dbdiff 0.6.7__tar.gz → 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dbdiff-0.6.7 → dbdiff-0.7.0}/LICENSE +1 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/PKG-INFO +29 -8
- {dbdiff-0.6.7 → dbdiff-0.7.0}/README.md +3 -0
- dbdiff-0.7.0/setup.py +67 -0
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/__init__.py +1 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/__main__.py +1 -0
- dbdiff-0.7.0/src/dbdiff/cli.py +491 -0
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/logging.json +1 -1
- dbdiff-0.7.0/src/dbdiff/main.py +728 -0
- dbdiff-0.7.0/src/dbdiff/report.py +165 -0
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/all_keys_count.sql +1 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/all_keys_sample.sql +1 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/create_dedup.sql +1 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/create_dup.sql +1 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/create_joined_table_from_selectinto.sql +1 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/create_temp_table.sql +1 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/diff_column.sql +1 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/diff_column_hier.sql +1 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/diff_column_numeric_diffs_binned.sql +1 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/diff_column_numeric_diffs_sorted.sql +1 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/diff_column_raw.sql +1 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/diff_column_summary.sql +1 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/diff_rows_sample.sql +1 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/html/base.html +1 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/html/report.html +1 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/insert_diff.sql +1 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/insert_joined_table.sql +1 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/joined_column.sql +1 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/joined_column_hier.sql +1 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/joined_column_numeric_diffs_binned.sql +1 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/joined_column_numeric_diffs_sorted.sql +1 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/joined_count.sql +1 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/joined_rows_count.sql +1 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/joined_rows_sample.sql +1 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/sub_keys_count.sql +0 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/sub_keys_grouped.sql +0 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/sub_keys_sample.sql +0 -1
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/table_columns.sql +1 -1
- dbdiff-0.7.0/src/dbdiff/templates/table_drop.sql +1 -0
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/table_exists.sql +1 -1
- dbdiff-0.7.0/src/dbdiff/templates/table_rows.sql +2 -0
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/table_rows_uniq.sql +1 -1
- dbdiff-0.7.0/src/dbdiff/vertica.py +126 -0
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff.egg-info/PKG-INFO +29 -8
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff.egg-info/requires.txt +1 -1
- dbdiff-0.7.0/tests/test_dbdiff.py +478 -0
- dbdiff-0.6.7/setup.py +0 -84
- dbdiff-0.6.7/src/dbdiff/cli.py +0 -362
- dbdiff-0.6.7/src/dbdiff/main.py +0 -583
- dbdiff-0.6.7/src/dbdiff/report.py +0 -114
- dbdiff-0.6.7/src/dbdiff/templates/table_drop.sql +0 -1
- dbdiff-0.6.7/src/dbdiff/templates/table_rows.sql +0 -2
- dbdiff-0.6.7/src/dbdiff/vertica.py +0 -126
- dbdiff-0.6.7/tests/test_dbdiff.py +0 -449
- {dbdiff-0.6.7 → dbdiff-0.7.0}/AUTHORS.md +0 -0
- {dbdiff-0.6.7 → dbdiff-0.7.0}/CHANGELOG.md +0 -0
- {dbdiff-0.6.7 → dbdiff-0.7.0}/MANIFEST.in +0 -0
- {dbdiff-0.6.7 → dbdiff-0.7.0}/setup.cfg +0 -0
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/create_joined_table.sql +0 -0
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/first_key_base.sql +0 -0
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/first_key_count.sql +0 -0
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/first_key_sample.sql +0 -0
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/joined_column_raw.sql +0 -0
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/sub_keys_base.sql +0 -0
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff.egg-info/SOURCES.txt +0 -0
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff.egg-info/dependency_links.txt +0 -0
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff.egg-info/entry_points.txt +0 -0
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff.egg-info/not-zip-safe +0 -0
- {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff.egg-info/top_level.txt +0 -0
|
@@ -6,4 +6,4 @@ Permission is hereby granted, free of charge, to any person obtaining a copy of
|
|
|
6
6
|
|
|
7
7
|
The above copyright notice and this permission notice (including the next paragraph) shall be included in all copies or substantial portions of the Software.
|
|
8
8
|
|
|
9
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
9
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@@ -1,13 +1,12 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: dbdiff
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.0
|
|
4
4
|
Summary: Compare two tables on Vertica.
|
|
5
5
|
Home-page: https://github.com/andyreagan/dbdiff
|
|
6
6
|
Author: Andy Reagan
|
|
7
7
|
Author-email: andy@andyreagan.com
|
|
8
8
|
License: MIT
|
|
9
9
|
Project-URL: Issue Tracker, https://github.com/andyreagan/dbdiff/issues
|
|
10
|
-
Platform: UNKNOWN
|
|
11
10
|
Classifier: Development Status :: 5 - Production/Stable
|
|
12
11
|
Classifier: Intended Audience :: Developers
|
|
13
12
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -15,17 +14,41 @@ Classifier: Operating System :: Unix
|
|
|
15
14
|
Classifier: Operating System :: POSIX
|
|
16
15
|
Classifier: Programming Language :: Python
|
|
17
16
|
Classifier: Programming Language :: Python :: 3
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.
|
|
19
|
-
Classifier: Programming Language :: Python :: 3.
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
21
|
Classifier: Topic :: Utilities
|
|
21
|
-
Requires-Python: >=3.
|
|
22
|
+
Requires-Python: >=3.9
|
|
22
23
|
Description-Content-Type: text/markdown
|
|
23
24
|
License-File: LICENSE
|
|
24
25
|
License-File: AUTHORS.md
|
|
26
|
+
Requires-Dist: click
|
|
27
|
+
Requires-Dist: requests
|
|
28
|
+
Requires-Dist: pandas>=1.0.0
|
|
29
|
+
Requires-Dist: Jinja2
|
|
30
|
+
Requires-Dist: python-dotenv
|
|
31
|
+
Requires-Dist: vertica_python
|
|
32
|
+
Requires-Dist: xlsxwriter
|
|
33
|
+
Dynamic: author
|
|
34
|
+
Dynamic: author-email
|
|
35
|
+
Dynamic: classifier
|
|
36
|
+
Dynamic: description
|
|
37
|
+
Dynamic: description-content-type
|
|
38
|
+
Dynamic: home-page
|
|
39
|
+
Dynamic: license
|
|
40
|
+
Dynamic: license-file
|
|
41
|
+
Dynamic: project-url
|
|
42
|
+
Dynamic: requires-dist
|
|
43
|
+
Dynamic: requires-python
|
|
44
|
+
Dynamic: summary
|
|
25
45
|
|
|
26
46
|
Overview
|
|
27
47
|
========
|
|
28
48
|
|
|
49
|
+
[](https://github.com/andyreagan/dbdiff/actions/workflows/vertica-test.yml)
|
|
50
|
+
[](https://badge.fury.io/py/dbdiff)
|
|
51
|
+
|
|
29
52
|
Compare two tables on Vertica,
|
|
30
53
|
that are expected to be exactly the same.
|
|
31
54
|
|
|
@@ -168,5 +191,3 @@ Locally, in a separate terminal window, you can start one of these like:
|
|
|
168
191
|
To run the all tests run:
|
|
169
192
|
|
|
170
193
|
tox
|
|
171
|
-
|
|
172
|
-
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
Overview
|
|
2
2
|
========
|
|
3
3
|
|
|
4
|
+
[](https://github.com/andyreagan/dbdiff/actions/workflows/vertica-test.yml)
|
|
5
|
+
[](https://badge.fury.io/py/dbdiff)
|
|
6
|
+
|
|
4
7
|
Compare two tables on Vertica,
|
|
5
8
|
that are expected to be exactly the same.
|
|
6
9
|
|
dbdiff-0.7.0/setup.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
from glob import glob
|
|
3
|
+
from pathlib import Path, PurePath
|
|
4
|
+
|
|
5
|
+
from setuptools import find_packages, setup
|
|
6
|
+
|
|
7
|
+
from src import dbdiff
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def read(*names, **kwargs):
|
|
11
|
+
with Path(PurePath.joinpath(Path(__file__).parent, *names)).open(
|
|
12
|
+
encoding=kwargs.get("encoding", "utf8")
|
|
13
|
+
) as fh:
|
|
14
|
+
return fh.read()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
setup(
|
|
18
|
+
name="dbdiff",
|
|
19
|
+
version=dbdiff.__version__,
|
|
20
|
+
license="MIT",
|
|
21
|
+
description="Compare two tables on Vertica.",
|
|
22
|
+
long_description=read("README.md"),
|
|
23
|
+
long_description_content_type="text/markdown",
|
|
24
|
+
author="Andy Reagan",
|
|
25
|
+
author_email="andy@andyreagan.com",
|
|
26
|
+
url="https://github.com/andyreagan/dbdiff",
|
|
27
|
+
packages=find_packages("src"),
|
|
28
|
+
package_dir={"": "src"},
|
|
29
|
+
py_modules=[PurePath(path).stem for path in glob("src/*.py")],
|
|
30
|
+
include_package_data=True,
|
|
31
|
+
zip_safe=False,
|
|
32
|
+
classifiers=[
|
|
33
|
+
# complete classifier list: http://pypi.python.org/pypi?%3Aaction=list_classifiers
|
|
34
|
+
"Development Status :: 5 - Production/Stable",
|
|
35
|
+
"Intended Audience :: Developers",
|
|
36
|
+
"License :: OSI Approved :: MIT License",
|
|
37
|
+
"Operating System :: Unix",
|
|
38
|
+
"Operating System :: POSIX",
|
|
39
|
+
"Programming Language :: Python",
|
|
40
|
+
"Programming Language :: Python :: 3",
|
|
41
|
+
"Programming Language :: Python :: 3.9",
|
|
42
|
+
"Programming Language :: Python :: 3.10",
|
|
43
|
+
"Programming Language :: Python :: 3.11",
|
|
44
|
+
"Programming Language :: Python :: 3.12",
|
|
45
|
+
"Topic :: Utilities",
|
|
46
|
+
],
|
|
47
|
+
project_urls={
|
|
48
|
+
"Issue Tracker": "https://github.com/andyreagan/dbdiff/issues",
|
|
49
|
+
},
|
|
50
|
+
keywords=[],
|
|
51
|
+
python_requires=">=3.9",
|
|
52
|
+
install_requires=[
|
|
53
|
+
"click",
|
|
54
|
+
"requests",
|
|
55
|
+
"pandas>=1.0.0",
|
|
56
|
+
"Jinja2",
|
|
57
|
+
"python-dotenv",
|
|
58
|
+
"vertica_python",
|
|
59
|
+
"xlsxwriter",
|
|
60
|
+
],
|
|
61
|
+
extras_require={},
|
|
62
|
+
entry_points={
|
|
63
|
+
"console_scripts": [
|
|
64
|
+
"dbdiff = dbdiff.cli:cli",
|
|
65
|
+
]
|
|
66
|
+
},
|
|
67
|
+
)
|
|
@@ -0,0 +1,491 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import logging.config
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import click
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from jinja2 import Environment, PackageLoader
|
|
10
|
+
from vertica_python.vertica.cursor import Cursor
|
|
11
|
+
|
|
12
|
+
from dbdiff import __version__
|
|
13
|
+
from dbdiff.main import (
|
|
14
|
+
check_primary_key,
|
|
15
|
+
create_diff_table,
|
|
16
|
+
create_joined_table,
|
|
17
|
+
get_all_col_info,
|
|
18
|
+
get_column_diffs,
|
|
19
|
+
get_column_diffs_from_joined,
|
|
20
|
+
get_diff_columns,
|
|
21
|
+
get_diff_rows,
|
|
22
|
+
get_diff_rows_from_joined,
|
|
23
|
+
get_unmatched_rows,
|
|
24
|
+
get_unmatched_rows_straight,
|
|
25
|
+
insert_diff_table,
|
|
26
|
+
select_distinct_rows,
|
|
27
|
+
)
|
|
28
|
+
from dbdiff.report import excel_report, html_report
|
|
29
|
+
from dbdiff.vertica import get_cur
|
|
30
|
+
|
|
31
|
+
JINJA_ENV = Environment(loader=PackageLoader("dbdiff", "templates"))
|
|
32
|
+
DEFAULT_LOGGING_CONFIG = Path(__file__).with_name("logging.json")
|
|
33
|
+
LOGGER = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def initialize_logging(config: Path) -> None:
|
|
37
|
+
"""
|
|
38
|
+
Initialize logging configuration from JSON config file.
|
|
39
|
+
"""
|
|
40
|
+
with config.open() as fobj:
|
|
41
|
+
dict_config = json.load(fobj)
|
|
42
|
+
logging.config.dictConfig(dict_config)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def df_to_dict(x: Any) -> Any:
|
|
46
|
+
if isinstance(x, pd.DataFrame):
|
|
47
|
+
return x.to_dict("records")
|
|
48
|
+
else:
|
|
49
|
+
return x
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_summary_from_all_info(d: dict) -> dict:
|
|
53
|
+
return {
|
|
54
|
+
"x_schema": d["x_schema"],
|
|
55
|
+
"x_table": d["x_table"],
|
|
56
|
+
"y_schema": d["y_schema"],
|
|
57
|
+
"y_table": d["y_table"],
|
|
58
|
+
"join_cols": d["join_cols"],
|
|
59
|
+
"total_row_count": d["total_row_count"],
|
|
60
|
+
"dedup_info": d["dedup_info"],
|
|
61
|
+
"column_info": {
|
|
62
|
+
col: {k: df_to_dict(v) for k, v in info.items()}
|
|
63
|
+
for col, info in d["column_info"].items()
|
|
64
|
+
},
|
|
65
|
+
# this is a dataframe:
|
|
66
|
+
"column_match_info": df_to_dict(d["column_match_info"]),
|
|
67
|
+
"missing_join_info": {
|
|
68
|
+
side: {k: df_to_dict(v) for k, v in info.items()}
|
|
69
|
+
for side, info in d["missing_join_info"].items()
|
|
70
|
+
},
|
|
71
|
+
# just the counts from the diff summary:
|
|
72
|
+
"diff_summary": d["diff_summary"],
|
|
73
|
+
"hierarchical_join_info": {
|
|
74
|
+
col: {
|
|
75
|
+
side: {k: df_to_dict(v) for k, v in info.items()} for side, info in col_info.items()
|
|
76
|
+
}
|
|
77
|
+
for col, col_info in d["hierarchical_join_info"].items()
|
|
78
|
+
},
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@click.command()
|
|
83
|
+
@click.argument("schema")
|
|
84
|
+
@click.argument("x_table")
|
|
85
|
+
@click.argument("y_table")
|
|
86
|
+
@click.argument("join_cols")
|
|
87
|
+
@click.option(
|
|
88
|
+
"--y-schema", default=None, help="If the schema for the y_table is different, specify it."
|
|
89
|
+
)
|
|
90
|
+
@click.option(
|
|
91
|
+
"--output-schema",
|
|
92
|
+
default=None,
|
|
93
|
+
help="If you want the schema for the output tables to be different, specify it.",
|
|
94
|
+
)
|
|
95
|
+
@click.option(
|
|
96
|
+
"--drop-output-tables",
|
|
97
|
+
is_flag=True,
|
|
98
|
+
help="Drop the joined and diff tables created and used here.",
|
|
99
|
+
)
|
|
100
|
+
@click.option(
|
|
101
|
+
"--x-table-query",
|
|
102
|
+
is_flag=True,
|
|
103
|
+
help="If X_TABLE is not a table in Vertica, but rather a query stored in a file, add this flag and the query will be read and instantiated into a temporary table. Ex: 'temp_xtable_name_to_use.sql'.",
|
|
104
|
+
)
|
|
105
|
+
@click.option(
|
|
106
|
+
"--y-table-query",
|
|
107
|
+
is_flag=True,
|
|
108
|
+
help="If Y_TABLE is not a table in Vertica, but rather a query stored in a file, add this flag and the query will be read and instantiated into a temporary table.",
|
|
109
|
+
)
|
|
110
|
+
@click.option(
|
|
111
|
+
"--exclude-columns", default="", help="Comma separated string of column names to exclude."
|
|
112
|
+
)
|
|
113
|
+
@click.option(
|
|
114
|
+
"--hierarchical-join",
|
|
115
|
+
is_flag=True,
|
|
116
|
+
help="If multiple join keys, and join key #2 is a subset of join key #1. We expect matches for all of #1 from both tables even if we dont match on #1 and #2. This way, we can have more nuanced output by first breaking out missing on the first key.",
|
|
117
|
+
)
|
|
118
|
+
@click.option(
|
|
119
|
+
"--max-rows-all",
|
|
120
|
+
default=10,
|
|
121
|
+
help="Limit of full rows to pull that have differences.",
|
|
122
|
+
show_default=True,
|
|
123
|
+
)
|
|
124
|
+
@click.option(
|
|
125
|
+
"--max-rows-column",
|
|
126
|
+
default=10,
|
|
127
|
+
help="Limit of grouped and raw column level differences to pull.",
|
|
128
|
+
show_default=True,
|
|
129
|
+
)
|
|
130
|
+
@click.option(
|
|
131
|
+
"--output-format", type=click.Choice(["HTML", "XLSX"], case_sensitive=False), default="HTML"
|
|
132
|
+
)
|
|
133
|
+
@click.option(
|
|
134
|
+
"--save-column-summary", is_flag=True, help="Save the column dtype and match summary."
|
|
135
|
+
)
|
|
136
|
+
@click.option(
|
|
137
|
+
"--save-column-summary-format",
|
|
138
|
+
type=click.Choice(["CSV", "PICKLE"], case_sensitive=False),
|
|
139
|
+
default="CSV",
|
|
140
|
+
)
|
|
141
|
+
@click.option(
|
|
142
|
+
"--skip-row-total",
|
|
143
|
+
is_flag=True,
|
|
144
|
+
help="Skip counting the total # of rows with differences, only use cell differences.",
|
|
145
|
+
)
|
|
146
|
+
@click.option("--use-diff-table", is_flag=True, help="Use a diff table in the middle.")
|
|
147
|
+
@click.option("--logging-config", type=Path, default=DEFAULT_LOGGING_CONFIG)
|
|
148
|
+
@click.option(
|
|
149
|
+
"--case-insensitive",
|
|
150
|
+
is_flag=True,
|
|
151
|
+
help="If using this flag, all case sensitivity is turned off.",
|
|
152
|
+
)
|
|
153
|
+
@click.option("--save-json-summary", is_flag=True, help="Save a .json file of the diff summary.")
|
|
154
|
+
@click.version_option(__version__)
|
|
155
|
+
def cli(
|
|
156
|
+
schema: str,
|
|
157
|
+
x_table: str,
|
|
158
|
+
y_table: str,
|
|
159
|
+
join_cols: str,
|
|
160
|
+
y_schema: str,
|
|
161
|
+
output_schema: str,
|
|
162
|
+
drop_output_tables: bool,
|
|
163
|
+
x_table_query: bool,
|
|
164
|
+
y_table_query: bool,
|
|
165
|
+
exclude_columns: str,
|
|
166
|
+
hierarchical_join: bool,
|
|
167
|
+
max_rows_all: int,
|
|
168
|
+
max_rows_column: int,
|
|
169
|
+
output_format: str,
|
|
170
|
+
save_column_summary: bool,
|
|
171
|
+
save_column_summary_format: str,
|
|
172
|
+
skip_row_total: bool,
|
|
173
|
+
use_diff_table: bool,
|
|
174
|
+
logging_config: Path,
|
|
175
|
+
case_insensitive: bool,
|
|
176
|
+
save_json_summary: bool,
|
|
177
|
+
):
|
|
178
|
+
"""Compare two flat files X_TABLE and Y_TABLE, using Vertica as the join engine.
|
|
179
|
+
Assume they are both in the same schema = SCHEMA.
|
|
180
|
+
Join them on the columns in comma-separated string JOIN_COLS.
|
|
181
|
+
Expects that the join columns have matching data type or will implicitly cast for comparison,
|
|
182
|
+
and implicity cast into the type in [X_TABLE] for the JOINED table.
|
|
183
|
+
Expects that all other columns with matchings names (those that will be compared)
|
|
184
|
+
can be compared directly (it will cast all dtypes for comparison to the type in X_TABLE).
|
|
185
|
+
|
|
186
|
+
Will drop [X_TABLE]_DIFF (if --use-diff-table) and will drop [X_TABLE]_JOINED if they exist."""
|
|
187
|
+
# default y_schema to be the same as x
|
|
188
|
+
if y_schema is None:
|
|
189
|
+
y_schema = schema
|
|
190
|
+
if output_schema is None:
|
|
191
|
+
output_schema = schema
|
|
192
|
+
join_cols_list = list(map(lambda x: x.lower(), join_cols.split(",")))
|
|
193
|
+
exclude_columns_set = set(map(lambda x: x.lower(), exclude_columns.split(",")))
|
|
194
|
+
initialize_logging(logging_config)
|
|
195
|
+
|
|
196
|
+
with get_cur() as cur:
|
|
197
|
+
if x_table_query:
|
|
198
|
+
with open(x_table) as f:
|
|
199
|
+
q = f.read()
|
|
200
|
+
x_table = Path(x_table).stem
|
|
201
|
+
LOGGER.info("Creating temp table from query for x.")
|
|
202
|
+
q_create = JINJA_ENV.get_template("create_temp_table.sql").render(
|
|
203
|
+
table_name=x_table, query=q
|
|
204
|
+
)
|
|
205
|
+
LOGGER.info(q_create)
|
|
206
|
+
cur.execute(q_create)
|
|
207
|
+
schema = "v_temp_schema"
|
|
208
|
+
if y_table_query:
|
|
209
|
+
with open(y_table) as f:
|
|
210
|
+
q = f.read()
|
|
211
|
+
y_table = Path(y_table).stem
|
|
212
|
+
LOGGER.info("Creating temp table from query for y.")
|
|
213
|
+
q_create = JINJA_ENV.get_template("create_temp_table.sql").render(
|
|
214
|
+
table_name=y_table, query=q
|
|
215
|
+
)
|
|
216
|
+
LOGGER.info(q_create)
|
|
217
|
+
cur.execute(q_create)
|
|
218
|
+
y_schema = "v_temp_schema"
|
|
219
|
+
|
|
220
|
+
all_info = main(
|
|
221
|
+
cur=cur,
|
|
222
|
+
x_schema=schema,
|
|
223
|
+
x_table=x_table,
|
|
224
|
+
y_schema=y_schema,
|
|
225
|
+
y_table=y_table,
|
|
226
|
+
output_schema=output_schema,
|
|
227
|
+
join_cols=join_cols_list,
|
|
228
|
+
exclude_columns=exclude_columns_set,
|
|
229
|
+
max_rows_all=max_rows_all,
|
|
230
|
+
max_rows_column=max_rows_column,
|
|
231
|
+
drop_output_tables=drop_output_tables,
|
|
232
|
+
hierarchical_join=hierarchical_join,
|
|
233
|
+
save_column_summary=save_column_summary,
|
|
234
|
+
save_column_summary_format=save_column_summary_format,
|
|
235
|
+
skip_row_total=skip_row_total,
|
|
236
|
+
use_diff_table=use_diff_table,
|
|
237
|
+
case_insensitive=case_insensitive,
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
if output_format == "HTML":
|
|
241
|
+
report = html_report(**all_info)
|
|
242
|
+
with open(x_table + "_report.html", "w") as f:
|
|
243
|
+
f.write(report)
|
|
244
|
+
elif output_format == "XLSX":
|
|
245
|
+
reports = excel_report(**all_info)
|
|
246
|
+
with pd.ExcelWriter(x_table + "_report.xlsx", engine="xlsxwriter") as writer:
|
|
247
|
+
for sheet_name, df in reports:
|
|
248
|
+
df.to_excel(writer, sheet_name=sheet_name, index=False)
|
|
249
|
+
|
|
250
|
+
if save_json_summary:
|
|
251
|
+
# get the parts of the info that aren't dataframes
|
|
252
|
+
summary_info = get_summary_from_all_info(all_info)
|
|
253
|
+
Path(f"{x_table}_diff_summary.json").write_text(
|
|
254
|
+
json.dumps(summary_info, indent=4, default=str)
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def main(
|
|
259
|
+
cur: Cursor,
|
|
260
|
+
x_schema: str,
|
|
261
|
+
x_table: str,
|
|
262
|
+
y_schema: str,
|
|
263
|
+
y_table: str,
|
|
264
|
+
output_schema: str,
|
|
265
|
+
join_cols: list,
|
|
266
|
+
exclude_columns: set,
|
|
267
|
+
max_rows_all: int,
|
|
268
|
+
max_rows_column: int,
|
|
269
|
+
drop_output_tables: bool,
|
|
270
|
+
hierarchical_join: bool,
|
|
271
|
+
save_column_summary: bool,
|
|
272
|
+
save_column_summary_format: str,
|
|
273
|
+
skip_row_total: bool,
|
|
274
|
+
use_diff_table: bool,
|
|
275
|
+
case_insensitive: bool,
|
|
276
|
+
):
|
|
277
|
+
"""Main method to be called by CLI.
|
|
278
|
+
A separate function from cli() so that it can be imported easily as well."""
|
|
279
|
+
|
|
280
|
+
if case_insensitive:
|
|
281
|
+
LOGGER.info("Setting to case insensitive.")
|
|
282
|
+
cur.execute("SET LOCALE TO 'en_US@colstrength=1';")
|
|
283
|
+
# clear the results
|
|
284
|
+
cur.fetchall()
|
|
285
|
+
|
|
286
|
+
all_col_info_df = get_all_col_info(
|
|
287
|
+
cur,
|
|
288
|
+
x_schema,
|
|
289
|
+
x_table,
|
|
290
|
+
y_schema,
|
|
291
|
+
y_table,
|
|
292
|
+
exclude_columns,
|
|
293
|
+
save_column_summary,
|
|
294
|
+
save_column_summary_format,
|
|
295
|
+
)
|
|
296
|
+
comparable_filter = (
|
|
297
|
+
~all_col_info_df.exclude
|
|
298
|
+
& all_col_info_df.comparable
|
|
299
|
+
& ~all_col_info_df.x_dtype.isnull()
|
|
300
|
+
& ~all_col_info_df.y_dtype.isnull()
|
|
301
|
+
)
|
|
302
|
+
all_col_info_df["uncomparable"] = (
|
|
303
|
+
(~all_col_info_df.comparable)
|
|
304
|
+
& (~all_col_info_df.x_dtype.isnull())
|
|
305
|
+
& (~all_col_info_df.y_dtype.isnull())
|
|
306
|
+
)
|
|
307
|
+
# check that the join cols exist on both tables
|
|
308
|
+
for col in join_cols:
|
|
309
|
+
if all_col_info_df.loc[comparable_filter & (all_col_info_df.index == col), :].shape[0] == 0:
|
|
310
|
+
raise RuntimeError(
|
|
311
|
+
"Column `{}` not in comparable columns (missing from one, both, or bad dtype). Here is the info we do have about that col:\n".format(
|
|
312
|
+
col
|
|
313
|
+
)
|
|
314
|
+
+ all_col_info_df.loc[col, :].to_string()
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
LOGGER.info("Checking primary keys.")
|
|
318
|
+
x = check_primary_key(cur=cur, schema=x_schema, table=x_table, join_cols=join_cols)
|
|
319
|
+
y = check_primary_key(cur=cur, schema=y_schema, table=y_table, join_cols=join_cols)
|
|
320
|
+
dedup_info = {x_table: {"count": x}, y_table: {"count": y}}
|
|
321
|
+
|
|
322
|
+
# hard stop on primary key:
|
|
323
|
+
# assert x == 0, '# non distinct rows in ' + x_table + ' is ' + str(x)
|
|
324
|
+
# assert y == 0, '# non distinct rows in ' + y_table + ' is ' + str(y)
|
|
325
|
+
|
|
326
|
+
if hierarchical_join:
|
|
327
|
+
LOGGER.info("Getting rows that are missing on each join key.")
|
|
328
|
+
hierarchical_join_info = get_unmatched_rows(
|
|
329
|
+
cur=cur,
|
|
330
|
+
x_schema=x_schema,
|
|
331
|
+
y_schema=y_schema,
|
|
332
|
+
x_table=x_table,
|
|
333
|
+
y_table=y_table,
|
|
334
|
+
join_cols=join_cols,
|
|
335
|
+
max_rows_column=max_rows_column,
|
|
336
|
+
)
|
|
337
|
+
else:
|
|
338
|
+
hierarchical_join_info = {}
|
|
339
|
+
|
|
340
|
+
# create sub-tables to allow a comparison:
|
|
341
|
+
if x != 0:
|
|
342
|
+
LOGGER.info("X table was not unique on join keys, creating _dedup and _dup versions.")
|
|
343
|
+
schema, x_table = select_distinct_rows(
|
|
344
|
+
cur,
|
|
345
|
+
x_schema,
|
|
346
|
+
x_table,
|
|
347
|
+
join_cols,
|
|
348
|
+
use_temp_tables=(drop_output_tables or x_schema == "v_temp_schema"),
|
|
349
|
+
)
|
|
350
|
+
if y != 0:
|
|
351
|
+
LOGGER.info("Y table was not unique on join keys, creating _dedup and _dup versions.")
|
|
352
|
+
y_schema, y_table = select_distinct_rows(
|
|
353
|
+
cur,
|
|
354
|
+
y_schema,
|
|
355
|
+
y_table,
|
|
356
|
+
join_cols,
|
|
357
|
+
use_temp_tables=(drop_output_tables or y_schema == "v_temp_schema"),
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
LOGGER.info("Getting rows that did not match (not in joined table) after deduping.")
|
|
361
|
+
missing_join_info = get_unmatched_rows_straight(
|
|
362
|
+
cur=cur,
|
|
363
|
+
x_schema=x_schema,
|
|
364
|
+
y_schema=y_schema,
|
|
365
|
+
x_table=x_table,
|
|
366
|
+
y_table=y_table,
|
|
367
|
+
join_cols=join_cols,
|
|
368
|
+
max_rows_column=max_rows_column,
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
# build the joined table
|
|
372
|
+
LOGGER.info("Building joined table " + (x_table + "_JOINED"))
|
|
373
|
+
joined_row_count = create_joined_table(
|
|
374
|
+
cur=cur,
|
|
375
|
+
x_schema=x_schema,
|
|
376
|
+
y_schema=y_schema,
|
|
377
|
+
x_table=x_table,
|
|
378
|
+
y_table=y_table,
|
|
379
|
+
join_cols=join_cols,
|
|
380
|
+
compare_cols=all_col_info_df.loc[comparable_filter, :],
|
|
381
|
+
joined_schema=output_schema,
|
|
382
|
+
joined_table=(x_table + "_JOINED"),
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
if use_diff_table:
|
|
386
|
+
# build the diff table
|
|
387
|
+
LOGGER.info("Building diff table " + (x_table + "_DIFF."))
|
|
388
|
+
create_diff_table(
|
|
389
|
+
cur=cur,
|
|
390
|
+
schema=output_schema,
|
|
391
|
+
table=(x_table + "_DIFF"),
|
|
392
|
+
join_cols=join_cols,
|
|
393
|
+
all_col_info_df=all_col_info_df,
|
|
394
|
+
)
|
|
395
|
+
for column in all_col_info_df.loc[
|
|
396
|
+
comparable_filter & ~all_col_info_df.index.isin(join_cols), :
|
|
397
|
+
].index.values:
|
|
398
|
+
LOGGER.info("Inserting column " + column + " into diff table.")
|
|
399
|
+
insert_diff_table(
|
|
400
|
+
cur=cur,
|
|
401
|
+
joined_schema=output_schema,
|
|
402
|
+
joined_table=(x_table + "_JOINED"),
|
|
403
|
+
diff_schema=output_schema,
|
|
404
|
+
diff_table=(x_table + "_DIFF"),
|
|
405
|
+
join_cols=join_cols,
|
|
406
|
+
column=column,
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
############################################################################
|
|
410
|
+
# Result 1: Get rows with at least N=1 difference (count, query, dataframe),
|
|
411
|
+
############################################################################
|
|
412
|
+
diff_summary = get_diff_rows(
|
|
413
|
+
cur, output_schema, x_table, join_cols, max_rows_all, skip_row_total
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
############################################################################
|
|
417
|
+
# Result 2: Get ordered list of columns by # of differences (query, dataframe).
|
|
418
|
+
############################################################################
|
|
419
|
+
diff_columns = get_diff_columns(cur, output_schema, x_table)
|
|
420
|
+
|
|
421
|
+
############################################################################
|
|
422
|
+
# Result 3: Get detailed column diffs.
|
|
423
|
+
############################################################################
|
|
424
|
+
grouped_column_diffs = get_column_diffs(
|
|
425
|
+
diff_columns,
|
|
426
|
+
cur,
|
|
427
|
+
output_schema,
|
|
428
|
+
x_schema,
|
|
429
|
+
x_table,
|
|
430
|
+
y_schema,
|
|
431
|
+
y_table,
|
|
432
|
+
join_cols,
|
|
433
|
+
max_rows_column,
|
|
434
|
+
all_col_info_df,
|
|
435
|
+
hierarchical_join,
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
else:
|
|
439
|
+
grouped_column_diffs = get_column_diffs_from_joined(
|
|
440
|
+
cur=cur,
|
|
441
|
+
output_schema=output_schema,
|
|
442
|
+
x_schema=x_schema,
|
|
443
|
+
x_table=x_table,
|
|
444
|
+
y_schema=y_schema,
|
|
445
|
+
y_table=y_table,
|
|
446
|
+
join_cols=join_cols,
|
|
447
|
+
max_rows_column=max_rows_column,
|
|
448
|
+
all_col_info_df=all_col_info_df,
|
|
449
|
+
comparable_filter=comparable_filter,
|
|
450
|
+
hierarchical=hierarchical_join,
|
|
451
|
+
)
|
|
452
|
+
diff_summary = get_diff_rows_from_joined(
|
|
453
|
+
cur=cur,
|
|
454
|
+
grouped_column_diffs=grouped_column_diffs,
|
|
455
|
+
output_schema=output_schema,
|
|
456
|
+
x_table=x_table,
|
|
457
|
+
join_cols=join_cols,
|
|
458
|
+
max_rows_all=max_rows_all,
|
|
459
|
+
skip_row_total=skip_row_total,
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
all_info = {
|
|
463
|
+
"x_schema": x_schema,
|
|
464
|
+
"y_schema": y_schema,
|
|
465
|
+
"x_table": x_table,
|
|
466
|
+
"y_table": y_table,
|
|
467
|
+
"join_cols": join_cols,
|
|
468
|
+
"total_row_count": joined_row_count,
|
|
469
|
+
"column_info": grouped_column_diffs,
|
|
470
|
+
"column_match_info": all_col_info_df,
|
|
471
|
+
"missing_join_info": missing_join_info,
|
|
472
|
+
"hierarchical_join_info": hierarchical_join_info,
|
|
473
|
+
"dedup_info": dedup_info,
|
|
474
|
+
"diff_summary": diff_summary,
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
if drop_output_tables:
|
|
478
|
+
LOGGER.info("Dropping output tables. WARNING: queries in the report won't work!")
|
|
479
|
+
cur.execute(
|
|
480
|
+
JINJA_ENV.get_template("table_drop.sql").render(
|
|
481
|
+
schema_name=output_schema, table_name=(x_table + "_JOINED")
|
|
482
|
+
)
|
|
483
|
+
)
|
|
484
|
+
if use_diff_table:
|
|
485
|
+
cur.execute(
|
|
486
|
+
JINJA_ENV.get_template("table_drop.sql").render(
|
|
487
|
+
schema_name=output_schema, table_name=(x_table + "_DIFF")
|
|
488
|
+
)
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
return all_info
|