dbdiff 0.6.7__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. {dbdiff-0.6.7 → dbdiff-0.7.0}/LICENSE +1 -1
  2. {dbdiff-0.6.7 → dbdiff-0.7.0}/PKG-INFO +29 -8
  3. {dbdiff-0.6.7 → dbdiff-0.7.0}/README.md +3 -0
  4. dbdiff-0.7.0/setup.py +67 -0
  5. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/__init__.py +1 -1
  6. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/__main__.py +1 -0
  7. dbdiff-0.7.0/src/dbdiff/cli.py +491 -0
  8. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/logging.json +1 -1
  9. dbdiff-0.7.0/src/dbdiff/main.py +728 -0
  10. dbdiff-0.7.0/src/dbdiff/report.py +165 -0
  11. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/all_keys_count.sql +1 -1
  12. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/all_keys_sample.sql +1 -1
  13. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/create_dedup.sql +1 -1
  14. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/create_dup.sql +1 -1
  15. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/create_joined_table_from_selectinto.sql +1 -1
  16. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/create_temp_table.sql +1 -1
  17. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/diff_column.sql +1 -1
  18. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/diff_column_hier.sql +1 -1
  19. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/diff_column_numeric_diffs_binned.sql +1 -1
  20. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/diff_column_numeric_diffs_sorted.sql +1 -1
  21. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/diff_column_raw.sql +1 -1
  22. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/diff_column_summary.sql +1 -1
  23. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/diff_rows_sample.sql +1 -1
  24. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/html/base.html +1 -1
  25. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/html/report.html +1 -1
  26. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/insert_diff.sql +1 -1
  27. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/insert_joined_table.sql +1 -1
  28. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/joined_column.sql +1 -1
  29. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/joined_column_hier.sql +1 -1
  30. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/joined_column_numeric_diffs_binned.sql +1 -1
  31. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/joined_column_numeric_diffs_sorted.sql +1 -1
  32. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/joined_count.sql +1 -1
  33. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/joined_rows_count.sql +1 -1
  34. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/joined_rows_sample.sql +1 -1
  35. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/sub_keys_count.sql +0 -1
  36. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/sub_keys_grouped.sql +0 -1
  37. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/sub_keys_sample.sql +0 -1
  38. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/table_columns.sql +1 -1
  39. dbdiff-0.7.0/src/dbdiff/templates/table_drop.sql +1 -0
  40. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/table_exists.sql +1 -1
  41. dbdiff-0.7.0/src/dbdiff/templates/table_rows.sql +2 -0
  42. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/table_rows_uniq.sql +1 -1
  43. dbdiff-0.7.0/src/dbdiff/vertica.py +126 -0
  44. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff.egg-info/PKG-INFO +29 -8
  45. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff.egg-info/requires.txt +1 -1
  46. dbdiff-0.7.0/tests/test_dbdiff.py +478 -0
  47. dbdiff-0.6.7/setup.py +0 -84
  48. dbdiff-0.6.7/src/dbdiff/cli.py +0 -362
  49. dbdiff-0.6.7/src/dbdiff/main.py +0 -583
  50. dbdiff-0.6.7/src/dbdiff/report.py +0 -114
  51. dbdiff-0.6.7/src/dbdiff/templates/table_drop.sql +0 -1
  52. dbdiff-0.6.7/src/dbdiff/templates/table_rows.sql +0 -2
  53. dbdiff-0.6.7/src/dbdiff/vertica.py +0 -126
  54. dbdiff-0.6.7/tests/test_dbdiff.py +0 -449
  55. {dbdiff-0.6.7 → dbdiff-0.7.0}/AUTHORS.md +0 -0
  56. {dbdiff-0.6.7 → dbdiff-0.7.0}/CHANGELOG.md +0 -0
  57. {dbdiff-0.6.7 → dbdiff-0.7.0}/MANIFEST.in +0 -0
  58. {dbdiff-0.6.7 → dbdiff-0.7.0}/setup.cfg +0 -0
  59. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/create_joined_table.sql +0 -0
  60. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/first_key_base.sql +0 -0
  61. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/first_key_count.sql +0 -0
  62. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/first_key_sample.sql +0 -0
  63. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/joined_column_raw.sql +0 -0
  64. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff/templates/sub_keys_base.sql +0 -0
  65. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff.egg-info/SOURCES.txt +0 -0
  66. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff.egg-info/dependency_links.txt +0 -0
  67. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff.egg-info/entry_points.txt +0 -0
  68. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff.egg-info/not-zip-safe +0 -0
  69. {dbdiff-0.6.7 → dbdiff-0.7.0}/src/dbdiff.egg-info/top_level.txt +0 -0
@@ -6,4 +6,4 @@ Permission is hereby granted, free of charge, to any person obtaining a copy of
6
6
 
7
7
  The above copyright notice and this permission notice (including the next paragraph) shall be included in all copies or substantial portions of the Software.
8
8
 
9
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
9
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -1,13 +1,12 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: dbdiff
3
- Version: 0.6.7
3
+ Version: 0.7.0
4
4
  Summary: Compare two tables on Vertica.
5
5
  Home-page: https://github.com/andyreagan/dbdiff
6
6
  Author: Andy Reagan
7
7
  Author-email: andy@andyreagan.com
8
8
  License: MIT
9
9
  Project-URL: Issue Tracker, https://github.com/andyreagan/dbdiff/issues
10
- Platform: UNKNOWN
11
10
  Classifier: Development Status :: 5 - Production/Stable
12
11
  Classifier: Intended Audience :: Developers
13
12
  Classifier: License :: OSI Approved :: MIT License
@@ -15,17 +14,41 @@ Classifier: Operating System :: Unix
15
14
  Classifier: Operating System :: POSIX
16
15
  Classifier: Programming Language :: Python
17
16
  Classifier: Programming Language :: Python :: 3
18
- Classifier: Programming Language :: Python :: 3.6
19
- Classifier: Programming Language :: Python :: 3.7
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
20
21
  Classifier: Topic :: Utilities
21
- Requires-Python: >=3.6
22
+ Requires-Python: >=3.9
22
23
  Description-Content-Type: text/markdown
23
24
  License-File: LICENSE
24
25
  License-File: AUTHORS.md
26
+ Requires-Dist: click
27
+ Requires-Dist: requests
28
+ Requires-Dist: pandas>=1.0.0
29
+ Requires-Dist: Jinja2
30
+ Requires-Dist: python-dotenv
31
+ Requires-Dist: vertica_python
32
+ Requires-Dist: xlsxwriter
33
+ Dynamic: author
34
+ Dynamic: author-email
35
+ Dynamic: classifier
36
+ Dynamic: description
37
+ Dynamic: description-content-type
38
+ Dynamic: home-page
39
+ Dynamic: license
40
+ Dynamic: license-file
41
+ Dynamic: project-url
42
+ Dynamic: requires-dist
43
+ Dynamic: requires-python
44
+ Dynamic: summary
25
45
 
26
46
  Overview
27
47
  ========
28
48
 
49
+ [![Test Status](https://github.com/andyreagan/dbdiff/actions/workflows/vertica-test.yml/badge.svg)](https://github.com/andyreagan/dbdiff/actions/workflows/vertica-test.yml)
50
+ [![PyPI version](https://badge.fury.io/py/dbdiff.svg)](https://badge.fury.io/py/dbdiff)
51
+
29
52
  Compare two tables on Vertica,
30
53
  that are expected to be exactly the same.
31
54
 
@@ -168,5 +191,3 @@ Locally, in a separate terminal window, you can start one of these like:
168
191
  To run the all tests run:
169
192
 
170
193
  tox
171
-
172
-
@@ -1,6 +1,9 @@
1
1
  Overview
2
2
  ========
3
3
 
4
+ [![Test Status](https://github.com/andyreagan/dbdiff/actions/workflows/vertica-test.yml/badge.svg)](https://github.com/andyreagan/dbdiff/actions/workflows/vertica-test.yml)
5
+ [![PyPI version](https://badge.fury.io/py/dbdiff.svg)](https://badge.fury.io/py/dbdiff)
6
+
4
7
  Compare two tables on Vertica,
5
8
  that are expected to be exactly the same.
6
9
 
dbdiff-0.7.0/setup.py ADDED
@@ -0,0 +1,67 @@
1
+ #!/usr/bin/env python
2
+ from glob import glob
3
+ from pathlib import Path, PurePath
4
+
5
+ from setuptools import find_packages, setup
6
+
7
+ from src import dbdiff
8
+
9
+
10
+ def read(*names, **kwargs):
11
+ with Path(PurePath.joinpath(Path(__file__).parent, *names)).open(
12
+ encoding=kwargs.get("encoding", "utf8")
13
+ ) as fh:
14
+ return fh.read()
15
+
16
+
17
+ setup(
18
+ name="dbdiff",
19
+ version=dbdiff.__version__,
20
+ license="MIT",
21
+ description="Compare two tables on Vertica.",
22
+ long_description=read("README.md"),
23
+ long_description_content_type="text/markdown",
24
+ author="Andy Reagan",
25
+ author_email="andy@andyreagan.com",
26
+ url="https://github.com/andyreagan/dbdiff",
27
+ packages=find_packages("src"),
28
+ package_dir={"": "src"},
29
+ py_modules=[PurePath(path).stem for path in glob("src/*.py")],
30
+ include_package_data=True,
31
+ zip_safe=False,
32
+ classifiers=[
33
+ # complete classifier list: http://pypi.python.org/pypi?%3Aaction=list_classifiers
34
+ "Development Status :: 5 - Production/Stable",
35
+ "Intended Audience :: Developers",
36
+ "License :: OSI Approved :: MIT License",
37
+ "Operating System :: Unix",
38
+ "Operating System :: POSIX",
39
+ "Programming Language :: Python",
40
+ "Programming Language :: Python :: 3",
41
+ "Programming Language :: Python :: 3.9",
42
+ "Programming Language :: Python :: 3.10",
43
+ "Programming Language :: Python :: 3.11",
44
+ "Programming Language :: Python :: 3.12",
45
+ "Topic :: Utilities",
46
+ ],
47
+ project_urls={
48
+ "Issue Tracker": "https://github.com/andyreagan/dbdiff/issues",
49
+ },
50
+ keywords=[],
51
+ python_requires=">=3.9",
52
+ install_requires=[
53
+ "click",
54
+ "requests",
55
+ "pandas>=1.0.0",
56
+ "Jinja2",
57
+ "python-dotenv",
58
+ "vertica_python",
59
+ "xlsxwriter",
60
+ ],
61
+ extras_require={},
62
+ entry_points={
63
+ "console_scripts": [
64
+ "dbdiff = dbdiff.cli:cli",
65
+ ]
66
+ },
67
+ )
@@ -4,4 +4,4 @@
4
4
  # See that warning on Step 6 here:
5
5
  # https://packaging.python.org/guides/single-sourcing-package-version/
6
6
  # If we want to do imports here, there is a different approach.
7
- __version__ = '0.6.7'
7
+ __version__ = "0.7.0"
@@ -8,6 +8,7 @@ Why does this file exist, and why __main__? For more info, read:
8
8
  - https://docs.python.org/2/using/cmdline.html#cmdoption-m
9
9
  - https://docs.python.org/3/using/cmdline.html#cmdoption-m
10
10
  """
11
+
11
12
  from dbdiff.cli import cli
12
13
 
13
14
  if __name__ == "__main__":
@@ -0,0 +1,491 @@
1
+ import json
2
+ import logging
3
+ import logging.config
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ import click
8
+ import pandas as pd
9
+ from jinja2 import Environment, PackageLoader
10
+ from vertica_python.vertica.cursor import Cursor
11
+
12
+ from dbdiff import __version__
13
+ from dbdiff.main import (
14
+ check_primary_key,
15
+ create_diff_table,
16
+ create_joined_table,
17
+ get_all_col_info,
18
+ get_column_diffs,
19
+ get_column_diffs_from_joined,
20
+ get_diff_columns,
21
+ get_diff_rows,
22
+ get_diff_rows_from_joined,
23
+ get_unmatched_rows,
24
+ get_unmatched_rows_straight,
25
+ insert_diff_table,
26
+ select_distinct_rows,
27
+ )
28
+ from dbdiff.report import excel_report, html_report
29
+ from dbdiff.vertica import get_cur
30
+
31
+ JINJA_ENV = Environment(loader=PackageLoader("dbdiff", "templates"))
32
+ DEFAULT_LOGGING_CONFIG = Path(__file__).with_name("logging.json")
33
+ LOGGER = logging.getLogger(__name__)
34
+
35
+
36
+ def initialize_logging(config: Path) -> None:
37
+ """
38
+ Initialize logging configuration from JSON config file.
39
+ """
40
+ with config.open() as fobj:
41
+ dict_config = json.load(fobj)
42
+ logging.config.dictConfig(dict_config)
43
+
44
+
45
+ def df_to_dict(x: Any) -> Any:
46
+ if isinstance(x, pd.DataFrame):
47
+ return x.to_dict("records")
48
+ else:
49
+ return x
50
+
51
+
52
+ def get_summary_from_all_info(d: dict) -> dict:
53
+ return {
54
+ "x_schema": d["x_schema"],
55
+ "x_table": d["x_table"],
56
+ "y_schema": d["y_schema"],
57
+ "y_table": d["y_table"],
58
+ "join_cols": d["join_cols"],
59
+ "total_row_count": d["total_row_count"],
60
+ "dedup_info": d["dedup_info"],
61
+ "column_info": {
62
+ col: {k: df_to_dict(v) for k, v in info.items()}
63
+ for col, info in d["column_info"].items()
64
+ },
65
+ # this is a dataframe:
66
+ "column_match_info": df_to_dict(d["column_match_info"]),
67
+ "missing_join_info": {
68
+ side: {k: df_to_dict(v) for k, v in info.items()}
69
+ for side, info in d["missing_join_info"].items()
70
+ },
71
+ # just the counts from the diff summary:
72
+ "diff_summary": d["diff_summary"],
73
+ "hierarchical_join_info": {
74
+ col: {
75
+ side: {k: df_to_dict(v) for k, v in info.items()} for side, info in col_info.items()
76
+ }
77
+ for col, col_info in d["hierarchical_join_info"].items()
78
+ },
79
+ }
80
+
81
+
82
+ @click.command()
83
+ @click.argument("schema")
84
+ @click.argument("x_table")
85
+ @click.argument("y_table")
86
+ @click.argument("join_cols")
87
+ @click.option(
88
+ "--y-schema", default=None, help="If the schema for the y_table is different, specify it."
89
+ )
90
+ @click.option(
91
+ "--output-schema",
92
+ default=None,
93
+ help="If you want the schema for the output tables to be different, specify it.",
94
+ )
95
+ @click.option(
96
+ "--drop-output-tables",
97
+ is_flag=True,
98
+ help="Drop the joined and diff tables created and used here.",
99
+ )
100
+ @click.option(
101
+ "--x-table-query",
102
+ is_flag=True,
103
+ help="If X_TABLE is not a table in Vertica, but rather a query stored in a file, add this flag and the query will be read and instantiated into a temporary table. Ex: 'temp_xtable_name_to_use.sql'.",
104
+ )
105
+ @click.option(
106
+ "--y-table-query",
107
+ is_flag=True,
108
+ help="If Y_TABLE is not a table in Vertica, but rather a query stored in a file, add this flag and the query will be read and instantiated into a temporary table.",
109
+ )
110
+ @click.option(
111
+ "--exclude-columns", default="", help="Comma separated string of column names to exclude."
112
+ )
113
+ @click.option(
114
+ "--hierarchical-join",
115
+ is_flag=True,
116
+ help="If multiple join keys, and join key #2 is a subset of join key #1. We expect matches for all of #1 from both tables even if we dont match on #1 and #2. This way, we can have more nuanced output by first breaking out missing on the first key.",
117
+ )
118
+ @click.option(
119
+ "--max-rows-all",
120
+ default=10,
121
+ help="Limit of full rows to pull that have differences.",
122
+ show_default=True,
123
+ )
124
+ @click.option(
125
+ "--max-rows-column",
126
+ default=10,
127
+ help="Limit of grouped and raw column level differences to pull.",
128
+ show_default=True,
129
+ )
130
+ @click.option(
131
+ "--output-format", type=click.Choice(["HTML", "XLSX"], case_sensitive=False), default="HTML"
132
+ )
133
+ @click.option(
134
+ "--save-column-summary", is_flag=True, help="Save the column dtype and match summary."
135
+ )
136
+ @click.option(
137
+ "--save-column-summary-format",
138
+ type=click.Choice(["CSV", "PICKLE"], case_sensitive=False),
139
+ default="CSV",
140
+ )
141
+ @click.option(
142
+ "--skip-row-total",
143
+ is_flag=True,
144
+ help="Skip counting the total # of rows with differences, only use cell differences.",
145
+ )
146
+ @click.option("--use-diff-table", is_flag=True, help="Use a diff table in the middle.")
147
+ @click.option("--logging-config", type=Path, default=DEFAULT_LOGGING_CONFIG)
148
+ @click.option(
149
+ "--case-insensitive",
150
+ is_flag=True,
151
+ help="If using this flag, all case sensitivity is turned off.",
152
+ )
153
+ @click.option("--save-json-summary", is_flag=True, help="Save a .json file of the diff summary.")
154
+ @click.version_option(__version__)
155
+ def cli(
156
+ schema: str,
157
+ x_table: str,
158
+ y_table: str,
159
+ join_cols: str,
160
+ y_schema: str,
161
+ output_schema: str,
162
+ drop_output_tables: bool,
163
+ x_table_query: bool,
164
+ y_table_query: bool,
165
+ exclude_columns: str,
166
+ hierarchical_join: bool,
167
+ max_rows_all: int,
168
+ max_rows_column: int,
169
+ output_format: str,
170
+ save_column_summary: bool,
171
+ save_column_summary_format: str,
172
+ skip_row_total: bool,
173
+ use_diff_table: bool,
174
+ logging_config: Path,
175
+ case_insensitive: bool,
176
+ save_json_summary: bool,
177
+ ):
178
+ """Compare two flat files X_TABLE and Y_TABLE, using Vertica as the join engine.
179
+ Assume they are both in the same schema = SCHEMA.
180
+ Join them on the columns in comma-separated string JOIN_COLS.
181
+ Expects that the join columns have matching data type or will implicitly cast for comparison,
182
+ and implicity cast into the type in [X_TABLE] for the JOINED table.
183
+ Expects that all other columns with matchings names (those that will be compared)
184
+ can be compared directly (it will cast all dtypes for comparison to the type in X_TABLE).
185
+
186
+ Will drop [X_TABLE]_DIFF (if --use-diff-table) and will drop [X_TABLE]_JOINED if they exist."""
187
+ # default y_schema to be the same as x
188
+ if y_schema is None:
189
+ y_schema = schema
190
+ if output_schema is None:
191
+ output_schema = schema
192
+ join_cols_list = list(map(lambda x: x.lower(), join_cols.split(",")))
193
+ exclude_columns_set = set(map(lambda x: x.lower(), exclude_columns.split(",")))
194
+ initialize_logging(logging_config)
195
+
196
+ with get_cur() as cur:
197
+ if x_table_query:
198
+ with open(x_table) as f:
199
+ q = f.read()
200
+ x_table = Path(x_table).stem
201
+ LOGGER.info("Creating temp table from query for x.")
202
+ q_create = JINJA_ENV.get_template("create_temp_table.sql").render(
203
+ table_name=x_table, query=q
204
+ )
205
+ LOGGER.info(q_create)
206
+ cur.execute(q_create)
207
+ schema = "v_temp_schema"
208
+ if y_table_query:
209
+ with open(y_table) as f:
210
+ q = f.read()
211
+ y_table = Path(y_table).stem
212
+ LOGGER.info("Creating temp table from query for y.")
213
+ q_create = JINJA_ENV.get_template("create_temp_table.sql").render(
214
+ table_name=y_table, query=q
215
+ )
216
+ LOGGER.info(q_create)
217
+ cur.execute(q_create)
218
+ y_schema = "v_temp_schema"
219
+
220
+ all_info = main(
221
+ cur=cur,
222
+ x_schema=schema,
223
+ x_table=x_table,
224
+ y_schema=y_schema,
225
+ y_table=y_table,
226
+ output_schema=output_schema,
227
+ join_cols=join_cols_list,
228
+ exclude_columns=exclude_columns_set,
229
+ max_rows_all=max_rows_all,
230
+ max_rows_column=max_rows_column,
231
+ drop_output_tables=drop_output_tables,
232
+ hierarchical_join=hierarchical_join,
233
+ save_column_summary=save_column_summary,
234
+ save_column_summary_format=save_column_summary_format,
235
+ skip_row_total=skip_row_total,
236
+ use_diff_table=use_diff_table,
237
+ case_insensitive=case_insensitive,
238
+ )
239
+
240
+ if output_format == "HTML":
241
+ report = html_report(**all_info)
242
+ with open(x_table + "_report.html", "w") as f:
243
+ f.write(report)
244
+ elif output_format == "XLSX":
245
+ reports = excel_report(**all_info)
246
+ with pd.ExcelWriter(x_table + "_report.xlsx", engine="xlsxwriter") as writer:
247
+ for sheet_name, df in reports:
248
+ df.to_excel(writer, sheet_name=sheet_name, index=False)
249
+
250
+ if save_json_summary:
251
+ # get the parts of the info that aren't dataframes
252
+ summary_info = get_summary_from_all_info(all_info)
253
+ Path(f"{x_table}_diff_summary.json").write_text(
254
+ json.dumps(summary_info, indent=4, default=str)
255
+ )
256
+
257
+
258
+ def main(
259
+ cur: Cursor,
260
+ x_schema: str,
261
+ x_table: str,
262
+ y_schema: str,
263
+ y_table: str,
264
+ output_schema: str,
265
+ join_cols: list,
266
+ exclude_columns: set,
267
+ max_rows_all: int,
268
+ max_rows_column: int,
269
+ drop_output_tables: bool,
270
+ hierarchical_join: bool,
271
+ save_column_summary: bool,
272
+ save_column_summary_format: str,
273
+ skip_row_total: bool,
274
+ use_diff_table: bool,
275
+ case_insensitive: bool,
276
+ ):
277
+ """Main method to be called by CLI.
278
+ A separate function from cli() so that it can be imported easily as well."""
279
+
280
+ if case_insensitive:
281
+ LOGGER.info("Setting to case insensitive.")
282
+ cur.execute("SET LOCALE TO 'en_US@colstrength=1';")
283
+ # clear the results
284
+ cur.fetchall()
285
+
286
+ all_col_info_df = get_all_col_info(
287
+ cur,
288
+ x_schema,
289
+ x_table,
290
+ y_schema,
291
+ y_table,
292
+ exclude_columns,
293
+ save_column_summary,
294
+ save_column_summary_format,
295
+ )
296
+ comparable_filter = (
297
+ ~all_col_info_df.exclude
298
+ & all_col_info_df.comparable
299
+ & ~all_col_info_df.x_dtype.isnull()
300
+ & ~all_col_info_df.y_dtype.isnull()
301
+ )
302
+ all_col_info_df["uncomparable"] = (
303
+ (~all_col_info_df.comparable)
304
+ & (~all_col_info_df.x_dtype.isnull())
305
+ & (~all_col_info_df.y_dtype.isnull())
306
+ )
307
+ # check that the join cols exist on both tables
308
+ for col in join_cols:
309
+ if all_col_info_df.loc[comparable_filter & (all_col_info_df.index == col), :].shape[0] == 0:
310
+ raise RuntimeError(
311
+ "Column `{}` not in comparable columns (missing from one, both, or bad dtype). Here is the info we do have about that col:\n".format(
312
+ col
313
+ )
314
+ + all_col_info_df.loc[col, :].to_string()
315
+ )
316
+
317
+ LOGGER.info("Checking primary keys.")
318
+ x = check_primary_key(cur=cur, schema=x_schema, table=x_table, join_cols=join_cols)
319
+ y = check_primary_key(cur=cur, schema=y_schema, table=y_table, join_cols=join_cols)
320
+ dedup_info = {x_table: {"count": x}, y_table: {"count": y}}
321
+
322
+ # hard stop on primary key:
323
+ # assert x == 0, '# non distinct rows in ' + x_table + ' is ' + str(x)
324
+ # assert y == 0, '# non distinct rows in ' + y_table + ' is ' + str(y)
325
+
326
+ if hierarchical_join:
327
+ LOGGER.info("Getting rows that are missing on each join key.")
328
+ hierarchical_join_info = get_unmatched_rows(
329
+ cur=cur,
330
+ x_schema=x_schema,
331
+ y_schema=y_schema,
332
+ x_table=x_table,
333
+ y_table=y_table,
334
+ join_cols=join_cols,
335
+ max_rows_column=max_rows_column,
336
+ )
337
+ else:
338
+ hierarchical_join_info = {}
339
+
340
+ # create sub-tables to allow a comparison:
341
+ if x != 0:
342
+ LOGGER.info("X table was not unique on join keys, creating _dedup and _dup versions.")
343
+ schema, x_table = select_distinct_rows(
344
+ cur,
345
+ x_schema,
346
+ x_table,
347
+ join_cols,
348
+ use_temp_tables=(drop_output_tables or x_schema == "v_temp_schema"),
349
+ )
350
+ if y != 0:
351
+ LOGGER.info("Y table was not unique on join keys, creating _dedup and _dup versions.")
352
+ y_schema, y_table = select_distinct_rows(
353
+ cur,
354
+ y_schema,
355
+ y_table,
356
+ join_cols,
357
+ use_temp_tables=(drop_output_tables or y_schema == "v_temp_schema"),
358
+ )
359
+
360
+ LOGGER.info("Getting rows that did not match (not in joined table) after deduping.")
361
+ missing_join_info = get_unmatched_rows_straight(
362
+ cur=cur,
363
+ x_schema=x_schema,
364
+ y_schema=y_schema,
365
+ x_table=x_table,
366
+ y_table=y_table,
367
+ join_cols=join_cols,
368
+ max_rows_column=max_rows_column,
369
+ )
370
+
371
+ # build the joined table
372
+ LOGGER.info("Building joined table " + (x_table + "_JOINED"))
373
+ joined_row_count = create_joined_table(
374
+ cur=cur,
375
+ x_schema=x_schema,
376
+ y_schema=y_schema,
377
+ x_table=x_table,
378
+ y_table=y_table,
379
+ join_cols=join_cols,
380
+ compare_cols=all_col_info_df.loc[comparable_filter, :],
381
+ joined_schema=output_schema,
382
+ joined_table=(x_table + "_JOINED"),
383
+ )
384
+
385
+ if use_diff_table:
386
+ # build the diff table
387
+ LOGGER.info("Building diff table " + (x_table + "_DIFF."))
388
+ create_diff_table(
389
+ cur=cur,
390
+ schema=output_schema,
391
+ table=(x_table + "_DIFF"),
392
+ join_cols=join_cols,
393
+ all_col_info_df=all_col_info_df,
394
+ )
395
+ for column in all_col_info_df.loc[
396
+ comparable_filter & ~all_col_info_df.index.isin(join_cols), :
397
+ ].index.values:
398
+ LOGGER.info("Inserting column " + column + " into diff table.")
399
+ insert_diff_table(
400
+ cur=cur,
401
+ joined_schema=output_schema,
402
+ joined_table=(x_table + "_JOINED"),
403
+ diff_schema=output_schema,
404
+ diff_table=(x_table + "_DIFF"),
405
+ join_cols=join_cols,
406
+ column=column,
407
+ )
408
+
409
+ ############################################################################
410
+ # Result 1: Get rows with at least N=1 difference (count, query, dataframe),
411
+ ############################################################################
412
+ diff_summary = get_diff_rows(
413
+ cur, output_schema, x_table, join_cols, max_rows_all, skip_row_total
414
+ )
415
+
416
+ ############################################################################
417
+ # Result 2: Get ordered list of columns by # of differences (query, dataframe).
418
+ ############################################################################
419
+ diff_columns = get_diff_columns(cur, output_schema, x_table)
420
+
421
+ ############################################################################
422
+ # Result 3: Get detailed column diffs.
423
+ ############################################################################
424
+ grouped_column_diffs = get_column_diffs(
425
+ diff_columns,
426
+ cur,
427
+ output_schema,
428
+ x_schema,
429
+ x_table,
430
+ y_schema,
431
+ y_table,
432
+ join_cols,
433
+ max_rows_column,
434
+ all_col_info_df,
435
+ hierarchical_join,
436
+ )
437
+
438
+ else:
439
+ grouped_column_diffs = get_column_diffs_from_joined(
440
+ cur=cur,
441
+ output_schema=output_schema,
442
+ x_schema=x_schema,
443
+ x_table=x_table,
444
+ y_schema=y_schema,
445
+ y_table=y_table,
446
+ join_cols=join_cols,
447
+ max_rows_column=max_rows_column,
448
+ all_col_info_df=all_col_info_df,
449
+ comparable_filter=comparable_filter,
450
+ hierarchical=hierarchical_join,
451
+ )
452
+ diff_summary = get_diff_rows_from_joined(
453
+ cur=cur,
454
+ grouped_column_diffs=grouped_column_diffs,
455
+ output_schema=output_schema,
456
+ x_table=x_table,
457
+ join_cols=join_cols,
458
+ max_rows_all=max_rows_all,
459
+ skip_row_total=skip_row_total,
460
+ )
461
+
462
+ all_info = {
463
+ "x_schema": x_schema,
464
+ "y_schema": y_schema,
465
+ "x_table": x_table,
466
+ "y_table": y_table,
467
+ "join_cols": join_cols,
468
+ "total_row_count": joined_row_count,
469
+ "column_info": grouped_column_diffs,
470
+ "column_match_info": all_col_info_df,
471
+ "missing_join_info": missing_join_info,
472
+ "hierarchical_join_info": hierarchical_join_info,
473
+ "dedup_info": dedup_info,
474
+ "diff_summary": diff_summary,
475
+ }
476
+
477
+ if drop_output_tables:
478
+ LOGGER.info("Dropping output tables. WARNING: queries in the report won't work!")
479
+ cur.execute(
480
+ JINJA_ENV.get_template("table_drop.sql").render(
481
+ schema_name=output_schema, table_name=(x_table + "_JOINED")
482
+ )
483
+ )
484
+ if use_diff_table:
485
+ cur.execute(
486
+ JINJA_ENV.get_template("table_drop.sql").render(
487
+ schema_name=output_schema, table_name=(x_table + "_DIFF")
488
+ )
489
+ )
490
+
491
+ return all_info
@@ -21,4 +21,4 @@
21
21
  "handlers": ["stderr"],
22
22
  "level": "DEBUG"
23
23
  }
24
- }
24
+ }