dbdiff 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. dbdiff/__init__.py +7 -0
  2. dbdiff/__main__.py +15 -0
  3. dbdiff/cli.py +491 -0
  4. dbdiff/logging.json +24 -0
  5. dbdiff/main.py +728 -0
  6. dbdiff/report.py +165 -0
  7. dbdiff/templates/all_keys_count.sql +6 -0
  8. dbdiff/templates/all_keys_sample.sql +10 -0
  9. dbdiff/templates/create_dedup.sql +11 -0
  10. dbdiff/templates/create_dup.sql +11 -0
  11. dbdiff/templates/create_joined_table.sql +13 -0
  12. dbdiff/templates/create_joined_table_from_selectinto.sql +14 -0
  13. dbdiff/templates/create_temp_table.sql +1 -0
  14. dbdiff/templates/diff_column.sql +28 -0
  15. dbdiff/templates/diff_column_hier.sql +5 -0
  16. dbdiff/templates/diff_column_numeric_diffs_binned.sql +15 -0
  17. dbdiff/templates/diff_column_numeric_diffs_sorted.sql +6 -0
  18. dbdiff/templates/diff_column_raw.sql +11 -0
  19. dbdiff/templates/diff_column_summary.sql +5 -0
  20. dbdiff/templates/diff_rows_sample.sql +8 -0
  21. dbdiff/templates/first_key_base.sql +18 -0
  22. dbdiff/templates/first_key_count.sql +4 -0
  23. dbdiff/templates/first_key_sample.sql +10 -0
  24. dbdiff/templates/html/base.html +34 -0
  25. dbdiff/templates/html/report.html +241 -0
  26. dbdiff/templates/insert_diff.sql +9 -0
  27. dbdiff/templates/insert_joined_table.sql +26 -0
  28. dbdiff/templates/joined_column.sql +7 -0
  29. dbdiff/templates/joined_column_hier.sql +11 -0
  30. dbdiff/templates/joined_column_numeric_diffs_binned.sql +15 -0
  31. dbdiff/templates/joined_column_numeric_diffs_sorted.sql +6 -0
  32. dbdiff/templates/joined_column_raw.sql +6 -0
  33. dbdiff/templates/joined_count.sql +3 -0
  34. dbdiff/templates/joined_rows_count.sql +3 -0
  35. dbdiff/templates/joined_rows_sample.sql +3 -0
  36. dbdiff/templates/sub_keys_base.sql +33 -0
  37. dbdiff/templates/sub_keys_count.sql +4 -0
  38. dbdiff/templates/sub_keys_grouped.sql +14 -0
  39. dbdiff/templates/sub_keys_sample.sql +12 -0
  40. dbdiff/templates/table_columns.sql +5 -0
  41. dbdiff/templates/table_drop.sql +1 -0
  42. dbdiff/templates/table_exists.sql +4 -0
  43. dbdiff/templates/table_rows.sql +2 -0
  44. dbdiff/templates/table_rows_uniq.sql +6 -0
  45. dbdiff/vertica.py +126 -0
  46. dbdiff-0.7.1.dist-info/METADATA +200 -0
  47. dbdiff-0.7.1.dist-info/RECORD +51 -0
  48. dbdiff-0.7.1.dist-info/WHEEL +4 -0
  49. dbdiff-0.7.1.dist-info/entry_points.txt +2 -0
  50. dbdiff-0.7.1.dist-info/licenses/AUTHORS.md +4 -0
  51. dbdiff-0.7.1.dist-info/licenses/LICENSE +9 -0
dbdiff/report.py ADDED
@@ -0,0 +1,165 @@
1
+ import pandas as pd
2
+ from jinja2 import Environment, PackageLoader
3
+
4
+ MAX_EXCEL_SHEET_NAME_LEN = 31
5
+ JINJA_ENV = Environment(loader=PackageLoader("dbdiff", "templates"))
6
+
7
+
8
+ def get_max_diferences(column_info: pd.DataFrame) -> int:
9
+ if len(list(column_info.values())) > 0:
10
+ return list(column_info.values())[0]["count"]
11
+ else:
12
+ return 0
13
+
14
+
15
+ def reformat_missing_join_info(d: dict, x_table: str, y_table: str) -> dict:
16
+ return {x_table: d["x"], y_table: d["y"]}
17
+
18
+
19
+ def reformat_hierarchical_join_info(d: dict, x_table: str, y_table: str) -> dict:
20
+ return {col: reformat_missing_join_info(val, x_table, y_table) for col, val in d.items()}
21
+
22
+
23
+ def html_report(
24
+ x_schema: str,
25
+ y_schema: str,
26
+ x_table: str,
27
+ y_table: str,
28
+ join_cols: list,
29
+ diff_summary: dict,
30
+ total_row_count: int,
31
+ column_info: dict,
32
+ column_match_info: pd.DataFrame,
33
+ missing_join_info: dict,
34
+ hierarchical_join_info: dict,
35
+ dedup_info: dict,
36
+ ) -> str:
37
+
38
+ def comma(value, format="{0:,d}"):
39
+ return format.format(value)
40
+
41
+ def code(value, codeclass="plaintext"):
42
+ return f'<code class="{codeclass}">{value}</code>'
43
+
44
+ def dfhtml(df, classes=["table", "table-bordered", "table-striped", "table-hover", "table-sm"]):
45
+ return df.to_html(index=False, classes=classes)
46
+
47
+ JINJA_ENV.filters["comma"] = comma
48
+ JINJA_ENV.filters["code"] = code
49
+ JINJA_ENV.filters["dfhtml"] = dfhtml
50
+
51
+ t = JINJA_ENV.get_template("html/report.html")
52
+
53
+ max_differences = get_max_diferences(column_info)
54
+ missing_join_info = reformat_missing_join_info(missing_join_info, x_table, y_table)
55
+ hierarchical_join_info = reformat_hierarchical_join_info(
56
+ hierarchical_join_info, x_table, y_table
57
+ )
58
+
59
+ return t.render(
60
+ {
61
+ "x_schema": x_schema,
62
+ "y_schema": y_schema,
63
+ "x_table": x_table,
64
+ "y_table": y_table,
65
+ "join_cols": join_cols,
66
+ "diff_summary": diff_summary,
67
+ "total_row_count": total_row_count,
68
+ "column_info": column_info,
69
+ "max_differences": max_differences,
70
+ "missing_join_info": missing_join_info,
71
+ "hierarchical_join_info": hierarchical_join_info,
72
+ "dedup_info": dedup_info,
73
+ # can't do these filters in Jinja
74
+ # could write a filter function that takes a list of
75
+ # positive and a list of negative filter columns
76
+ # but this is good enough for the one case:
77
+ "compared_column_count": (
78
+ (~column_match_info.exclude)
79
+ & column_match_info.comparable
80
+ & (~column_match_info.x_dtype.isnull())
81
+ & (~column_match_info.y_dtype.isnull())
82
+ & (~column_match_info.index.isin(join_cols))
83
+ ).sum(),
84
+ "column_match_info": column_match_info,
85
+ }
86
+ )
87
+
88
+
89
+ def excel_report(
90
+ x_schema: str,
91
+ y_schema: str,
92
+ x_table: str,
93
+ y_table: str,
94
+ join_cols: list,
95
+ diff_summary: dict,
96
+ total_row_count: int,
97
+ column_info: dict,
98
+ column_match_info: pd.DataFrame,
99
+ missing_join_info: dict,
100
+ hierarchical_join_info: dict,
101
+ dedup_info: dict,
102
+ ) -> list:
103
+ """
104
+ Return a list with [(sheet_name: str, df: pd.DataFrame) ... ]
105
+ """
106
+ all_sheets = []
107
+ summary_sheet_data = [
108
+ {
109
+ "Summary": 'Diff report between tables {x_table} (herein, "x") and {y_table} (herein, "y").'.format(
110
+ x_table=x_table, y_table=y_table
111
+ )
112
+ }
113
+ ]
114
+ summary_sheet_data.append({"Summary": "----"})
115
+ summary_sheet_data.append(
116
+ {
117
+ "Summary": "There are {x_missing_count} rows in {y_table} that are not in {x_table}.".format(
118
+ x_table=x_table, y_table=y_table, x_missing_count=missing_join_info["x"]["count"]
119
+ )
120
+ }
121
+ )
122
+ summary_sheet_data.append(
123
+ {
124
+ "Summary": "There are {y_missing_count} rows in {x_table} that are not in {y_table}.".format(
125
+ x_table=x_table, y_table=y_table, y_missing_count=missing_join_info["y"]["count"]
126
+ )
127
+ }
128
+ )
129
+ summary_sheet_data.append(
130
+ {
131
+ "Summary": "There are {diff_row_count} rows matched between tables that don't line up exactly.".format(
132
+ diff_row_count=diff_summary["count"]
133
+ )
134
+ }
135
+ )
136
+ summary_sheet_data.append(
137
+ {
138
+ "Summary": "There are {column_info} columns that have differences.".format(
139
+ column_info=len(column_info)
140
+ )
141
+ }
142
+ )
143
+
144
+ max_differences = get_max_diferences(column_info)
145
+
146
+ summary_sheet_data.append(
147
+ {
148
+ "Summary": "The maximum number of differences on any individual column is {max_differences}.".format(
149
+ max_differences=max_differences
150
+ )
151
+ }
152
+ )
153
+ all_sheets.append(("Summary", pd.DataFrame(summary_sheet_data)))
154
+
155
+ if missing_join_info["x"]["count"] > 0:
156
+ # all_sheets.append(('Missing rows in {x_table}'.format(x_table=x_table), x_missing_ids))
157
+ all_sheets.append(("Missing in x", missing_join_info["x"]["sample"]))
158
+ if missing_join_info["y"]["count"] > 0:
159
+ # all_sheets.append(('Missing rows in {y_table}'.format(y_table=y_table), y_missing_ids))
160
+ all_sheets.append(("Missing in y", missing_join_info["y"]["sample"]))
161
+ if diff_summary["count"] > 0:
162
+ all_sheets.append(("Mismatched rows", diff_summary["sample"]))
163
+ for column, info in column_info.items():
164
+ all_sheets.append((column[:MAX_EXCEL_SHEET_NAME_LEN], info["df_raw"]))
165
+ return all_sheets
@@ -0,0 +1,6 @@
1
+ SELECT COUNT(*)
2
+ FROM {{ x_schema }}.{{ x_table }} x
3
+ FULL OUTER JOIN {{ y_schema }}.{{ y_table }} y
4
+ ON {% for col in join_cols %}x.{{ col }} <=> y.{{ col }}{% if not loop.last %} AND {% endif %}
5
+ {% endfor -%}
6
+ WHERE {% if x %}y{% else %}x{% endif %}.{{ join_cols[0] }} IS NULL
@@ -0,0 +1,10 @@
1
+ SELECT {% for col in join_cols -%}
2
+ {% if x %}x{% else %}y{% endif %}.{{ col }} AS {{ col }}{% if not loop.last %},
3
+ {% endif %}{% endfor %}
4
+ FROM {{ x_schema }}.{{ x_table }} x
5
+ FULL OUTER JOIN {{ y_schema }}.{{ y_table }} y
6
+ ON {% for col in join_cols %}x.{{ col }} <=> y.{{ col }}{% if not loop.last %} AND {% endif %}
7
+ {% endfor -%}
8
+ WHERE {% if x %}y{% else %}x{% endif %}.{{ join_cols[0] }} IS NULL
9
+ ORDER BY {% for col in join_cols %}{% if x %}x{% else %}y{% endif %}.{{ col }}{% if not loop.last %}, {% endif %}{% endfor %}
10
+ LIMIT {{ max_rows_column }}
@@ -0,0 +1,11 @@
1
+ SELECT x.*
2
+ {% if not use_temp_table %}INTO {{ schema_name }}.{{ table_name_dedup }}{% endif %}
3
+ FROM {{ schema_name }}.{{ table_name }} x
4
+ INNER JOIN (
5
+ SELECT {{ group_cols }},
6
+ COUNT(*) AS dup_count
7
+ FROM {{ schema_name }}.{{ table_name }}
8
+ GROUP BY {{ group_cols }}
9
+ ) y
10
+ ON {{ join_cols }}
11
+ WHERE y.dup_count = 1
@@ -0,0 +1,11 @@
1
+ SELECT x.*, y.dup_count
2
+ {% if not use_temp_table %}INTO {{ schema_name }}.{{ table_name_dup }}{% endif %}
3
+ FROM {{ schema_name }}.{{ table_name }} x
4
+ INNER JOIN (
5
+ SELECT {{ group_cols }},
6
+ COUNT(*) AS dup_count
7
+ FROM {{ schema_name }}.{{ table_name }}
8
+ GROUP BY {{ group_cols }}
9
+ ) y
10
+ ON {{ join_cols }}
11
+ WHERE y.dup_count > 1
@@ -0,0 +1,13 @@
1
+ CREATE TABLE {{ joined_schema }}.{{ joined_table }} (
2
+ {% for i, row in compare_cols.iterrows() %}
3
+ {% if row.name in join_cols %}
4
+ {{ row.name }} {{ row.x_dtype }}
5
+ {% else %}
6
+ x_{{ row.name }} {{ row.x_dtype }},
7
+ y_{{ row.name }} {{ row.x_dtype }}
8
+ {% endif %}
9
+ {% if not loop.last %},{% endif %}
10
+ {% endfor %}
11
+ )
12
+ ORDER BY {{ join_cols|join(", ") }}
13
+ ;
@@ -0,0 +1,14 @@
1
+ SELECT {% for i, row in compare_cols.iterrows() -%}
2
+ {% if row.name in join_cols -%}
3
+ COALESCE(x.{{ row.name }}, y.{{ row.name }}) AS {{ row.name -}}
4
+ {% else -%}
5
+ x.{{ row.name }}::{{ row.x_dtype }} AS x_{{ row.name }},
6
+ y.{{ row.name }}::{{ row.x_dtype }} AS y_{{ row.name -}}
7
+ {%- endif -%}
8
+ {%- if not loop.last %},{% endif -%}
9
+ {%- endfor %}
10
+ INTO {{ joined_schema }}.{{ joined_table }}
11
+ FROM {{ x_schema }}.{{ x_table }} AS x
12
+ INNER JOIN {{ y_schema }}.{{ y_table }} AS y
13
+ ON {% for col in join_cols %}x.{{ col }} <=> y.{{ col }}{% if not loop.last %} AND {% endif %}
14
+ {% endfor %}
@@ -0,0 +1 @@
1
+ CREATE LOCAL TEMP TABLE {{ table_name }} ON COMMIT PRESERVE ROWS AS ({{ query }})
@@ -0,0 +1,28 @@
1
+ -- could pull them out of the original table:
2
+ -- SELECT x_{{ column }},
3
+ -- y_{{ column }},
4
+ -- COUNT(*) AS ct
5
+ -- FROM (
6
+ -- SELECT x_{{ column }}, y_{{ column }},
7
+ -- x_{{ column }} <=> y_{{ column }} AS {{ column }}_eq
8
+ -- FROM {{ joined_schema }}.{{ joined_table }}
9
+ -- ) AS t1
10
+ -- WHERE t1.{{ column }}_eq IS FALSE
11
+ -- GROUP BY x_{{ column }}, y_{{ column }}
12
+ -- ORDER BY ct DESC
13
+ -- but instead use the results we already have
14
+ -- some critique of the following: the group by in the subquery isn't
15
+ -- necessary based on the design (the insert into this diff table)
16
+ SELECT joined.x_{{ column }},
17
+ joined.y_{{ column }},
18
+ COUNT(*) AS ct
19
+ FROM {{ joined_schema }}.{{ joined_table }} joined
20
+ INNER JOIN (
21
+ SELECT {{ group_cols }}
22
+ FROM {{ diff_schema }}.{{ diff_table }}
23
+ WHERE column_name = '{{ column }}'
24
+ GROUP BY {{ group_cols }}
25
+ ) diff
26
+ ON {{ join_cols }}
27
+ GROUP BY joined.x_{{ column }}, joined.y_{{ column }}
28
+ ORDER BY ct DESC
@@ -0,0 +1,5 @@
1
+ SELECT {{ join_cols }},
2
+ {{ column }}
3
+ FROM {{ schema }}.{{ table }}
4
+ WHERE {{ first_join_col }} IN (SELECT {{ first_join_col }} FROM {{ diff_schema }}.{{ diff_table }} WHERE column_name = '{{ column }}' GROUP BY {{ first_join_col }} ORDER BY {{ first_join_col }} {% if limit %}LIMIT {{ limit }}{% endif %})
5
+ ORDER BY {{ join_cols }}
@@ -0,0 +1,15 @@
1
+ SELECT MIN(diff) AS min_diff,
2
+ MAX(diff) AS max_diff,
3
+ SUM(ct) AS ct
4
+ FROM (
5
+ SELECT NTILE({{ tiles }}) OVER (ORDER BY ABS(raw.x_{{ column }}-raw.y_{{ column }})) AS n_tile,
6
+ ABS(raw.x_{{ column }}-raw.y_{{ column }}) AS diff,
7
+ raw.ct
8
+ FROM (
9
+ {% include "diff_column.sql" %}
10
+ ) raw
11
+ WHERE raw.x_{{ column }} IS NOT NULL
12
+ AND raw.y_{{ column }} IS NOT NULL
13
+ ) tiled
14
+ GROUP BY n_tile
15
+ ORDER BY MIN(diff) ASC, MAX(diff) ASC
@@ -0,0 +1,6 @@
1
+ SELECT {{ join_cols|join(", ") }},
2
+ x_{{ column }},
3
+ y_{{ column }},
4
+ ABS(x_{{ column }} - y_{{ column }}) AS abs_diff
5
+ FROM ({% include "diff_column_raw.sql" %}) q_raw
6
+ ORDER BY ABS(x_{{ column }} - y_{{ column }}) DESC
@@ -0,0 +1,11 @@
1
+ SELECT {% for col in join_cols %}joined.{{ col }},
2
+ {% endfor %}joined.x_{{ column }},
3
+ joined.y_{{ column }}
4
+ FROM {{ joined_schema }}.{{ joined_table }} joined
5
+ INNER JOIN (
6
+ SELECT {{ join_cols|join(", ") }}
7
+ FROM {{ diff_schema }}.{{ diff_table }}
8
+ WHERE column_name = '{{ column }}'
9
+ GROUP BY {{ join_cols|join(", ") }}
10
+ ) diff
11
+ ON {{ join_cols_join }}
@@ -0,0 +1,5 @@
1
+ SELECT column_name,
2
+ COUNT(*)
3
+ FROM {{ schema_name }}.{{ table_name }}
4
+ GROUP BY 1
5
+ ORDER BY 2 DESC;
@@ -0,0 +1,8 @@
1
+ SELECT x.*
2
+ FROM {{ schema_name }}.{{ joined_table }} x
3
+ INNER JOIN (
4
+ SELECT {{ group_cols }}
5
+ FROM {{ schema_name }}.{{ diff_table }}
6
+ GROUP BY {{ group_cols }}
7
+ ) joined
8
+ ON {{ join_cols }}
@@ -0,0 +1,18 @@
1
+ {% block select %}
2
+ {% endblock %}
3
+ {% block core %}
4
+ FROM (
5
+ SELECT {{ join_col }} FROM {{ x_schema }}.{{ x_table }} GROUP BY 1
6
+ ) x
7
+ FULL OUTER JOIN (
8
+ SELECT {{ join_col }} FROM {{ y_schema }}.{{ y_table }} GROUP BY 1
9
+ ) y
10
+ ON x.{{ join_col }} = y.{{ join_col }}
11
+ {% endblock %}
12
+ {% block where %}
13
+ WHERE {% if x %}y{% else %}x{% endif %}.{{ join_col }} IS NULL
14
+ {% endblock %}
15
+ {% block orderby %}
16
+ {% endblock %}
17
+ {% block limit %}
18
+ {% endblock %}
@@ -0,0 +1,4 @@
1
+ {% extends "first_key_base.sql" %}
2
+ {% block select %}
3
+ SELECT COUNT(*)
4
+ {% endblock %}
@@ -0,0 +1,10 @@
1
+ {% extends "first_key_base.sql" %}
2
+ {% block select %}
3
+ SELECT {% if x %}x{% else %}y{% endif %}.{{ join_col }} AS {{ join_col }}
4
+ {% endblock %}
5
+ {% block orderby %}
6
+ ORDER BY {% if x %}x{% else %}y{% endif %}.{{ join_col }}
7
+ {% endblock %}
8
+ {% block limit %}
9
+ LIMIT {{ max_rows_column }}
10
+ {% endblock %}
@@ -0,0 +1,34 @@
1
+ <!doctype html>
2
+ <html lang="en">
3
+ <head>
4
+ {% block header %}
5
+ {% block headermeta %}
6
+ <!-- Required meta tags -->
7
+ <meta charset="utf-8">
8
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
9
+ {% endblock %}
10
+ {% block headercss %}
11
+ <!-- Bootstrap CSS -->
12
+ <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
13
+ {% endblock %}
14
+ {% block headerextracss %}
15
+ {% endblock %}
16
+ {% block headertitle %}
17
+ <title>Base HTML templae</title>
18
+ {% endblock %}
19
+ {% endblock %}
20
+ </head>
21
+ <body>
22
+ {% block body %}
23
+ {% endblock %}
24
+ {% block bodyjs %}
25
+ <!-- jQuery first, then Popper.js, then Bootstrap JS -->
26
+ <script src="https://code.jquery.com/jquery-3.3.1.slim.min.js" integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo" crossorigin="anonymous"></script>
27
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js" integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1" crossorigin="anonymous"></script>
28
+ <script src="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js" integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous"></script>
29
+ {% endblock %}
30
+ <!-- Optional JavaScript -->
31
+ {% block bodyextrajs %}
32
+ {% endblock %}
33
+ </body>
34
+ </html>
@@ -0,0 +1,241 @@
1
+ {% extends "html/base.html" %}
2
+
3
+ {% block headerextracss %}
4
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.15.10/styles/atom-one-light.min.css" integrity="sha256-VCcaD9+X/d4QGYRX7l5aMJ8BWgwfA8d3S7i/HC9rvvw=" crossorigin="anonymous" />
5
+ {% endblock %}
6
+ {% block headertitle %}
7
+ <title>Diff Report</title>
8
+ {% endblock %}
9
+
10
+ {% block body %}
11
+ <div class="container">
12
+ <div class="jumbotron" style="margin-top: 30px;">
13
+ <h1 class="display-4">
14
+ Diff report between tables <ul>
15
+ <li>{{ x_table|code }} and</li>
16
+ <li>{{ y_table|code }}.</li>
17
+ </ul>
18
+ </h1>
19
+ <h2>
20
+ Joined using columns: {% for col in join_cols %}{{ col|code }}{% if not loop.last %}, {% endif %}{% endfor %}.
21
+ </h2>
22
+ <hr class="my-4">
23
+ <h3>Summary:</h3>
24
+ <ul>
25
+ {% for side, info in dedup_info.items() %}
26
+ {% if info.count > 0 %}
27
+ <li>There are {{ info.count|comma|code }} rows in {{ side|code }} that were not uniquely identified by the join keys, they will be ignored for cell-by-cell differences, but considered for heirarchical join key analysis.</li>
28
+ {% endif %}
29
+ {% endfor %}
30
+ {% for side, info in missing_join_info.items() %}
31
+ <li>There are {{ info.count|comma|code }} rows in {{ side|code }} that are not in other table.</li>
32
+ {% endfor %}
33
+ {% if diff_summary.count %}
34
+ <li>There are {{ diff_summary.count|comma|code }} / {{ total_row_count|comma|code }} rows matched between tables that don't line up exactly, and {{ diff_summary.total_count|comma|code }} total cell differences.</li>
35
+ {% else %}
36
+ <li>There are {{ total_row_count|comma|code }} rows matched between tables, and {{ diff_summary.total_count|comma|code }} total cell differences.</li>
37
+ <li>There are {{ diff_summary.total_count|comma|code }} total cell differences.</li>
38
+ {% endif %}
39
+ {% if compared_column_count == 1 %}
40
+ <li>There is {{ 1|comma|code }} / {{ compared_column_count|comma|code }} columns that has a difference.</li>
41
+ {% else %}
42
+ <li>There are {{ column_info|length|comma|code }} / {{ compared_column_count|comma|code }} columns that have differences.</li>
43
+ {% endif %}
44
+ {% if (column_match_info.uncomparable).sum() > 0 %}
45
+ <li>There {% if (column_match_info.uncomparable).sum() > 1 %}are{% else %}is{% endif %} {{ (column_match_info.uncomparable).sum()|comma|code }} column{% if (column_match_info.uncomparable).sum() > 1 %}s{% endif %} that matched on name
46
+ but were not compared based dtype matching:
47
+ <ul>
48
+ {% for i, row in column_match_info.loc[column_match_info.uncomparable].iterrows() %}
49
+ <li>Column {{ row.name|code }} has type {{ row.x_dtype|code }} in x and type {{ row.y_dtype|code }} in y.</li>
50
+ {% endfor %}
51
+ </ul>
52
+ </li>
53
+ {% endif %}
54
+ {% if column_match_info.x_dtype.isnull().sum() > 0 %}
55
+ <li>There {% if column_match_info.x_dtype.isnull().sum() > 1 %}are{% else %}is{% endif %} {{ column_match_info.x_dtype.isnull().sum()|comma|code }} column(s) missing from x:
56
+ <ul>
57
+ {% for i, row in column_match_info.loc[column_match_info.x_dtype.isnull()].iterrows() %}
58
+ <li>Column {{ row.name|code }}.</li>
59
+ {% endfor %}
60
+ </ul>
61
+ </li>
62
+ {% endif %}
63
+ {% if column_match_info.y_dtype.isnull().sum() > 0 %}
64
+ <li>There {% if column_match_info.y_dtype.isnull().sum() > 1 %}are{% else %}is{% endif %} {{ column_match_info.y_dtype.isnull().sum()|comma|code }} column(s) missing from y.
65
+ <ul>
66
+ {% for i, row in column_match_info.loc[column_match_info.y_dtype.isnull()].iterrows() %}
67
+ <li>Column {{ row.name|code }}.</li>
68
+ {% endfor %}
69
+ </ul>
70
+ </li>
71
+ {% endif %}
72
+ <li>The maximum number of differences on any individual column is {{ max_differences|comma|code }}.</li>
73
+ {% if column_match_info.exclude.sum() > 0 %}
74
+ <li>For reference, this report excluded the following {{ column_match_info.exclude.sum()|comma|code }} column(s):
75
+ <ul>
76
+ {% for i, row in column_match_info.loc[column_match_info.exclude].iterrows() %}
77
+ <li>Column {{ row.name|code }}.</li>
78
+ {% endfor %}
79
+ </ul>
80
+ </li>
81
+ {% endif %}
82
+ </ul>
83
+ </div>
84
+
85
+ <div class="row">
86
+ <div class="col">
87
+ <h2>
88
+ (1) Missing rows, (2) mismatched rows, and (3) differences by each column.
89
+ Click to expand.
90
+ </h2>
91
+ <p>Columns are ordered by the number of differences on each column.
92
+ For the mismatched rows, columns with prefix "x" are from <code class="plaintext">{{ x_schema }}.{{ x_table }}</code> and columns with prefix "y" are from <code class="plaintext">{{ x_schema }}.{{ y_table }}</code>.
93
+ The same columns labels are in the "grouped differences" output,
94
+ in which we take all pairs of cells that are different for the given column,
95
+ and count up the number of such pairs.
96
+ The code to pull the difference data is printed below it.</p>
97
+ </div>
98
+ </div>
99
+
100
+ <div class="row" style="margin-bottom: 30px;">
101
+ <div class="col">
102
+ <div class="accordion" id="accordionExample">
103
+ {% for side, info in missing_join_info.items() %}
104
+ {% if info.count > 0 %}
105
+ <div class="card">
106
+ <div class="card-header" id="headingmissing{{ side }}">
107
+ <h2 class="mb-0">
108
+ <button class="btn btn-link" type="button" data-toggle="collapse" data-target="#collapsemissing{{ side }}" aria-expanded="true" aria-controls="collapsemissing{{ side }}">
109
+ Rows in {{ side|code }} that aren't in the other table.
110
+ </button>
111
+ </h2>
112
+ </div>
113
+ <div id="collapsemissing{{ side }}" class="collapse show" aria-labelledby="headingmissing{{ side }}" data-parent="#accordionExample">
114
+ <div class="card-body overflow-auto">
115
+ {{ info.sample|dfhtml|safe }}
116
+ <h3>Query:</h3>
117
+ <pre>{{ info.query|code("pgsql") }}</pre>
118
+ </div>
119
+ </div>
120
+ </div>
121
+ {% endif %}
122
+ {% endfor %}
123
+ {% for col, m in hierarchical_join_info.items() %}
124
+ {% for side, info in m.items() %}
125
+ {% if info.count > 0 %}
126
+ <div class="card">
127
+ <div class="card-header" id="headingx{{ col }}{{ info.count }}">
128
+ <h2 class="mb-0">
129
+ <button class="btn btn-link" type="button" data-toggle="collapse" data-target="#collapsex{{ col }}{{ side }}{{ info.count }}" aria-expanded="true" aria-controls="collapsex{{ col }}{{ col }}{{ info.count }}">
130
+ Rows in table {{ side|code }} that aren't in the other table for column {{ col|code }} (sample out of {{ info.count|comma }})
131
+ </button>
132
+ </h2>
133
+ </div>
134
+ <div id="collapsex{{ col }}{{ side }}{{ info.count }}" class="collapse show" aria-labelledby="headingx{{ col }}{{ side }}{{ info.count }}" data-parent="#accordionExample">
135
+ <div class="card-body overflow-auto">
136
+ {{ info.sample|dfhtml|safe }}
137
+ {% if "sample_grouped" in info %}
138
+ <h4>Grouped:</h4>
139
+ {{ info.sample_grouped|dfhtml|safe }}
140
+ {% endif %}
141
+ <h3>Queries:</h3>
142
+ <pre>{{ info.query|code("pgsql") }}</pre>
143
+ {% if "sample_grouped" in info %}
144
+ <pre>{{ info.query_grouped|code("pgsql") }}</pre>
145
+ {% endif %}
146
+ </div>
147
+ </div>
148
+ </div>
149
+ {% endif %}
150
+ {% endfor %}
151
+ {% endfor %}
152
+ {% if diff_summary.count %}
153
+ <div class="card">
154
+ <div class="card-header" id="headingThree">
155
+ <h2 class="mb-0">
156
+ <button class="btn btn-link" type="button" data-toggle="collapse" data-target="#collapseThree" aria-expanded="true" aria-controls="collapseThree">
157
+ Sample mismatched rows
158
+ </button>
159
+ </h2>
160
+ </div>
161
+ <div id="collapseThree" class="collapse show" aria-labelledby="headingThree" data-parent="#accordionExample">
162
+ <div class="card-body overflow-auto">
163
+ {{ diff_summary.sample|dfhtml|safe }}
164
+ </div>
165
+ </div>
166
+ </div>
167
+ {% endif %}
168
+ {% for column, info in column_info.items() %}
169
+ <div class="card">
170
+ <div class="card-header" id="heading{{ loop.index }}">
171
+ <h2 class="mb-0">
172
+ <button class="btn btn-link collapsed" type="button" data-toggle="collapse" data-target="#collapse{{ loop.index }}" aria-expanded="false" aria-controls="collapse{{ loop.index }}">
173
+ Column {{ column|code("highlighter-rogue") }} has {{ info.count|comma }} differences.
174
+ </button>
175
+ </h2>
176
+ </div>
177
+ <div id="collapse{{ loop.index }}" class="collapse" aria-labelledby="heading{{ loop.index }}" data-parent="#accordionExample">
178
+ <div class="card-body overflow-auto">
179
+ <h3>Grouped differences, 100 most common:</h3>
180
+ <p>
181
+ {{ info.df|dfhtml|safe }}
182
+ </p>
183
+ <h3>Raw differences samples:</h3>
184
+ <p>
185
+ {{ info.df_raw|dfhtml|safe }}
186
+ </p>
187
+ {% if info.q_h_x %}
188
+ <h3>Raw differences by first join column sample for x:</h3>
189
+ <p>
190
+ {{ info.df_h_x|dfhtml|safe }}
191
+ </p>
192
+ <h3>Raw differences by first join column sample for y:</h3>
193
+ <p>
194
+ {{ info.df_h_y|dfhtml|safe }}
195
+ </p>
196
+ {% endif %}
197
+ {% if info.q_n %}
198
+ <h3>Binned differences between values:</h3>
199
+ <p>
200
+ {{ info.df_n|dfhtml|safe }}
201
+ </p>
202
+ <h3>Biggest differences between values:</h3>
203
+ <p>
204
+ {{ info.df_n_sample|dfhtml|safe }}
205
+ </p>
206
+ {% endif %}
207
+ <hr class="my-4">
208
+ <h3>Query for grouped differences:</h3>
209
+ <pre>{{ info.q|code("pgsql") }}</pre>
210
+ <h3>Query for raw differences:</h3>
211
+ <pre>{{ info.q_raw|code("pgsql") }}</pre>
212
+ {% if info.q_h_x %}
213
+ <h3>Query for raw differences by first join column in x:</h3>
214
+ <pre>{{ info.q_h_x|code("pgsql") }}</pre>
215
+ <h3>Query for raw differences by first join column in y:</h3>
216
+ <pre>{{ info.q_h_y|code("pgsql") }}</pre>
217
+ {% endif %}
218
+ {% if info.q_n %}
219
+ <h3>Query for grouped numeric differences:</h3>
220
+ <pre>{{ info.q_n|code("pgsql") }}</pre>
221
+ <h3>Query for sample biggest numeric differences:</h3>
222
+ <pre>{{ info.q_n_sample|code("pgsql") }}</pre>
223
+ {% endif %}
224
+ </div>
225
+ </div>
226
+ </div>
227
+ {% endfor %}
228
+ </div>
229
+ </div>
230
+ </div>
231
+ </div>
232
+ {% endblock %}
233
+
234
+ {% block bodyextrajs %}
235
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.15.10/highlight.min.js" integrity="sha256-1zu+3BnLYV9LdiY85uXMzii3bdrkelyp37e0ZyTAQh0=" crossorigin="anonymous"></script>
236
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.15.10/languages/pgsql.min.js" integrity="sha256-kS8L87/ytXlzSR5Zcd2qCLLg8j0uVuGcirrE9+Oi7Ns=" crossorigin="anonymous"></script>
237
+ <script>
238
+ hljs.initHighlightingOnLoad();
239
+ </script>
240
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.4/clipboard.min.js"></script>
241
+ {% endblock %}
@@ -0,0 +1,9 @@
1
+ INSERT INTO {{ diff_schema }}.{{ diff_table }} ( {{ join_cols|join(", ") }}, column_name )
2
+ SELECT {{ join_cols|join(", ") }},
3
+ '{{ column }}' as column_name
4
+ FROM (
5
+ SELECT {{ join_cols|join(", ") }},
6
+ x_{{ column }} <=> y_{{ column }} AS {{ column }}_eq
7
+ FROM {{ joined_schema }}.{{ joined_table }}
8
+ ) AS t1
9
+ WHERE t1.{{ column }}_eq IS FALSE;