dcs-sdk 1.6.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_diff/__init__.py +221 -0
- data_diff/__main__.py +517 -0
- data_diff/abcs/__init__.py +13 -0
- data_diff/abcs/compiler.py +27 -0
- data_diff/abcs/database_types.py +402 -0
- data_diff/config.py +141 -0
- data_diff/databases/__init__.py +38 -0
- data_diff/databases/_connect.py +323 -0
- data_diff/databases/base.py +1417 -0
- data_diff/databases/bigquery.py +376 -0
- data_diff/databases/clickhouse.py +217 -0
- data_diff/databases/databricks.py +262 -0
- data_diff/databases/duckdb.py +207 -0
- data_diff/databases/mssql.py +343 -0
- data_diff/databases/mysql.py +189 -0
- data_diff/databases/oracle.py +238 -0
- data_diff/databases/postgresql.py +293 -0
- data_diff/databases/presto.py +222 -0
- data_diff/databases/redis.py +93 -0
- data_diff/databases/redshift.py +233 -0
- data_diff/databases/snowflake.py +222 -0
- data_diff/databases/sybase.py +720 -0
- data_diff/databases/trino.py +73 -0
- data_diff/databases/vertica.py +174 -0
- data_diff/diff_tables.py +489 -0
- data_diff/errors.py +17 -0
- data_diff/format.py +369 -0
- data_diff/hashdiff_tables.py +1026 -0
- data_diff/info_tree.py +76 -0
- data_diff/joindiff_tables.py +434 -0
- data_diff/lexicographic_space.py +253 -0
- data_diff/parse_time.py +88 -0
- data_diff/py.typed +0 -0
- data_diff/queries/__init__.py +13 -0
- data_diff/queries/api.py +213 -0
- data_diff/queries/ast_classes.py +811 -0
- data_diff/queries/base.py +38 -0
- data_diff/queries/extras.py +43 -0
- data_diff/query_utils.py +70 -0
- data_diff/schema.py +67 -0
- data_diff/table_segment.py +583 -0
- data_diff/thread_utils.py +112 -0
- data_diff/utils.py +1022 -0
- data_diff/version.py +15 -0
- dcs_core/__init__.py +13 -0
- dcs_core/__main__.py +17 -0
- dcs_core/__version__.py +15 -0
- dcs_core/cli/__init__.py +13 -0
- dcs_core/cli/cli.py +165 -0
- dcs_core/core/__init__.py +19 -0
- dcs_core/core/common/__init__.py +13 -0
- dcs_core/core/common/errors.py +50 -0
- dcs_core/core/common/models/__init__.py +13 -0
- dcs_core/core/common/models/configuration.py +284 -0
- dcs_core/core/common/models/dashboard.py +24 -0
- dcs_core/core/common/models/data_source_resource.py +75 -0
- dcs_core/core/common/models/metric.py +160 -0
- dcs_core/core/common/models/profile.py +75 -0
- dcs_core/core/common/models/validation.py +216 -0
- dcs_core/core/common/models/widget.py +44 -0
- dcs_core/core/configuration/__init__.py +13 -0
- dcs_core/core/configuration/config_loader.py +139 -0
- dcs_core/core/configuration/configuration_parser.py +262 -0
- dcs_core/core/configuration/configuration_parser_arc.py +328 -0
- dcs_core/core/datasource/__init__.py +13 -0
- dcs_core/core/datasource/base.py +62 -0
- dcs_core/core/datasource/manager.py +112 -0
- dcs_core/core/datasource/search_datasource.py +421 -0
- dcs_core/core/datasource/sql_datasource.py +1094 -0
- dcs_core/core/inspect.py +163 -0
- dcs_core/core/logger/__init__.py +13 -0
- dcs_core/core/logger/base.py +32 -0
- dcs_core/core/logger/default_logger.py +94 -0
- dcs_core/core/metric/__init__.py +13 -0
- dcs_core/core/metric/base.py +220 -0
- dcs_core/core/metric/combined_metric.py +98 -0
- dcs_core/core/metric/custom_metric.py +34 -0
- dcs_core/core/metric/manager.py +137 -0
- dcs_core/core/metric/numeric_metric.py +403 -0
- dcs_core/core/metric/reliability_metric.py +90 -0
- dcs_core/core/profiling/__init__.py +13 -0
- dcs_core/core/profiling/datasource_profiling.py +136 -0
- dcs_core/core/profiling/numeric_field_profiling.py +72 -0
- dcs_core/core/profiling/text_field_profiling.py +67 -0
- dcs_core/core/repository/__init__.py +13 -0
- dcs_core/core/repository/metric_repository.py +77 -0
- dcs_core/core/utils/__init__.py +13 -0
- dcs_core/core/utils/log.py +29 -0
- dcs_core/core/utils/tracking.py +105 -0
- dcs_core/core/utils/utils.py +44 -0
- dcs_core/core/validation/__init__.py +13 -0
- dcs_core/core/validation/base.py +230 -0
- dcs_core/core/validation/completeness_validation.py +153 -0
- dcs_core/core/validation/custom_query_validation.py +24 -0
- dcs_core/core/validation/manager.py +282 -0
- dcs_core/core/validation/numeric_validation.py +276 -0
- dcs_core/core/validation/reliability_validation.py +91 -0
- dcs_core/core/validation/uniqueness_validation.py +61 -0
- dcs_core/core/validation/validity_validation.py +738 -0
- dcs_core/integrations/__init__.py +13 -0
- dcs_core/integrations/databases/__init__.py +13 -0
- dcs_core/integrations/databases/bigquery.py +187 -0
- dcs_core/integrations/databases/databricks.py +51 -0
- dcs_core/integrations/databases/db2.py +652 -0
- dcs_core/integrations/databases/elasticsearch.py +61 -0
- dcs_core/integrations/databases/mssql.py +829 -0
- dcs_core/integrations/databases/mysql.py +409 -0
- dcs_core/integrations/databases/opensearch.py +64 -0
- dcs_core/integrations/databases/oracle.py +719 -0
- dcs_core/integrations/databases/postgres.py +482 -0
- dcs_core/integrations/databases/redshift.py +53 -0
- dcs_core/integrations/databases/snowflake.py +48 -0
- dcs_core/integrations/databases/spark_df.py +111 -0
- dcs_core/integrations/databases/sybase.py +1069 -0
- dcs_core/integrations/storage/__init__.py +13 -0
- dcs_core/integrations/storage/local_file.py +149 -0
- dcs_core/integrations/utils/__init__.py +13 -0
- dcs_core/integrations/utils/utils.py +36 -0
- dcs_core/report/__init__.py +13 -0
- dcs_core/report/dashboard.py +211 -0
- dcs_core/report/models.py +88 -0
- dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
- dcs_core/report/static/assets/images/docs.svg +6 -0
- dcs_core/report/static/assets/images/github.svg +4 -0
- dcs_core/report/static/assets/images/logo.svg +7 -0
- dcs_core/report/static/assets/images/slack.svg +13 -0
- dcs_core/report/static/index.js +2 -0
- dcs_core/report/static/index.js.LICENSE.txt +3971 -0
- dcs_sdk/__init__.py +13 -0
- dcs_sdk/__main__.py +18 -0
- dcs_sdk/__version__.py +15 -0
- dcs_sdk/cli/__init__.py +13 -0
- dcs_sdk/cli/cli.py +163 -0
- dcs_sdk/sdk/__init__.py +58 -0
- dcs_sdk/sdk/config/__init__.py +13 -0
- dcs_sdk/sdk/config/config_loader.py +491 -0
- dcs_sdk/sdk/data_diff/__init__.py +13 -0
- dcs_sdk/sdk/data_diff/data_differ.py +821 -0
- dcs_sdk/sdk/rules/__init__.py +15 -0
- dcs_sdk/sdk/rules/rules_mappping.py +31 -0
- dcs_sdk/sdk/rules/rules_repository.py +214 -0
- dcs_sdk/sdk/rules/schema_rules.py +65 -0
- dcs_sdk/sdk/utils/__init__.py +13 -0
- dcs_sdk/sdk/utils/serializer.py +25 -0
- dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
- dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
- dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
- dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
- dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
- dcs_sdk/sdk/utils/table.py +475 -0
- dcs_sdk/sdk/utils/themes.py +40 -0
- dcs_sdk/sdk/utils/utils.py +349 -0
- dcs_sdk-1.6.5.dist-info/METADATA +150 -0
- dcs_sdk-1.6.5.dist-info/RECORD +159 -0
- dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
- dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
data_diff/diff_tables.py
ADDED
|
@@ -0,0 +1,489 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""Provides classes for performing a table diff"""
|
|
16
|
+
|
|
17
|
+
import threading
|
|
18
|
+
from abc import ABC, abstractmethod
|
|
19
|
+
from collections import defaultdict
|
|
20
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
21
|
+
from contextlib import contextmanager
|
|
22
|
+
from enum import Enum
|
|
23
|
+
from operator import methodcaller
|
|
24
|
+
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union
|
|
25
|
+
|
|
26
|
+
import attrs
|
|
27
|
+
|
|
28
|
+
# logger = getLogger(__name__)
|
|
29
|
+
from loguru import logger
|
|
30
|
+
|
|
31
|
+
from data_diff.abcs.database_types import IKey, Integer, StringType
|
|
32
|
+
from data_diff.errors import DataDiffMismatchingKeyTypesError
|
|
33
|
+
from data_diff.info_tree import InfoTree, SegmentInfo
|
|
34
|
+
from data_diff.table_segment import TableSegment, create_mesh_from_points
|
|
35
|
+
from data_diff.thread_utils import ThreadedYielder
|
|
36
|
+
from data_diff.utils import Vector, getLogger, safezip
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class Algorithm(Enum):
|
|
40
|
+
AUTO = "auto"
|
|
41
|
+
JOINDIFF = "joindiff"
|
|
42
|
+
HASHDIFF = "hashdiff"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
DiffResult = Iterator[Tuple[str, tuple]] # Iterator[Tuple[Literal["+", "-"], tuple]]
|
|
46
|
+
DiffResultList = Iterator[List[Tuple[str, tuple]]]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@attrs.define(frozen=False)
|
|
50
|
+
class ThreadBase:
|
|
51
|
+
"Provides utility methods for optional threading"
|
|
52
|
+
|
|
53
|
+
threaded: bool = True
|
|
54
|
+
max_threadpool_size: Optional[int] = 1
|
|
55
|
+
|
|
56
|
+
def _thread_map(self, func, iterable):
|
|
57
|
+
if not self.threaded:
|
|
58
|
+
return map(func, iterable)
|
|
59
|
+
|
|
60
|
+
with ThreadPoolExecutor(max_workers=self.max_threadpool_size) as task_pool:
|
|
61
|
+
return task_pool.map(func, iterable)
|
|
62
|
+
|
|
63
|
+
def _threaded_call(self, func, iterable):
|
|
64
|
+
"Calls a method for each object in iterable."
|
|
65
|
+
return list(self._thread_map(methodcaller(func), iterable))
|
|
66
|
+
|
|
67
|
+
def _thread_as_completed(self, func, iterable):
|
|
68
|
+
if not self.threaded:
|
|
69
|
+
yield from map(func, iterable)
|
|
70
|
+
return
|
|
71
|
+
|
|
72
|
+
with ThreadPoolExecutor(max_workers=self.max_threadpool_size) as task_pool:
|
|
73
|
+
futures = [task_pool.submit(func, item) for item in iterable]
|
|
74
|
+
for future in as_completed(futures):
|
|
75
|
+
yield future.result()
|
|
76
|
+
|
|
77
|
+
def _threaded_call_as_completed(self, func, iterable):
|
|
78
|
+
"Calls a method for each object in iterable. Returned in order of completion."
|
|
79
|
+
return self._thread_as_completed(methodcaller(func), iterable)
|
|
80
|
+
|
|
81
|
+
@contextmanager
|
|
82
|
+
def _run_in_background(self, *funcs):
|
|
83
|
+
with ThreadPoolExecutor(max_workers=self.max_threadpool_size) as task_pool:
|
|
84
|
+
futures = [task_pool.submit(f) for f in funcs if f is not None]
|
|
85
|
+
yield futures
|
|
86
|
+
for f in futures:
|
|
87
|
+
f.result()
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@attrs.define(frozen=True)
|
|
91
|
+
class DiffStats:
|
|
92
|
+
diff_by_sign: Dict[str, int]
|
|
93
|
+
table1_count: int
|
|
94
|
+
table2_count: int
|
|
95
|
+
unchanged: int
|
|
96
|
+
# diff_percent: float
|
|
97
|
+
extra_column_diffs: Optional[Dict[str, int]]
|
|
98
|
+
exclusive_source_ids: List[tuple]
|
|
99
|
+
exclusive_target_ids: List[tuple]
|
|
100
|
+
duplicate_source_ids: List[tuple]
|
|
101
|
+
duplicate_target_ids: List[tuple]
|
|
102
|
+
diff_values_ids: List[tuple]
|
|
103
|
+
diff_pk_percent: float
|
|
104
|
+
rows_downloaded: int
|
|
105
|
+
comparison_tracker: Optional[List] = []
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@attrs.define(frozen=True)
|
|
109
|
+
class DiffResultWrapper:
|
|
110
|
+
diff: iter # DiffResult
|
|
111
|
+
info_tree: InfoTree
|
|
112
|
+
stats: dict
|
|
113
|
+
result_list: list = attrs.field(factory=list)
|
|
114
|
+
|
|
115
|
+
def __iter__(self) -> Iterator[Any]:
|
|
116
|
+
yield from self.result_list
|
|
117
|
+
for i in self.diff:
|
|
118
|
+
self.result_list.append(i)
|
|
119
|
+
yield i
|
|
120
|
+
|
|
121
|
+
def _get_stats(self) -> DiffStats:
|
|
122
|
+
list(self) # Consume the iterator into result_list, if we haven't already
|
|
123
|
+
|
|
124
|
+
key_columns = self.info_tree.info.tables[0].key_columns
|
|
125
|
+
len_key_columns = len(key_columns)
|
|
126
|
+
diff_by_key = {}
|
|
127
|
+
extra_column_values_store = {}
|
|
128
|
+
extra_columns = self.info_tree.info.tables[0].extra_columns
|
|
129
|
+
extra_column_diffs = {k: 0 for k in extra_columns}
|
|
130
|
+
source_rows_by_key = defaultdict(int)
|
|
131
|
+
target_rows_by_key = defaultdict(int)
|
|
132
|
+
exclusive_source_ids = []
|
|
133
|
+
exclusive_target_ids = []
|
|
134
|
+
duplicate_source_ids = []
|
|
135
|
+
duplicate_target_ids = []
|
|
136
|
+
diff_values_ids = []
|
|
137
|
+
|
|
138
|
+
for sign, values in self.result_list:
|
|
139
|
+
k = values[:len_key_columns]
|
|
140
|
+
if sign == "-":
|
|
141
|
+
source_rows_by_key[k] += 1
|
|
142
|
+
elif sign == "+":
|
|
143
|
+
target_rows_by_key[k] += 1
|
|
144
|
+
|
|
145
|
+
for sign, values in self.result_list:
|
|
146
|
+
k = values[:len_key_columns]
|
|
147
|
+
if sign == "-":
|
|
148
|
+
if source_rows_by_key[k] > 1 and k not in duplicate_source_ids:
|
|
149
|
+
duplicate_source_ids.append(k)
|
|
150
|
+
if k not in target_rows_by_key:
|
|
151
|
+
exclusive_source_ids.append(k)
|
|
152
|
+
elif sign == "+":
|
|
153
|
+
if target_rows_by_key[k] > 1 and k not in duplicate_target_ids:
|
|
154
|
+
duplicate_target_ids.append(k)
|
|
155
|
+
if k not in source_rows_by_key:
|
|
156
|
+
exclusive_target_ids.append(k)
|
|
157
|
+
|
|
158
|
+
for sign, values in self.result_list:
|
|
159
|
+
k = values[:len_key_columns]
|
|
160
|
+
if k in diff_by_key:
|
|
161
|
+
if sign != diff_by_key[k]:
|
|
162
|
+
diff_by_key[k] = "!"
|
|
163
|
+
if source_rows_by_key[k] <= 1 and target_rows_by_key[k] <= 1:
|
|
164
|
+
diff_values_ids.append(k)
|
|
165
|
+
extra_column_values = values[len_key_columns:]
|
|
166
|
+
for i in range(0, len(extra_columns)):
|
|
167
|
+
if extra_column_values[i] != extra_column_values_store[k][i]:
|
|
168
|
+
extra_column_diffs[extra_columns[i]] += 1
|
|
169
|
+
else:
|
|
170
|
+
diff_by_key[k] = sign
|
|
171
|
+
extra_column_values_store[k] = values[len_key_columns:]
|
|
172
|
+
|
|
173
|
+
diff_by_sign = {k: 0 for k in "+-!"}
|
|
174
|
+
for sign in diff_by_key.values():
|
|
175
|
+
diff_by_sign[sign] += 1
|
|
176
|
+
|
|
177
|
+
table1_count = self.info_tree.info.tables[0].count()
|
|
178
|
+
table2_count = self.info_tree.info.tables[1].count()
|
|
179
|
+
|
|
180
|
+
total_exclusive_pks = len(exclusive_source_ids) + len(exclusive_target_ids)
|
|
181
|
+
total_source_unique_pks = table1_count - len(duplicate_source_ids)
|
|
182
|
+
total_unique_pks = total_source_unique_pks + len(exclusive_target_ids)
|
|
183
|
+
diff_pk_percent = (total_exclusive_pks / total_unique_pks) if total_unique_pks > 0 else 0.0
|
|
184
|
+
differing_pks = diff_by_sign["!"]
|
|
185
|
+
exclusive_pks = total_exclusive_pks
|
|
186
|
+
unchanged = total_unique_pks - differing_pks - exclusive_pks
|
|
187
|
+
# diff_percent = 1 - unchanged / max(table1_count, table2_count) if max(table1_count, table2_count) > 0 else 0.0
|
|
188
|
+
rows_downloaded = self.stats.get("rows_downloaded", 0)
|
|
189
|
+
comparison_tracker = self.stats.get("comparison_tracker", [])
|
|
190
|
+
return DiffStats(
|
|
191
|
+
diff_by_sign,
|
|
192
|
+
table1_count,
|
|
193
|
+
table2_count,
|
|
194
|
+
unchanged,
|
|
195
|
+
# diff_percent,
|
|
196
|
+
extra_column_diffs,
|
|
197
|
+
exclusive_source_ids,
|
|
198
|
+
exclusive_target_ids,
|
|
199
|
+
duplicate_source_ids,
|
|
200
|
+
duplicate_target_ids,
|
|
201
|
+
diff_values_ids,
|
|
202
|
+
diff_pk_percent,
|
|
203
|
+
rows_downloaded,
|
|
204
|
+
comparison_tracker,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
def get_stats_string(self):
|
|
208
|
+
diff_stats = self._get_stats()
|
|
209
|
+
|
|
210
|
+
string_output = ""
|
|
211
|
+
# string_output += f"{diff_stats.table1_count} rows in table A\n"
|
|
212
|
+
# string_output += f"{diff_stats.table2_count} rows in table B\n"
|
|
213
|
+
string_output += f"{diff_stats.diff_by_sign['-']} rows exclusive to table A (not present in B)\n"
|
|
214
|
+
string_output += f"{diff_stats.diff_by_sign['+']} rows exclusive to table B (not present in A)\n"
|
|
215
|
+
string_output += f"{diff_stats.diff_by_sign['!']} rows updated\n"
|
|
216
|
+
# string_output += f"{diff_stats.unchanged} rows unchanged\n"
|
|
217
|
+
# string_output += f"{100*diff_stats.diff_percent:.2f}% difference score\n"
|
|
218
|
+
|
|
219
|
+
# if self.stats:
|
|
220
|
+
# string_output += "\nExtra-Info:\n"
|
|
221
|
+
# for k, v in sorted(self.stats.items()):
|
|
222
|
+
# string_output += f" {k} = {v}\n"
|
|
223
|
+
for k, v in diff_stats.extra_column_diffs.items():
|
|
224
|
+
string_output += f"{v} rows with different values in column: {k}\n"
|
|
225
|
+
json_output = {
|
|
226
|
+
"rows_A": diff_stats.table1_count,
|
|
227
|
+
"rows_B": diff_stats.table2_count,
|
|
228
|
+
"exclusive_A": diff_stats.diff_by_sign["-"],
|
|
229
|
+
"exclusive_B": diff_stats.diff_by_sign["+"],
|
|
230
|
+
"updated": diff_stats.diff_by_sign["!"],
|
|
231
|
+
"total": sum(diff_stats.diff_by_sign.values()),
|
|
232
|
+
}
|
|
233
|
+
json_output["values"] = diff_stats.extra_column_diffs or {}
|
|
234
|
+
return string_output, json_output
|
|
235
|
+
|
|
236
|
+
def get_stats_dict(self):
|
|
237
|
+
diff_stats = self._get_stats()
|
|
238
|
+
json_output = {
|
|
239
|
+
"rows_A": diff_stats.table1_count,
|
|
240
|
+
"rows_B": diff_stats.table2_count,
|
|
241
|
+
"exclusive_A": diff_stats.diff_by_sign["-"],
|
|
242
|
+
"exclusive_B": diff_stats.diff_by_sign["+"],
|
|
243
|
+
# "updated": diff_stats.diff_by_sign["!"],
|
|
244
|
+
# "total": sum(diff_stats.diff_by_sign.values()),
|
|
245
|
+
"exclusive_source_ids": diff_stats.exclusive_source_ids,
|
|
246
|
+
"exclusive_target_ids": diff_stats.exclusive_target_ids,
|
|
247
|
+
"duplicate_source_ids": diff_stats.duplicate_source_ids,
|
|
248
|
+
"duplicate_target_ids": diff_stats.duplicate_target_ids,
|
|
249
|
+
"diff_values_ids": diff_stats.diff_values_ids,
|
|
250
|
+
"diff_pk_percent": diff_stats.diff_pk_percent,
|
|
251
|
+
"unchanged": diff_stats.unchanged,
|
|
252
|
+
"rows_downloaded": diff_stats.rows_downloaded,
|
|
253
|
+
"comparison_tracker": diff_stats.comparison_tracker,
|
|
254
|
+
}
|
|
255
|
+
json_output["values"] = diff_stats.extra_column_diffs or {}
|
|
256
|
+
return json_output
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
@attrs.define(frozen=False)
|
|
260
|
+
class TableDiffer(ThreadBase, ABC):
|
|
261
|
+
INFO_TREE_CLASS = InfoTree
|
|
262
|
+
|
|
263
|
+
bisection_factor = 32
|
|
264
|
+
stats: dict = {}
|
|
265
|
+
|
|
266
|
+
ignored_columns1: Set[str] = attrs.field(factory=set)
|
|
267
|
+
ignored_columns2: Set[str] = attrs.field(factory=set)
|
|
268
|
+
_ignored_columns_lock: threading.Lock = attrs.field(factory=threading.Lock, init=False)
|
|
269
|
+
yield_list: bool = False
|
|
270
|
+
t1_row_count: int = attrs.field(default=0, init=False)
|
|
271
|
+
t2_row_count: int = attrs.field(default=0, init=False)
|
|
272
|
+
|
|
273
|
+
def diff_tables(self, table1: TableSegment, table2: TableSegment, info_tree: InfoTree = None) -> DiffResultWrapper:
|
|
274
|
+
"""Diff the given tables.
|
|
275
|
+
|
|
276
|
+
Parameters:
|
|
277
|
+
table1 (TableSegment): The "before" table to compare. Or: source table
|
|
278
|
+
table2 (TableSegment): The "after" table to compare. Or: target table
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
An iterator that yield pair-tuples, representing the diff. Items can be either -
|
|
282
|
+
('-', row) for items in table1 but not in table2.
|
|
283
|
+
('+', row) for items in table2 but not in table1.
|
|
284
|
+
Where `row` is a tuple of values, corresponding to the diffed columns.
|
|
285
|
+
"""
|
|
286
|
+
if info_tree is None:
|
|
287
|
+
segment_info = self.INFO_TREE_CLASS.SEGMENT_INFO_CLASS([table1, table2])
|
|
288
|
+
info_tree = self.INFO_TREE_CLASS(segment_info)
|
|
289
|
+
return DiffResultWrapper(self._diff_tables_wrapper(table1, table2, info_tree), info_tree, self.stats)
|
|
290
|
+
|
|
291
|
+
def _diff_tables_wrapper(self, table1: TableSegment, table2: TableSegment, info_tree: InfoTree) -> DiffResult:
|
|
292
|
+
if table1.database.dialect.PREVENT_OVERFLOW_WHEN_CONCAT or table2.database.dialect.PREVENT_OVERFLOW_WHEN_CONCAT:
|
|
293
|
+
table1.database.dialect.enable_preventing_type_overflow()
|
|
294
|
+
table2.database.dialect.enable_preventing_type_overflow()
|
|
295
|
+
|
|
296
|
+
error = None
|
|
297
|
+
try:
|
|
298
|
+
# Query and validate schema
|
|
299
|
+
table1, table2 = self._threaded_call("with_schema", [table1, table2])
|
|
300
|
+
self._validate_and_adjust_columns(table1, table2)
|
|
301
|
+
|
|
302
|
+
yield from self._diff_tables_root(table1, table2, info_tree)
|
|
303
|
+
|
|
304
|
+
except BaseException as e: # Catch KeyboardInterrupt too
|
|
305
|
+
error = e
|
|
306
|
+
finally:
|
|
307
|
+
info_tree.aggregate_info()
|
|
308
|
+
if error:
|
|
309
|
+
raise error
|
|
310
|
+
|
|
311
|
+
def _validate_and_adjust_columns(self, table1: TableSegment, table2: TableSegment) -> None:
|
|
312
|
+
pass
|
|
313
|
+
|
|
314
|
+
def _diff_tables_root(
|
|
315
|
+
self, table1: TableSegment, table2: TableSegment, info_tree: InfoTree
|
|
316
|
+
) -> Union[DiffResult, DiffResultList]:
|
|
317
|
+
return self._bisect_and_diff_tables(table1, table2, info_tree)
|
|
318
|
+
|
|
319
|
+
@abstractmethod
|
|
320
|
+
def _diff_segments(
|
|
321
|
+
self,
|
|
322
|
+
ti: ThreadedYielder,
|
|
323
|
+
table1: TableSegment,
|
|
324
|
+
table2: TableSegment,
|
|
325
|
+
info_tree: InfoTree,
|
|
326
|
+
max_rows: int,
|
|
327
|
+
level=0,
|
|
328
|
+
segment_index=None,
|
|
329
|
+
segment_count=None,
|
|
330
|
+
): ...
|
|
331
|
+
|
|
332
|
+
def _bisect_and_diff_tables(self, table1: TableSegment, table2: TableSegment, info_tree):
|
|
333
|
+
if len(table1.key_columns) != len(table2.key_columns):
|
|
334
|
+
raise ValueError("Tables should have an equivalent number of key columns!")
|
|
335
|
+
|
|
336
|
+
key_types1 = [table1._schema[i] for i in table1.key_columns]
|
|
337
|
+
key_types2 = [table2._schema[i] for i in table2.key_columns]
|
|
338
|
+
|
|
339
|
+
for kt in key_types1 + key_types2:
|
|
340
|
+
if not isinstance(kt, IKey):
|
|
341
|
+
raise NotImplementedError(f"Cannot use a column of type {kt} as a key")
|
|
342
|
+
|
|
343
|
+
mismatched_key_types = False
|
|
344
|
+
for i, (kt1, kt2) in enumerate(safezip(key_types1, key_types2)):
|
|
345
|
+
if kt1.python_type is not kt2.python_type:
|
|
346
|
+
# Allow integer vs string, and string vs string variants for diffing, but mark as mismatched
|
|
347
|
+
if (isinstance(kt1, Integer) and isinstance(kt2, StringType)) or (
|
|
348
|
+
isinstance(kt2, Integer) and isinstance(kt1, StringType)
|
|
349
|
+
):
|
|
350
|
+
mismatched_key_types = True
|
|
351
|
+
elif isinstance(kt1, StringType) and isinstance(kt2, StringType):
|
|
352
|
+
mismatched_key_types = True
|
|
353
|
+
else:
|
|
354
|
+
k1 = table1.key_columns[i]
|
|
355
|
+
k2 = table2.key_columns[i]
|
|
356
|
+
raise DataDiffMismatchingKeyTypesError(
|
|
357
|
+
f"Key columns {k1} type: {kt1.python_type} and {k2} type: {kt2.python_type} can't be compared due to different types."
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
# Query min/max values
|
|
361
|
+
key_ranges = self._threaded_call_as_completed("query_key_range", [table1, table2])
|
|
362
|
+
|
|
363
|
+
# Start with the first completed value, so we don't waste time waiting
|
|
364
|
+
min_key1, max_key1 = self._parse_key_range_result(key_types1, next(key_ranges))
|
|
365
|
+
|
|
366
|
+
btable1 = table1.new_key_bounds(min_key=min_key1, max_key=max_key1, key_types=key_types1)
|
|
367
|
+
btable2 = table2.new_key_bounds(min_key=min_key1, max_key=max_key1, key_types=key_types2)
|
|
368
|
+
|
|
369
|
+
logger.info(
|
|
370
|
+
f"Diffing segments at key-range: {btable1.min_key}..{btable2.max_key}. "
|
|
371
|
+
f"size: table1 <= {btable1.approximate_size(self.t1_row_count)}, table2 <= {btable2.approximate_size(self.t2_row_count)}"
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
ti = ThreadedYielder(self.max_threadpool_size, self.yield_list)
|
|
375
|
+
# Bisect (split) the table into segments, and diff them recursively.
|
|
376
|
+
ti.submit(self._bisect_and_diff_segments, ti, btable1, btable2, info_tree, priority=999)
|
|
377
|
+
|
|
378
|
+
# Now we check for the second min-max, to diff the portions we "missed".
|
|
379
|
+
# This is achieved by subtracting the table ranges, and dividing the resulting space into aligned boxes.
|
|
380
|
+
# For example, given tables A & B, and a 2D compound key, where A was queried first for key-range,
|
|
381
|
+
# the regions of B we need to diff in this second pass are marked by B1..8:
|
|
382
|
+
# ┌──┬──────┬──┐
|
|
383
|
+
# │B1│ B2 │B3│
|
|
384
|
+
# ├──┼──────┼──┤
|
|
385
|
+
# │B4│ A │B5│
|
|
386
|
+
# ├──┼──────┼──┤
|
|
387
|
+
# │B6│ B7 │B8│
|
|
388
|
+
# └──┴──────┴──┘
|
|
389
|
+
# Overall, the max number of new regions in this 2nd pass is 3^|k| - 1
|
|
390
|
+
|
|
391
|
+
# Note: python types can be the same, but the rendering parameters (e.g. casing) can differ.
|
|
392
|
+
# If key types mismatched (e.g., int vs string), skip the second meshing pass to avoid
|
|
393
|
+
# attempting to sort mixed-type tuples (e.g., ArithAlphanumeric vs int).
|
|
394
|
+
if not mismatched_key_types:
|
|
395
|
+
min_key2, max_key2 = self._parse_key_range_result(key_types2, next(key_ranges))
|
|
396
|
+
|
|
397
|
+
points = [list(sorted(p)) for p in safezip(min_key1, min_key2, max_key1, max_key2)]
|
|
398
|
+
box_mesh = create_mesh_from_points(*points)
|
|
399
|
+
|
|
400
|
+
new_regions = [(p1, p2) for p1, p2 in box_mesh if p1 < p2 and not (p1 >= min_key1 and p2 <= max_key1)]
|
|
401
|
+
|
|
402
|
+
for p1, p2 in new_regions:
|
|
403
|
+
extra_table1 = table1.new_key_bounds(min_key=p1, max_key=p2, key_types=key_types1)
|
|
404
|
+
extra_table2 = table2.new_key_bounds(min_key=p1, max_key=p2, key_types=key_types2)
|
|
405
|
+
ti.submit(
|
|
406
|
+
self._bisect_and_diff_segments,
|
|
407
|
+
ti,
|
|
408
|
+
extra_table1,
|
|
409
|
+
extra_table2,
|
|
410
|
+
info_tree,
|
|
411
|
+
priority=999,
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
return ti
|
|
415
|
+
|
|
416
|
+
def _parse_key_range_result(self, key_types, key_range) -> Tuple[Vector, Vector]:
|
|
417
|
+
min_key_values, max_key_values = key_range
|
|
418
|
+
|
|
419
|
+
# We add 1 because our ranges are exclusive of the end (like in Python)
|
|
420
|
+
try:
|
|
421
|
+
min_key = Vector(key_type.make_value(mn) for key_type, mn in safezip(key_types, min_key_values))
|
|
422
|
+
max_key = Vector(key_type.make_value(mx) + 1 for key_type, mx in safezip(key_types, max_key_values))
|
|
423
|
+
except (TypeError, ValueError) as e:
|
|
424
|
+
raise type(e)(f"Cannot apply {key_types} to '{min_key_values}', '{max_key_values}'.") from e
|
|
425
|
+
|
|
426
|
+
return min_key, max_key
|
|
427
|
+
|
|
428
|
+
def _bisect_and_diff_segments(
|
|
429
|
+
self,
|
|
430
|
+
ti: ThreadedYielder,
|
|
431
|
+
table1: TableSegment,
|
|
432
|
+
table2: TableSegment,
|
|
433
|
+
info_tree: InfoTree,
|
|
434
|
+
level=0,
|
|
435
|
+
max_rows=None,
|
|
436
|
+
):
|
|
437
|
+
assert table1.is_bounded and table2.is_bounded
|
|
438
|
+
|
|
439
|
+
# Choose evenly spaced checkpoints (according to min_key and max_key)
|
|
440
|
+
biggest_table = max(
|
|
441
|
+
table1, table2, key=methodcaller("approximate_size", max(self.t1_row_count, self.t2_row_count))
|
|
442
|
+
)
|
|
443
|
+
checkpoints = biggest_table.choose_checkpoints(self.bisection_factor - 1)
|
|
444
|
+
|
|
445
|
+
# Get it thread-safe, to avoid segment misalignment because of bad timing.
|
|
446
|
+
with self._ignored_columns_lock:
|
|
447
|
+
table1 = attrs.evolve(table1, ignored_columns=frozenset(self.ignored_columns1))
|
|
448
|
+
table2 = attrs.evolve(table2, ignored_columns=frozenset(self.ignored_columns2))
|
|
449
|
+
|
|
450
|
+
# Create new instances of TableSegment between each checkpoint
|
|
451
|
+
segmented1 = table1.segment_by_checkpoints(checkpoints)
|
|
452
|
+
segmented2 = table2.segment_by_checkpoints(checkpoints)
|
|
453
|
+
|
|
454
|
+
# Recursively compare each pair of corresponding segments between table1 and table2
|
|
455
|
+
for i, (t1, t2) in enumerate(safezip(segmented1, segmented2)):
|
|
456
|
+
info_node = info_tree.add_node(t1, t2, max_rows=max_rows)
|
|
457
|
+
ti.submit(
|
|
458
|
+
self._diff_segments,
|
|
459
|
+
ti,
|
|
460
|
+
t1,
|
|
461
|
+
t2,
|
|
462
|
+
info_node,
|
|
463
|
+
max_rows,
|
|
464
|
+
level + 1,
|
|
465
|
+
i + 1,
|
|
466
|
+
len(segmented1),
|
|
467
|
+
priority=level,
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
def ignore_column(self, column_name1: str, column_name2: str) -> None:
|
|
471
|
+
"""
|
|
472
|
+
Ignore the column (by name on sides A & B) in md5s & diffs from now on.
|
|
473
|
+
|
|
474
|
+
This affects 2 places:
|
|
475
|
+
|
|
476
|
+
- The columns are not checksumed for new(!) segments.
|
|
477
|
+
- The columns are ignored in in-memory diffing for running segments.
|
|
478
|
+
|
|
479
|
+
The columns are never ignored in the fetched values, whether they are
|
|
480
|
+
the same or different — for data consistency.
|
|
481
|
+
|
|
482
|
+
Use this feature to collect relatively well-represented differences
|
|
483
|
+
across all columns if one of them is highly different in the beginning
|
|
484
|
+
of a table (as per the order of segmentation/bisection). Otherwise,
|
|
485
|
+
that one column might easily hit the limit and stop the whole diff.
|
|
486
|
+
"""
|
|
487
|
+
with self._ignored_columns_lock:
|
|
488
|
+
self.ignored_columns1.add(column_name1)
|
|
489
|
+
self.ignored_columns2.add(column_name2)
|
data_diff/errors.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DataDiffMismatchingKeyTypesError(Exception):
|
|
17
|
+
"Raised when the key types of two tables do not match, like VARCHAR and INT."
|