dcs-sdk 1.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. data_diff/__init__.py +221 -0
  2. data_diff/__main__.py +517 -0
  3. data_diff/abcs/__init__.py +13 -0
  4. data_diff/abcs/compiler.py +27 -0
  5. data_diff/abcs/database_types.py +402 -0
  6. data_diff/config.py +141 -0
  7. data_diff/databases/__init__.py +38 -0
  8. data_diff/databases/_connect.py +323 -0
  9. data_diff/databases/base.py +1417 -0
  10. data_diff/databases/bigquery.py +376 -0
  11. data_diff/databases/clickhouse.py +217 -0
  12. data_diff/databases/databricks.py +262 -0
  13. data_diff/databases/duckdb.py +207 -0
  14. data_diff/databases/mssql.py +343 -0
  15. data_diff/databases/mysql.py +189 -0
  16. data_diff/databases/oracle.py +238 -0
  17. data_diff/databases/postgresql.py +293 -0
  18. data_diff/databases/presto.py +222 -0
  19. data_diff/databases/redis.py +93 -0
  20. data_diff/databases/redshift.py +233 -0
  21. data_diff/databases/snowflake.py +222 -0
  22. data_diff/databases/sybase.py +720 -0
  23. data_diff/databases/trino.py +73 -0
  24. data_diff/databases/vertica.py +174 -0
  25. data_diff/diff_tables.py +489 -0
  26. data_diff/errors.py +17 -0
  27. data_diff/format.py +369 -0
  28. data_diff/hashdiff_tables.py +1026 -0
  29. data_diff/info_tree.py +76 -0
  30. data_diff/joindiff_tables.py +434 -0
  31. data_diff/lexicographic_space.py +253 -0
  32. data_diff/parse_time.py +88 -0
  33. data_diff/py.typed +0 -0
  34. data_diff/queries/__init__.py +13 -0
  35. data_diff/queries/api.py +213 -0
  36. data_diff/queries/ast_classes.py +811 -0
  37. data_diff/queries/base.py +38 -0
  38. data_diff/queries/extras.py +43 -0
  39. data_diff/query_utils.py +70 -0
  40. data_diff/schema.py +67 -0
  41. data_diff/table_segment.py +583 -0
  42. data_diff/thread_utils.py +112 -0
  43. data_diff/utils.py +1022 -0
  44. data_diff/version.py +15 -0
  45. dcs_core/__init__.py +13 -0
  46. dcs_core/__main__.py +17 -0
  47. dcs_core/__version__.py +15 -0
  48. dcs_core/cli/__init__.py +13 -0
  49. dcs_core/cli/cli.py +165 -0
  50. dcs_core/core/__init__.py +19 -0
  51. dcs_core/core/common/__init__.py +13 -0
  52. dcs_core/core/common/errors.py +50 -0
  53. dcs_core/core/common/models/__init__.py +13 -0
  54. dcs_core/core/common/models/configuration.py +284 -0
  55. dcs_core/core/common/models/dashboard.py +24 -0
  56. dcs_core/core/common/models/data_source_resource.py +75 -0
  57. dcs_core/core/common/models/metric.py +160 -0
  58. dcs_core/core/common/models/profile.py +75 -0
  59. dcs_core/core/common/models/validation.py +216 -0
  60. dcs_core/core/common/models/widget.py +44 -0
  61. dcs_core/core/configuration/__init__.py +13 -0
  62. dcs_core/core/configuration/config_loader.py +139 -0
  63. dcs_core/core/configuration/configuration_parser.py +262 -0
  64. dcs_core/core/configuration/configuration_parser_arc.py +328 -0
  65. dcs_core/core/datasource/__init__.py +13 -0
  66. dcs_core/core/datasource/base.py +62 -0
  67. dcs_core/core/datasource/manager.py +112 -0
  68. dcs_core/core/datasource/search_datasource.py +421 -0
  69. dcs_core/core/datasource/sql_datasource.py +1094 -0
  70. dcs_core/core/inspect.py +163 -0
  71. dcs_core/core/logger/__init__.py +13 -0
  72. dcs_core/core/logger/base.py +32 -0
  73. dcs_core/core/logger/default_logger.py +94 -0
  74. dcs_core/core/metric/__init__.py +13 -0
  75. dcs_core/core/metric/base.py +220 -0
  76. dcs_core/core/metric/combined_metric.py +98 -0
  77. dcs_core/core/metric/custom_metric.py +34 -0
  78. dcs_core/core/metric/manager.py +137 -0
  79. dcs_core/core/metric/numeric_metric.py +403 -0
  80. dcs_core/core/metric/reliability_metric.py +90 -0
  81. dcs_core/core/profiling/__init__.py +13 -0
  82. dcs_core/core/profiling/datasource_profiling.py +136 -0
  83. dcs_core/core/profiling/numeric_field_profiling.py +72 -0
  84. dcs_core/core/profiling/text_field_profiling.py +67 -0
  85. dcs_core/core/repository/__init__.py +13 -0
  86. dcs_core/core/repository/metric_repository.py +77 -0
  87. dcs_core/core/utils/__init__.py +13 -0
  88. dcs_core/core/utils/log.py +29 -0
  89. dcs_core/core/utils/tracking.py +105 -0
  90. dcs_core/core/utils/utils.py +44 -0
  91. dcs_core/core/validation/__init__.py +13 -0
  92. dcs_core/core/validation/base.py +230 -0
  93. dcs_core/core/validation/completeness_validation.py +153 -0
  94. dcs_core/core/validation/custom_query_validation.py +24 -0
  95. dcs_core/core/validation/manager.py +282 -0
  96. dcs_core/core/validation/numeric_validation.py +276 -0
  97. dcs_core/core/validation/reliability_validation.py +91 -0
  98. dcs_core/core/validation/uniqueness_validation.py +61 -0
  99. dcs_core/core/validation/validity_validation.py +738 -0
  100. dcs_core/integrations/__init__.py +13 -0
  101. dcs_core/integrations/databases/__init__.py +13 -0
  102. dcs_core/integrations/databases/bigquery.py +187 -0
  103. dcs_core/integrations/databases/databricks.py +51 -0
  104. dcs_core/integrations/databases/db2.py +652 -0
  105. dcs_core/integrations/databases/elasticsearch.py +61 -0
  106. dcs_core/integrations/databases/mssql.py +829 -0
  107. dcs_core/integrations/databases/mysql.py +409 -0
  108. dcs_core/integrations/databases/opensearch.py +64 -0
  109. dcs_core/integrations/databases/oracle.py +719 -0
  110. dcs_core/integrations/databases/postgres.py +482 -0
  111. dcs_core/integrations/databases/redshift.py +53 -0
  112. dcs_core/integrations/databases/snowflake.py +48 -0
  113. dcs_core/integrations/databases/spark_df.py +111 -0
  114. dcs_core/integrations/databases/sybase.py +1069 -0
  115. dcs_core/integrations/storage/__init__.py +13 -0
  116. dcs_core/integrations/storage/local_file.py +149 -0
  117. dcs_core/integrations/utils/__init__.py +13 -0
  118. dcs_core/integrations/utils/utils.py +36 -0
  119. dcs_core/report/__init__.py +13 -0
  120. dcs_core/report/dashboard.py +211 -0
  121. dcs_core/report/models.py +88 -0
  122. dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
  123. dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
  124. dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
  125. dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
  126. dcs_core/report/static/assets/images/docs.svg +6 -0
  127. dcs_core/report/static/assets/images/github.svg +4 -0
  128. dcs_core/report/static/assets/images/logo.svg +7 -0
  129. dcs_core/report/static/assets/images/slack.svg +13 -0
  130. dcs_core/report/static/index.js +2 -0
  131. dcs_core/report/static/index.js.LICENSE.txt +3971 -0
  132. dcs_sdk/__init__.py +13 -0
  133. dcs_sdk/__main__.py +18 -0
  134. dcs_sdk/__version__.py +15 -0
  135. dcs_sdk/cli/__init__.py +13 -0
  136. dcs_sdk/cli/cli.py +163 -0
  137. dcs_sdk/sdk/__init__.py +58 -0
  138. dcs_sdk/sdk/config/__init__.py +13 -0
  139. dcs_sdk/sdk/config/config_loader.py +491 -0
  140. dcs_sdk/sdk/data_diff/__init__.py +13 -0
  141. dcs_sdk/sdk/data_diff/data_differ.py +821 -0
  142. dcs_sdk/sdk/rules/__init__.py +15 -0
  143. dcs_sdk/sdk/rules/rules_mappping.py +31 -0
  144. dcs_sdk/sdk/rules/rules_repository.py +214 -0
  145. dcs_sdk/sdk/rules/schema_rules.py +65 -0
  146. dcs_sdk/sdk/utils/__init__.py +13 -0
  147. dcs_sdk/sdk/utils/serializer.py +25 -0
  148. dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
  149. dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
  150. dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
  151. dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
  152. dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
  153. dcs_sdk/sdk/utils/table.py +475 -0
  154. dcs_sdk/sdk/utils/themes.py +40 -0
  155. dcs_sdk/sdk/utils/utils.py +349 -0
  156. dcs_sdk-1.6.5.dist-info/METADATA +150 -0
  157. dcs_sdk-1.6.5.dist-info/RECORD +159 -0
  158. dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
  159. dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,489 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Provides classes for performing a table diff"""
16
+
17
+ import threading
18
+ from abc import ABC, abstractmethod
19
+ from collections import defaultdict
20
+ from concurrent.futures import ThreadPoolExecutor, as_completed
21
+ from contextlib import contextmanager
22
+ from enum import Enum
23
+ from operator import methodcaller
24
+ from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union
25
+
26
+ import attrs
27
+
28
+ # logger = getLogger(__name__)
29
+ from loguru import logger
30
+
31
+ from data_diff.abcs.database_types import IKey, Integer, StringType
32
+ from data_diff.errors import DataDiffMismatchingKeyTypesError
33
+ from data_diff.info_tree import InfoTree, SegmentInfo
34
+ from data_diff.table_segment import TableSegment, create_mesh_from_points
35
+ from data_diff.thread_utils import ThreadedYielder
36
+ from data_diff.utils import Vector, getLogger, safezip
37
+
38
+
39
+ class Algorithm(Enum):
40
+ AUTO = "auto"
41
+ JOINDIFF = "joindiff"
42
+ HASHDIFF = "hashdiff"
43
+
44
+
45
+ DiffResult = Iterator[Tuple[str, tuple]] # Iterator[Tuple[Literal["+", "-"], tuple]]
46
+ DiffResultList = Iterator[List[Tuple[str, tuple]]]
47
+
48
+
49
+ @attrs.define(frozen=False)
50
+ class ThreadBase:
51
+ "Provides utility methods for optional threading"
52
+
53
+ threaded: bool = True
54
+ max_threadpool_size: Optional[int] = 1
55
+
56
+ def _thread_map(self, func, iterable):
57
+ if not self.threaded:
58
+ return map(func, iterable)
59
+
60
+ with ThreadPoolExecutor(max_workers=self.max_threadpool_size) as task_pool:
61
+ return task_pool.map(func, iterable)
62
+
63
+ def _threaded_call(self, func, iterable):
64
+ "Calls a method for each object in iterable."
65
+ return list(self._thread_map(methodcaller(func), iterable))
66
+
67
+ def _thread_as_completed(self, func, iterable):
68
+ if not self.threaded:
69
+ yield from map(func, iterable)
70
+ return
71
+
72
+ with ThreadPoolExecutor(max_workers=self.max_threadpool_size) as task_pool:
73
+ futures = [task_pool.submit(func, item) for item in iterable]
74
+ for future in as_completed(futures):
75
+ yield future.result()
76
+
77
+ def _threaded_call_as_completed(self, func, iterable):
78
+ "Calls a method for each object in iterable. Returned in order of completion."
79
+ return self._thread_as_completed(methodcaller(func), iterable)
80
+
81
+ @contextmanager
82
+ def _run_in_background(self, *funcs):
83
+ with ThreadPoolExecutor(max_workers=self.max_threadpool_size) as task_pool:
84
+ futures = [task_pool.submit(f) for f in funcs if f is not None]
85
+ yield futures
86
+ for f in futures:
87
+ f.result()
88
+
89
+
90
+ @attrs.define(frozen=True)
91
+ class DiffStats:
92
+ diff_by_sign: Dict[str, int]
93
+ table1_count: int
94
+ table2_count: int
95
+ unchanged: int
96
+ # diff_percent: float
97
+ extra_column_diffs: Optional[Dict[str, int]]
98
+ exclusive_source_ids: List[tuple]
99
+ exclusive_target_ids: List[tuple]
100
+ duplicate_source_ids: List[tuple]
101
+ duplicate_target_ids: List[tuple]
102
+ diff_values_ids: List[tuple]
103
+ diff_pk_percent: float
104
+ rows_downloaded: int
105
+ comparison_tracker: Optional[List] = []
106
+
107
+
108
+ @attrs.define(frozen=True)
109
+ class DiffResultWrapper:
110
+ diff: iter # DiffResult
111
+ info_tree: InfoTree
112
+ stats: dict
113
+ result_list: list = attrs.field(factory=list)
114
+
115
+ def __iter__(self) -> Iterator[Any]:
116
+ yield from self.result_list
117
+ for i in self.diff:
118
+ self.result_list.append(i)
119
+ yield i
120
+
121
+ def _get_stats(self) -> DiffStats:
122
+ list(self) # Consume the iterator into result_list, if we haven't already
123
+
124
+ key_columns = self.info_tree.info.tables[0].key_columns
125
+ len_key_columns = len(key_columns)
126
+ diff_by_key = {}
127
+ extra_column_values_store = {}
128
+ extra_columns = self.info_tree.info.tables[0].extra_columns
129
+ extra_column_diffs = {k: 0 for k in extra_columns}
130
+ source_rows_by_key = defaultdict(int)
131
+ target_rows_by_key = defaultdict(int)
132
+ exclusive_source_ids = []
133
+ exclusive_target_ids = []
134
+ duplicate_source_ids = []
135
+ duplicate_target_ids = []
136
+ diff_values_ids = []
137
+
138
+ for sign, values in self.result_list:
139
+ k = values[:len_key_columns]
140
+ if sign == "-":
141
+ source_rows_by_key[k] += 1
142
+ elif sign == "+":
143
+ target_rows_by_key[k] += 1
144
+
145
+ for sign, values in self.result_list:
146
+ k = values[:len_key_columns]
147
+ if sign == "-":
148
+ if source_rows_by_key[k] > 1 and k not in duplicate_source_ids:
149
+ duplicate_source_ids.append(k)
150
+ if k not in target_rows_by_key:
151
+ exclusive_source_ids.append(k)
152
+ elif sign == "+":
153
+ if target_rows_by_key[k] > 1 and k not in duplicate_target_ids:
154
+ duplicate_target_ids.append(k)
155
+ if k not in source_rows_by_key:
156
+ exclusive_target_ids.append(k)
157
+
158
+ for sign, values in self.result_list:
159
+ k = values[:len_key_columns]
160
+ if k in diff_by_key:
161
+ if sign != diff_by_key[k]:
162
+ diff_by_key[k] = "!"
163
+ if source_rows_by_key[k] <= 1 and target_rows_by_key[k] <= 1:
164
+ diff_values_ids.append(k)
165
+ extra_column_values = values[len_key_columns:]
166
+ for i in range(0, len(extra_columns)):
167
+ if extra_column_values[i] != extra_column_values_store[k][i]:
168
+ extra_column_diffs[extra_columns[i]] += 1
169
+ else:
170
+ diff_by_key[k] = sign
171
+ extra_column_values_store[k] = values[len_key_columns:]
172
+
173
+ diff_by_sign = {k: 0 for k in "+-!"}
174
+ for sign in diff_by_key.values():
175
+ diff_by_sign[sign] += 1
176
+
177
+ table1_count = self.info_tree.info.tables[0].count()
178
+ table2_count = self.info_tree.info.tables[1].count()
179
+
180
+ total_exclusive_pks = len(exclusive_source_ids) + len(exclusive_target_ids)
181
+ total_source_unique_pks = table1_count - len(duplicate_source_ids)
182
+ total_unique_pks = total_source_unique_pks + len(exclusive_target_ids)
183
+ diff_pk_percent = (total_exclusive_pks / total_unique_pks) if total_unique_pks > 0 else 0.0
184
+ differing_pks = diff_by_sign["!"]
185
+ exclusive_pks = total_exclusive_pks
186
+ unchanged = total_unique_pks - differing_pks - exclusive_pks
187
+ # diff_percent = 1 - unchanged / max(table1_count, table2_count) if max(table1_count, table2_count) > 0 else 0.0
188
+ rows_downloaded = self.stats.get("rows_downloaded", 0)
189
+ comparison_tracker = self.stats.get("comparison_tracker", [])
190
+ return DiffStats(
191
+ diff_by_sign,
192
+ table1_count,
193
+ table2_count,
194
+ unchanged,
195
+ # diff_percent,
196
+ extra_column_diffs,
197
+ exclusive_source_ids,
198
+ exclusive_target_ids,
199
+ duplicate_source_ids,
200
+ duplicate_target_ids,
201
+ diff_values_ids,
202
+ diff_pk_percent,
203
+ rows_downloaded,
204
+ comparison_tracker,
205
+ )
206
+
207
+ def get_stats_string(self):
208
+ diff_stats = self._get_stats()
209
+
210
+ string_output = ""
211
+ # string_output += f"{diff_stats.table1_count} rows in table A\n"
212
+ # string_output += f"{diff_stats.table2_count} rows in table B\n"
213
+ string_output += f"{diff_stats.diff_by_sign['-']} rows exclusive to table A (not present in B)\n"
214
+ string_output += f"{diff_stats.diff_by_sign['+']} rows exclusive to table B (not present in A)\n"
215
+ string_output += f"{diff_stats.diff_by_sign['!']} rows updated\n"
216
+ # string_output += f"{diff_stats.unchanged} rows unchanged\n"
217
+ # string_output += f"{100*diff_stats.diff_percent:.2f}% difference score\n"
218
+
219
+ # if self.stats:
220
+ # string_output += "\nExtra-Info:\n"
221
+ # for k, v in sorted(self.stats.items()):
222
+ # string_output += f" {k} = {v}\n"
223
+ for k, v in diff_stats.extra_column_diffs.items():
224
+ string_output += f"{v} rows with different values in column: {k}\n"
225
+ json_output = {
226
+ "rows_A": diff_stats.table1_count,
227
+ "rows_B": diff_stats.table2_count,
228
+ "exclusive_A": diff_stats.diff_by_sign["-"],
229
+ "exclusive_B": diff_stats.diff_by_sign["+"],
230
+ "updated": diff_stats.diff_by_sign["!"],
231
+ "total": sum(diff_stats.diff_by_sign.values()),
232
+ }
233
+ json_output["values"] = diff_stats.extra_column_diffs or {}
234
+ return string_output, json_output
235
+
236
+ def get_stats_dict(self):
237
+ diff_stats = self._get_stats()
238
+ json_output = {
239
+ "rows_A": diff_stats.table1_count,
240
+ "rows_B": diff_stats.table2_count,
241
+ "exclusive_A": diff_stats.diff_by_sign["-"],
242
+ "exclusive_B": diff_stats.diff_by_sign["+"],
243
+ # "updated": diff_stats.diff_by_sign["!"],
244
+ # "total": sum(diff_stats.diff_by_sign.values()),
245
+ "exclusive_source_ids": diff_stats.exclusive_source_ids,
246
+ "exclusive_target_ids": diff_stats.exclusive_target_ids,
247
+ "duplicate_source_ids": diff_stats.duplicate_source_ids,
248
+ "duplicate_target_ids": diff_stats.duplicate_target_ids,
249
+ "diff_values_ids": diff_stats.diff_values_ids,
250
+ "diff_pk_percent": diff_stats.diff_pk_percent,
251
+ "unchanged": diff_stats.unchanged,
252
+ "rows_downloaded": diff_stats.rows_downloaded,
253
+ "comparison_tracker": diff_stats.comparison_tracker,
254
+ }
255
+ json_output["values"] = diff_stats.extra_column_diffs or {}
256
+ return json_output
257
+
258
+
259
+ @attrs.define(frozen=False)
260
+ class TableDiffer(ThreadBase, ABC):
261
+ INFO_TREE_CLASS = InfoTree
262
+
263
+ bisection_factor = 32
264
+ stats: dict = {}
265
+
266
+ ignored_columns1: Set[str] = attrs.field(factory=set)
267
+ ignored_columns2: Set[str] = attrs.field(factory=set)
268
+ _ignored_columns_lock: threading.Lock = attrs.field(factory=threading.Lock, init=False)
269
+ yield_list: bool = False
270
+ t1_row_count: int = attrs.field(default=0, init=False)
271
+ t2_row_count: int = attrs.field(default=0, init=False)
272
+
273
+ def diff_tables(self, table1: TableSegment, table2: TableSegment, info_tree: InfoTree = None) -> DiffResultWrapper:
274
+ """Diff the given tables.
275
+
276
+ Parameters:
277
+ table1 (TableSegment): The "before" table to compare. Or: source table
278
+ table2 (TableSegment): The "after" table to compare. Or: target table
279
+
280
+ Returns:
281
+ An iterator that yield pair-tuples, representing the diff. Items can be either -
282
+ ('-', row) for items in table1 but not in table2.
283
+ ('+', row) for items in table2 but not in table1.
284
+ Where `row` is a tuple of values, corresponding to the diffed columns.
285
+ """
286
+ if info_tree is None:
287
+ segment_info = self.INFO_TREE_CLASS.SEGMENT_INFO_CLASS([table1, table2])
288
+ info_tree = self.INFO_TREE_CLASS(segment_info)
289
+ return DiffResultWrapper(self._diff_tables_wrapper(table1, table2, info_tree), info_tree, self.stats)
290
+
291
+ def _diff_tables_wrapper(self, table1: TableSegment, table2: TableSegment, info_tree: InfoTree) -> DiffResult:
292
+ if table1.database.dialect.PREVENT_OVERFLOW_WHEN_CONCAT or table2.database.dialect.PREVENT_OVERFLOW_WHEN_CONCAT:
293
+ table1.database.dialect.enable_preventing_type_overflow()
294
+ table2.database.dialect.enable_preventing_type_overflow()
295
+
296
+ error = None
297
+ try:
298
+ # Query and validate schema
299
+ table1, table2 = self._threaded_call("with_schema", [table1, table2])
300
+ self._validate_and_adjust_columns(table1, table2)
301
+
302
+ yield from self._diff_tables_root(table1, table2, info_tree)
303
+
304
+ except BaseException as e: # Catch KeyboardInterrupt too
305
+ error = e
306
+ finally:
307
+ info_tree.aggregate_info()
308
+ if error:
309
+ raise error
310
+
311
+ def _validate_and_adjust_columns(self, table1: TableSegment, table2: TableSegment) -> None:
312
+ pass
313
+
314
+ def _diff_tables_root(
315
+ self, table1: TableSegment, table2: TableSegment, info_tree: InfoTree
316
+ ) -> Union[DiffResult, DiffResultList]:
317
+ return self._bisect_and_diff_tables(table1, table2, info_tree)
318
+
319
+ @abstractmethod
320
+ def _diff_segments(
321
+ self,
322
+ ti: ThreadedYielder,
323
+ table1: TableSegment,
324
+ table2: TableSegment,
325
+ info_tree: InfoTree,
326
+ max_rows: int,
327
+ level=0,
328
+ segment_index=None,
329
+ segment_count=None,
330
+ ): ...
331
+
332
+ def _bisect_and_diff_tables(self, table1: TableSegment, table2: TableSegment, info_tree):
333
+ if len(table1.key_columns) != len(table2.key_columns):
334
+ raise ValueError("Tables should have an equivalent number of key columns!")
335
+
336
+ key_types1 = [table1._schema[i] for i in table1.key_columns]
337
+ key_types2 = [table2._schema[i] for i in table2.key_columns]
338
+
339
+ for kt in key_types1 + key_types2:
340
+ if not isinstance(kt, IKey):
341
+ raise NotImplementedError(f"Cannot use a column of type {kt} as a key")
342
+
343
+ mismatched_key_types = False
344
+ for i, (kt1, kt2) in enumerate(safezip(key_types1, key_types2)):
345
+ if kt1.python_type is not kt2.python_type:
346
+ # Allow integer vs string, and string vs string variants for diffing, but mark as mismatched
347
+ if (isinstance(kt1, Integer) and isinstance(kt2, StringType)) or (
348
+ isinstance(kt2, Integer) and isinstance(kt1, StringType)
349
+ ):
350
+ mismatched_key_types = True
351
+ elif isinstance(kt1, StringType) and isinstance(kt2, StringType):
352
+ mismatched_key_types = True
353
+ else:
354
+ k1 = table1.key_columns[i]
355
+ k2 = table2.key_columns[i]
356
+ raise DataDiffMismatchingKeyTypesError(
357
+ f"Key columns {k1} type: {kt1.python_type} and {k2} type: {kt2.python_type} can't be compared due to different types."
358
+ )
359
+
360
+ # Query min/max values
361
+ key_ranges = self._threaded_call_as_completed("query_key_range", [table1, table2])
362
+
363
+ # Start with the first completed value, so we don't waste time waiting
364
+ min_key1, max_key1 = self._parse_key_range_result(key_types1, next(key_ranges))
365
+
366
+ btable1 = table1.new_key_bounds(min_key=min_key1, max_key=max_key1, key_types=key_types1)
367
+ btable2 = table2.new_key_bounds(min_key=min_key1, max_key=max_key1, key_types=key_types2)
368
+
369
+ logger.info(
370
+ f"Diffing segments at key-range: {btable1.min_key}..{btable2.max_key}. "
371
+ f"size: table1 <= {btable1.approximate_size(self.t1_row_count)}, table2 <= {btable2.approximate_size(self.t2_row_count)}"
372
+ )
373
+
374
+ ti = ThreadedYielder(self.max_threadpool_size, self.yield_list)
375
+ # Bisect (split) the table into segments, and diff them recursively.
376
+ ti.submit(self._bisect_and_diff_segments, ti, btable1, btable2, info_tree, priority=999)
377
+
378
+ # Now we check for the second min-max, to diff the portions we "missed".
379
+ # This is achieved by subtracting the table ranges, and dividing the resulting space into aligned boxes.
380
+ # For example, given tables A & B, and a 2D compound key, where A was queried first for key-range,
381
+ # the regions of B we need to diff in this second pass are marked by B1..8:
382
+ # ┌──┬──────┬──┐
383
+ # │B1│ B2 │B3│
384
+ # ├──┼──────┼──┤
385
+ # │B4│ A │B5│
386
+ # ├──┼──────┼──┤
387
+ # │B6│ B7 │B8│
388
+ # └──┴──────┴──┘
389
+ # Overall, the max number of new regions in this 2nd pass is 3^|k| - 1
390
+
391
+ # Note: python types can be the same, but the rendering parameters (e.g. casing) can differ.
392
+ # If key types mismatched (e.g., int vs string), skip the second meshing pass to avoid
393
+ # attempting to sort mixed-type tuples (e.g., ArithAlphanumeric vs int).
394
+ if not mismatched_key_types:
395
+ min_key2, max_key2 = self._parse_key_range_result(key_types2, next(key_ranges))
396
+
397
+ points = [list(sorted(p)) for p in safezip(min_key1, min_key2, max_key1, max_key2)]
398
+ box_mesh = create_mesh_from_points(*points)
399
+
400
+ new_regions = [(p1, p2) for p1, p2 in box_mesh if p1 < p2 and not (p1 >= min_key1 and p2 <= max_key1)]
401
+
402
+ for p1, p2 in new_regions:
403
+ extra_table1 = table1.new_key_bounds(min_key=p1, max_key=p2, key_types=key_types1)
404
+ extra_table2 = table2.new_key_bounds(min_key=p1, max_key=p2, key_types=key_types2)
405
+ ti.submit(
406
+ self._bisect_and_diff_segments,
407
+ ti,
408
+ extra_table1,
409
+ extra_table2,
410
+ info_tree,
411
+ priority=999,
412
+ )
413
+
414
+ return ti
415
+
416
+ def _parse_key_range_result(self, key_types, key_range) -> Tuple[Vector, Vector]:
417
+ min_key_values, max_key_values = key_range
418
+
419
+ # We add 1 because our ranges are exclusive of the end (like in Python)
420
+ try:
421
+ min_key = Vector(key_type.make_value(mn) for key_type, mn in safezip(key_types, min_key_values))
422
+ max_key = Vector(key_type.make_value(mx) + 1 for key_type, mx in safezip(key_types, max_key_values))
423
+ except (TypeError, ValueError) as e:
424
+ raise type(e)(f"Cannot apply {key_types} to '{min_key_values}', '{max_key_values}'.") from e
425
+
426
+ return min_key, max_key
427
+
428
+ def _bisect_and_diff_segments(
429
+ self,
430
+ ti: ThreadedYielder,
431
+ table1: TableSegment,
432
+ table2: TableSegment,
433
+ info_tree: InfoTree,
434
+ level=0,
435
+ max_rows=None,
436
+ ):
437
+ assert table1.is_bounded and table2.is_bounded
438
+
439
+ # Choose evenly spaced checkpoints (according to min_key and max_key)
440
+ biggest_table = max(
441
+ table1, table2, key=methodcaller("approximate_size", max(self.t1_row_count, self.t2_row_count))
442
+ )
443
+ checkpoints = biggest_table.choose_checkpoints(self.bisection_factor - 1)
444
+
445
+ # Get it thread-safe, to avoid segment misalignment because of bad timing.
446
+ with self._ignored_columns_lock:
447
+ table1 = attrs.evolve(table1, ignored_columns=frozenset(self.ignored_columns1))
448
+ table2 = attrs.evolve(table2, ignored_columns=frozenset(self.ignored_columns2))
449
+
450
+ # Create new instances of TableSegment between each checkpoint
451
+ segmented1 = table1.segment_by_checkpoints(checkpoints)
452
+ segmented2 = table2.segment_by_checkpoints(checkpoints)
453
+
454
+ # Recursively compare each pair of corresponding segments between table1 and table2
455
+ for i, (t1, t2) in enumerate(safezip(segmented1, segmented2)):
456
+ info_node = info_tree.add_node(t1, t2, max_rows=max_rows)
457
+ ti.submit(
458
+ self._diff_segments,
459
+ ti,
460
+ t1,
461
+ t2,
462
+ info_node,
463
+ max_rows,
464
+ level + 1,
465
+ i + 1,
466
+ len(segmented1),
467
+ priority=level,
468
+ )
469
+
470
+ def ignore_column(self, column_name1: str, column_name2: str) -> None:
471
+ """
472
+ Ignore the column (by name on sides A & B) in md5s & diffs from now on.
473
+
474
+ This affects 2 places:
475
+
476
+ - The columns are not checksumed for new(!) segments.
477
+ - The columns are ignored in in-memory diffing for running segments.
478
+
479
+ The columns are never ignored in the fetched values, whether they are
480
+ the same or different — for data consistency.
481
+
482
+ Use this feature to collect relatively well-represented differences
483
+ across all columns if one of them is highly different in the beginning
484
+ of a table (as per the order of segmentation/bisection). Otherwise,
485
+ that one column might easily hit the limit and stop the whole diff.
486
+ """
487
+ with self._ignored_columns_lock:
488
+ self.ignored_columns1.add(column_name1)
489
+ self.ignored_columns2.add(column_name2)
data_diff/errors.py ADDED
@@ -0,0 +1,17 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ class DataDiffMismatchingKeyTypesError(Exception):
17
+ "Raised when the key types of two tables do not match, like VARCHAR and INT."