dcs-sdk 1.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. data_diff/__init__.py +221 -0
  2. data_diff/__main__.py +517 -0
  3. data_diff/abcs/__init__.py +13 -0
  4. data_diff/abcs/compiler.py +27 -0
  5. data_diff/abcs/database_types.py +402 -0
  6. data_diff/config.py +141 -0
  7. data_diff/databases/__init__.py +38 -0
  8. data_diff/databases/_connect.py +323 -0
  9. data_diff/databases/base.py +1417 -0
  10. data_diff/databases/bigquery.py +376 -0
  11. data_diff/databases/clickhouse.py +217 -0
  12. data_diff/databases/databricks.py +262 -0
  13. data_diff/databases/duckdb.py +207 -0
  14. data_diff/databases/mssql.py +343 -0
  15. data_diff/databases/mysql.py +189 -0
  16. data_diff/databases/oracle.py +238 -0
  17. data_diff/databases/postgresql.py +293 -0
  18. data_diff/databases/presto.py +222 -0
  19. data_diff/databases/redis.py +93 -0
  20. data_diff/databases/redshift.py +233 -0
  21. data_diff/databases/snowflake.py +222 -0
  22. data_diff/databases/sybase.py +720 -0
  23. data_diff/databases/trino.py +73 -0
  24. data_diff/databases/vertica.py +174 -0
  25. data_diff/diff_tables.py +489 -0
  26. data_diff/errors.py +17 -0
  27. data_diff/format.py +369 -0
  28. data_diff/hashdiff_tables.py +1026 -0
  29. data_diff/info_tree.py +76 -0
  30. data_diff/joindiff_tables.py +434 -0
  31. data_diff/lexicographic_space.py +253 -0
  32. data_diff/parse_time.py +88 -0
  33. data_diff/py.typed +0 -0
  34. data_diff/queries/__init__.py +13 -0
  35. data_diff/queries/api.py +213 -0
  36. data_diff/queries/ast_classes.py +811 -0
  37. data_diff/queries/base.py +38 -0
  38. data_diff/queries/extras.py +43 -0
  39. data_diff/query_utils.py +70 -0
  40. data_diff/schema.py +67 -0
  41. data_diff/table_segment.py +583 -0
  42. data_diff/thread_utils.py +112 -0
  43. data_diff/utils.py +1022 -0
  44. data_diff/version.py +15 -0
  45. dcs_core/__init__.py +13 -0
  46. dcs_core/__main__.py +17 -0
  47. dcs_core/__version__.py +15 -0
  48. dcs_core/cli/__init__.py +13 -0
  49. dcs_core/cli/cli.py +165 -0
  50. dcs_core/core/__init__.py +19 -0
  51. dcs_core/core/common/__init__.py +13 -0
  52. dcs_core/core/common/errors.py +50 -0
  53. dcs_core/core/common/models/__init__.py +13 -0
  54. dcs_core/core/common/models/configuration.py +284 -0
  55. dcs_core/core/common/models/dashboard.py +24 -0
  56. dcs_core/core/common/models/data_source_resource.py +75 -0
  57. dcs_core/core/common/models/metric.py +160 -0
  58. dcs_core/core/common/models/profile.py +75 -0
  59. dcs_core/core/common/models/validation.py +216 -0
  60. dcs_core/core/common/models/widget.py +44 -0
  61. dcs_core/core/configuration/__init__.py +13 -0
  62. dcs_core/core/configuration/config_loader.py +139 -0
  63. dcs_core/core/configuration/configuration_parser.py +262 -0
  64. dcs_core/core/configuration/configuration_parser_arc.py +328 -0
  65. dcs_core/core/datasource/__init__.py +13 -0
  66. dcs_core/core/datasource/base.py +62 -0
  67. dcs_core/core/datasource/manager.py +112 -0
  68. dcs_core/core/datasource/search_datasource.py +421 -0
  69. dcs_core/core/datasource/sql_datasource.py +1094 -0
  70. dcs_core/core/inspect.py +163 -0
  71. dcs_core/core/logger/__init__.py +13 -0
  72. dcs_core/core/logger/base.py +32 -0
  73. dcs_core/core/logger/default_logger.py +94 -0
  74. dcs_core/core/metric/__init__.py +13 -0
  75. dcs_core/core/metric/base.py +220 -0
  76. dcs_core/core/metric/combined_metric.py +98 -0
  77. dcs_core/core/metric/custom_metric.py +34 -0
  78. dcs_core/core/metric/manager.py +137 -0
  79. dcs_core/core/metric/numeric_metric.py +403 -0
  80. dcs_core/core/metric/reliability_metric.py +90 -0
  81. dcs_core/core/profiling/__init__.py +13 -0
  82. dcs_core/core/profiling/datasource_profiling.py +136 -0
  83. dcs_core/core/profiling/numeric_field_profiling.py +72 -0
  84. dcs_core/core/profiling/text_field_profiling.py +67 -0
  85. dcs_core/core/repository/__init__.py +13 -0
  86. dcs_core/core/repository/metric_repository.py +77 -0
  87. dcs_core/core/utils/__init__.py +13 -0
  88. dcs_core/core/utils/log.py +29 -0
  89. dcs_core/core/utils/tracking.py +105 -0
  90. dcs_core/core/utils/utils.py +44 -0
  91. dcs_core/core/validation/__init__.py +13 -0
  92. dcs_core/core/validation/base.py +230 -0
  93. dcs_core/core/validation/completeness_validation.py +153 -0
  94. dcs_core/core/validation/custom_query_validation.py +24 -0
  95. dcs_core/core/validation/manager.py +282 -0
  96. dcs_core/core/validation/numeric_validation.py +276 -0
  97. dcs_core/core/validation/reliability_validation.py +91 -0
  98. dcs_core/core/validation/uniqueness_validation.py +61 -0
  99. dcs_core/core/validation/validity_validation.py +738 -0
  100. dcs_core/integrations/__init__.py +13 -0
  101. dcs_core/integrations/databases/__init__.py +13 -0
  102. dcs_core/integrations/databases/bigquery.py +187 -0
  103. dcs_core/integrations/databases/databricks.py +51 -0
  104. dcs_core/integrations/databases/db2.py +652 -0
  105. dcs_core/integrations/databases/elasticsearch.py +61 -0
  106. dcs_core/integrations/databases/mssql.py +829 -0
  107. dcs_core/integrations/databases/mysql.py +409 -0
  108. dcs_core/integrations/databases/opensearch.py +64 -0
  109. dcs_core/integrations/databases/oracle.py +719 -0
  110. dcs_core/integrations/databases/postgres.py +482 -0
  111. dcs_core/integrations/databases/redshift.py +53 -0
  112. dcs_core/integrations/databases/snowflake.py +48 -0
  113. dcs_core/integrations/databases/spark_df.py +111 -0
  114. dcs_core/integrations/databases/sybase.py +1069 -0
  115. dcs_core/integrations/storage/__init__.py +13 -0
  116. dcs_core/integrations/storage/local_file.py +149 -0
  117. dcs_core/integrations/utils/__init__.py +13 -0
  118. dcs_core/integrations/utils/utils.py +36 -0
  119. dcs_core/report/__init__.py +13 -0
  120. dcs_core/report/dashboard.py +211 -0
  121. dcs_core/report/models.py +88 -0
  122. dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
  123. dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
  124. dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
  125. dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
  126. dcs_core/report/static/assets/images/docs.svg +6 -0
  127. dcs_core/report/static/assets/images/github.svg +4 -0
  128. dcs_core/report/static/assets/images/logo.svg +7 -0
  129. dcs_core/report/static/assets/images/slack.svg +13 -0
  130. dcs_core/report/static/index.js +2 -0
  131. dcs_core/report/static/index.js.LICENSE.txt +3971 -0
  132. dcs_sdk/__init__.py +13 -0
  133. dcs_sdk/__main__.py +18 -0
  134. dcs_sdk/__version__.py +15 -0
  135. dcs_sdk/cli/__init__.py +13 -0
  136. dcs_sdk/cli/cli.py +163 -0
  137. dcs_sdk/sdk/__init__.py +58 -0
  138. dcs_sdk/sdk/config/__init__.py +13 -0
  139. dcs_sdk/sdk/config/config_loader.py +491 -0
  140. dcs_sdk/sdk/data_diff/__init__.py +13 -0
  141. dcs_sdk/sdk/data_diff/data_differ.py +821 -0
  142. dcs_sdk/sdk/rules/__init__.py +15 -0
  143. dcs_sdk/sdk/rules/rules_mappping.py +31 -0
  144. dcs_sdk/sdk/rules/rules_repository.py +214 -0
  145. dcs_sdk/sdk/rules/schema_rules.py +65 -0
  146. dcs_sdk/sdk/utils/__init__.py +13 -0
  147. dcs_sdk/sdk/utils/serializer.py +25 -0
  148. dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
  149. dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
  150. dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
  151. dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
  152. dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
  153. dcs_sdk/sdk/utils/table.py +475 -0
  154. dcs_sdk/sdk/utils/themes.py +40 -0
  155. dcs_sdk/sdk/utils/utils.py +349 -0
  156. dcs_sdk-1.6.5.dist-info/METADATA +150 -0
  157. dcs_sdk-1.6.5.dist-info/RECORD +159 -0
  158. dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
  159. dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,1026 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import logging
16
+ import os
17
+ import time
18
+ from collections import defaultdict
19
+ from numbers import Number
20
+ from threading import Lock
21
+ from typing import Any, Collection, Dict, Iterator, List, Optional, Sequence, Set, Tuple
22
+
23
+ import attrs
24
+ from loguru import logger
25
+ from typing_extensions import Literal
26
+
27
+ from data_diff.abcs.database_types import (
28
+ JSON,
29
+ Boolean,
30
+ ColType_UUID,
31
+ NumericType,
32
+ PrecisionType,
33
+ StringType,
34
+ )
35
+ from data_diff.diff_tables import TableDiffer
36
+ from data_diff.info_tree import InfoTree
37
+ from data_diff.table_segment import TableSegment
38
+ from data_diff.thread_utils import ThreadedYielder
39
+ from data_diff.utils import diffs_are_equiv_jsons, safezip
40
+
41
+ BENCHMARK = os.environ.get("BENCHMARK", False)
42
+
43
+ DEFAULT_BISECTION_THRESHOLD = 1024 * 16
44
+ DEFAULT_BISECTION_FACTOR = 32
45
+ DEFAULT_PER_COLUMN_DIFF_LIMIT = 100
46
+ DEFAULT_ENGRESS_LIMIT = 5_00_000
47
+ DEAFULT_TIMEOUT = 60 * 5 # minutes
48
+
49
+ # logger = logging.getLogger("hashdiff_tables")
50
+
51
+ # Just for local readability: TODO: later switch to real type declarations of these.
52
+ _Op = Literal["+", "-"]
53
+ _PK = Sequence[Any]
54
+ _Row = Tuple[Any]
55
+
56
+
57
+ class PerColumnDiffTracker:
58
+ """Thread-safe tracker for differences per column and enforces limits"""
59
+
60
+ def __init__(
61
+ self,
62
+ per_column_diff_limit: int,
63
+ columns1: Sequence[str],
64
+ columns2: Sequence[str],
65
+ ignored_columns1: Collection[str],
66
+ ignored_columns2: Collection[str],
67
+ ):
68
+ self.per_column_diff_limit = per_column_diff_limit
69
+ self.column_diff_counts = defaultdict(int)
70
+ self.stopped_columns = set()
71
+ self.exclusive_pk_count = 0
72
+ self.duplicate_pk_count = 0
73
+ self._lock = Lock()
74
+
75
+ # Store original column mappings
76
+ self.original_columns1 = list(columns1)
77
+ self.original_columns2 = list(columns2)
78
+ self.original_ignored_columns1 = set(ignored_columns1)
79
+ self.original_ignored_columns2 = set(ignored_columns2)
80
+
81
+ # Create column name to index mapping for non-ignored columns
82
+ self.active_columns1 = [col for col in columns1 if col not in ignored_columns1]
83
+ self.active_columns2 = [col for col in columns2 if col not in ignored_columns2]
84
+ self.column1_to_index = {col: idx for idx, col in enumerate(self.active_columns1)}
85
+ self.column2_to_index = {col: idx for idx, col in enumerate(self.active_columns2)}
86
+
87
+ def should_process_column_diff(self, column_index: int) -> bool:
88
+ """Check if we should continue processing diffs for this column"""
89
+ with self._lock:
90
+ return column_index not in self.stopped_columns
91
+
92
+ def record_column_diff(self, column_index: int) -> bool:
93
+ """Record a diff for a column and return True if we should continue processing this column"""
94
+ with self._lock:
95
+ if column_index in self.stopped_columns:
96
+ return False
97
+
98
+ if self.column_diff_counts[column_index] >= self.per_column_diff_limit:
99
+ self.stopped_columns.add(column_index)
100
+ column_name = (
101
+ self.active_columns1[column_index]
102
+ if column_index < len(self.active_columns1)
103
+ else f"column_{column_index}"
104
+ )
105
+ logger.info(
106
+ f"Column '{column_name}' reached diff limit of {self.per_column_diff_limit}, stopping further diff tracking for this column"
107
+ )
108
+ return False
109
+ self.column_diff_counts[column_index] += 1
110
+ return True
111
+
112
+ def record_exclusive_pk(self) -> bool:
113
+ """Record an exclusive PK and return True if we should continue processing"""
114
+ with self._lock:
115
+ self.exclusive_pk_count += 1
116
+ return self.exclusive_pk_count < self.per_column_diff_limit
117
+
118
+ def record_duplicate_pk(self) -> bool:
119
+ """Record a duplicate PK and return True if we should continue processing"""
120
+ with self._lock:
121
+ self.duplicate_pk_count += 1
122
+ return self.duplicate_pk_count < self.per_column_diff_limit
123
+
124
+ def has_active_targets(self, total_columns: int) -> bool:
125
+ """Check if there are still columns being actively tracked"""
126
+ with self._lock:
127
+ return (
128
+ len(self.stopped_columns) < total_columns
129
+ or self.exclusive_pk_count < self.per_column_diff_limit
130
+ or self.duplicate_pk_count < self.per_column_diff_limit
131
+ )
132
+
133
+ def get_updated_ignored_columns(self) -> Tuple[Set[str], Set[str]]:
134
+ """Get updated ignored columns including stopped columns"""
135
+ with self._lock:
136
+ updated_ignored1 = set(self.original_ignored_columns1)
137
+ updated_ignored2 = set(self.original_ignored_columns2)
138
+
139
+ # Add stopped columns to ignored columns
140
+ for col_idx in self.stopped_columns:
141
+ if col_idx < len(self.active_columns1):
142
+ updated_ignored1.add(self.active_columns1[col_idx])
143
+ if col_idx < len(self.active_columns2):
144
+ updated_ignored2.add(self.active_columns2[col_idx])
145
+
146
+ return updated_ignored1, updated_ignored2
147
+
148
+ def get_active_columns_for_checksum(self) -> Tuple[List[str], List[str]]:
149
+ """Get columns that should be included in checksum (excluding stopped columns)"""
150
+ with self._lock:
151
+ active_checksum_columns1 = []
152
+ active_checksum_columns2 = []
153
+
154
+ for idx, col in enumerate(self.active_columns1):
155
+ if idx not in self.stopped_columns:
156
+ active_checksum_columns1.append(col)
157
+
158
+ for idx, col in enumerate(self.active_columns2):
159
+ if idx not in self.stopped_columns:
160
+ active_checksum_columns2.append(col)
161
+
162
+ return active_checksum_columns1, active_checksum_columns2
163
+
164
+ def get_stopped_columns(self) -> Set[int]:
165
+ with self._lock:
166
+ return self.stopped_columns.copy()
167
+
168
+
169
+ def diff_sets(
170
+ a: Sequence[_Row],
171
+ b: Sequence[_Row],
172
+ *,
173
+ json_cols: dict = None,
174
+ columns1: Sequence[str],
175
+ columns2: Sequence[str],
176
+ key_columns1: Sequence[str],
177
+ key_columns2: Sequence[str],
178
+ ignored_columns1: Collection[str],
179
+ ignored_columns2: Collection[str],
180
+ diff_tracker: PerColumnDiffTracker = None,
181
+ ) -> Iterator:
182
+ # Initialize per-column diff tracker if not provided
183
+ if diff_tracker is None:
184
+ diff_tracker = PerColumnDiffTracker(
185
+ DEFAULT_PER_COLUMN_DIFF_LIMIT, columns1, columns2, ignored_columns1, ignored_columns2
186
+ )
187
+
188
+ # Get updated ignored columns (including stopped columns)
189
+ updated_ignored1, updated_ignored2 = diff_tracker.get_updated_ignored_columns()
190
+
191
+ # Group full rows by PKs on each side. The first items are the PK: TableSegment.relevant_columns
192
+ rows_by_pks1: Dict[_PK, List[_Row]] = defaultdict(list)
193
+ rows_by_pks2: Dict[_PK, List[_Row]] = defaultdict(list)
194
+ for row in a:
195
+ pk: _PK = tuple(val for col, val in zip(key_columns1, row))
196
+ rows_by_pks1[pk].append(row)
197
+ for row in b:
198
+ pk: _PK = tuple(val for col, val in zip(key_columns2, row))
199
+ rows_by_pks2[pk].append(row)
200
+
201
+ # Calculate total active columns for tracking
202
+ total_columns = len([col for col in columns1 if col not in updated_ignored1])
203
+
204
+ # Mind that the same pk MUST go in full with all the -/+ rows all at once, for grouping.
205
+ diffs_by_pks: Dict[_PK, List[Tuple[_Op, _Row]]] = defaultdict(list)
206
+
207
+ warned_diff_cols = set()
208
+
209
+ for pk in sorted(set(rows_by_pks1) | set(rows_by_pks2)):
210
+ if not diff_tracker.has_active_targets(total_columns):
211
+ logger.info(
212
+ "Diffing stopped because columns with potential differences have reached their configured diff limits."
213
+ )
214
+ break
215
+
216
+ cutrows1: List[_Row] = [tuple(row1) for row1 in rows_by_pks1[pk]]
217
+
218
+ cutrows2: List[_Row] = [tuple(row2) for row2 in rows_by_pks2[pk]]
219
+
220
+ # Handle exclusive rows (present in only one side)
221
+ if len(rows_by_pks1[pk]) == 0 or len(rows_by_pks2[pk]) == 0:
222
+ if not diff_tracker.record_exclusive_pk():
223
+ continue
224
+
225
+ for row1 in rows_by_pks1[pk]:
226
+ diffs_by_pks[pk].append(("-", row1))
227
+ for row2 in rows_by_pks2[pk]:
228
+ diffs_by_pks[pk].append(("+", row2))
229
+ continue
230
+
231
+ # Handle duplicate PKs (2+ rows on either side)
232
+ if len(cutrows1) > 1 or len(cutrows2) > 1:
233
+ if not diff_tracker.record_duplicate_pk():
234
+ continue
235
+
236
+ for row1 in rows_by_pks1[pk]:
237
+ diffs_by_pks[pk].append(("-", row1))
238
+ for row2 in rows_by_pks2[pk]:
239
+ diffs_by_pks[pk].append(("+", row2))
240
+ continue
241
+
242
+ if len(cutrows1) == 1 and len(cutrows2) == 1:
243
+ row1, row2 = cutrows1[0], cutrows2[0]
244
+
245
+ # Find all differing columns and attempt to record them
246
+ has_recordable_diff = False
247
+
248
+ for col_idx, (val1, val2) in enumerate(zip(row1, row2)):
249
+ if val1 != val2: # This column has a difference
250
+ # Try to record it if the column is still being tracked
251
+ if diff_tracker.should_process_column_diff(col_idx):
252
+ if diff_tracker.record_column_diff(col_idx):
253
+ has_recordable_diff = True
254
+ # Continue checking other columns even if this one just got exhausted
255
+
256
+ # Include the row pair if we successfully recorded at least one difference
257
+ if has_recordable_diff:
258
+ for row1 in rows_by_pks1[pk]:
259
+ diffs_by_pks[pk].append(("-", row1))
260
+ for row2 in rows_by_pks2[pk]:
261
+ diffs_by_pks[pk].append(("+", row2))
262
+
263
+ # Process and yield the collected diffs
264
+ for diffs in (diffs_by_pks[pk] for pk in sorted(diffs_by_pks)):
265
+ if json_cols:
266
+ parsed_match, overriden_diff_cols = diffs_are_equiv_jsons(diffs, json_cols)
267
+ if parsed_match:
268
+ to_warn = overriden_diff_cols - warned_diff_cols
269
+ for w in to_warn:
270
+ logger.warning(
271
+ f"Equivalent JSON objects with different string representations detected "
272
+ f"in column '{w}'. These cases are NOT reported as differences."
273
+ )
274
+ warned_diff_cols.add(w)
275
+ continue
276
+ yield from diffs
277
+
278
+
279
+ @attrs.define(frozen=False)
280
+ class HashDiffer(TableDiffer):
281
+ """Finds the diff between two SQL tables
282
+
283
+ The algorithm uses hashing to quickly check if the tables are different, and then applies a
284
+ bisection search recursively to find the differences efficiently.
285
+
286
+ Works best for comparing tables that are mostly the same, with minor discrepancies.
287
+
288
+ Parameters:
289
+ bisection_factor (int): Into how many segments to bisect per iteration.
290
+ bisection_threshold (Number): When should we stop bisecting and compare locally (in row count).
291
+ threaded (bool): Enable/disable threaded diffing. Needed to take advantage of database threads.
292
+ max_threadpool_size (int): Maximum size of each threadpool. ``None`` means auto.
293
+ Only relevant when `threaded` is ``True``.
294
+ There may be many pools, so number of actual threads can be a lot higher.
295
+ per_column_diff_limit (int): Stop targeting column after finding this many different values.
296
+ Same applies to exclusive and duplicate PKs. If there are no targets left,
297
+ diffing will stop.
298
+ egress_limit (int): Maximum number of rows to download per segment.
299
+ strict (bool): Enable strict type checking. If ``False``, will not raise errors on incompatible types,
300
+ """
301
+
302
+ bisection_factor: int = DEFAULT_BISECTION_FACTOR
303
+ bisection_threshold: int = DEFAULT_BISECTION_THRESHOLD
304
+ bisection_disabled: bool = False # i.e. always download the rows (used in tests)
305
+ strict: bool = True # i.e. strict type check
306
+ per_column_diff_limit: int = DEFAULT_PER_COLUMN_DIFF_LIMIT
307
+ egress_limit: int = DEFAULT_ENGRESS_LIMIT # Rows download limit
308
+ stats: dict = attrs.field(factory=dict)
309
+ t1_row_count: int = 0
310
+ t2_row_count: int = 0
311
+ start_time: float = attrs.Factory(lambda: time.monotonic())
312
+ timeout_limit: int = DEAFULT_TIMEOUT
313
+
314
+ # Thread-safe diff tracker instance
315
+ _diff_tracker: PerColumnDiffTracker = attrs.field(default=None, init=False)
316
+
317
+ def __attrs_post_init__(self) -> None:
318
+ # Validate options
319
+ if self.bisection_factor >= self.bisection_threshold:
320
+ raise ValueError("Incorrect param values (bisection factor must be lower than threshold)")
321
+ if self.bisection_factor < 2:
322
+ raise ValueError("Must have at least two segments per iteration (i.e. bisection_factor >= 2)")
323
+ if self.per_column_diff_limit <= 0:
324
+ raise ValueError("per_column_diff_limit must be a positive integer")
325
+
326
+ def _initialize_diff_tracker(self, table1: TableSegment, table2: TableSegment) -> None:
327
+ """Initialize the diff tracker with table information"""
328
+ if self._diff_tracker is None:
329
+ self._diff_tracker = PerColumnDiffTracker(
330
+ self.per_column_diff_limit,
331
+ table1.relevant_columns,
332
+ table2.relevant_columns,
333
+ self.ignored_columns1,
334
+ self.ignored_columns2,
335
+ )
336
+
337
+ def update_comparison_tracker(self, reason_type: str, segment: str) -> None:
338
+ if "comparison_tracker" not in self.stats:
339
+ self.stats["comparison_tracker"] = []
340
+
341
+ if reason_type == "per_column_diff_limit":
342
+ reason = (
343
+ "Diffing stopped because columns with potential differences have reached their configured diff limits."
344
+ )
345
+ elif reason_type == "egress_limit":
346
+ reason = f"Row download limit reached, {self.stats.get('rows_downloaded')}"
347
+ elif reason_type == "timeout":
348
+ reason = f"Timeout limit reached, {self.timeout_limit} min"
349
+
350
+ tracker = self.stats["comparison_tracker"]
351
+ reason_index_map = {
352
+ entry.get("reason_type"): idx for idx, entry in enumerate(tracker) if "reason_type" in entry
353
+ }
354
+
355
+ new_entry = {"reason": reason, "segment": segment, "reason_type": reason_type}
356
+
357
+ if reason_type in reason_index_map:
358
+ tracker[reason_index_map[reason_type]] = new_entry
359
+ else:
360
+ tracker.append(new_entry)
361
+
362
+ self.stats["comparison_tracker"] = tracker
363
+
364
+ def _get_checksum_columns(self, table1: TableSegment, table2: TableSegment) -> Tuple[List[str], List[str]]:
365
+ """Get columns to include in checksum, excluding stopped columns"""
366
+ if self._diff_tracker is None:
367
+ # If no diff tracker, use all relevant columns
368
+ return list(table1.relevant_columns), list(table2.relevant_columns)
369
+
370
+ # Get active columns for checksum (excluding stopped columns)
371
+ active_cols1, active_cols2 = self._diff_tracker.get_active_columns_for_checksum()
372
+
373
+ # If no active columns left, use key columns only
374
+ if not active_cols1 or not active_cols2:
375
+ return list(table1.key_columns), list(table2.key_columns)
376
+
377
+ return active_cols1, active_cols2
378
+
379
+ def _create_segment_with_updated_columns(
380
+ self, original_segment: TableSegment, active_columns: List[str]
381
+ ) -> TableSegment:
382
+ """Create a new segment with updated relevant columns for checksum"""
383
+ # Create a copy of the segment with updated relevant columns
384
+ updated_segment = attrs.evolve(original_segment, extra_columns=active_columns)
385
+ return updated_segment
386
+
387
+ def _validate_and_adjust_columns(self, table1: TableSegment, table2: TableSegment) -> None:
388
+ for c1, c2 in safezip(table1.relevant_columns, table2.relevant_columns):
389
+ if c1 not in table1._schema:
390
+ raise ValueError(f"Column '{c1}' not found in schema for table {table1}")
391
+ if c2 not in table2._schema:
392
+ raise ValueError(f"Column '{c2}' not found in schema for table {table2}")
393
+
394
+ # Update schemas to minimal mutual precision
395
+ col1 = table1._schema[c1]
396
+ col2 = table2._schema[c2]
397
+ if isinstance(col1, PrecisionType):
398
+ if not isinstance(col2, PrecisionType):
399
+ if self.strict:
400
+ raise TypeError(f"Incompatible types for column '{c1}': {col1} <-> {col2}")
401
+ else:
402
+ continue
403
+
404
+ lowest = min(col1, col2, key=lambda col: col.precision)
405
+
406
+ if col1.precision != col2.precision:
407
+ logger.warning(f"Using reduced precision {lowest} for column '{c1}'. Types={col1}, {col2}")
408
+
409
+ table1._schema[c1] = attrs.evolve(col1, precision=lowest.precision, rounds=lowest.rounds)
410
+ table2._schema[c2] = attrs.evolve(col2, precision=lowest.precision, rounds=lowest.rounds)
411
+
412
+ elif isinstance(col1, (NumericType, Boolean)):
413
+ if not isinstance(col2, (NumericType, Boolean)):
414
+ if self.strict:
415
+ raise TypeError(f"Incompatible types for column '{c1}': {col1} <-> {col2}")
416
+ else:
417
+ continue
418
+
419
+ lowest = min(col1, col2, key=lambda col: col.precision)
420
+
421
+ if col1.precision != col2.precision:
422
+ logger.warning(f"Using reduced precision {lowest} for column '{c1}'. Types={col1}, {col2}")
423
+
424
+ if lowest.precision != col1.precision:
425
+ table1._schema[c1] = attrs.evolve(col1, precision=lowest.precision)
426
+ if lowest.precision != col2.precision:
427
+ table2._schema[c2] = attrs.evolve(col2, precision=lowest.precision)
428
+
429
+ for t in [table1, table2]:
430
+ for c in t.relevant_columns:
431
+ ctype = t._schema[c]
432
+ if not ctype.supported:
433
+ logger.warning(
434
+ f"[{t.database.name if t.database.name.lower() != 'duckdb' else 'File'}] Column '{c}' of type '{ctype}' has no compatibility handling. "
435
+ "If encoding/formatting differs between databases, it may result in false positives."
436
+ )
437
+
438
+ def _diff_segments(
439
+ self,
440
+ ti: ThreadedYielder,
441
+ table1: TableSegment,
442
+ table2: TableSegment,
443
+ info_tree: InfoTree,
444
+ max_rows: int,
445
+ level=0,
446
+ segment_index=None,
447
+ segment_count=None,
448
+ ):
449
+ # Check if level exceeds maximum allowed recursion depth
450
+ if level > 15:
451
+ logger.warning(
452
+ ". " * level
453
+ + f"Maximum recursion level reached ({level}); switching to direct row comparison for segment {table1.min_key}..{table1.max_key}"
454
+ )
455
+ # Fallback: download rows and diff locally to prevent excessive recursion
456
+ rows1, rows2 = self._threaded_call("get_values", [table1, table2])
457
+ json_cols = {
458
+ i: colname
459
+ for i, colname in enumerate(table1.extra_columns)
460
+ if isinstance(table1._schema[colname], JSON)
461
+ }
462
+ diff = list(
463
+ diff_sets(
464
+ rows1,
465
+ rows2,
466
+ json_cols=json_cols,
467
+ columns1=table1.relevant_columns,
468
+ columns2=table2.relevant_columns,
469
+ key_columns1=table1.key_columns,
470
+ key_columns2=table2.key_columns,
471
+ ignored_columns1=self.ignored_columns1,
472
+ ignored_columns2=self.ignored_columns2,
473
+ diff_tracker=self._diff_tracker,
474
+ )
475
+ )
476
+ info_tree.info.set_diff(diff)
477
+ info_tree.info.rowcounts = {1: len(rows1), 2: len(rows2)}
478
+ self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2))
479
+ logger.info(
480
+ ". " * level
481
+ + f"Diff found {len(diff)} different rows, {self.stats['rows_downloaded']} total rows downloaded."
482
+ )
483
+ return diff
484
+
485
+ # Initialize diff tracker if not already done
486
+ self._initialize_diff_tracker(table1, table2)
487
+
488
+ logger.info(
489
+ ". " * level + f"Diffing segment {segment_index}/{segment_count}, "
490
+ f"key-range: {table1.min_key}..{table2.max_key}, "
491
+ f"size <= {max_rows}"
492
+ )
493
+ elapsed = time.monotonic() - self.start_time
494
+ if (
495
+ len(self._diff_tracker.get_stopped_columns()) > 0
496
+ and not self.stats.get("rows_downloaded", 0) >= self.egress_limit
497
+ and not elapsed > self.timeout_limit * 60
498
+ ):
499
+ self.update_comparison_tracker(
500
+ reason_type="per_column_diff_limit",
501
+ segment=f"{table1.min_key}..{table1.max_key}",
502
+ )
503
+ if not self._diff_tracker.has_active_targets(len(table1.relevant_columns)):
504
+ logger.info(
505
+ "Diffing stopped because columns with potential differences have reached their configured diff limits."
506
+ )
507
+ info_tree.info.is_diff = False
508
+ self.update_comparison_tracker(
509
+ reason_type="per_column_diff_limit",
510
+ segment=f"{table1.min_key}..{table1.max_key}",
511
+ )
512
+ return
513
+ if self.stats.get("rows_downloaded", 0) >= self.egress_limit:
514
+ info_tree.info.is_diff = False
515
+ logger.info(
516
+ ". " * level
517
+ + f"Row download limit reached {self.stats.get('rows_downloaded')}, stopping bisection for segment {table1.min_key}..{table1.max_key}"
518
+ )
519
+ self.update_comparison_tracker(
520
+ reason_type="egress_limit",
521
+ segment=f"{table1.min_key}..{table1.max_key}",
522
+ )
523
+ return
524
+
525
+ elapsed = time.monotonic() - self.start_time
526
+ if elapsed > self.timeout_limit * 60:
527
+ info_tree.info.is_diff = False
528
+ logger.info(
529
+ ". " * level + f"Timeout limit reached ({self.timeout_limit} min); "
530
+ f"stopping bisection for segment {table1.min_key}..{table1.max_key}"
531
+ )
532
+ self.update_comparison_tracker(
533
+ reason_type="timeout",
534
+ segment=f"{table1.min_key}..{table1.max_key}",
535
+ )
536
+ return
537
+ # When benchmarking, we want the ability to skip checksumming. This
538
+ # allows us to download all rows for comparison in performance. By
539
+ # default, dcs-diff will checksum the section first (when it's below
540
+ # the threshold) and _then_ download it.
541
+ if BENCHMARK:
542
+ if self.bisection_disabled or max_rows < self.bisection_threshold:
543
+ return self._bisect_and_diff_segments(ti, table1, table2, info_tree, level=level, max_rows=max_rows)
544
+
545
+ # Get active columns for checksum (excluding stopped columns)
546
+ active_cols1, active_cols2 = self._get_checksum_columns(table1, table2)
547
+
548
+ # Create segments with updated columns for checksum
549
+ checksum_table1 = self._create_segment_with_updated_columns(table1, active_cols1)
550
+ checksum_table2 = self._create_segment_with_updated_columns(table2, active_cols2)
551
+
552
+ (count1, checksum1), (count2, checksum2) = self._threaded_call(
553
+ "count_and_checksum", [checksum_table1, checksum_table2]
554
+ )
555
+
556
+ assert not info_tree.info.rowcounts
557
+ info_tree.info.rowcounts = {1: count1, 2: count2}
558
+
559
+ if count1 == 0 and count2 == 0:
560
+ logger.debug(
561
+ "Uneven distribution of keys detected in segment {}..{} (big gaps in the key column). "
562
+ "For better performance, we recommend to increase the bisection-threshold.",
563
+ table1.min_key,
564
+ table1.max_key,
565
+ )
566
+ assert checksum1 is None and checksum2 is None
567
+ info_tree.info.is_diff = False
568
+ return
569
+
570
+ if checksum1 == checksum2:
571
+ info_tree.info.is_diff = False
572
+ return
573
+
574
+ info_tree.info.is_diff = True
575
+ return self._bisect_and_diff_segments(ti, table1, table2, info_tree, level=level, max_rows=max(count1, count2))
576
+
577
+ def _bisect_and_diff_segments(
578
+ self,
579
+ ti: ThreadedYielder,
580
+ table1: TableSegment,
581
+ table2: TableSegment,
582
+ info_tree: InfoTree,
583
+ level=0,
584
+ max_rows=None,
585
+ ):
586
+ # Check if level exceeds maximum allowed recursion depth
587
+ if level > 15:
588
+ logger.warning(
589
+ ". " * level
590
+ + f"Maximum recursion level reached ({level}); switching to direct row comparison for segment {table1.min_key}..{table1.max_key}"
591
+ )
592
+ # Fallback: download rows and diff locally to prevent excessive recursion
593
+ rows1, rows2 = self._threaded_call("get_values", [table1, table2])
594
+ json_cols = {
595
+ i: colname
596
+ for i, colname in enumerate(table1.extra_columns)
597
+ if isinstance(table1._schema[colname], JSON)
598
+ }
599
+ diff = list(
600
+ diff_sets(
601
+ rows1,
602
+ rows2,
603
+ json_cols=json_cols,
604
+ columns1=table1.relevant_columns,
605
+ columns2=table2.relevant_columns,
606
+ key_columns1=table1.key_columns,
607
+ key_columns2=table2.key_columns,
608
+ ignored_columns1=self.ignored_columns1,
609
+ ignored_columns2=self.ignored_columns2,
610
+ diff_tracker=self._diff_tracker,
611
+ )
612
+ )
613
+ info_tree.info.set_diff(diff)
614
+ info_tree.info.rowcounts = {1: len(rows1), 2: len(rows2)}
615
+ self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2))
616
+ logger.info(
617
+ ". " * level
618
+ + f"Diff found {len(diff)} different rows, {self.stats['rows_downloaded']} total rows downloaded."
619
+ )
620
+ return diff
621
+
622
+ assert table1.is_bounded and table2.is_bounded
623
+
624
+ # Initialize diff tracker if not already done
625
+ self._initialize_diff_tracker(table1, table2)
626
+ elapsed = time.monotonic() - self.start_time
627
+ if (
628
+ len(self._diff_tracker.get_stopped_columns()) > 0
629
+ and not self.stats.get("rows_downloaded", 0) >= self.egress_limit
630
+ and not elapsed > self.timeout_limit * 60
631
+ ):
632
+ self.update_comparison_tracker(
633
+ reason_type="per_column_diff_limit",
634
+ segment=f"{table1.min_key}..{table1.max_key}",
635
+ )
636
+
637
+ if not self._diff_tracker.has_active_targets(len(table1.relevant_columns)):
638
+ logger.info(
639
+ "Diffing stopped because columns with potential differences have reached their configured diff limits."
640
+ )
641
+ info_tree.info.is_diff = False
642
+ self.update_comparison_tracker(
643
+ reason_type="per_column_diff_limit",
644
+ segment=f"{table1.min_key}..{table1.max_key}",
645
+ )
646
+ return
647
+ if self.stats.get("rows_downloaded", 0) >= self.egress_limit:
648
+ logger.info("Row download limit reached, stopping bisection")
649
+ logger.info(
650
+ ". " * level
651
+ + f"Row download limit reached {self.stats.get('rows_downloaded')}, stopping bisection for segment {table1.min_key}..{table1.max_key}"
652
+ )
653
+ self.update_comparison_tracker(
654
+ reason_type="egress_limit",
655
+ segment=f"{table1.min_key}..{table1.max_key}",
656
+ )
657
+ info_tree.info.is_diff = False
658
+ return
659
+
660
+ elapsed = time.monotonic() - self.start_time
661
+ if elapsed > self.timeout_limit * 60:
662
+ info_tree.info.is_diff = False
663
+ logger.info(
664
+ ". " * level + f"Timeout limit reached ({self.timeout_limit} min); "
665
+ f"stopping bisection for segment {table1.min_key}..{table1.max_key}"
666
+ )
667
+ self.update_comparison_tracker(
668
+ reason_type="timeout",
669
+ segment=f"{table1.min_key}..{table1.max_key}",
670
+ )
671
+ return
672
+
673
+ max_space_size = max(table1.approximate_size(self.t1_row_count), table2.approximate_size(self.t2_row_count))
674
+ if max_rows is None:
675
+ # We can be sure that row_count <= max_rows iff the table key is unique
676
+ max_rows = max_space_size
677
+ info_tree.info.max_rows = max_rows
678
+
679
+ # If count is below the threshold, just download and compare the columns locally
680
+ # This saves time, as bisection speed is limited by ping and query performance.
681
+ if self.bisection_disabled or max_rows < self.bisection_threshold or max_space_size < self.bisection_factor * 2:
682
+ rows1, rows2 = self._threaded_call("get_values", [table1, table2])
683
+ json_cols = {
684
+ i: colname
685
+ for i, colname in enumerate(table1.extra_columns)
686
+ if isinstance(table1._schema[colname], JSON)
687
+ }
688
+ diff = list(
689
+ diff_sets(
690
+ rows1,
691
+ rows2,
692
+ json_cols=json_cols,
693
+ columns1=table1.relevant_columns,
694
+ columns2=table2.relevant_columns,
695
+ key_columns1=table1.key_columns,
696
+ key_columns2=table2.key_columns,
697
+ ignored_columns1=self.ignored_columns1,
698
+ ignored_columns2=self.ignored_columns2,
699
+ diff_tracker=self._diff_tracker,
700
+ )
701
+ )
702
+
703
+ info_tree.info.set_diff(diff)
704
+ info_tree.info.rowcounts = {1: len(rows1), 2: len(rows2)}
705
+
706
+ self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2))
707
+ logger.info(
708
+ ". " * level
709
+ + f"Diff found {len(diff)} different rows, {self.stats['rows_downloaded']} total rows downloaded."
710
+ )
711
+ return diff
712
+
713
+ return super()._bisect_and_diff_segments(ti, table1, table2, info_tree, level, max_rows)
714
+
715
+
716
+ @attrs.define(frozen=False)
717
+ class HashDiffer(HashDiffer):
718
+ """
719
+ Enhanced HashDiffer with in-memory mode support.
720
+
721
+ Additional Parameters:
722
+ in_memory_diff (bool): If True, skip checksums and download segments directly for in-memory comparison.
723
+ If False, use standard checksum-based bisection (default behavior).
724
+ memory_segment_size (int): When in_memory_diff=True, target number of rows per segment before downloading.
725
+ """
726
+
727
+ in_memory_diff: bool = False
728
+ memory_segment_size: int = 10000
729
+
730
+ def __attrs_post_init__(self) -> None:
731
+ super().__attrs_post_init__()
732
+
733
+ if self.in_memory_diff:
734
+ logger.info("=" * 70)
735
+ logger.info("IN-MEMORY DIFF MODE ENABLED")
736
+ logger.info(" - Checksum queries: DISABLED")
737
+ logger.info(f" - Segment size: {self.memory_segment_size} rows")
738
+ logger.info(f" - Threading: {'ENABLED' if self.threaded else 'DISABLED'}")
739
+ logger.info(f" - Egress limit: {self.egress_limit} rows")
740
+ logger.info("=" * 70)
741
+
742
+ # Adjust bisection threshold for in-memory mode
743
+ if self.memory_segment_size > 0:
744
+ self.bisection_threshold = self.memory_segment_size
745
+
746
+ def _should_skip_checksum_and_download(self, max_rows: int) -> bool:
747
+ """
748
+ Determine if we should skip checksum and directly download segment data.
749
+
750
+ Returns True if:
751
+ 1. in_memory_diff flag is enabled, OR
752
+ 2. Traditional conditions: segment is below bisection threshold
753
+ """
754
+ return self.in_memory_diff
755
+ # if self.in_memory_diff:
756
+ # # In memory mode: download if segment is at or below target size
757
+ # return max_rows <= self.memory_segment_size
758
+ # else:
759
+ # # Traditional mode: use bisection threshold
760
+ # return self.bisection_disabled or max_rows < self.bisection_threshold
761
+
762
+ def _diff_segments(
763
+ self,
764
+ ti,
765
+ table1: TableSegment,
766
+ table2: TableSegment,
767
+ info_tree: InfoTree,
768
+ max_rows: int,
769
+ level=0,
770
+ segment_index=None,
771
+ segment_count=None,
772
+ ):
773
+ """
774
+ Enhanced segment diffing with in-memory mode support.
775
+ """
776
+ # Check recursion depth limit
777
+ if level > 15:
778
+ logger.warning(
779
+ ". " * level + f"Maximum recursion level ({level}) reached; "
780
+ f"downloading segment {table1.min_key}..{table1.max_key}"
781
+ )
782
+ return self._download_and_diff_locally(table1, table2, info_tree, level)
783
+
784
+ # Initialize diff tracker
785
+ self._initialize_diff_tracker(table1, table2)
786
+
787
+ logger.info(
788
+ ". " * level + f"Diffing segment {segment_index}/{segment_count}, "
789
+ f"key-range: {table1.min_key}..{table2.max_key}, "
790
+ f"size <= {max_rows}"
791
+ )
792
+
793
+ # Check all stop conditions
794
+ if not self._check_continuation_conditions(table1, info_tree, level):
795
+ return
796
+
797
+ # IN-MEMORY MODE: Skip checksum if flag is set or segment is small enough
798
+ if self._should_skip_checksum_and_download(max_rows):
799
+ if self.in_memory_diff:
800
+ logger.info(". " * level + f"[IN-MEMORY MODE] Downloading segment directly " f"(size: {max_rows} rows)")
801
+
802
+ return self._download_and_diff_locally(table1, table2, info_tree, level)
803
+
804
+ # STANDARD MODE: Perform checksum-based comparison
805
+ return self._checksum_and_bisect_if_needed(
806
+ ti, table1, table2, info_tree, level, max_rows, segment_index, segment_count
807
+ )
808
+
809
+ def _check_continuation_conditions(self, table1: TableSegment, info_tree: InfoTree, level: int) -> bool:
810
+ """Check if we should continue diffing (respects limits)."""
811
+
812
+ # Check per-column diff limit
813
+ elapsed = time.monotonic() - self.start_time
814
+ if (
815
+ len(self._diff_tracker.get_stopped_columns()) > 0
816
+ and not self.stats.get("rows_downloaded", 0) >= self.egress_limit
817
+ and not elapsed > self.timeout_limit * 60
818
+ ):
819
+ self.update_comparison_tracker(
820
+ reason_type="per_column_diff_limit",
821
+ segment=f"{table1.min_key}..{table1.max_key}",
822
+ )
823
+
824
+ if not self._diff_tracker.has_active_targets(len(table1.relevant_columns)):
825
+ logger.info(
826
+ "Diffing stopped because columns with potential differences "
827
+ "have reached their configured diff limits."
828
+ )
829
+ info_tree.info.is_diff = False
830
+ self.update_comparison_tracker(
831
+ reason_type="per_column_diff_limit",
832
+ segment=f"{table1.min_key}..{table1.max_key}",
833
+ )
834
+ return False
835
+
836
+ # Check egress limit
837
+ if self.stats.get("rows_downloaded", 0) >= self.egress_limit:
838
+ info_tree.info.is_diff = False
839
+ logger.info(
840
+ ". " * level + f"Row download limit reached "
841
+ f"{self.stats.get('rows_downloaded')}, stopping bisection for "
842
+ f"segment {table1.min_key}..{table1.max_key}"
843
+ )
844
+ self.update_comparison_tracker(
845
+ reason_type="egress_limit",
846
+ segment=f"{table1.min_key}..{table1.max_key}",
847
+ )
848
+ return False
849
+
850
+ # Check timeout
851
+ elapsed = time.monotonic() - self.start_time
852
+ if elapsed > self.timeout_limit * 60:
853
+ info_tree.info.is_diff = False
854
+ logger.info(
855
+ ". " * level + f"Timeout limit reached ({self.timeout_limit} min); "
856
+ f"stopping bisection for segment {table1.min_key}..{table1.max_key}"
857
+ )
858
+ self.update_comparison_tracker(
859
+ reason_type="timeout",
860
+ segment=f"{table1.min_key}..{table1.max_key}",
861
+ )
862
+ return False
863
+
864
+ return True
865
+
866
+ def _download_and_diff_locally(
867
+ self,
868
+ table1: TableSegment,
869
+ table2: TableSegment,
870
+ info_tree: InfoTree,
871
+ level: int,
872
+ ) -> List:
873
+ """Download segment rows and perform in-memory diff."""
874
+ start_time = time.monotonic()
875
+ mode_label = "[IN-MEMORY]" if self.in_memory_diff else "[STANDARD]"
876
+ logger.info(
877
+ ". " * level + f"{mode_label} Downloading rows for comparison: " f"{table1.min_key}..{table1.max_key}"
878
+ )
879
+
880
+ # Download rows from both tables
881
+ rows1, rows2 = self._threaded_call("get_values", [table1, table2])
882
+
883
+ # Update statistics
884
+ downloaded = max(len(rows1), len(rows2))
885
+ self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + downloaded
886
+
887
+ logger.info(
888
+ ". " * level + f"{mode_label} Downloaded {len(rows1)} and {len(rows2)} rows. "
889
+ f"Total downloaded: {self.stats['rows_downloaded']}"
890
+ f"Time taken in ms: {int((time.monotonic() - start_time) * 1000)}ms"
891
+ )
892
+
893
+ # Perform in-memory diff
894
+ json_cols = {
895
+ i: colname for i, colname in enumerate(table1.extra_columns) if isinstance(table1._schema[colname], JSON)
896
+ }
897
+
898
+ diff = list(
899
+ diff_sets(
900
+ rows1,
901
+ rows2,
902
+ json_cols=json_cols,
903
+ columns1=table1.relevant_columns,
904
+ columns2=table2.relevant_columns,
905
+ key_columns1=table1.key_columns,
906
+ key_columns2=table2.key_columns,
907
+ ignored_columns1=self.ignored_columns1,
908
+ ignored_columns2=self.ignored_columns2,
909
+ diff_tracker=self._diff_tracker,
910
+ )
911
+ )
912
+
913
+ # Update info tree
914
+ info_tree.info.set_diff(diff)
915
+ info_tree.info.rowcounts = {1: len(rows1), 2: len(rows2)}
916
+
917
+ logger.info(". " * level + f"{mode_label} Found {len(diff)} different rows in this segment")
918
+
919
+ return diff
920
+
921
+ def _checksum_and_bisect_if_needed(
922
+ self,
923
+ ti,
924
+ table1: TableSegment,
925
+ table2: TableSegment,
926
+ info_tree: InfoTree,
927
+ level: int,
928
+ max_rows: int,
929
+ segment_index: Optional[int],
930
+ segment_count: Optional[int],
931
+ ):
932
+ """Perform checksum comparison and bisect if differences found (standard mode)."""
933
+
934
+ logger.info(". " * level + "[CHECKSUM MODE] Comparing segment checksums")
935
+
936
+ # Get active columns for checksum (excluding stopped columns)
937
+ active_cols1, active_cols2 = self._get_checksum_columns(table1, table2)
938
+
939
+ # Create segments with updated columns for checksum
940
+ checksum_table1 = self._create_segment_with_updated_columns(table1, active_cols1)
941
+ checksum_table2 = self._create_segment_with_updated_columns(table2, active_cols2)
942
+
943
+ # Perform checksum
944
+ (count1, checksum1), (count2, checksum2) = self._threaded_call(
945
+ "count_and_checksum", [checksum_table1, checksum_table2]
946
+ )
947
+
948
+ assert not info_tree.info.rowcounts
949
+ info_tree.info.rowcounts = {1: count1, 2: count2}
950
+
951
+ # Handle empty segments
952
+ if count1 == 0 and count2 == 0:
953
+ logger.debug(
954
+ "Uneven distribution of keys detected in segment {}..{} "
955
+ "(big gaps in the key column). For better performance, "
956
+ "we recommend to increase the bisection-threshold.",
957
+ table1.min_key,
958
+ table1.max_key,
959
+ )
960
+ assert checksum1 is None and checksum2 is None
961
+ info_tree.info.is_diff = False
962
+ return
963
+
964
+ # Compare checksums
965
+ if checksum1 == checksum2:
966
+ logger.info(". " * level + "[CHECKSUM MODE] Checksums match - no differences")
967
+ info_tree.info.is_diff = False
968
+ return
969
+
970
+ logger.info(". " * level + "[CHECKSUM MODE] Checksums differ - bisecting segment")
971
+ info_tree.info.is_diff = True
972
+
973
+ # Bisect and continue
974
+ return self._bisect_and_diff_segments(ti, table1, table2, info_tree, level=level, max_rows=max(count1, count2))
975
+
976
+ def _bisect_and_diff_segments(
977
+ self,
978
+ ti,
979
+ table1: TableSegment,
980
+ table2: TableSegment,
981
+ info_tree: InfoTree,
982
+ level=0,
983
+ max_rows=None,
984
+ ):
985
+ """
986
+ Enhanced bisection with in-memory mode support.
987
+ """
988
+ # Check recursion limit
989
+ if level > 15:
990
+ logger.warning(
991
+ ". " * level + f"Maximum recursion level ({level}) reached; "
992
+ f"downloading segment {table1.min_key}..{table1.max_key}"
993
+ )
994
+ return self._download_and_diff_locally(table1, table2, info_tree, level)
995
+
996
+ assert table1.is_bounded and table2.is_bounded
997
+
998
+ # Initialize diff tracker
999
+ self._initialize_diff_tracker(table1, table2)
1000
+
1001
+ # Check continuation conditions
1002
+ if not self._check_continuation_conditions(table1, info_tree, level):
1003
+ return
1004
+
1005
+ # Calculate max space size
1006
+ max_space_size = max(table1.approximate_size(self.t1_row_count), table2.approximate_size(self.t2_row_count))
1007
+
1008
+ if max_rows is None:
1009
+ max_rows = max_space_size
1010
+
1011
+ info_tree.info.max_rows = max_rows
1012
+
1013
+ # Check if we should download directly
1014
+ should_download = (
1015
+ self.bisection_disabled or max_rows < self.bisection_threshold or max_space_size < self.bisection_factor * 2
1016
+ )
1017
+
1018
+ # In-memory mode: also download if at target segment size
1019
+ if self.in_memory_diff and max_rows <= self.memory_segment_size:
1020
+ should_download = True
1021
+
1022
+ if should_download:
1023
+ return self._download_and_diff_locally(table1, table2, info_tree, level)
1024
+
1025
+ # Otherwise, continue with standard bisection
1026
+ return super()._bisect_and_diff_segments(ti, table1, table2, info_tree, level, max_rows)