dcs-sdk 1.6.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_diff/__init__.py +221 -0
- data_diff/__main__.py +517 -0
- data_diff/abcs/__init__.py +13 -0
- data_diff/abcs/compiler.py +27 -0
- data_diff/abcs/database_types.py +402 -0
- data_diff/config.py +141 -0
- data_diff/databases/__init__.py +38 -0
- data_diff/databases/_connect.py +323 -0
- data_diff/databases/base.py +1417 -0
- data_diff/databases/bigquery.py +376 -0
- data_diff/databases/clickhouse.py +217 -0
- data_diff/databases/databricks.py +262 -0
- data_diff/databases/duckdb.py +207 -0
- data_diff/databases/mssql.py +343 -0
- data_diff/databases/mysql.py +189 -0
- data_diff/databases/oracle.py +238 -0
- data_diff/databases/postgresql.py +293 -0
- data_diff/databases/presto.py +222 -0
- data_diff/databases/redis.py +93 -0
- data_diff/databases/redshift.py +233 -0
- data_diff/databases/snowflake.py +222 -0
- data_diff/databases/sybase.py +720 -0
- data_diff/databases/trino.py +73 -0
- data_diff/databases/vertica.py +174 -0
- data_diff/diff_tables.py +489 -0
- data_diff/errors.py +17 -0
- data_diff/format.py +369 -0
- data_diff/hashdiff_tables.py +1026 -0
- data_diff/info_tree.py +76 -0
- data_diff/joindiff_tables.py +434 -0
- data_diff/lexicographic_space.py +253 -0
- data_diff/parse_time.py +88 -0
- data_diff/py.typed +0 -0
- data_diff/queries/__init__.py +13 -0
- data_diff/queries/api.py +213 -0
- data_diff/queries/ast_classes.py +811 -0
- data_diff/queries/base.py +38 -0
- data_diff/queries/extras.py +43 -0
- data_diff/query_utils.py +70 -0
- data_diff/schema.py +67 -0
- data_diff/table_segment.py +583 -0
- data_diff/thread_utils.py +112 -0
- data_diff/utils.py +1022 -0
- data_diff/version.py +15 -0
- dcs_core/__init__.py +13 -0
- dcs_core/__main__.py +17 -0
- dcs_core/__version__.py +15 -0
- dcs_core/cli/__init__.py +13 -0
- dcs_core/cli/cli.py +165 -0
- dcs_core/core/__init__.py +19 -0
- dcs_core/core/common/__init__.py +13 -0
- dcs_core/core/common/errors.py +50 -0
- dcs_core/core/common/models/__init__.py +13 -0
- dcs_core/core/common/models/configuration.py +284 -0
- dcs_core/core/common/models/dashboard.py +24 -0
- dcs_core/core/common/models/data_source_resource.py +75 -0
- dcs_core/core/common/models/metric.py +160 -0
- dcs_core/core/common/models/profile.py +75 -0
- dcs_core/core/common/models/validation.py +216 -0
- dcs_core/core/common/models/widget.py +44 -0
- dcs_core/core/configuration/__init__.py +13 -0
- dcs_core/core/configuration/config_loader.py +139 -0
- dcs_core/core/configuration/configuration_parser.py +262 -0
- dcs_core/core/configuration/configuration_parser_arc.py +328 -0
- dcs_core/core/datasource/__init__.py +13 -0
- dcs_core/core/datasource/base.py +62 -0
- dcs_core/core/datasource/manager.py +112 -0
- dcs_core/core/datasource/search_datasource.py +421 -0
- dcs_core/core/datasource/sql_datasource.py +1094 -0
- dcs_core/core/inspect.py +163 -0
- dcs_core/core/logger/__init__.py +13 -0
- dcs_core/core/logger/base.py +32 -0
- dcs_core/core/logger/default_logger.py +94 -0
- dcs_core/core/metric/__init__.py +13 -0
- dcs_core/core/metric/base.py +220 -0
- dcs_core/core/metric/combined_metric.py +98 -0
- dcs_core/core/metric/custom_metric.py +34 -0
- dcs_core/core/metric/manager.py +137 -0
- dcs_core/core/metric/numeric_metric.py +403 -0
- dcs_core/core/metric/reliability_metric.py +90 -0
- dcs_core/core/profiling/__init__.py +13 -0
- dcs_core/core/profiling/datasource_profiling.py +136 -0
- dcs_core/core/profiling/numeric_field_profiling.py +72 -0
- dcs_core/core/profiling/text_field_profiling.py +67 -0
- dcs_core/core/repository/__init__.py +13 -0
- dcs_core/core/repository/metric_repository.py +77 -0
- dcs_core/core/utils/__init__.py +13 -0
- dcs_core/core/utils/log.py +29 -0
- dcs_core/core/utils/tracking.py +105 -0
- dcs_core/core/utils/utils.py +44 -0
- dcs_core/core/validation/__init__.py +13 -0
- dcs_core/core/validation/base.py +230 -0
- dcs_core/core/validation/completeness_validation.py +153 -0
- dcs_core/core/validation/custom_query_validation.py +24 -0
- dcs_core/core/validation/manager.py +282 -0
- dcs_core/core/validation/numeric_validation.py +276 -0
- dcs_core/core/validation/reliability_validation.py +91 -0
- dcs_core/core/validation/uniqueness_validation.py +61 -0
- dcs_core/core/validation/validity_validation.py +738 -0
- dcs_core/integrations/__init__.py +13 -0
- dcs_core/integrations/databases/__init__.py +13 -0
- dcs_core/integrations/databases/bigquery.py +187 -0
- dcs_core/integrations/databases/databricks.py +51 -0
- dcs_core/integrations/databases/db2.py +652 -0
- dcs_core/integrations/databases/elasticsearch.py +61 -0
- dcs_core/integrations/databases/mssql.py +829 -0
- dcs_core/integrations/databases/mysql.py +409 -0
- dcs_core/integrations/databases/opensearch.py +64 -0
- dcs_core/integrations/databases/oracle.py +719 -0
- dcs_core/integrations/databases/postgres.py +482 -0
- dcs_core/integrations/databases/redshift.py +53 -0
- dcs_core/integrations/databases/snowflake.py +48 -0
- dcs_core/integrations/databases/spark_df.py +111 -0
- dcs_core/integrations/databases/sybase.py +1069 -0
- dcs_core/integrations/storage/__init__.py +13 -0
- dcs_core/integrations/storage/local_file.py +149 -0
- dcs_core/integrations/utils/__init__.py +13 -0
- dcs_core/integrations/utils/utils.py +36 -0
- dcs_core/report/__init__.py +13 -0
- dcs_core/report/dashboard.py +211 -0
- dcs_core/report/models.py +88 -0
- dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
- dcs_core/report/static/assets/images/docs.svg +6 -0
- dcs_core/report/static/assets/images/github.svg +4 -0
- dcs_core/report/static/assets/images/logo.svg +7 -0
- dcs_core/report/static/assets/images/slack.svg +13 -0
- dcs_core/report/static/index.js +2 -0
- dcs_core/report/static/index.js.LICENSE.txt +3971 -0
- dcs_sdk/__init__.py +13 -0
- dcs_sdk/__main__.py +18 -0
- dcs_sdk/__version__.py +15 -0
- dcs_sdk/cli/__init__.py +13 -0
- dcs_sdk/cli/cli.py +163 -0
- dcs_sdk/sdk/__init__.py +58 -0
- dcs_sdk/sdk/config/__init__.py +13 -0
- dcs_sdk/sdk/config/config_loader.py +491 -0
- dcs_sdk/sdk/data_diff/__init__.py +13 -0
- dcs_sdk/sdk/data_diff/data_differ.py +821 -0
- dcs_sdk/sdk/rules/__init__.py +15 -0
- dcs_sdk/sdk/rules/rules_mappping.py +31 -0
- dcs_sdk/sdk/rules/rules_repository.py +214 -0
- dcs_sdk/sdk/rules/schema_rules.py +65 -0
- dcs_sdk/sdk/utils/__init__.py +13 -0
- dcs_sdk/sdk/utils/serializer.py +25 -0
- dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
- dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
- dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
- dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
- dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
- dcs_sdk/sdk/utils/table.py +475 -0
- dcs_sdk/sdk/utils/themes.py +40 -0
- dcs_sdk/sdk/utils/utils.py +349 -0
- dcs_sdk-1.6.5.dist-info/METADATA +150 -0
- dcs_sdk-1.6.5.dist-info/RECORD +159 -0
- dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
- dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,1026 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
import os
|
|
17
|
+
import time
|
|
18
|
+
from collections import defaultdict
|
|
19
|
+
from numbers import Number
|
|
20
|
+
from threading import Lock
|
|
21
|
+
from typing import Any, Collection, Dict, Iterator, List, Optional, Sequence, Set, Tuple
|
|
22
|
+
|
|
23
|
+
import attrs
|
|
24
|
+
from loguru import logger
|
|
25
|
+
from typing_extensions import Literal
|
|
26
|
+
|
|
27
|
+
from data_diff.abcs.database_types import (
|
|
28
|
+
JSON,
|
|
29
|
+
Boolean,
|
|
30
|
+
ColType_UUID,
|
|
31
|
+
NumericType,
|
|
32
|
+
PrecisionType,
|
|
33
|
+
StringType,
|
|
34
|
+
)
|
|
35
|
+
from data_diff.diff_tables import TableDiffer
|
|
36
|
+
from data_diff.info_tree import InfoTree
|
|
37
|
+
from data_diff.table_segment import TableSegment
|
|
38
|
+
from data_diff.thread_utils import ThreadedYielder
|
|
39
|
+
from data_diff.utils import diffs_are_equiv_jsons, safezip
|
|
40
|
+
|
|
41
|
+
BENCHMARK = os.environ.get("BENCHMARK", False)
|
|
42
|
+
|
|
43
|
+
DEFAULT_BISECTION_THRESHOLD = 1024 * 16
|
|
44
|
+
DEFAULT_BISECTION_FACTOR = 32
|
|
45
|
+
DEFAULT_PER_COLUMN_DIFF_LIMIT = 100
|
|
46
|
+
DEFAULT_ENGRESS_LIMIT = 5_00_000
|
|
47
|
+
DEAFULT_TIMEOUT = 60 * 5 # minutes
|
|
48
|
+
|
|
49
|
+
# logger = logging.getLogger("hashdiff_tables")
|
|
50
|
+
|
|
51
|
+
# Just for local readability: TODO: later switch to real type declarations of these.
|
|
52
|
+
_Op = Literal["+", "-"]
|
|
53
|
+
_PK = Sequence[Any]
|
|
54
|
+
_Row = Tuple[Any]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class PerColumnDiffTracker:
|
|
58
|
+
"""Thread-safe tracker for differences per column and enforces limits"""
|
|
59
|
+
|
|
60
|
+
def __init__(
|
|
61
|
+
self,
|
|
62
|
+
per_column_diff_limit: int,
|
|
63
|
+
columns1: Sequence[str],
|
|
64
|
+
columns2: Sequence[str],
|
|
65
|
+
ignored_columns1: Collection[str],
|
|
66
|
+
ignored_columns2: Collection[str],
|
|
67
|
+
):
|
|
68
|
+
self.per_column_diff_limit = per_column_diff_limit
|
|
69
|
+
self.column_diff_counts = defaultdict(int)
|
|
70
|
+
self.stopped_columns = set()
|
|
71
|
+
self.exclusive_pk_count = 0
|
|
72
|
+
self.duplicate_pk_count = 0
|
|
73
|
+
self._lock = Lock()
|
|
74
|
+
|
|
75
|
+
# Store original column mappings
|
|
76
|
+
self.original_columns1 = list(columns1)
|
|
77
|
+
self.original_columns2 = list(columns2)
|
|
78
|
+
self.original_ignored_columns1 = set(ignored_columns1)
|
|
79
|
+
self.original_ignored_columns2 = set(ignored_columns2)
|
|
80
|
+
|
|
81
|
+
# Create column name to index mapping for non-ignored columns
|
|
82
|
+
self.active_columns1 = [col for col in columns1 if col not in ignored_columns1]
|
|
83
|
+
self.active_columns2 = [col for col in columns2 if col not in ignored_columns2]
|
|
84
|
+
self.column1_to_index = {col: idx for idx, col in enumerate(self.active_columns1)}
|
|
85
|
+
self.column2_to_index = {col: idx for idx, col in enumerate(self.active_columns2)}
|
|
86
|
+
|
|
87
|
+
def should_process_column_diff(self, column_index: int) -> bool:
|
|
88
|
+
"""Check if we should continue processing diffs for this column"""
|
|
89
|
+
with self._lock:
|
|
90
|
+
return column_index not in self.stopped_columns
|
|
91
|
+
|
|
92
|
+
def record_column_diff(self, column_index: int) -> bool:
|
|
93
|
+
"""Record a diff for a column and return True if we should continue processing this column"""
|
|
94
|
+
with self._lock:
|
|
95
|
+
if column_index in self.stopped_columns:
|
|
96
|
+
return False
|
|
97
|
+
|
|
98
|
+
if self.column_diff_counts[column_index] >= self.per_column_diff_limit:
|
|
99
|
+
self.stopped_columns.add(column_index)
|
|
100
|
+
column_name = (
|
|
101
|
+
self.active_columns1[column_index]
|
|
102
|
+
if column_index < len(self.active_columns1)
|
|
103
|
+
else f"column_{column_index}"
|
|
104
|
+
)
|
|
105
|
+
logger.info(
|
|
106
|
+
f"Column '{column_name}' reached diff limit of {self.per_column_diff_limit}, stopping further diff tracking for this column"
|
|
107
|
+
)
|
|
108
|
+
return False
|
|
109
|
+
self.column_diff_counts[column_index] += 1
|
|
110
|
+
return True
|
|
111
|
+
|
|
112
|
+
def record_exclusive_pk(self) -> bool:
|
|
113
|
+
"""Record an exclusive PK and return True if we should continue processing"""
|
|
114
|
+
with self._lock:
|
|
115
|
+
self.exclusive_pk_count += 1
|
|
116
|
+
return self.exclusive_pk_count < self.per_column_diff_limit
|
|
117
|
+
|
|
118
|
+
def record_duplicate_pk(self) -> bool:
|
|
119
|
+
"""Record a duplicate PK and return True if we should continue processing"""
|
|
120
|
+
with self._lock:
|
|
121
|
+
self.duplicate_pk_count += 1
|
|
122
|
+
return self.duplicate_pk_count < self.per_column_diff_limit
|
|
123
|
+
|
|
124
|
+
def has_active_targets(self, total_columns: int) -> bool:
|
|
125
|
+
"""Check if there are still columns being actively tracked"""
|
|
126
|
+
with self._lock:
|
|
127
|
+
return (
|
|
128
|
+
len(self.stopped_columns) < total_columns
|
|
129
|
+
or self.exclusive_pk_count < self.per_column_diff_limit
|
|
130
|
+
or self.duplicate_pk_count < self.per_column_diff_limit
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
def get_updated_ignored_columns(self) -> Tuple[Set[str], Set[str]]:
|
|
134
|
+
"""Get updated ignored columns including stopped columns"""
|
|
135
|
+
with self._lock:
|
|
136
|
+
updated_ignored1 = set(self.original_ignored_columns1)
|
|
137
|
+
updated_ignored2 = set(self.original_ignored_columns2)
|
|
138
|
+
|
|
139
|
+
# Add stopped columns to ignored columns
|
|
140
|
+
for col_idx in self.stopped_columns:
|
|
141
|
+
if col_idx < len(self.active_columns1):
|
|
142
|
+
updated_ignored1.add(self.active_columns1[col_idx])
|
|
143
|
+
if col_idx < len(self.active_columns2):
|
|
144
|
+
updated_ignored2.add(self.active_columns2[col_idx])
|
|
145
|
+
|
|
146
|
+
return updated_ignored1, updated_ignored2
|
|
147
|
+
|
|
148
|
+
def get_active_columns_for_checksum(self) -> Tuple[List[str], List[str]]:
|
|
149
|
+
"""Get columns that should be included in checksum (excluding stopped columns)"""
|
|
150
|
+
with self._lock:
|
|
151
|
+
active_checksum_columns1 = []
|
|
152
|
+
active_checksum_columns2 = []
|
|
153
|
+
|
|
154
|
+
for idx, col in enumerate(self.active_columns1):
|
|
155
|
+
if idx not in self.stopped_columns:
|
|
156
|
+
active_checksum_columns1.append(col)
|
|
157
|
+
|
|
158
|
+
for idx, col in enumerate(self.active_columns2):
|
|
159
|
+
if idx not in self.stopped_columns:
|
|
160
|
+
active_checksum_columns2.append(col)
|
|
161
|
+
|
|
162
|
+
return active_checksum_columns1, active_checksum_columns2
|
|
163
|
+
|
|
164
|
+
def get_stopped_columns(self) -> Set[int]:
|
|
165
|
+
with self._lock:
|
|
166
|
+
return self.stopped_columns.copy()
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def diff_sets(
|
|
170
|
+
a: Sequence[_Row],
|
|
171
|
+
b: Sequence[_Row],
|
|
172
|
+
*,
|
|
173
|
+
json_cols: dict = None,
|
|
174
|
+
columns1: Sequence[str],
|
|
175
|
+
columns2: Sequence[str],
|
|
176
|
+
key_columns1: Sequence[str],
|
|
177
|
+
key_columns2: Sequence[str],
|
|
178
|
+
ignored_columns1: Collection[str],
|
|
179
|
+
ignored_columns2: Collection[str],
|
|
180
|
+
diff_tracker: PerColumnDiffTracker = None,
|
|
181
|
+
) -> Iterator:
|
|
182
|
+
# Initialize per-column diff tracker if not provided
|
|
183
|
+
if diff_tracker is None:
|
|
184
|
+
diff_tracker = PerColumnDiffTracker(
|
|
185
|
+
DEFAULT_PER_COLUMN_DIFF_LIMIT, columns1, columns2, ignored_columns1, ignored_columns2
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# Get updated ignored columns (including stopped columns)
|
|
189
|
+
updated_ignored1, updated_ignored2 = diff_tracker.get_updated_ignored_columns()
|
|
190
|
+
|
|
191
|
+
# Group full rows by PKs on each side. The first items are the PK: TableSegment.relevant_columns
|
|
192
|
+
rows_by_pks1: Dict[_PK, List[_Row]] = defaultdict(list)
|
|
193
|
+
rows_by_pks2: Dict[_PK, List[_Row]] = defaultdict(list)
|
|
194
|
+
for row in a:
|
|
195
|
+
pk: _PK = tuple(val for col, val in zip(key_columns1, row))
|
|
196
|
+
rows_by_pks1[pk].append(row)
|
|
197
|
+
for row in b:
|
|
198
|
+
pk: _PK = tuple(val for col, val in zip(key_columns2, row))
|
|
199
|
+
rows_by_pks2[pk].append(row)
|
|
200
|
+
|
|
201
|
+
# Calculate total active columns for tracking
|
|
202
|
+
total_columns = len([col for col in columns1 if col not in updated_ignored1])
|
|
203
|
+
|
|
204
|
+
# Mind that the same pk MUST go in full with all the -/+ rows all at once, for grouping.
|
|
205
|
+
diffs_by_pks: Dict[_PK, List[Tuple[_Op, _Row]]] = defaultdict(list)
|
|
206
|
+
|
|
207
|
+
warned_diff_cols = set()
|
|
208
|
+
|
|
209
|
+
for pk in sorted(set(rows_by_pks1) | set(rows_by_pks2)):
|
|
210
|
+
if not diff_tracker.has_active_targets(total_columns):
|
|
211
|
+
logger.info(
|
|
212
|
+
"Diffing stopped because columns with potential differences have reached their configured diff limits."
|
|
213
|
+
)
|
|
214
|
+
break
|
|
215
|
+
|
|
216
|
+
cutrows1: List[_Row] = [tuple(row1) for row1 in rows_by_pks1[pk]]
|
|
217
|
+
|
|
218
|
+
cutrows2: List[_Row] = [tuple(row2) for row2 in rows_by_pks2[pk]]
|
|
219
|
+
|
|
220
|
+
# Handle exclusive rows (present in only one side)
|
|
221
|
+
if len(rows_by_pks1[pk]) == 0 or len(rows_by_pks2[pk]) == 0:
|
|
222
|
+
if not diff_tracker.record_exclusive_pk():
|
|
223
|
+
continue
|
|
224
|
+
|
|
225
|
+
for row1 in rows_by_pks1[pk]:
|
|
226
|
+
diffs_by_pks[pk].append(("-", row1))
|
|
227
|
+
for row2 in rows_by_pks2[pk]:
|
|
228
|
+
diffs_by_pks[pk].append(("+", row2))
|
|
229
|
+
continue
|
|
230
|
+
|
|
231
|
+
# Handle duplicate PKs (2+ rows on either side)
|
|
232
|
+
if len(cutrows1) > 1 or len(cutrows2) > 1:
|
|
233
|
+
if not diff_tracker.record_duplicate_pk():
|
|
234
|
+
continue
|
|
235
|
+
|
|
236
|
+
for row1 in rows_by_pks1[pk]:
|
|
237
|
+
diffs_by_pks[pk].append(("-", row1))
|
|
238
|
+
for row2 in rows_by_pks2[pk]:
|
|
239
|
+
diffs_by_pks[pk].append(("+", row2))
|
|
240
|
+
continue
|
|
241
|
+
|
|
242
|
+
if len(cutrows1) == 1 and len(cutrows2) == 1:
|
|
243
|
+
row1, row2 = cutrows1[0], cutrows2[0]
|
|
244
|
+
|
|
245
|
+
# Find all differing columns and attempt to record them
|
|
246
|
+
has_recordable_diff = False
|
|
247
|
+
|
|
248
|
+
for col_idx, (val1, val2) in enumerate(zip(row1, row2)):
|
|
249
|
+
if val1 != val2: # This column has a difference
|
|
250
|
+
# Try to record it if the column is still being tracked
|
|
251
|
+
if diff_tracker.should_process_column_diff(col_idx):
|
|
252
|
+
if diff_tracker.record_column_diff(col_idx):
|
|
253
|
+
has_recordable_diff = True
|
|
254
|
+
# Continue checking other columns even if this one just got exhausted
|
|
255
|
+
|
|
256
|
+
# Include the row pair if we successfully recorded at least one difference
|
|
257
|
+
if has_recordable_diff:
|
|
258
|
+
for row1 in rows_by_pks1[pk]:
|
|
259
|
+
diffs_by_pks[pk].append(("-", row1))
|
|
260
|
+
for row2 in rows_by_pks2[pk]:
|
|
261
|
+
diffs_by_pks[pk].append(("+", row2))
|
|
262
|
+
|
|
263
|
+
# Process and yield the collected diffs
|
|
264
|
+
for diffs in (diffs_by_pks[pk] for pk in sorted(diffs_by_pks)):
|
|
265
|
+
if json_cols:
|
|
266
|
+
parsed_match, overriden_diff_cols = diffs_are_equiv_jsons(diffs, json_cols)
|
|
267
|
+
if parsed_match:
|
|
268
|
+
to_warn = overriden_diff_cols - warned_diff_cols
|
|
269
|
+
for w in to_warn:
|
|
270
|
+
logger.warning(
|
|
271
|
+
f"Equivalent JSON objects with different string representations detected "
|
|
272
|
+
f"in column '{w}'. These cases are NOT reported as differences."
|
|
273
|
+
)
|
|
274
|
+
warned_diff_cols.add(w)
|
|
275
|
+
continue
|
|
276
|
+
yield from diffs
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
@attrs.define(frozen=False)
|
|
280
|
+
class HashDiffer(TableDiffer):
|
|
281
|
+
"""Finds the diff between two SQL tables
|
|
282
|
+
|
|
283
|
+
The algorithm uses hashing to quickly check if the tables are different, and then applies a
|
|
284
|
+
bisection search recursively to find the differences efficiently.
|
|
285
|
+
|
|
286
|
+
Works best for comparing tables that are mostly the same, with minor discrepancies.
|
|
287
|
+
|
|
288
|
+
Parameters:
|
|
289
|
+
bisection_factor (int): Into how many segments to bisect per iteration.
|
|
290
|
+
bisection_threshold (Number): When should we stop bisecting and compare locally (in row count).
|
|
291
|
+
threaded (bool): Enable/disable threaded diffing. Needed to take advantage of database threads.
|
|
292
|
+
max_threadpool_size (int): Maximum size of each threadpool. ``None`` means auto.
|
|
293
|
+
Only relevant when `threaded` is ``True``.
|
|
294
|
+
There may be many pools, so number of actual threads can be a lot higher.
|
|
295
|
+
per_column_diff_limit (int): Stop targeting column after finding this many different values.
|
|
296
|
+
Same applies to exclusive and duplicate PKs. If there are no targets left,
|
|
297
|
+
diffing will stop.
|
|
298
|
+
egress_limit (int): Maximum number of rows to download per segment.
|
|
299
|
+
strict (bool): Enable strict type checking. If ``False``, will not raise errors on incompatible types,
|
|
300
|
+
"""
|
|
301
|
+
|
|
302
|
+
bisection_factor: int = DEFAULT_BISECTION_FACTOR
|
|
303
|
+
bisection_threshold: int = DEFAULT_BISECTION_THRESHOLD
|
|
304
|
+
bisection_disabled: bool = False # i.e. always download the rows (used in tests)
|
|
305
|
+
strict: bool = True # i.e. strict type check
|
|
306
|
+
per_column_diff_limit: int = DEFAULT_PER_COLUMN_DIFF_LIMIT
|
|
307
|
+
egress_limit: int = DEFAULT_ENGRESS_LIMIT # Rows download limit
|
|
308
|
+
stats: dict = attrs.field(factory=dict)
|
|
309
|
+
t1_row_count: int = 0
|
|
310
|
+
t2_row_count: int = 0
|
|
311
|
+
start_time: float = attrs.Factory(lambda: time.monotonic())
|
|
312
|
+
timeout_limit: int = DEAFULT_TIMEOUT
|
|
313
|
+
|
|
314
|
+
# Thread-safe diff tracker instance
|
|
315
|
+
_diff_tracker: PerColumnDiffTracker = attrs.field(default=None, init=False)
|
|
316
|
+
|
|
317
|
+
def __attrs_post_init__(self) -> None:
|
|
318
|
+
# Validate options
|
|
319
|
+
if self.bisection_factor >= self.bisection_threshold:
|
|
320
|
+
raise ValueError("Incorrect param values (bisection factor must be lower than threshold)")
|
|
321
|
+
if self.bisection_factor < 2:
|
|
322
|
+
raise ValueError("Must have at least two segments per iteration (i.e. bisection_factor >= 2)")
|
|
323
|
+
if self.per_column_diff_limit <= 0:
|
|
324
|
+
raise ValueError("per_column_diff_limit must be a positive integer")
|
|
325
|
+
|
|
326
|
+
def _initialize_diff_tracker(self, table1: TableSegment, table2: TableSegment) -> None:
|
|
327
|
+
"""Initialize the diff tracker with table information"""
|
|
328
|
+
if self._diff_tracker is None:
|
|
329
|
+
self._diff_tracker = PerColumnDiffTracker(
|
|
330
|
+
self.per_column_diff_limit,
|
|
331
|
+
table1.relevant_columns,
|
|
332
|
+
table2.relevant_columns,
|
|
333
|
+
self.ignored_columns1,
|
|
334
|
+
self.ignored_columns2,
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
def update_comparison_tracker(self, reason_type: str, segment: str) -> None:
|
|
338
|
+
if "comparison_tracker" not in self.stats:
|
|
339
|
+
self.stats["comparison_tracker"] = []
|
|
340
|
+
|
|
341
|
+
if reason_type == "per_column_diff_limit":
|
|
342
|
+
reason = (
|
|
343
|
+
"Diffing stopped because columns with potential differences have reached their configured diff limits."
|
|
344
|
+
)
|
|
345
|
+
elif reason_type == "egress_limit":
|
|
346
|
+
reason = f"Row download limit reached, {self.stats.get('rows_downloaded')}"
|
|
347
|
+
elif reason_type == "timeout":
|
|
348
|
+
reason = f"Timeout limit reached, {self.timeout_limit} min"
|
|
349
|
+
|
|
350
|
+
tracker = self.stats["comparison_tracker"]
|
|
351
|
+
reason_index_map = {
|
|
352
|
+
entry.get("reason_type"): idx for idx, entry in enumerate(tracker) if "reason_type" in entry
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
new_entry = {"reason": reason, "segment": segment, "reason_type": reason_type}
|
|
356
|
+
|
|
357
|
+
if reason_type in reason_index_map:
|
|
358
|
+
tracker[reason_index_map[reason_type]] = new_entry
|
|
359
|
+
else:
|
|
360
|
+
tracker.append(new_entry)
|
|
361
|
+
|
|
362
|
+
self.stats["comparison_tracker"] = tracker
|
|
363
|
+
|
|
364
|
+
def _get_checksum_columns(self, table1: TableSegment, table2: TableSegment) -> Tuple[List[str], List[str]]:
|
|
365
|
+
"""Get columns to include in checksum, excluding stopped columns"""
|
|
366
|
+
if self._diff_tracker is None:
|
|
367
|
+
# If no diff tracker, use all relevant columns
|
|
368
|
+
return list(table1.relevant_columns), list(table2.relevant_columns)
|
|
369
|
+
|
|
370
|
+
# Get active columns for checksum (excluding stopped columns)
|
|
371
|
+
active_cols1, active_cols2 = self._diff_tracker.get_active_columns_for_checksum()
|
|
372
|
+
|
|
373
|
+
# If no active columns left, use key columns only
|
|
374
|
+
if not active_cols1 or not active_cols2:
|
|
375
|
+
return list(table1.key_columns), list(table2.key_columns)
|
|
376
|
+
|
|
377
|
+
return active_cols1, active_cols2
|
|
378
|
+
|
|
379
|
+
def _create_segment_with_updated_columns(
|
|
380
|
+
self, original_segment: TableSegment, active_columns: List[str]
|
|
381
|
+
) -> TableSegment:
|
|
382
|
+
"""Create a new segment with updated relevant columns for checksum"""
|
|
383
|
+
# Create a copy of the segment with updated relevant columns
|
|
384
|
+
updated_segment = attrs.evolve(original_segment, extra_columns=active_columns)
|
|
385
|
+
return updated_segment
|
|
386
|
+
|
|
387
|
+
def _validate_and_adjust_columns(self, table1: TableSegment, table2: TableSegment) -> None:
|
|
388
|
+
for c1, c2 in safezip(table1.relevant_columns, table2.relevant_columns):
|
|
389
|
+
if c1 not in table1._schema:
|
|
390
|
+
raise ValueError(f"Column '{c1}' not found in schema for table {table1}")
|
|
391
|
+
if c2 not in table2._schema:
|
|
392
|
+
raise ValueError(f"Column '{c2}' not found in schema for table {table2}")
|
|
393
|
+
|
|
394
|
+
# Update schemas to minimal mutual precision
|
|
395
|
+
col1 = table1._schema[c1]
|
|
396
|
+
col2 = table2._schema[c2]
|
|
397
|
+
if isinstance(col1, PrecisionType):
|
|
398
|
+
if not isinstance(col2, PrecisionType):
|
|
399
|
+
if self.strict:
|
|
400
|
+
raise TypeError(f"Incompatible types for column '{c1}': {col1} <-> {col2}")
|
|
401
|
+
else:
|
|
402
|
+
continue
|
|
403
|
+
|
|
404
|
+
lowest = min(col1, col2, key=lambda col: col.precision)
|
|
405
|
+
|
|
406
|
+
if col1.precision != col2.precision:
|
|
407
|
+
logger.warning(f"Using reduced precision {lowest} for column '{c1}'. Types={col1}, {col2}")
|
|
408
|
+
|
|
409
|
+
table1._schema[c1] = attrs.evolve(col1, precision=lowest.precision, rounds=lowest.rounds)
|
|
410
|
+
table2._schema[c2] = attrs.evolve(col2, precision=lowest.precision, rounds=lowest.rounds)
|
|
411
|
+
|
|
412
|
+
elif isinstance(col1, (NumericType, Boolean)):
|
|
413
|
+
if not isinstance(col2, (NumericType, Boolean)):
|
|
414
|
+
if self.strict:
|
|
415
|
+
raise TypeError(f"Incompatible types for column '{c1}': {col1} <-> {col2}")
|
|
416
|
+
else:
|
|
417
|
+
continue
|
|
418
|
+
|
|
419
|
+
lowest = min(col1, col2, key=lambda col: col.precision)
|
|
420
|
+
|
|
421
|
+
if col1.precision != col2.precision:
|
|
422
|
+
logger.warning(f"Using reduced precision {lowest} for column '{c1}'. Types={col1}, {col2}")
|
|
423
|
+
|
|
424
|
+
if lowest.precision != col1.precision:
|
|
425
|
+
table1._schema[c1] = attrs.evolve(col1, precision=lowest.precision)
|
|
426
|
+
if lowest.precision != col2.precision:
|
|
427
|
+
table2._schema[c2] = attrs.evolve(col2, precision=lowest.precision)
|
|
428
|
+
|
|
429
|
+
for t in [table1, table2]:
|
|
430
|
+
for c in t.relevant_columns:
|
|
431
|
+
ctype = t._schema[c]
|
|
432
|
+
if not ctype.supported:
|
|
433
|
+
logger.warning(
|
|
434
|
+
f"[{t.database.name if t.database.name.lower() != 'duckdb' else 'File'}] Column '{c}' of type '{ctype}' has no compatibility handling. "
|
|
435
|
+
"If encoding/formatting differs between databases, it may result in false positives."
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
def _diff_segments(
|
|
439
|
+
self,
|
|
440
|
+
ti: ThreadedYielder,
|
|
441
|
+
table1: TableSegment,
|
|
442
|
+
table2: TableSegment,
|
|
443
|
+
info_tree: InfoTree,
|
|
444
|
+
max_rows: int,
|
|
445
|
+
level=0,
|
|
446
|
+
segment_index=None,
|
|
447
|
+
segment_count=None,
|
|
448
|
+
):
|
|
449
|
+
# Check if level exceeds maximum allowed recursion depth
|
|
450
|
+
if level > 15:
|
|
451
|
+
logger.warning(
|
|
452
|
+
". " * level
|
|
453
|
+
+ f"Maximum recursion level reached ({level}); switching to direct row comparison for segment {table1.min_key}..{table1.max_key}"
|
|
454
|
+
)
|
|
455
|
+
# Fallback: download rows and diff locally to prevent excessive recursion
|
|
456
|
+
rows1, rows2 = self._threaded_call("get_values", [table1, table2])
|
|
457
|
+
json_cols = {
|
|
458
|
+
i: colname
|
|
459
|
+
for i, colname in enumerate(table1.extra_columns)
|
|
460
|
+
if isinstance(table1._schema[colname], JSON)
|
|
461
|
+
}
|
|
462
|
+
diff = list(
|
|
463
|
+
diff_sets(
|
|
464
|
+
rows1,
|
|
465
|
+
rows2,
|
|
466
|
+
json_cols=json_cols,
|
|
467
|
+
columns1=table1.relevant_columns,
|
|
468
|
+
columns2=table2.relevant_columns,
|
|
469
|
+
key_columns1=table1.key_columns,
|
|
470
|
+
key_columns2=table2.key_columns,
|
|
471
|
+
ignored_columns1=self.ignored_columns1,
|
|
472
|
+
ignored_columns2=self.ignored_columns2,
|
|
473
|
+
diff_tracker=self._diff_tracker,
|
|
474
|
+
)
|
|
475
|
+
)
|
|
476
|
+
info_tree.info.set_diff(diff)
|
|
477
|
+
info_tree.info.rowcounts = {1: len(rows1), 2: len(rows2)}
|
|
478
|
+
self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2))
|
|
479
|
+
logger.info(
|
|
480
|
+
". " * level
|
|
481
|
+
+ f"Diff found {len(diff)} different rows, {self.stats['rows_downloaded']} total rows downloaded."
|
|
482
|
+
)
|
|
483
|
+
return diff
|
|
484
|
+
|
|
485
|
+
# Initialize diff tracker if not already done
|
|
486
|
+
self._initialize_diff_tracker(table1, table2)
|
|
487
|
+
|
|
488
|
+
logger.info(
|
|
489
|
+
". " * level + f"Diffing segment {segment_index}/{segment_count}, "
|
|
490
|
+
f"key-range: {table1.min_key}..{table2.max_key}, "
|
|
491
|
+
f"size <= {max_rows}"
|
|
492
|
+
)
|
|
493
|
+
elapsed = time.monotonic() - self.start_time
|
|
494
|
+
if (
|
|
495
|
+
len(self._diff_tracker.get_stopped_columns()) > 0
|
|
496
|
+
and not self.stats.get("rows_downloaded", 0) >= self.egress_limit
|
|
497
|
+
and not elapsed > self.timeout_limit * 60
|
|
498
|
+
):
|
|
499
|
+
self.update_comparison_tracker(
|
|
500
|
+
reason_type="per_column_diff_limit",
|
|
501
|
+
segment=f"{table1.min_key}..{table1.max_key}",
|
|
502
|
+
)
|
|
503
|
+
if not self._diff_tracker.has_active_targets(len(table1.relevant_columns)):
|
|
504
|
+
logger.info(
|
|
505
|
+
"Diffing stopped because columns with potential differences have reached their configured diff limits."
|
|
506
|
+
)
|
|
507
|
+
info_tree.info.is_diff = False
|
|
508
|
+
self.update_comparison_tracker(
|
|
509
|
+
reason_type="per_column_diff_limit",
|
|
510
|
+
segment=f"{table1.min_key}..{table1.max_key}",
|
|
511
|
+
)
|
|
512
|
+
return
|
|
513
|
+
if self.stats.get("rows_downloaded", 0) >= self.egress_limit:
|
|
514
|
+
info_tree.info.is_diff = False
|
|
515
|
+
logger.info(
|
|
516
|
+
". " * level
|
|
517
|
+
+ f"Row download limit reached {self.stats.get('rows_downloaded')}, stopping bisection for segment {table1.min_key}..{table1.max_key}"
|
|
518
|
+
)
|
|
519
|
+
self.update_comparison_tracker(
|
|
520
|
+
reason_type="egress_limit",
|
|
521
|
+
segment=f"{table1.min_key}..{table1.max_key}",
|
|
522
|
+
)
|
|
523
|
+
return
|
|
524
|
+
|
|
525
|
+
elapsed = time.monotonic() - self.start_time
|
|
526
|
+
if elapsed > self.timeout_limit * 60:
|
|
527
|
+
info_tree.info.is_diff = False
|
|
528
|
+
logger.info(
|
|
529
|
+
". " * level + f"Timeout limit reached ({self.timeout_limit} min); "
|
|
530
|
+
f"stopping bisection for segment {table1.min_key}..{table1.max_key}"
|
|
531
|
+
)
|
|
532
|
+
self.update_comparison_tracker(
|
|
533
|
+
reason_type="timeout",
|
|
534
|
+
segment=f"{table1.min_key}..{table1.max_key}",
|
|
535
|
+
)
|
|
536
|
+
return
|
|
537
|
+
# When benchmarking, we want the ability to skip checksumming. This
|
|
538
|
+
# allows us to download all rows for comparison in performance. By
|
|
539
|
+
# default, dcs-diff will checksum the section first (when it's below
|
|
540
|
+
# the threshold) and _then_ download it.
|
|
541
|
+
if BENCHMARK:
|
|
542
|
+
if self.bisection_disabled or max_rows < self.bisection_threshold:
|
|
543
|
+
return self._bisect_and_diff_segments(ti, table1, table2, info_tree, level=level, max_rows=max_rows)
|
|
544
|
+
|
|
545
|
+
# Get active columns for checksum (excluding stopped columns)
|
|
546
|
+
active_cols1, active_cols2 = self._get_checksum_columns(table1, table2)
|
|
547
|
+
|
|
548
|
+
# Create segments with updated columns for checksum
|
|
549
|
+
checksum_table1 = self._create_segment_with_updated_columns(table1, active_cols1)
|
|
550
|
+
checksum_table2 = self._create_segment_with_updated_columns(table2, active_cols2)
|
|
551
|
+
|
|
552
|
+
(count1, checksum1), (count2, checksum2) = self._threaded_call(
|
|
553
|
+
"count_and_checksum", [checksum_table1, checksum_table2]
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
assert not info_tree.info.rowcounts
|
|
557
|
+
info_tree.info.rowcounts = {1: count1, 2: count2}
|
|
558
|
+
|
|
559
|
+
if count1 == 0 and count2 == 0:
|
|
560
|
+
logger.debug(
|
|
561
|
+
"Uneven distribution of keys detected in segment {}..{} (big gaps in the key column). "
|
|
562
|
+
"For better performance, we recommend to increase the bisection-threshold.",
|
|
563
|
+
table1.min_key,
|
|
564
|
+
table1.max_key,
|
|
565
|
+
)
|
|
566
|
+
assert checksum1 is None and checksum2 is None
|
|
567
|
+
info_tree.info.is_diff = False
|
|
568
|
+
return
|
|
569
|
+
|
|
570
|
+
if checksum1 == checksum2:
|
|
571
|
+
info_tree.info.is_diff = False
|
|
572
|
+
return
|
|
573
|
+
|
|
574
|
+
info_tree.info.is_diff = True
|
|
575
|
+
return self._bisect_and_diff_segments(ti, table1, table2, info_tree, level=level, max_rows=max(count1, count2))
|
|
576
|
+
|
|
577
|
+
def _bisect_and_diff_segments(
|
|
578
|
+
self,
|
|
579
|
+
ti: ThreadedYielder,
|
|
580
|
+
table1: TableSegment,
|
|
581
|
+
table2: TableSegment,
|
|
582
|
+
info_tree: InfoTree,
|
|
583
|
+
level=0,
|
|
584
|
+
max_rows=None,
|
|
585
|
+
):
|
|
586
|
+
# Check if level exceeds maximum allowed recursion depth
|
|
587
|
+
if level > 15:
|
|
588
|
+
logger.warning(
|
|
589
|
+
". " * level
|
|
590
|
+
+ f"Maximum recursion level reached ({level}); switching to direct row comparison for segment {table1.min_key}..{table1.max_key}"
|
|
591
|
+
)
|
|
592
|
+
# Fallback: download rows and diff locally to prevent excessive recursion
|
|
593
|
+
rows1, rows2 = self._threaded_call("get_values", [table1, table2])
|
|
594
|
+
json_cols = {
|
|
595
|
+
i: colname
|
|
596
|
+
for i, colname in enumerate(table1.extra_columns)
|
|
597
|
+
if isinstance(table1._schema[colname], JSON)
|
|
598
|
+
}
|
|
599
|
+
diff = list(
|
|
600
|
+
diff_sets(
|
|
601
|
+
rows1,
|
|
602
|
+
rows2,
|
|
603
|
+
json_cols=json_cols,
|
|
604
|
+
columns1=table1.relevant_columns,
|
|
605
|
+
columns2=table2.relevant_columns,
|
|
606
|
+
key_columns1=table1.key_columns,
|
|
607
|
+
key_columns2=table2.key_columns,
|
|
608
|
+
ignored_columns1=self.ignored_columns1,
|
|
609
|
+
ignored_columns2=self.ignored_columns2,
|
|
610
|
+
diff_tracker=self._diff_tracker,
|
|
611
|
+
)
|
|
612
|
+
)
|
|
613
|
+
info_tree.info.set_diff(diff)
|
|
614
|
+
info_tree.info.rowcounts = {1: len(rows1), 2: len(rows2)}
|
|
615
|
+
self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2))
|
|
616
|
+
logger.info(
|
|
617
|
+
". " * level
|
|
618
|
+
+ f"Diff found {len(diff)} different rows, {self.stats['rows_downloaded']} total rows downloaded."
|
|
619
|
+
)
|
|
620
|
+
return diff
|
|
621
|
+
|
|
622
|
+
assert table1.is_bounded and table2.is_bounded
|
|
623
|
+
|
|
624
|
+
# Initialize diff tracker if not already done
|
|
625
|
+
self._initialize_diff_tracker(table1, table2)
|
|
626
|
+
elapsed = time.monotonic() - self.start_time
|
|
627
|
+
if (
|
|
628
|
+
len(self._diff_tracker.get_stopped_columns()) > 0
|
|
629
|
+
and not self.stats.get("rows_downloaded", 0) >= self.egress_limit
|
|
630
|
+
and not elapsed > self.timeout_limit * 60
|
|
631
|
+
):
|
|
632
|
+
self.update_comparison_tracker(
|
|
633
|
+
reason_type="per_column_diff_limit",
|
|
634
|
+
segment=f"{table1.min_key}..{table1.max_key}",
|
|
635
|
+
)
|
|
636
|
+
|
|
637
|
+
if not self._diff_tracker.has_active_targets(len(table1.relevant_columns)):
|
|
638
|
+
logger.info(
|
|
639
|
+
"Diffing stopped because columns with potential differences have reached their configured diff limits."
|
|
640
|
+
)
|
|
641
|
+
info_tree.info.is_diff = False
|
|
642
|
+
self.update_comparison_tracker(
|
|
643
|
+
reason_type="per_column_diff_limit",
|
|
644
|
+
segment=f"{table1.min_key}..{table1.max_key}",
|
|
645
|
+
)
|
|
646
|
+
return
|
|
647
|
+
if self.stats.get("rows_downloaded", 0) >= self.egress_limit:
|
|
648
|
+
logger.info("Row download limit reached, stopping bisection")
|
|
649
|
+
logger.info(
|
|
650
|
+
". " * level
|
|
651
|
+
+ f"Row download limit reached {self.stats.get('rows_downloaded')}, stopping bisection for segment {table1.min_key}..{table1.max_key}"
|
|
652
|
+
)
|
|
653
|
+
self.update_comparison_tracker(
|
|
654
|
+
reason_type="egress_limit",
|
|
655
|
+
segment=f"{table1.min_key}..{table1.max_key}",
|
|
656
|
+
)
|
|
657
|
+
info_tree.info.is_diff = False
|
|
658
|
+
return
|
|
659
|
+
|
|
660
|
+
elapsed = time.monotonic() - self.start_time
|
|
661
|
+
if elapsed > self.timeout_limit * 60:
|
|
662
|
+
info_tree.info.is_diff = False
|
|
663
|
+
logger.info(
|
|
664
|
+
". " * level + f"Timeout limit reached ({self.timeout_limit} min); "
|
|
665
|
+
f"stopping bisection for segment {table1.min_key}..{table1.max_key}"
|
|
666
|
+
)
|
|
667
|
+
self.update_comparison_tracker(
|
|
668
|
+
reason_type="timeout",
|
|
669
|
+
segment=f"{table1.min_key}..{table1.max_key}",
|
|
670
|
+
)
|
|
671
|
+
return
|
|
672
|
+
|
|
673
|
+
max_space_size = max(table1.approximate_size(self.t1_row_count), table2.approximate_size(self.t2_row_count))
|
|
674
|
+
if max_rows is None:
|
|
675
|
+
# We can be sure that row_count <= max_rows iff the table key is unique
|
|
676
|
+
max_rows = max_space_size
|
|
677
|
+
info_tree.info.max_rows = max_rows
|
|
678
|
+
|
|
679
|
+
# If count is below the threshold, just download and compare the columns locally
|
|
680
|
+
# This saves time, as bisection speed is limited by ping and query performance.
|
|
681
|
+
if self.bisection_disabled or max_rows < self.bisection_threshold or max_space_size < self.bisection_factor * 2:
|
|
682
|
+
rows1, rows2 = self._threaded_call("get_values", [table1, table2])
|
|
683
|
+
json_cols = {
|
|
684
|
+
i: colname
|
|
685
|
+
for i, colname in enumerate(table1.extra_columns)
|
|
686
|
+
if isinstance(table1._schema[colname], JSON)
|
|
687
|
+
}
|
|
688
|
+
diff = list(
|
|
689
|
+
diff_sets(
|
|
690
|
+
rows1,
|
|
691
|
+
rows2,
|
|
692
|
+
json_cols=json_cols,
|
|
693
|
+
columns1=table1.relevant_columns,
|
|
694
|
+
columns2=table2.relevant_columns,
|
|
695
|
+
key_columns1=table1.key_columns,
|
|
696
|
+
key_columns2=table2.key_columns,
|
|
697
|
+
ignored_columns1=self.ignored_columns1,
|
|
698
|
+
ignored_columns2=self.ignored_columns2,
|
|
699
|
+
diff_tracker=self._diff_tracker,
|
|
700
|
+
)
|
|
701
|
+
)
|
|
702
|
+
|
|
703
|
+
info_tree.info.set_diff(diff)
|
|
704
|
+
info_tree.info.rowcounts = {1: len(rows1), 2: len(rows2)}
|
|
705
|
+
|
|
706
|
+
self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2))
|
|
707
|
+
logger.info(
|
|
708
|
+
". " * level
|
|
709
|
+
+ f"Diff found {len(diff)} different rows, {self.stats['rows_downloaded']} total rows downloaded."
|
|
710
|
+
)
|
|
711
|
+
return diff
|
|
712
|
+
|
|
713
|
+
return super()._bisect_and_diff_segments(ti, table1, table2, info_tree, level, max_rows)
|
|
714
|
+
|
|
715
|
+
|
|
716
|
+
@attrs.define(frozen=False)
|
|
717
|
+
class HashDiffer(HashDiffer):
|
|
718
|
+
"""
|
|
719
|
+
Enhanced HashDiffer with in-memory mode support.
|
|
720
|
+
|
|
721
|
+
Additional Parameters:
|
|
722
|
+
in_memory_diff (bool): If True, skip checksums and download segments directly for in-memory comparison.
|
|
723
|
+
If False, use standard checksum-based bisection (default behavior).
|
|
724
|
+
memory_segment_size (int): When in_memory_diff=True, target number of rows per segment before downloading.
|
|
725
|
+
"""
|
|
726
|
+
|
|
727
|
+
in_memory_diff: bool = False
|
|
728
|
+
memory_segment_size: int = 10000
|
|
729
|
+
|
|
730
|
+
def __attrs_post_init__(self) -> None:
|
|
731
|
+
super().__attrs_post_init__()
|
|
732
|
+
|
|
733
|
+
if self.in_memory_diff:
|
|
734
|
+
logger.info("=" * 70)
|
|
735
|
+
logger.info("IN-MEMORY DIFF MODE ENABLED")
|
|
736
|
+
logger.info(" - Checksum queries: DISABLED")
|
|
737
|
+
logger.info(f" - Segment size: {self.memory_segment_size} rows")
|
|
738
|
+
logger.info(f" - Threading: {'ENABLED' if self.threaded else 'DISABLED'}")
|
|
739
|
+
logger.info(f" - Egress limit: {self.egress_limit} rows")
|
|
740
|
+
logger.info("=" * 70)
|
|
741
|
+
|
|
742
|
+
# Adjust bisection threshold for in-memory mode
|
|
743
|
+
if self.memory_segment_size > 0:
|
|
744
|
+
self.bisection_threshold = self.memory_segment_size
|
|
745
|
+
|
|
746
|
+
def _should_skip_checksum_and_download(self, max_rows: int) -> bool:
|
|
747
|
+
"""
|
|
748
|
+
Determine if we should skip checksum and directly download segment data.
|
|
749
|
+
|
|
750
|
+
Returns True if:
|
|
751
|
+
1. in_memory_diff flag is enabled, OR
|
|
752
|
+
2. Traditional conditions: segment is below bisection threshold
|
|
753
|
+
"""
|
|
754
|
+
return self.in_memory_diff
|
|
755
|
+
# if self.in_memory_diff:
|
|
756
|
+
# # In memory mode: download if segment is at or below target size
|
|
757
|
+
# return max_rows <= self.memory_segment_size
|
|
758
|
+
# else:
|
|
759
|
+
# # Traditional mode: use bisection threshold
|
|
760
|
+
# return self.bisection_disabled or max_rows < self.bisection_threshold
|
|
761
|
+
|
|
762
|
+
def _diff_segments(
|
|
763
|
+
self,
|
|
764
|
+
ti,
|
|
765
|
+
table1: TableSegment,
|
|
766
|
+
table2: TableSegment,
|
|
767
|
+
info_tree: InfoTree,
|
|
768
|
+
max_rows: int,
|
|
769
|
+
level=0,
|
|
770
|
+
segment_index=None,
|
|
771
|
+
segment_count=None,
|
|
772
|
+
):
|
|
773
|
+
"""
|
|
774
|
+
Enhanced segment diffing with in-memory mode support.
|
|
775
|
+
"""
|
|
776
|
+
# Check recursion depth limit
|
|
777
|
+
if level > 15:
|
|
778
|
+
logger.warning(
|
|
779
|
+
". " * level + f"Maximum recursion level ({level}) reached; "
|
|
780
|
+
f"downloading segment {table1.min_key}..{table1.max_key}"
|
|
781
|
+
)
|
|
782
|
+
return self._download_and_diff_locally(table1, table2, info_tree, level)
|
|
783
|
+
|
|
784
|
+
# Initialize diff tracker
|
|
785
|
+
self._initialize_diff_tracker(table1, table2)
|
|
786
|
+
|
|
787
|
+
logger.info(
|
|
788
|
+
". " * level + f"Diffing segment {segment_index}/{segment_count}, "
|
|
789
|
+
f"key-range: {table1.min_key}..{table2.max_key}, "
|
|
790
|
+
f"size <= {max_rows}"
|
|
791
|
+
)
|
|
792
|
+
|
|
793
|
+
# Check all stop conditions
|
|
794
|
+
if not self._check_continuation_conditions(table1, info_tree, level):
|
|
795
|
+
return
|
|
796
|
+
|
|
797
|
+
# IN-MEMORY MODE: Skip checksum if flag is set or segment is small enough
|
|
798
|
+
if self._should_skip_checksum_and_download(max_rows):
|
|
799
|
+
if self.in_memory_diff:
|
|
800
|
+
logger.info(". " * level + f"[IN-MEMORY MODE] Downloading segment directly " f"(size: {max_rows} rows)")
|
|
801
|
+
|
|
802
|
+
return self._download_and_diff_locally(table1, table2, info_tree, level)
|
|
803
|
+
|
|
804
|
+
# STANDARD MODE: Perform checksum-based comparison
|
|
805
|
+
return self._checksum_and_bisect_if_needed(
|
|
806
|
+
ti, table1, table2, info_tree, level, max_rows, segment_index, segment_count
|
|
807
|
+
)
|
|
808
|
+
|
|
809
|
+
def _check_continuation_conditions(self, table1: TableSegment, info_tree: InfoTree, level: int) -> bool:
|
|
810
|
+
"""Check if we should continue diffing (respects limits)."""
|
|
811
|
+
|
|
812
|
+
# Check per-column diff limit
|
|
813
|
+
elapsed = time.monotonic() - self.start_time
|
|
814
|
+
if (
|
|
815
|
+
len(self._diff_tracker.get_stopped_columns()) > 0
|
|
816
|
+
and not self.stats.get("rows_downloaded", 0) >= self.egress_limit
|
|
817
|
+
and not elapsed > self.timeout_limit * 60
|
|
818
|
+
):
|
|
819
|
+
self.update_comparison_tracker(
|
|
820
|
+
reason_type="per_column_diff_limit",
|
|
821
|
+
segment=f"{table1.min_key}..{table1.max_key}",
|
|
822
|
+
)
|
|
823
|
+
|
|
824
|
+
if not self._diff_tracker.has_active_targets(len(table1.relevant_columns)):
|
|
825
|
+
logger.info(
|
|
826
|
+
"Diffing stopped because columns with potential differences "
|
|
827
|
+
"have reached their configured diff limits."
|
|
828
|
+
)
|
|
829
|
+
info_tree.info.is_diff = False
|
|
830
|
+
self.update_comparison_tracker(
|
|
831
|
+
reason_type="per_column_diff_limit",
|
|
832
|
+
segment=f"{table1.min_key}..{table1.max_key}",
|
|
833
|
+
)
|
|
834
|
+
return False
|
|
835
|
+
|
|
836
|
+
# Check egress limit
|
|
837
|
+
if self.stats.get("rows_downloaded", 0) >= self.egress_limit:
|
|
838
|
+
info_tree.info.is_diff = False
|
|
839
|
+
logger.info(
|
|
840
|
+
". " * level + f"Row download limit reached "
|
|
841
|
+
f"{self.stats.get('rows_downloaded')}, stopping bisection for "
|
|
842
|
+
f"segment {table1.min_key}..{table1.max_key}"
|
|
843
|
+
)
|
|
844
|
+
self.update_comparison_tracker(
|
|
845
|
+
reason_type="egress_limit",
|
|
846
|
+
segment=f"{table1.min_key}..{table1.max_key}",
|
|
847
|
+
)
|
|
848
|
+
return False
|
|
849
|
+
|
|
850
|
+
# Check timeout
|
|
851
|
+
elapsed = time.monotonic() - self.start_time
|
|
852
|
+
if elapsed > self.timeout_limit * 60:
|
|
853
|
+
info_tree.info.is_diff = False
|
|
854
|
+
logger.info(
|
|
855
|
+
". " * level + f"Timeout limit reached ({self.timeout_limit} min); "
|
|
856
|
+
f"stopping bisection for segment {table1.min_key}..{table1.max_key}"
|
|
857
|
+
)
|
|
858
|
+
self.update_comparison_tracker(
|
|
859
|
+
reason_type="timeout",
|
|
860
|
+
segment=f"{table1.min_key}..{table1.max_key}",
|
|
861
|
+
)
|
|
862
|
+
return False
|
|
863
|
+
|
|
864
|
+
return True
|
|
865
|
+
|
|
866
|
+
def _download_and_diff_locally(
|
|
867
|
+
self,
|
|
868
|
+
table1: TableSegment,
|
|
869
|
+
table2: TableSegment,
|
|
870
|
+
info_tree: InfoTree,
|
|
871
|
+
level: int,
|
|
872
|
+
) -> List:
|
|
873
|
+
"""Download segment rows and perform in-memory diff."""
|
|
874
|
+
start_time = time.monotonic()
|
|
875
|
+
mode_label = "[IN-MEMORY]" if self.in_memory_diff else "[STANDARD]"
|
|
876
|
+
logger.info(
|
|
877
|
+
". " * level + f"{mode_label} Downloading rows for comparison: " f"{table1.min_key}..{table1.max_key}"
|
|
878
|
+
)
|
|
879
|
+
|
|
880
|
+
# Download rows from both tables
|
|
881
|
+
rows1, rows2 = self._threaded_call("get_values", [table1, table2])
|
|
882
|
+
|
|
883
|
+
# Update statistics
|
|
884
|
+
downloaded = max(len(rows1), len(rows2))
|
|
885
|
+
self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + downloaded
|
|
886
|
+
|
|
887
|
+
logger.info(
|
|
888
|
+
". " * level + f"{mode_label} Downloaded {len(rows1)} and {len(rows2)} rows. "
|
|
889
|
+
f"Total downloaded: {self.stats['rows_downloaded']}"
|
|
890
|
+
f"Time taken in ms: {int((time.monotonic() - start_time) * 1000)}ms"
|
|
891
|
+
)
|
|
892
|
+
|
|
893
|
+
# Perform in-memory diff
|
|
894
|
+
json_cols = {
|
|
895
|
+
i: colname for i, colname in enumerate(table1.extra_columns) if isinstance(table1._schema[colname], JSON)
|
|
896
|
+
}
|
|
897
|
+
|
|
898
|
+
diff = list(
|
|
899
|
+
diff_sets(
|
|
900
|
+
rows1,
|
|
901
|
+
rows2,
|
|
902
|
+
json_cols=json_cols,
|
|
903
|
+
columns1=table1.relevant_columns,
|
|
904
|
+
columns2=table2.relevant_columns,
|
|
905
|
+
key_columns1=table1.key_columns,
|
|
906
|
+
key_columns2=table2.key_columns,
|
|
907
|
+
ignored_columns1=self.ignored_columns1,
|
|
908
|
+
ignored_columns2=self.ignored_columns2,
|
|
909
|
+
diff_tracker=self._diff_tracker,
|
|
910
|
+
)
|
|
911
|
+
)
|
|
912
|
+
|
|
913
|
+
# Update info tree
|
|
914
|
+
info_tree.info.set_diff(diff)
|
|
915
|
+
info_tree.info.rowcounts = {1: len(rows1), 2: len(rows2)}
|
|
916
|
+
|
|
917
|
+
logger.info(". " * level + f"{mode_label} Found {len(diff)} different rows in this segment")
|
|
918
|
+
|
|
919
|
+
return diff
|
|
920
|
+
|
|
921
|
+
def _checksum_and_bisect_if_needed(
|
|
922
|
+
self,
|
|
923
|
+
ti,
|
|
924
|
+
table1: TableSegment,
|
|
925
|
+
table2: TableSegment,
|
|
926
|
+
info_tree: InfoTree,
|
|
927
|
+
level: int,
|
|
928
|
+
max_rows: int,
|
|
929
|
+
segment_index: Optional[int],
|
|
930
|
+
segment_count: Optional[int],
|
|
931
|
+
):
|
|
932
|
+
"""Perform checksum comparison and bisect if differences found (standard mode)."""
|
|
933
|
+
|
|
934
|
+
logger.info(". " * level + "[CHECKSUM MODE] Comparing segment checksums")
|
|
935
|
+
|
|
936
|
+
# Get active columns for checksum (excluding stopped columns)
|
|
937
|
+
active_cols1, active_cols2 = self._get_checksum_columns(table1, table2)
|
|
938
|
+
|
|
939
|
+
# Create segments with updated columns for checksum
|
|
940
|
+
checksum_table1 = self._create_segment_with_updated_columns(table1, active_cols1)
|
|
941
|
+
checksum_table2 = self._create_segment_with_updated_columns(table2, active_cols2)
|
|
942
|
+
|
|
943
|
+
# Perform checksum
|
|
944
|
+
(count1, checksum1), (count2, checksum2) = self._threaded_call(
|
|
945
|
+
"count_and_checksum", [checksum_table1, checksum_table2]
|
|
946
|
+
)
|
|
947
|
+
|
|
948
|
+
assert not info_tree.info.rowcounts
|
|
949
|
+
info_tree.info.rowcounts = {1: count1, 2: count2}
|
|
950
|
+
|
|
951
|
+
# Handle empty segments
|
|
952
|
+
if count1 == 0 and count2 == 0:
|
|
953
|
+
logger.debug(
|
|
954
|
+
"Uneven distribution of keys detected in segment {}..{} "
|
|
955
|
+
"(big gaps in the key column). For better performance, "
|
|
956
|
+
"we recommend to increase the bisection-threshold.",
|
|
957
|
+
table1.min_key,
|
|
958
|
+
table1.max_key,
|
|
959
|
+
)
|
|
960
|
+
assert checksum1 is None and checksum2 is None
|
|
961
|
+
info_tree.info.is_diff = False
|
|
962
|
+
return
|
|
963
|
+
|
|
964
|
+
# Compare checksums
|
|
965
|
+
if checksum1 == checksum2:
|
|
966
|
+
logger.info(". " * level + "[CHECKSUM MODE] Checksums match - no differences")
|
|
967
|
+
info_tree.info.is_diff = False
|
|
968
|
+
return
|
|
969
|
+
|
|
970
|
+
logger.info(". " * level + "[CHECKSUM MODE] Checksums differ - bisecting segment")
|
|
971
|
+
info_tree.info.is_diff = True
|
|
972
|
+
|
|
973
|
+
# Bisect and continue
|
|
974
|
+
return self._bisect_and_diff_segments(ti, table1, table2, info_tree, level=level, max_rows=max(count1, count2))
|
|
975
|
+
|
|
976
|
+
def _bisect_and_diff_segments(
|
|
977
|
+
self,
|
|
978
|
+
ti,
|
|
979
|
+
table1: TableSegment,
|
|
980
|
+
table2: TableSegment,
|
|
981
|
+
info_tree: InfoTree,
|
|
982
|
+
level=0,
|
|
983
|
+
max_rows=None,
|
|
984
|
+
):
|
|
985
|
+
"""
|
|
986
|
+
Enhanced bisection with in-memory mode support.
|
|
987
|
+
"""
|
|
988
|
+
# Check recursion limit
|
|
989
|
+
if level > 15:
|
|
990
|
+
logger.warning(
|
|
991
|
+
". " * level + f"Maximum recursion level ({level}) reached; "
|
|
992
|
+
f"downloading segment {table1.min_key}..{table1.max_key}"
|
|
993
|
+
)
|
|
994
|
+
return self._download_and_diff_locally(table1, table2, info_tree, level)
|
|
995
|
+
|
|
996
|
+
assert table1.is_bounded and table2.is_bounded
|
|
997
|
+
|
|
998
|
+
# Initialize diff tracker
|
|
999
|
+
self._initialize_diff_tracker(table1, table2)
|
|
1000
|
+
|
|
1001
|
+
# Check continuation conditions
|
|
1002
|
+
if not self._check_continuation_conditions(table1, info_tree, level):
|
|
1003
|
+
return
|
|
1004
|
+
|
|
1005
|
+
# Calculate max space size
|
|
1006
|
+
max_space_size = max(table1.approximate_size(self.t1_row_count), table2.approximate_size(self.t2_row_count))
|
|
1007
|
+
|
|
1008
|
+
if max_rows is None:
|
|
1009
|
+
max_rows = max_space_size
|
|
1010
|
+
|
|
1011
|
+
info_tree.info.max_rows = max_rows
|
|
1012
|
+
|
|
1013
|
+
# Check if we should download directly
|
|
1014
|
+
should_download = (
|
|
1015
|
+
self.bisection_disabled or max_rows < self.bisection_threshold or max_space_size < self.bisection_factor * 2
|
|
1016
|
+
)
|
|
1017
|
+
|
|
1018
|
+
# In-memory mode: also download if at target segment size
|
|
1019
|
+
if self.in_memory_diff and max_rows <= self.memory_segment_size:
|
|
1020
|
+
should_download = True
|
|
1021
|
+
|
|
1022
|
+
if should_download:
|
|
1023
|
+
return self._download_and_diff_locally(table1, table2, info_tree, level)
|
|
1024
|
+
|
|
1025
|
+
# Otherwise, continue with standard bisection
|
|
1026
|
+
return super()._bisect_and_diff_segments(ti, table1, table2, info_tree, level, max_rows)
|