dcs-sdk 1.6.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_diff/__init__.py +221 -0
- data_diff/__main__.py +517 -0
- data_diff/abcs/__init__.py +13 -0
- data_diff/abcs/compiler.py +27 -0
- data_diff/abcs/database_types.py +402 -0
- data_diff/config.py +141 -0
- data_diff/databases/__init__.py +38 -0
- data_diff/databases/_connect.py +323 -0
- data_diff/databases/base.py +1417 -0
- data_diff/databases/bigquery.py +376 -0
- data_diff/databases/clickhouse.py +217 -0
- data_diff/databases/databricks.py +262 -0
- data_diff/databases/duckdb.py +207 -0
- data_diff/databases/mssql.py +343 -0
- data_diff/databases/mysql.py +189 -0
- data_diff/databases/oracle.py +238 -0
- data_diff/databases/postgresql.py +293 -0
- data_diff/databases/presto.py +222 -0
- data_diff/databases/redis.py +93 -0
- data_diff/databases/redshift.py +233 -0
- data_diff/databases/snowflake.py +222 -0
- data_diff/databases/sybase.py +720 -0
- data_diff/databases/trino.py +73 -0
- data_diff/databases/vertica.py +174 -0
- data_diff/diff_tables.py +489 -0
- data_diff/errors.py +17 -0
- data_diff/format.py +369 -0
- data_diff/hashdiff_tables.py +1026 -0
- data_diff/info_tree.py +76 -0
- data_diff/joindiff_tables.py +434 -0
- data_diff/lexicographic_space.py +253 -0
- data_diff/parse_time.py +88 -0
- data_diff/py.typed +0 -0
- data_diff/queries/__init__.py +13 -0
- data_diff/queries/api.py +213 -0
- data_diff/queries/ast_classes.py +811 -0
- data_diff/queries/base.py +38 -0
- data_diff/queries/extras.py +43 -0
- data_diff/query_utils.py +70 -0
- data_diff/schema.py +67 -0
- data_diff/table_segment.py +583 -0
- data_diff/thread_utils.py +112 -0
- data_diff/utils.py +1022 -0
- data_diff/version.py +15 -0
- dcs_core/__init__.py +13 -0
- dcs_core/__main__.py +17 -0
- dcs_core/__version__.py +15 -0
- dcs_core/cli/__init__.py +13 -0
- dcs_core/cli/cli.py +165 -0
- dcs_core/core/__init__.py +19 -0
- dcs_core/core/common/__init__.py +13 -0
- dcs_core/core/common/errors.py +50 -0
- dcs_core/core/common/models/__init__.py +13 -0
- dcs_core/core/common/models/configuration.py +284 -0
- dcs_core/core/common/models/dashboard.py +24 -0
- dcs_core/core/common/models/data_source_resource.py +75 -0
- dcs_core/core/common/models/metric.py +160 -0
- dcs_core/core/common/models/profile.py +75 -0
- dcs_core/core/common/models/validation.py +216 -0
- dcs_core/core/common/models/widget.py +44 -0
- dcs_core/core/configuration/__init__.py +13 -0
- dcs_core/core/configuration/config_loader.py +139 -0
- dcs_core/core/configuration/configuration_parser.py +262 -0
- dcs_core/core/configuration/configuration_parser_arc.py +328 -0
- dcs_core/core/datasource/__init__.py +13 -0
- dcs_core/core/datasource/base.py +62 -0
- dcs_core/core/datasource/manager.py +112 -0
- dcs_core/core/datasource/search_datasource.py +421 -0
- dcs_core/core/datasource/sql_datasource.py +1094 -0
- dcs_core/core/inspect.py +163 -0
- dcs_core/core/logger/__init__.py +13 -0
- dcs_core/core/logger/base.py +32 -0
- dcs_core/core/logger/default_logger.py +94 -0
- dcs_core/core/metric/__init__.py +13 -0
- dcs_core/core/metric/base.py +220 -0
- dcs_core/core/metric/combined_metric.py +98 -0
- dcs_core/core/metric/custom_metric.py +34 -0
- dcs_core/core/metric/manager.py +137 -0
- dcs_core/core/metric/numeric_metric.py +403 -0
- dcs_core/core/metric/reliability_metric.py +90 -0
- dcs_core/core/profiling/__init__.py +13 -0
- dcs_core/core/profiling/datasource_profiling.py +136 -0
- dcs_core/core/profiling/numeric_field_profiling.py +72 -0
- dcs_core/core/profiling/text_field_profiling.py +67 -0
- dcs_core/core/repository/__init__.py +13 -0
- dcs_core/core/repository/metric_repository.py +77 -0
- dcs_core/core/utils/__init__.py +13 -0
- dcs_core/core/utils/log.py +29 -0
- dcs_core/core/utils/tracking.py +105 -0
- dcs_core/core/utils/utils.py +44 -0
- dcs_core/core/validation/__init__.py +13 -0
- dcs_core/core/validation/base.py +230 -0
- dcs_core/core/validation/completeness_validation.py +153 -0
- dcs_core/core/validation/custom_query_validation.py +24 -0
- dcs_core/core/validation/manager.py +282 -0
- dcs_core/core/validation/numeric_validation.py +276 -0
- dcs_core/core/validation/reliability_validation.py +91 -0
- dcs_core/core/validation/uniqueness_validation.py +61 -0
- dcs_core/core/validation/validity_validation.py +738 -0
- dcs_core/integrations/__init__.py +13 -0
- dcs_core/integrations/databases/__init__.py +13 -0
- dcs_core/integrations/databases/bigquery.py +187 -0
- dcs_core/integrations/databases/databricks.py +51 -0
- dcs_core/integrations/databases/db2.py +652 -0
- dcs_core/integrations/databases/elasticsearch.py +61 -0
- dcs_core/integrations/databases/mssql.py +829 -0
- dcs_core/integrations/databases/mysql.py +409 -0
- dcs_core/integrations/databases/opensearch.py +64 -0
- dcs_core/integrations/databases/oracle.py +719 -0
- dcs_core/integrations/databases/postgres.py +482 -0
- dcs_core/integrations/databases/redshift.py +53 -0
- dcs_core/integrations/databases/snowflake.py +48 -0
- dcs_core/integrations/databases/spark_df.py +111 -0
- dcs_core/integrations/databases/sybase.py +1069 -0
- dcs_core/integrations/storage/__init__.py +13 -0
- dcs_core/integrations/storage/local_file.py +149 -0
- dcs_core/integrations/utils/__init__.py +13 -0
- dcs_core/integrations/utils/utils.py +36 -0
- dcs_core/report/__init__.py +13 -0
- dcs_core/report/dashboard.py +211 -0
- dcs_core/report/models.py +88 -0
- dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
- dcs_core/report/static/assets/images/docs.svg +6 -0
- dcs_core/report/static/assets/images/github.svg +4 -0
- dcs_core/report/static/assets/images/logo.svg +7 -0
- dcs_core/report/static/assets/images/slack.svg +13 -0
- dcs_core/report/static/index.js +2 -0
- dcs_core/report/static/index.js.LICENSE.txt +3971 -0
- dcs_sdk/__init__.py +13 -0
- dcs_sdk/__main__.py +18 -0
- dcs_sdk/__version__.py +15 -0
- dcs_sdk/cli/__init__.py +13 -0
- dcs_sdk/cli/cli.py +163 -0
- dcs_sdk/sdk/__init__.py +58 -0
- dcs_sdk/sdk/config/__init__.py +13 -0
- dcs_sdk/sdk/config/config_loader.py +491 -0
- dcs_sdk/sdk/data_diff/__init__.py +13 -0
- dcs_sdk/sdk/data_diff/data_differ.py +821 -0
- dcs_sdk/sdk/rules/__init__.py +15 -0
- dcs_sdk/sdk/rules/rules_mappping.py +31 -0
- dcs_sdk/sdk/rules/rules_repository.py +214 -0
- dcs_sdk/sdk/rules/schema_rules.py +65 -0
- dcs_sdk/sdk/utils/__init__.py +13 -0
- dcs_sdk/sdk/utils/serializer.py +25 -0
- dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
- dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
- dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
- dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
- dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
- dcs_sdk/sdk/utils/table.py +475 -0
- dcs_sdk/sdk/utils/themes.py +40 -0
- dcs_sdk/sdk/utils/utils.py +349 -0
- dcs_sdk-1.6.5.dist-info/METADATA +150 -0
- dcs_sdk-1.6.5.dist-info/RECORD +159 -0
- dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
- dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,475 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import multiprocessing
|
|
16
|
+
from collections import defaultdict
|
|
17
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
18
|
+
from typing import Any, Dict, List, Optional, Union
|
|
19
|
+
|
|
20
|
+
from rich.panel import Panel
|
|
21
|
+
from rich.table import Table
|
|
22
|
+
from rich.text import Text
|
|
23
|
+
|
|
24
|
+
from dcs_sdk.sdk.rules.rules_repository import RulesRepository
|
|
25
|
+
from dcs_sdk.sdk.utils.utils import apply_custom_masking, apply_masking
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def create_legend():
|
|
29
|
+
legend = Table(show_header=False, box=None)
|
|
30
|
+
legend.add_column(style="bold")
|
|
31
|
+
legend.add_column()
|
|
32
|
+
legend.add_row("Red", "Mismatch", style="red")
|
|
33
|
+
legend.add_row("Cyan", "Match", style="cyan")
|
|
34
|
+
legend.add_row("Yellow", "Duplicate", style="yellow")
|
|
35
|
+
return Panel(legend, title="Info", border_style="cyan bold", width=80)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def create_schema_table(response, console, is_source=True):
|
|
39
|
+
key = "source_dataset" if is_source else "target_dataset"
|
|
40
|
+
columns = response[key]["columns"]
|
|
41
|
+
title = f"Schema: {response[key]['database']}.{response[key]['schema']}.{response[key]['table_name']}"
|
|
42
|
+
mapped_columns = response["columns_mappings"]
|
|
43
|
+
other_columns = response["target_dataset"]["columns"] if is_source else response["source_dataset"]["columns"]
|
|
44
|
+
rules_repo = RulesRepository.get_instance()
|
|
45
|
+
|
|
46
|
+
table = Table(title=title, show_header=True, header_style="bold magenta")
|
|
47
|
+
table.add_column("#")
|
|
48
|
+
table.add_column("Column Name", style="cyan")
|
|
49
|
+
table.add_column("Data Type", style="magenta")
|
|
50
|
+
table.add_column("Reason", style="red")
|
|
51
|
+
|
|
52
|
+
for index, col in enumerate(columns, start=1):
|
|
53
|
+
name = col["column_name"]
|
|
54
|
+
data_type = col["data_type"]
|
|
55
|
+
max_length = col.get("character_maximum_length", None)
|
|
56
|
+
|
|
57
|
+
mapped_col = next(
|
|
58
|
+
(
|
|
59
|
+
m["target_column"] if is_source else m["source_column"]
|
|
60
|
+
for m in mapped_columns
|
|
61
|
+
if m["source_column" if is_source else "target_column"] == name
|
|
62
|
+
),
|
|
63
|
+
None,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
other_col = next((c for c in other_columns if c["column_name"] == (mapped_col or name)), None)
|
|
67
|
+
|
|
68
|
+
mismatch_reason = ""
|
|
69
|
+
if other_col:
|
|
70
|
+
match, reason = rules_repo.apply_schema_rules(
|
|
71
|
+
src_col=col,
|
|
72
|
+
tgt_col=other_col,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
if not match:
|
|
76
|
+
mismatch_reason = reason or ("Exclusive to source" if is_source else "Exclusive to target")
|
|
77
|
+
else:
|
|
78
|
+
mismatch_reason = "Exclusive to source" if is_source else "Exclusive to target"
|
|
79
|
+
|
|
80
|
+
data_type_with_max_len = f"{data_type} {('('+ str(max_length) + ')') if max_length is not None else ''}"
|
|
81
|
+
if mismatch_reason:
|
|
82
|
+
table.add_row(
|
|
83
|
+
str(index),
|
|
84
|
+
Text(name, style="red"),
|
|
85
|
+
Text(data_type_with_max_len, style="red"),
|
|
86
|
+
mismatch_reason,
|
|
87
|
+
)
|
|
88
|
+
else:
|
|
89
|
+
table.add_row(str(index), name, data_type_with_max_len, Text("-", style="green", justify="left"))
|
|
90
|
+
col["mismatch_reason"] = mismatch_reason
|
|
91
|
+
console.print(table)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def create_table_schema_row_count(response, row_diff_table, console):
|
|
95
|
+
source_dataset = response["source_dataset"]
|
|
96
|
+
target_dataset = response["target_dataset"]
|
|
97
|
+
|
|
98
|
+
console.print(create_legend())
|
|
99
|
+
table_row_counts = Table(title="Row Counts", show_header=True, header_style="bold magenta")
|
|
100
|
+
table_row_counts.add_column("")
|
|
101
|
+
table_row_counts.add_column(
|
|
102
|
+
f"{source_dataset['database']}.{source_dataset['schema']}.{source_dataset['table_name']}",
|
|
103
|
+
style="cyan",
|
|
104
|
+
)
|
|
105
|
+
table_row_counts.add_column(
|
|
106
|
+
f"{target_dataset['database']}.{target_dataset['schema']}.{target_dataset['table_name']}",
|
|
107
|
+
style="yellow",
|
|
108
|
+
)
|
|
109
|
+
table_row_counts.add_row(
|
|
110
|
+
"Row Count",
|
|
111
|
+
str(source_dataset["row_count"]),
|
|
112
|
+
str(target_dataset["row_count"]),
|
|
113
|
+
)
|
|
114
|
+
console.print(table_row_counts)
|
|
115
|
+
|
|
116
|
+
create_schema_table(response, console, is_source=True)
|
|
117
|
+
create_schema_table(response, console, is_source=False)
|
|
118
|
+
|
|
119
|
+
if row_diff_table is not None:
|
|
120
|
+
console.print(row_diff_table)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def process_batch(
|
|
124
|
+
batch: List[Dict[str, Any]],
|
|
125
|
+
provider_class,
|
|
126
|
+
primary_keys,
|
|
127
|
+
fields,
|
|
128
|
+
similarity,
|
|
129
|
+
src_masking_cols: List[str],
|
|
130
|
+
tgt_masking_cols: List[str],
|
|
131
|
+
masking_character: str,
|
|
132
|
+
) -> List[Dict[str, Any]]:
|
|
133
|
+
if not provider_class or len(batch) < 2:
|
|
134
|
+
return batch
|
|
135
|
+
|
|
136
|
+
provider = provider_class()
|
|
137
|
+
batch_size = len(batch)
|
|
138
|
+
i = 0
|
|
139
|
+
while i < batch_size - 1:
|
|
140
|
+
provider.add_text_similarity(
|
|
141
|
+
data=batch[i : i + 2],
|
|
142
|
+
key=primary_keys,
|
|
143
|
+
fields=fields,
|
|
144
|
+
similarity=similarity,
|
|
145
|
+
source_masking_cols=src_masking_cols,
|
|
146
|
+
target_masking_cols=tgt_masking_cols,
|
|
147
|
+
mask_char=masking_character,
|
|
148
|
+
)
|
|
149
|
+
i += 2
|
|
150
|
+
|
|
151
|
+
return batch
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def differ_rows(
|
|
155
|
+
diff_iter,
|
|
156
|
+
response,
|
|
157
|
+
src_masking_cols: List[str],
|
|
158
|
+
tgt_masking_cols: List[str],
|
|
159
|
+
masking_character: str,
|
|
160
|
+
limit: int | None = None,
|
|
161
|
+
table_limit: int = 100,
|
|
162
|
+
display_table: bool = False,
|
|
163
|
+
similarity=None,
|
|
164
|
+
similarity_providers=None,
|
|
165
|
+
fields=None,
|
|
166
|
+
batch_size: int = 2_000,
|
|
167
|
+
max_workers: int = max(1, multiprocessing.cpu_count() - 2),
|
|
168
|
+
quick_comparison: bool = False,
|
|
169
|
+
):
|
|
170
|
+
if quick_comparison:
|
|
171
|
+
try:
|
|
172
|
+
next(iter(diff_iter))
|
|
173
|
+
return {
|
|
174
|
+
"stats": {
|
|
175
|
+
"rows_A": 0,
|
|
176
|
+
"rows_B": 0,
|
|
177
|
+
"exclusive_A": 0,
|
|
178
|
+
"exclusive_B": 0,
|
|
179
|
+
"diff_pk_percent": 0,
|
|
180
|
+
"unchanged": 0,
|
|
181
|
+
"total_diff_count": 0,
|
|
182
|
+
"diff_rows_count": 0,
|
|
183
|
+
"total_duplicate_count_source": 0,
|
|
184
|
+
"total_duplicate_count_target": 0,
|
|
185
|
+
"diff_rows_percent": 0,
|
|
186
|
+
"has_differences": True,
|
|
187
|
+
},
|
|
188
|
+
"exclusive_pk_values_target": [],
|
|
189
|
+
"exclusive_pk_values_source": [],
|
|
190
|
+
"duplicate_pk_values_source": [],
|
|
191
|
+
"duplicate_pk_values_target": [],
|
|
192
|
+
"records_with_differences": [],
|
|
193
|
+
"table": None,
|
|
194
|
+
}
|
|
195
|
+
except StopIteration:
|
|
196
|
+
return {
|
|
197
|
+
"stats": {"has_differences": False},
|
|
198
|
+
"exclusive_pk_values_target": [],
|
|
199
|
+
"exclusive_pk_values_source": [],
|
|
200
|
+
"duplicate_pk_values_source": [],
|
|
201
|
+
"duplicate_pk_values_target": [],
|
|
202
|
+
"records_with_differences": [],
|
|
203
|
+
"table": None,
|
|
204
|
+
}
|
|
205
|
+
stats = diff_iter.get_stats_dict()
|
|
206
|
+
exclusive_source_set = set(stats["exclusive_source_ids"])
|
|
207
|
+
exclusive_target_set = set(stats["exclusive_target_ids"])
|
|
208
|
+
diff_values_set = set(stats["diff_values_ids"])
|
|
209
|
+
source_duplicates = set(stats["duplicate_source_ids"])
|
|
210
|
+
target_duplicates = set(stats["duplicate_target_ids"])
|
|
211
|
+
pk_key_cols = response["source_dataset"]["primary_keys"]
|
|
212
|
+
|
|
213
|
+
exclusive_to_source = []
|
|
214
|
+
exclusive_to_target = []
|
|
215
|
+
duplicates_in_source = []
|
|
216
|
+
duplicates_in_target = []
|
|
217
|
+
|
|
218
|
+
seen_ex_source = set()
|
|
219
|
+
seen_ex_target = set()
|
|
220
|
+
|
|
221
|
+
diff_pks_to_collect = set(diff_values_set) if limit is None else set(list(diff_values_set)[:limit])
|
|
222
|
+
diff_records_dict = {}
|
|
223
|
+
|
|
224
|
+
total_source_duplicates = 0
|
|
225
|
+
total_target_duplicates = 0
|
|
226
|
+
table_data = []
|
|
227
|
+
table = None
|
|
228
|
+
|
|
229
|
+
for diff in diff_iter:
|
|
230
|
+
sign, rows = diff
|
|
231
|
+
obj = {"meta": {"origin": "source" if sign == "-" else "target", "sign": sign}}
|
|
232
|
+
column_values = {}
|
|
233
|
+
|
|
234
|
+
for idx, col_ in enumerate(rows):
|
|
235
|
+
column_name = response["columns_mappings"][idx]["source_column"]
|
|
236
|
+
obj[column_name] = col_
|
|
237
|
+
column_values[column_name] = col_
|
|
238
|
+
|
|
239
|
+
if len(table_data) < table_limit:
|
|
240
|
+
table_data.append(obj)
|
|
241
|
+
pk_value = tuple(column_values[col] for col in pk_key_cols)
|
|
242
|
+
|
|
243
|
+
if sign == "-" and pk_value in exclusive_source_set:
|
|
244
|
+
if pk_value not in seen_ex_source and (limit is None or len(exclusive_to_source) < limit):
|
|
245
|
+
masked_obj = apply_masking(obj, src_masking_cols, masking_character)
|
|
246
|
+
exclusive_to_source.append(masked_obj)
|
|
247
|
+
seen_ex_source.add(pk_value)
|
|
248
|
+
|
|
249
|
+
if sign == "+" and pk_value in exclusive_target_set:
|
|
250
|
+
if pk_value not in seen_ex_target and (limit is None or len(exclusive_to_target) < limit):
|
|
251
|
+
masked_obj = apply_masking(obj, tgt_masking_cols, masking_character)
|
|
252
|
+
exclusive_to_target.append(masked_obj)
|
|
253
|
+
seen_ex_target.add(pk_value)
|
|
254
|
+
|
|
255
|
+
if sign == "-" and pk_value in source_duplicates:
|
|
256
|
+
total_source_duplicates += 1
|
|
257
|
+
if limit is None or len(duplicates_in_source) < limit:
|
|
258
|
+
masked_obj = apply_masking(obj, src_masking_cols, masking_character)
|
|
259
|
+
duplicates_in_source.append(masked_obj)
|
|
260
|
+
|
|
261
|
+
if sign == "+" and pk_value in target_duplicates:
|
|
262
|
+
total_target_duplicates += 1
|
|
263
|
+
if limit is None or len(duplicates_in_target) < limit:
|
|
264
|
+
masked_obj = apply_masking(obj, tgt_masking_cols, masking_character)
|
|
265
|
+
duplicates_in_target.append(masked_obj)
|
|
266
|
+
|
|
267
|
+
if pk_value in diff_pks_to_collect:
|
|
268
|
+
if pk_value not in diff_records_dict:
|
|
269
|
+
diff_records_dict[pk_value] = []
|
|
270
|
+
if limit is None or len(diff_records_dict[pk_value]) < 2:
|
|
271
|
+
diff_records_dict[pk_value].append(obj.copy())
|
|
272
|
+
|
|
273
|
+
def sort_by_pk(obj):
|
|
274
|
+
sort_key = []
|
|
275
|
+
for col in pk_key_cols:
|
|
276
|
+
pk_value = obj[col]
|
|
277
|
+
try:
|
|
278
|
+
sort_key.append((0, int(pk_value)))
|
|
279
|
+
except (ValueError, TypeError):
|
|
280
|
+
sort_key.append((1, str(pk_value)))
|
|
281
|
+
return tuple(sort_key)
|
|
282
|
+
|
|
283
|
+
try:
|
|
284
|
+
exclusive_to_source.sort(key=sort_by_pk)
|
|
285
|
+
exclusive_to_target.sort(key=sort_by_pk)
|
|
286
|
+
duplicates_in_source.sort(key=sort_by_pk)
|
|
287
|
+
duplicates_in_target.sort(key=sort_by_pk)
|
|
288
|
+
except:
|
|
289
|
+
pass
|
|
290
|
+
|
|
291
|
+
records_with_differences = []
|
|
292
|
+
masked_records = []
|
|
293
|
+
|
|
294
|
+
for pk_value, records in diff_records_dict.items():
|
|
295
|
+
records.sort(key=lambda x: x["meta"]["sign"], reverse=True)
|
|
296
|
+
if not similarity:
|
|
297
|
+
if len(records) == 2:
|
|
298
|
+
source = records[0]
|
|
299
|
+
target = records[1]
|
|
300
|
+
|
|
301
|
+
masked_src, masked_tgt = apply_custom_masking(
|
|
302
|
+
source=source,
|
|
303
|
+
target=target,
|
|
304
|
+
source_masking_cols=src_masking_cols,
|
|
305
|
+
target_masking_cols=tgt_masking_cols,
|
|
306
|
+
mask_char=masking_character,
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
masked_record = [masked_src, masked_tgt]
|
|
310
|
+
masked_records.extend(masked_record)
|
|
311
|
+
else:
|
|
312
|
+
masked_records.extend(records)
|
|
313
|
+
|
|
314
|
+
records_with_differences.extend(masked_records)
|
|
315
|
+
|
|
316
|
+
provider_class = None
|
|
317
|
+
primary_keys = response["source_dataset"]["primary_keys"]
|
|
318
|
+
if similarity and similarity_providers and fields and primary_keys:
|
|
319
|
+
provider_class = similarity_providers.get(similarity.similarity_method.lower())
|
|
320
|
+
if not provider_class:
|
|
321
|
+
print(f"Unknown similarity method: {similarity.similarity_method}")
|
|
322
|
+
|
|
323
|
+
if provider_class:
|
|
324
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
325
|
+
effective_batch_size = min(batch_size, len(records_with_differences))
|
|
326
|
+
effective_batch_size = effective_batch_size if effective_batch_size % 2 == 0 else effective_batch_size - 1
|
|
327
|
+
if effective_batch_size < 2:
|
|
328
|
+
effective_batch_size = 2
|
|
329
|
+
batches = [
|
|
330
|
+
records_with_differences[i : i + effective_batch_size]
|
|
331
|
+
for i in range(0, len(records_with_differences), effective_batch_size)
|
|
332
|
+
]
|
|
333
|
+
futures = [
|
|
334
|
+
executor.submit(
|
|
335
|
+
process_batch,
|
|
336
|
+
batch,
|
|
337
|
+
provider_class,
|
|
338
|
+
primary_keys,
|
|
339
|
+
fields,
|
|
340
|
+
similarity,
|
|
341
|
+
src_masking_cols,
|
|
342
|
+
tgt_masking_cols,
|
|
343
|
+
masking_character,
|
|
344
|
+
)
|
|
345
|
+
for batch in batches
|
|
346
|
+
]
|
|
347
|
+
i = 0
|
|
348
|
+
for future in as_completed(futures):
|
|
349
|
+
try:
|
|
350
|
+
result = future.result()
|
|
351
|
+
records_with_differences[i : i + len(result)] = result
|
|
352
|
+
i += len(result)
|
|
353
|
+
except Exception as e:
|
|
354
|
+
print(f"Error in batch processing: {str(e)}")
|
|
355
|
+
|
|
356
|
+
try:
|
|
357
|
+
records_with_differences.sort(key=sort_by_pk)
|
|
358
|
+
except:
|
|
359
|
+
pass
|
|
360
|
+
|
|
361
|
+
stats["total_diff_count"] = len(diff_values_set) + len(exclusive_source_set) + len(exclusive_target_set)
|
|
362
|
+
stats["diff_rows_count"] = len(diff_values_set)
|
|
363
|
+
stats.pop("exclusive_source_ids", None)
|
|
364
|
+
stats.pop("exclusive_target_ids", None)
|
|
365
|
+
stats.pop("diff_values_ids", None)
|
|
366
|
+
stats.pop("duplicate_source_ids", None)
|
|
367
|
+
stats.pop("duplicate_target_ids", None)
|
|
368
|
+
stats["total_duplicate_count_source"] = total_source_duplicates
|
|
369
|
+
stats["total_duplicate_count_target"] = total_target_duplicates
|
|
370
|
+
try:
|
|
371
|
+
diff_rows_percent = stats.get("diff_rows_count", 0) / (
|
|
372
|
+
stats.get("diff_rows_count", 0) + stats.get("unchanged", 0)
|
|
373
|
+
)
|
|
374
|
+
diff_rows_percent = abs(diff_rows_percent)
|
|
375
|
+
except ZeroDivisionError:
|
|
376
|
+
diff_rows_percent = 0.0
|
|
377
|
+
stats["diff_pk_percent"] = min(stats.get("diff_pk_percent", 0), 1)
|
|
378
|
+
stats["diff_rows_percent"] = diff_rows_percent
|
|
379
|
+
has_differences = diff_rows_percent != 0 or stats["diff_pk_percent"] != 0
|
|
380
|
+
stats["has_differences"] = has_differences
|
|
381
|
+
stats["source_masked_columns"] = src_masking_cols
|
|
382
|
+
stats["target_masked_columns"] = tgt_masking_cols
|
|
383
|
+
|
|
384
|
+
if display_table:
|
|
385
|
+
table = create_table_diff_rows(table_data, primary_keys, response["columns_mappings"], 100)
|
|
386
|
+
|
|
387
|
+
return {
|
|
388
|
+
"exclusive_pk_values_target": exclusive_to_target,
|
|
389
|
+
"exclusive_pk_values_source": exclusive_to_source,
|
|
390
|
+
"duplicate_pk_values_source": duplicates_in_source,
|
|
391
|
+
"duplicate_pk_values_target": duplicates_in_target,
|
|
392
|
+
"records_with_differences": records_with_differences,
|
|
393
|
+
"stats": stats,
|
|
394
|
+
"table": table,
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def create_table_diff_rows(data, primary_keys: Union[str, list[str]], columns_mappings, limit: int = 100):
|
|
399
|
+
table = Table(title="Diff Rows", show_header=True, header_style="bold magenta")
|
|
400
|
+
column_mapping_dict = {mapping["source_column"]: mapping["target_column"] for mapping in columns_mappings}
|
|
401
|
+
|
|
402
|
+
table.add_column("#")
|
|
403
|
+
table.add_column("Origin")
|
|
404
|
+
for mapping in columns_mappings:
|
|
405
|
+
source_col = mapping["source_column"]
|
|
406
|
+
target_col = mapping["target_column"]
|
|
407
|
+
if source_col == target_col:
|
|
408
|
+
table.add_column(source_col, style="cyan")
|
|
409
|
+
else:
|
|
410
|
+
table.add_column(f"{target_col}/{source_col}", style="cyan")
|
|
411
|
+
|
|
412
|
+
if isinstance(primary_keys, str):
|
|
413
|
+
primary_keys = [primary_keys]
|
|
414
|
+
|
|
415
|
+
def get_composite_key(row):
|
|
416
|
+
return tuple(row[key] for key in primary_keys)
|
|
417
|
+
|
|
418
|
+
records = defaultdict(lambda: defaultdict(list))
|
|
419
|
+
for row in data:
|
|
420
|
+
composite_key = get_composite_key(row)
|
|
421
|
+
origin = row["meta"]["origin"]
|
|
422
|
+
records[composite_key][origin].append(row)
|
|
423
|
+
|
|
424
|
+
previous_composite_key = None
|
|
425
|
+
serial_number = 0
|
|
426
|
+
unique_keys_processed = set()
|
|
427
|
+
for row in data:
|
|
428
|
+
composite_key = get_composite_key(row)
|
|
429
|
+
if composite_key not in unique_keys_processed:
|
|
430
|
+
if len(unique_keys_processed) >= limit:
|
|
431
|
+
break
|
|
432
|
+
serial_number += 1
|
|
433
|
+
meta_values = row["meta"]
|
|
434
|
+
row_values = {key: row[key] for key in column_mapping_dict.keys()}
|
|
435
|
+
origin = meta_values["origin"]
|
|
436
|
+
has_duplicates = any(len(records[composite_key][orig]) > 1 for orig in records[composite_key])
|
|
437
|
+
|
|
438
|
+
mismatched_columns = set()
|
|
439
|
+
if not has_duplicates:
|
|
440
|
+
source_count = len(records[composite_key]["source"])
|
|
441
|
+
target_count = len(records[composite_key]["target"])
|
|
442
|
+
if source_count == 1 and target_count == 1:
|
|
443
|
+
source_row = records[composite_key]["source"][0]
|
|
444
|
+
target_row = records[composite_key]["target"][0]
|
|
445
|
+
if origin == "target":
|
|
446
|
+
for col in column_mapping_dict.keys():
|
|
447
|
+
if source_row[col] != target_row[col]:
|
|
448
|
+
mismatched_columns.add(col)
|
|
449
|
+
|
|
450
|
+
formatted_cells = [Text(str(serial_number))]
|
|
451
|
+
for col in meta_values:
|
|
452
|
+
if col != "sign":
|
|
453
|
+
formatted_cells.append(
|
|
454
|
+
Text(
|
|
455
|
+
str(meta_values[col]),
|
|
456
|
+
style=f"{'chartreuse2' if meta_values['origin'] == 'source' else 'cyan3'}",
|
|
457
|
+
)
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
for col in column_mapping_dict.keys():
|
|
461
|
+
cell_value = row_values[col]
|
|
462
|
+
if has_duplicates:
|
|
463
|
+
formatted_cells.append(Text(str(cell_value), style="yellow bold"))
|
|
464
|
+
elif col in mismatched_columns:
|
|
465
|
+
formatted_cells.append(Text(str(cell_value), style="red bold"))
|
|
466
|
+
else:
|
|
467
|
+
formatted_cells.append(Text(str(cell_value)))
|
|
468
|
+
|
|
469
|
+
if previous_composite_key is not None and previous_composite_key != composite_key:
|
|
470
|
+
table.add_section()
|
|
471
|
+
|
|
472
|
+
table.add_row(*formatted_cells)
|
|
473
|
+
previous_composite_key = composite_key
|
|
474
|
+
unique_keys_processed.add(composite_key)
|
|
475
|
+
return table
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from rich.terminal_theme import TerminalTheme
|
|
16
|
+
|
|
17
|
+
theme_1 = TerminalTheme(
|
|
18
|
+
background=(20, 20, 20), # RGB triplet for grey
|
|
19
|
+
foreground=(255, 255, 255), # RGB triplet for white
|
|
20
|
+
normal=[
|
|
21
|
+
(0, 0, 0), # black
|
|
22
|
+
(255, 0, 0), # red
|
|
23
|
+
(0, 255, 0), # green
|
|
24
|
+
(255, 255, 0), # yellow
|
|
25
|
+
(0, 0, 255), # blue
|
|
26
|
+
(255, 0, 255), # magenta
|
|
27
|
+
(0, 255, 255), # cyan
|
|
28
|
+
(255, 255, 255), # white
|
|
29
|
+
],
|
|
30
|
+
bright=[
|
|
31
|
+
(64, 64, 64), # medium grey
|
|
32
|
+
(128, 64, 64), # medium red
|
|
33
|
+
(64, 128, 64), # medium green
|
|
34
|
+
(128, 128, 64), # medium yellow
|
|
35
|
+
(64, 64, 128), # medium blue
|
|
36
|
+
(128, 64, 128), # medium magenta
|
|
37
|
+
(64, 128, 128), # medium cyan
|
|
38
|
+
(128, 128, 128), # medium white (grey)
|
|
39
|
+
],
|
|
40
|
+
)
|