dcs-sdk 1.6.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_diff/__init__.py +221 -0
- data_diff/__main__.py +517 -0
- data_diff/abcs/__init__.py +13 -0
- data_diff/abcs/compiler.py +27 -0
- data_diff/abcs/database_types.py +402 -0
- data_diff/config.py +141 -0
- data_diff/databases/__init__.py +38 -0
- data_diff/databases/_connect.py +323 -0
- data_diff/databases/base.py +1417 -0
- data_diff/databases/bigquery.py +376 -0
- data_diff/databases/clickhouse.py +217 -0
- data_diff/databases/databricks.py +262 -0
- data_diff/databases/duckdb.py +207 -0
- data_diff/databases/mssql.py +343 -0
- data_diff/databases/mysql.py +189 -0
- data_diff/databases/oracle.py +238 -0
- data_diff/databases/postgresql.py +293 -0
- data_diff/databases/presto.py +222 -0
- data_diff/databases/redis.py +93 -0
- data_diff/databases/redshift.py +233 -0
- data_diff/databases/snowflake.py +222 -0
- data_diff/databases/sybase.py +720 -0
- data_diff/databases/trino.py +73 -0
- data_diff/databases/vertica.py +174 -0
- data_diff/diff_tables.py +489 -0
- data_diff/errors.py +17 -0
- data_diff/format.py +369 -0
- data_diff/hashdiff_tables.py +1026 -0
- data_diff/info_tree.py +76 -0
- data_diff/joindiff_tables.py +434 -0
- data_diff/lexicographic_space.py +253 -0
- data_diff/parse_time.py +88 -0
- data_diff/py.typed +0 -0
- data_diff/queries/__init__.py +13 -0
- data_diff/queries/api.py +213 -0
- data_diff/queries/ast_classes.py +811 -0
- data_diff/queries/base.py +38 -0
- data_diff/queries/extras.py +43 -0
- data_diff/query_utils.py +70 -0
- data_diff/schema.py +67 -0
- data_diff/table_segment.py +583 -0
- data_diff/thread_utils.py +112 -0
- data_diff/utils.py +1022 -0
- data_diff/version.py +15 -0
- dcs_core/__init__.py +13 -0
- dcs_core/__main__.py +17 -0
- dcs_core/__version__.py +15 -0
- dcs_core/cli/__init__.py +13 -0
- dcs_core/cli/cli.py +165 -0
- dcs_core/core/__init__.py +19 -0
- dcs_core/core/common/__init__.py +13 -0
- dcs_core/core/common/errors.py +50 -0
- dcs_core/core/common/models/__init__.py +13 -0
- dcs_core/core/common/models/configuration.py +284 -0
- dcs_core/core/common/models/dashboard.py +24 -0
- dcs_core/core/common/models/data_source_resource.py +75 -0
- dcs_core/core/common/models/metric.py +160 -0
- dcs_core/core/common/models/profile.py +75 -0
- dcs_core/core/common/models/validation.py +216 -0
- dcs_core/core/common/models/widget.py +44 -0
- dcs_core/core/configuration/__init__.py +13 -0
- dcs_core/core/configuration/config_loader.py +139 -0
- dcs_core/core/configuration/configuration_parser.py +262 -0
- dcs_core/core/configuration/configuration_parser_arc.py +328 -0
- dcs_core/core/datasource/__init__.py +13 -0
- dcs_core/core/datasource/base.py +62 -0
- dcs_core/core/datasource/manager.py +112 -0
- dcs_core/core/datasource/search_datasource.py +421 -0
- dcs_core/core/datasource/sql_datasource.py +1094 -0
- dcs_core/core/inspect.py +163 -0
- dcs_core/core/logger/__init__.py +13 -0
- dcs_core/core/logger/base.py +32 -0
- dcs_core/core/logger/default_logger.py +94 -0
- dcs_core/core/metric/__init__.py +13 -0
- dcs_core/core/metric/base.py +220 -0
- dcs_core/core/metric/combined_metric.py +98 -0
- dcs_core/core/metric/custom_metric.py +34 -0
- dcs_core/core/metric/manager.py +137 -0
- dcs_core/core/metric/numeric_metric.py +403 -0
- dcs_core/core/metric/reliability_metric.py +90 -0
- dcs_core/core/profiling/__init__.py +13 -0
- dcs_core/core/profiling/datasource_profiling.py +136 -0
- dcs_core/core/profiling/numeric_field_profiling.py +72 -0
- dcs_core/core/profiling/text_field_profiling.py +67 -0
- dcs_core/core/repository/__init__.py +13 -0
- dcs_core/core/repository/metric_repository.py +77 -0
- dcs_core/core/utils/__init__.py +13 -0
- dcs_core/core/utils/log.py +29 -0
- dcs_core/core/utils/tracking.py +105 -0
- dcs_core/core/utils/utils.py +44 -0
- dcs_core/core/validation/__init__.py +13 -0
- dcs_core/core/validation/base.py +230 -0
- dcs_core/core/validation/completeness_validation.py +153 -0
- dcs_core/core/validation/custom_query_validation.py +24 -0
- dcs_core/core/validation/manager.py +282 -0
- dcs_core/core/validation/numeric_validation.py +276 -0
- dcs_core/core/validation/reliability_validation.py +91 -0
- dcs_core/core/validation/uniqueness_validation.py +61 -0
- dcs_core/core/validation/validity_validation.py +738 -0
- dcs_core/integrations/__init__.py +13 -0
- dcs_core/integrations/databases/__init__.py +13 -0
- dcs_core/integrations/databases/bigquery.py +187 -0
- dcs_core/integrations/databases/databricks.py +51 -0
- dcs_core/integrations/databases/db2.py +652 -0
- dcs_core/integrations/databases/elasticsearch.py +61 -0
- dcs_core/integrations/databases/mssql.py +829 -0
- dcs_core/integrations/databases/mysql.py +409 -0
- dcs_core/integrations/databases/opensearch.py +64 -0
- dcs_core/integrations/databases/oracle.py +719 -0
- dcs_core/integrations/databases/postgres.py +482 -0
- dcs_core/integrations/databases/redshift.py +53 -0
- dcs_core/integrations/databases/snowflake.py +48 -0
- dcs_core/integrations/databases/spark_df.py +111 -0
- dcs_core/integrations/databases/sybase.py +1069 -0
- dcs_core/integrations/storage/__init__.py +13 -0
- dcs_core/integrations/storage/local_file.py +149 -0
- dcs_core/integrations/utils/__init__.py +13 -0
- dcs_core/integrations/utils/utils.py +36 -0
- dcs_core/report/__init__.py +13 -0
- dcs_core/report/dashboard.py +211 -0
- dcs_core/report/models.py +88 -0
- dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
- dcs_core/report/static/assets/images/docs.svg +6 -0
- dcs_core/report/static/assets/images/github.svg +4 -0
- dcs_core/report/static/assets/images/logo.svg +7 -0
- dcs_core/report/static/assets/images/slack.svg +13 -0
- dcs_core/report/static/index.js +2 -0
- dcs_core/report/static/index.js.LICENSE.txt +3971 -0
- dcs_sdk/__init__.py +13 -0
- dcs_sdk/__main__.py +18 -0
- dcs_sdk/__version__.py +15 -0
- dcs_sdk/cli/__init__.py +13 -0
- dcs_sdk/cli/cli.py +163 -0
- dcs_sdk/sdk/__init__.py +58 -0
- dcs_sdk/sdk/config/__init__.py +13 -0
- dcs_sdk/sdk/config/config_loader.py +491 -0
- dcs_sdk/sdk/data_diff/__init__.py +13 -0
- dcs_sdk/sdk/data_diff/data_differ.py +821 -0
- dcs_sdk/sdk/rules/__init__.py +15 -0
- dcs_sdk/sdk/rules/rules_mappping.py +31 -0
- dcs_sdk/sdk/rules/rules_repository.py +214 -0
- dcs_sdk/sdk/rules/schema_rules.py +65 -0
- dcs_sdk/sdk/utils/__init__.py +13 -0
- dcs_sdk/sdk/utils/serializer.py +25 -0
- dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
- dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
- dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
- dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
- dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
- dcs_sdk/sdk/utils/table.py +475 -0
- dcs_sdk/sdk/utils/themes.py +40 -0
- dcs_sdk/sdk/utils/utils.py +349 -0
- dcs_sdk-1.6.5.dist-info/METADATA +150 -0
- dcs_sdk-1.6.5.dist-info/RECORD +159 -0
- dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
- dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,821 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import glob
|
|
16
|
+
import os
|
|
17
|
+
import time
|
|
18
|
+
from collections import defaultdict
|
|
19
|
+
from contextlib import suppress
|
|
20
|
+
from datetime import datetime, timezone
|
|
21
|
+
from typing import Dict, Optional
|
|
22
|
+
|
|
23
|
+
from loguru import logger
|
|
24
|
+
from rich.console import Console
|
|
25
|
+
|
|
26
|
+
from data_diff import TableSegment, connect, connect_to_table, diff_tables
|
|
27
|
+
from data_diff.databases import Database
|
|
28
|
+
from data_diff.databases.redis import RedisBackend
|
|
29
|
+
from dcs_sdk.sdk.config.config_loader import Comparison, SourceTargetConnection
|
|
30
|
+
from dcs_sdk.sdk.rules.rules_repository import RulesRepository
|
|
31
|
+
from dcs_sdk.sdk.utils.serializer import serialize_table_schema
|
|
32
|
+
from dcs_sdk.sdk.utils.table import create_table_schema_row_count, differ_rows
|
|
33
|
+
from dcs_sdk.sdk.utils.themes import theme_1
|
|
34
|
+
from dcs_sdk.sdk.utils.utils import (
|
|
35
|
+
calculate_column_differences,
|
|
36
|
+
convert_to_masked_if_required,
|
|
37
|
+
duck_db_load_csv_to_table,
|
|
38
|
+
find_identical_columns,
|
|
39
|
+
generate_table_name,
|
|
40
|
+
obfuscate_sensitive_data,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
DYNAMIC_BISECTION_THRESHOLD_MAX_LIMIT = 5_00_000
|
|
44
|
+
DEFAULT_BISECTION_THRESHOLD = 50_000
|
|
45
|
+
ROW_COUNT_PER_SEGMENT = 1_00_000
|
|
46
|
+
MAX_EGRESS_LIMIT = 5_00_000
|
|
47
|
+
MIN_EGRESS_LIMIT = 50_000
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class DBTableDiffer:
|
|
51
|
+
def __init__(self, config: Comparison):
|
|
52
|
+
self.config = config
|
|
53
|
+
self.console = Console(record=True)
|
|
54
|
+
self.created_at = datetime.now(tz=timezone.utc)
|
|
55
|
+
self.start_time = time.monotonic()
|
|
56
|
+
self.algorithm = "hashdiff"
|
|
57
|
+
self.table1 = None
|
|
58
|
+
self.table2 = None
|
|
59
|
+
self.diff_iter = None
|
|
60
|
+
self.response = {}
|
|
61
|
+
self.source_file_path = self.config.source.filepath
|
|
62
|
+
self.target_file_path = self.config.target.filepath
|
|
63
|
+
self.limit = config.limit
|
|
64
|
+
self.default_limit = 1000
|
|
65
|
+
self.table_limit = 100
|
|
66
|
+
self.source_db: Database = None
|
|
67
|
+
self.target_db: Database = None
|
|
68
|
+
self.similarity = self.config.similarity
|
|
69
|
+
self.similarity_providers = None
|
|
70
|
+
if self.similarity:
|
|
71
|
+
from dcs_sdk.sdk.utils.similarity_score.base_provider import (
|
|
72
|
+
ensure_nltk_data,
|
|
73
|
+
)
|
|
74
|
+
from dcs_sdk.sdk.utils.similarity_score.cosine_similarity_provider import (
|
|
75
|
+
CosineSimilarityProvider,
|
|
76
|
+
)
|
|
77
|
+
from dcs_sdk.sdk.utils.similarity_score.jaccard_provider import (
|
|
78
|
+
JaccardSimilarityProvider,
|
|
79
|
+
)
|
|
80
|
+
from dcs_sdk.sdk.utils.similarity_score.levenshtein_distance_provider import (
|
|
81
|
+
LevenshteinDistanceProvider,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
ensure_nltk_data()
|
|
85
|
+
|
|
86
|
+
self.similarity_providers = {
|
|
87
|
+
"jaccard": JaccardSimilarityProvider,
|
|
88
|
+
"levenshtein": LevenshteinDistanceProvider,
|
|
89
|
+
"cosine": CosineSimilarityProvider,
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
def create_dataset_dict(
|
|
93
|
+
self,
|
|
94
|
+
config: SourceTargetConnection,
|
|
95
|
+
table: TableSegment,
|
|
96
|
+
db_name: str,
|
|
97
|
+
file_path: str,
|
|
98
|
+
database_type: str,
|
|
99
|
+
) -> Dict:
|
|
100
|
+
schema_list = [serialize_table_schema(v) for v in table.get_schema().values()]
|
|
101
|
+
schema_list.sort(key=lambda x: x["column_name"].upper())
|
|
102
|
+
|
|
103
|
+
return {
|
|
104
|
+
"id": config.id,
|
|
105
|
+
"name": config.name,
|
|
106
|
+
"workspace": config.workspace,
|
|
107
|
+
"database_type": database_type,
|
|
108
|
+
"table_name": table.table_path[0],
|
|
109
|
+
"schema": table.database.default_schema,
|
|
110
|
+
"database": db_name,
|
|
111
|
+
"primary_keys": list(table.key_columns),
|
|
112
|
+
"file_path": file_path,
|
|
113
|
+
"files": [] if file_path is None else [generate_table_name(csv, False) for csv in glob.glob(file_path)],
|
|
114
|
+
"row_count": table.count(),
|
|
115
|
+
"columns": schema_list,
|
|
116
|
+
"exclusive_pk_cnt": 0,
|
|
117
|
+
"duplicate_pk_cnt": 0,
|
|
118
|
+
"null_pk_cnt": 0,
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
def connect_to_db_table(
|
|
122
|
+
self,
|
|
123
|
+
config: SourceTargetConnection,
|
|
124
|
+
is_source: bool,
|
|
125
|
+
) -> TableSegment:
|
|
126
|
+
if is_source:
|
|
127
|
+
primary_keys = self.config.primary_keys_source
|
|
128
|
+
columns = self.config.source_columns
|
|
129
|
+
where = self.config.source_filter
|
|
130
|
+
else:
|
|
131
|
+
primary_keys = self.config.primary_keys_target
|
|
132
|
+
columns = self.config.target_columns
|
|
133
|
+
where = self.config.target_filter
|
|
134
|
+
|
|
135
|
+
return connect_to_table(
|
|
136
|
+
{
|
|
137
|
+
"driver": config.driver,
|
|
138
|
+
"host": config.host,
|
|
139
|
+
"port": config.port,
|
|
140
|
+
"http_path": config.http_path,
|
|
141
|
+
"access_token": config.access_token,
|
|
142
|
+
"user": config.username,
|
|
143
|
+
"password": config.password,
|
|
144
|
+
"database": config.database,
|
|
145
|
+
"schema": config.schema_name,
|
|
146
|
+
"filepath": config.filepath,
|
|
147
|
+
"warehouse": config.warehouse,
|
|
148
|
+
"role": config.role,
|
|
149
|
+
"catalog": config.catalog,
|
|
150
|
+
"account": config.account,
|
|
151
|
+
"odbc_driver": config.odbc_driver,
|
|
152
|
+
"server": config.server,
|
|
153
|
+
"project": config.project,
|
|
154
|
+
"dataset": config.dataset,
|
|
155
|
+
"keyfile": config.keyfile,
|
|
156
|
+
"impersonate_service_account": config.impersonate_service_account,
|
|
157
|
+
"bigquery_credentials": config.bigquery_credentials,
|
|
158
|
+
},
|
|
159
|
+
config.table,
|
|
160
|
+
tuple(primary_keys),
|
|
161
|
+
extra_columns=tuple(columns),
|
|
162
|
+
where=where,
|
|
163
|
+
transform_columns=config.transform_columns,
|
|
164
|
+
job_id=self.config.job_id,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
def connect_to_db(self, config: SourceTargetConnection, is_source: bool):
|
|
168
|
+
if is_source:
|
|
169
|
+
self.source_db: Database = connect(
|
|
170
|
+
{
|
|
171
|
+
"driver": config.driver,
|
|
172
|
+
"host": config.host,
|
|
173
|
+
"port": config.port,
|
|
174
|
+
"http_path": config.http_path,
|
|
175
|
+
"access_token": config.access_token,
|
|
176
|
+
"user": config.username,
|
|
177
|
+
"password": config.password,
|
|
178
|
+
"database": config.database,
|
|
179
|
+
"warehouse": config.warehouse,
|
|
180
|
+
"schema": config.schema_name,
|
|
181
|
+
"role": config.role,
|
|
182
|
+
"catalog": config.catalog,
|
|
183
|
+
"account": config.account,
|
|
184
|
+
"odbc_driver": config.odbc_driver,
|
|
185
|
+
"server": config.server,
|
|
186
|
+
"project": config.project,
|
|
187
|
+
"keyfile": config.keyfile,
|
|
188
|
+
"impersonate_service_account": config.impersonate_service_account,
|
|
189
|
+
"bigquery_credentials": config.bigquery_credentials,
|
|
190
|
+
"dataset": config.dataset,
|
|
191
|
+
}
|
|
192
|
+
)
|
|
193
|
+
else:
|
|
194
|
+
self.target_db: Database = connect(
|
|
195
|
+
{
|
|
196
|
+
"driver": config.driver,
|
|
197
|
+
"host": config.host,
|
|
198
|
+
"port": config.port,
|
|
199
|
+
"http_path": config.http_path,
|
|
200
|
+
"access_token": config.access_token,
|
|
201
|
+
"user": config.username,
|
|
202
|
+
"password": config.password,
|
|
203
|
+
"database": config.database,
|
|
204
|
+
"warehouse": config.warehouse,
|
|
205
|
+
"schema": config.schema_name,
|
|
206
|
+
"role": config.role,
|
|
207
|
+
"catalog": config.catalog,
|
|
208
|
+
"account": config.account,
|
|
209
|
+
"odbc_driver": config.odbc_driver,
|
|
210
|
+
"server": config.server,
|
|
211
|
+
"project": config.project,
|
|
212
|
+
"keyfile": config.keyfile,
|
|
213
|
+
"impersonate_service_account": config.impersonate_service_account,
|
|
214
|
+
"bigquery_credentials": config.bigquery_credentials,
|
|
215
|
+
"dataset": config.dataset,
|
|
216
|
+
}
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
def process_duckdb(self, is_source: bool):
|
|
220
|
+
if is_source:
|
|
221
|
+
filepath = self.config.source.filepath
|
|
222
|
+
else:
|
|
223
|
+
filepath = self.config.target.filepath
|
|
224
|
+
if filepath is None:
|
|
225
|
+
raise ValueError("File path is required for file")
|
|
226
|
+
if filepath.endswith(".csv"):
|
|
227
|
+
if not duck_db_load_csv_to_table(self.config, filepath, is_source):
|
|
228
|
+
raise ValueError(f"Error in loading CSV, for the {'source' if is_source else 'target'}")
|
|
229
|
+
|
|
230
|
+
def _prepare_source_table(self) -> Optional[str]:
|
|
231
|
+
view_name = None
|
|
232
|
+
if self.config.source.driver == "duckdb":
|
|
233
|
+
return view_name
|
|
234
|
+
if self.config.source_query is not None:
|
|
235
|
+
self._process_database_as_schema(
|
|
236
|
+
driver=self.config.source.driver,
|
|
237
|
+
is_source=True,
|
|
238
|
+
)
|
|
239
|
+
self.connect_to_db(
|
|
240
|
+
self.config.source,
|
|
241
|
+
is_source=True,
|
|
242
|
+
)
|
|
243
|
+
view_name = self.source_db.create_view_from_query(
|
|
244
|
+
query=self.config.source_query,
|
|
245
|
+
schema=self.config.temporary_schema_source,
|
|
246
|
+
view_name=self.config.view_name_source,
|
|
247
|
+
)
|
|
248
|
+
self.config.source.schema_name = self.config.temporary_schema_source
|
|
249
|
+
self.config.source.table = view_name
|
|
250
|
+
return view_name
|
|
251
|
+
|
|
252
|
+
def _prepare_target_table(self) -> Optional[str]:
|
|
253
|
+
view_name = None
|
|
254
|
+
if self.config.target.driver == "duckdb":
|
|
255
|
+
return view_name
|
|
256
|
+
if self.config.target_query is not None:
|
|
257
|
+
self._process_database_as_schema(
|
|
258
|
+
driver=self.config.target.driver,
|
|
259
|
+
is_source=False,
|
|
260
|
+
)
|
|
261
|
+
self.connect_to_db(
|
|
262
|
+
self.config.target,
|
|
263
|
+
is_source=False,
|
|
264
|
+
)
|
|
265
|
+
view_name = self.target_db.create_view_from_query(
|
|
266
|
+
query=self.config.target_query,
|
|
267
|
+
schema=self.config.temporary_schema_target,
|
|
268
|
+
view_name=self.config.view_name_target,
|
|
269
|
+
)
|
|
270
|
+
self.config.target.schema_name = self.config.temporary_schema_target
|
|
271
|
+
self.config.target.table = view_name
|
|
272
|
+
|
|
273
|
+
return view_name
|
|
274
|
+
|
|
275
|
+
def _process_database_as_schema(self, driver: str, is_source: bool):
|
|
276
|
+
if driver in ["mysql"]:
|
|
277
|
+
if is_source:
|
|
278
|
+
self.config.source.database = self.config.temporary_schema_source
|
|
279
|
+
else:
|
|
280
|
+
self.config.target.database = self.config.temporary_schema_target
|
|
281
|
+
|
|
282
|
+
def _process_duckdb_connections(self):
|
|
283
|
+
if self.config.source.driver == "duckdb":
|
|
284
|
+
self.process_duckdb(is_source=True)
|
|
285
|
+
if self.config.target.driver == "duckdb":
|
|
286
|
+
self.process_duckdb(is_source=False)
|
|
287
|
+
|
|
288
|
+
def _get_automatic_bisection_threshold(self, max_row_count: int) -> int:
|
|
289
|
+
val = max_row_count // 10
|
|
290
|
+
|
|
291
|
+
if val > DYNAMIC_BISECTION_THRESHOLD_MAX_LIMIT:
|
|
292
|
+
return DYNAMIC_BISECTION_THRESHOLD_MAX_LIMIT
|
|
293
|
+
|
|
294
|
+
return val
|
|
295
|
+
|
|
296
|
+
def _get_automatic_bisection_factor(self, max_row_count) -> int:
|
|
297
|
+
return max_row_count // ROW_COUNT_PER_SEGMENT
|
|
298
|
+
|
|
299
|
+
def _get_automatic_egress_limit(self, max_row_count: int) -> int:
|
|
300
|
+
val = max_row_count // 10
|
|
301
|
+
|
|
302
|
+
if val > MAX_EGRESS_LIMIT:
|
|
303
|
+
return MAX_EGRESS_LIMIT
|
|
304
|
+
|
|
305
|
+
return val
|
|
306
|
+
|
|
307
|
+
def diff_tables(
|
|
308
|
+
self,
|
|
309
|
+
is_cli: bool = False,
|
|
310
|
+
show_stats: bool = False,
|
|
311
|
+
save_html: bool = False,
|
|
312
|
+
html_path: str = "dcs_report.html",
|
|
313
|
+
display_table: bool = False,
|
|
314
|
+
) -> Dict:
|
|
315
|
+
view_name_source = None
|
|
316
|
+
view_name_target = None
|
|
317
|
+
duckb_file_location_source = None
|
|
318
|
+
duckb_file_location_target = None
|
|
319
|
+
|
|
320
|
+
try:
|
|
321
|
+
self._process_duckdb_connections()
|
|
322
|
+
view_name_source = self._prepare_source_table()
|
|
323
|
+
view_name_target = self._prepare_target_table()
|
|
324
|
+
|
|
325
|
+
self.table1 = self.connect_to_db_table(self.config.source, is_source=True)
|
|
326
|
+
self.table2 = self.connect_to_db_table(self.config.target, is_source=False)
|
|
327
|
+
table_1_sample_data = []
|
|
328
|
+
table_2_sample_data = []
|
|
329
|
+
db1_name = (
|
|
330
|
+
self.config.source.database or self.config.source.catalog or self.config.source.project or "source"
|
|
331
|
+
)
|
|
332
|
+
db2_name = (
|
|
333
|
+
self.config.target.database or self.config.target.catalog or self.config.target.project or "target"
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
columns_order_wise_src = self.config.primary_keys_source + self.config.source_columns
|
|
337
|
+
columns_order_wise_target = self.config.primary_keys_target + self.config.target_columns
|
|
338
|
+
|
|
339
|
+
src_masking_cols = self.config.source_masking_columns
|
|
340
|
+
tgt_masking_cols = self.config.target_masking_columns
|
|
341
|
+
masking_character = self.config.masking_character
|
|
342
|
+
|
|
343
|
+
source_dataset = self.create_dataset_dict(
|
|
344
|
+
self.config.source,
|
|
345
|
+
self.table1,
|
|
346
|
+
db1_name,
|
|
347
|
+
self.source_file_path,
|
|
348
|
+
"file" if self.config.source.driver == "duckdb" else self.config.source.driver,
|
|
349
|
+
)
|
|
350
|
+
target_dataset = self.create_dataset_dict(
|
|
351
|
+
self.config.target,
|
|
352
|
+
self.table2,
|
|
353
|
+
db2_name,
|
|
354
|
+
self.target_file_path,
|
|
355
|
+
"file" if self.config.target.driver == "duckdb" else self.config.target.driver,
|
|
356
|
+
)
|
|
357
|
+
table_1_row_count = source_dataset.get("row_count", 0)
|
|
358
|
+
table_2_row_count = target_dataset.get("row_count", 0)
|
|
359
|
+
max_row_count = max(table_1_row_count, table_2_row_count)
|
|
360
|
+
|
|
361
|
+
is_bisection_threshold_automatic = self.config.advanced_configuration.bisection_threshold == -1
|
|
362
|
+
is_bisection_factor_automatic = self.config.advanced_configuration.bisection_factor == -1
|
|
363
|
+
is_egress_limit_automatic = self.config.advanced_configuration.egress_limit == -1
|
|
364
|
+
|
|
365
|
+
threshold = (
|
|
366
|
+
self.config.advanced_configuration.bisection_threshold
|
|
367
|
+
if not is_bisection_threshold_automatic
|
|
368
|
+
else self._get_automatic_bisection_threshold(max_row_count)
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
factor = (
|
|
372
|
+
self.config.advanced_configuration.bisection_factor
|
|
373
|
+
if not is_bisection_factor_automatic
|
|
374
|
+
else self._get_automatic_bisection_factor(max_row_count)
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
egress_limit = (
|
|
378
|
+
self.config.advanced_configuration.egress_limit
|
|
379
|
+
if not is_egress_limit_automatic
|
|
380
|
+
else self._get_automatic_egress_limit(max_row_count)
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
self.config.advanced_configuration.bisection_threshold = max(threshold, 1000)
|
|
384
|
+
self.config.advanced_configuration.bisection_factor = max(factor, 10)
|
|
385
|
+
self.config.advanced_configuration.egress_limit = max(egress_limit, MIN_EGRESS_LIMIT)
|
|
386
|
+
|
|
387
|
+
error_message = None
|
|
388
|
+
is_table_empty = False
|
|
389
|
+
if table_1_row_count == 0:
|
|
390
|
+
error_message = f"Source table '{source_dataset.get('table_name')}' is empty"
|
|
391
|
+
is_table_empty = True
|
|
392
|
+
if table_2_row_count == 0:
|
|
393
|
+
if error_message:
|
|
394
|
+
error_message += f" and target table '{target_dataset.get('table_name')}' is empty"
|
|
395
|
+
else:
|
|
396
|
+
error_message = f"Target table '{target_dataset.get('table_name')}' is empty"
|
|
397
|
+
is_table_empty = True
|
|
398
|
+
if not is_table_empty and not self.config.schema_diff:
|
|
399
|
+
pks_len = len(self.table1.key_columns)
|
|
400
|
+
table_1_sample_data = self.table1.with_schema().get_sample_data(limit=100)
|
|
401
|
+
sample_keys = [list(row[:pks_len]) for row in table_1_sample_data]
|
|
402
|
+
table_2_sample_data = self.table2.with_schema().get_sample_data(limit=100, sample_keys=sample_keys)
|
|
403
|
+
# if self.config.advanced_configuration.in_memory_diff:
|
|
404
|
+
# self.config.advanced_configuration.egress_limit = min(max_row_count, 50_00_000)
|
|
405
|
+
self.diff_iter = diff_tables(
|
|
406
|
+
self.table1,
|
|
407
|
+
self.table2,
|
|
408
|
+
algorithm=self.algorithm,
|
|
409
|
+
bisection_factor=self.config.advanced_configuration.bisection_factor,
|
|
410
|
+
bisection_threshold=self.config.advanced_configuration.bisection_threshold,
|
|
411
|
+
max_threadpool_size=self.config.advanced_configuration.max_threadpool_size,
|
|
412
|
+
strict=self.config.strict,
|
|
413
|
+
per_column_diff_limit=self.config.advanced_configuration.per_column_diff_limit,
|
|
414
|
+
egress_limit=self.config.advanced_configuration.egress_limit,
|
|
415
|
+
timeout_limit=self.config.advanced_configuration.timeout_limit,
|
|
416
|
+
in_memory_diff=self.config.advanced_configuration.in_memory_diff,
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
columns_mappings = [
|
|
420
|
+
{"source_column": src, "target_column": trg}
|
|
421
|
+
for src, trg in zip(columns_order_wise_src, columns_order_wise_target)
|
|
422
|
+
]
|
|
423
|
+
|
|
424
|
+
self.response = {
|
|
425
|
+
"source_dataset": source_dataset,
|
|
426
|
+
"target_dataset": target_dataset,
|
|
427
|
+
"columns_mappings": columns_mappings,
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
self.process_limit(max_row_count)
|
|
431
|
+
if not is_table_empty and not self.config.schema_diff:
|
|
432
|
+
diff_res = differ_rows(
|
|
433
|
+
diff_iter=self.diff_iter,
|
|
434
|
+
response=self.response,
|
|
435
|
+
limit=self.limit,
|
|
436
|
+
table_limit=self.table_limit,
|
|
437
|
+
display_table=display_table,
|
|
438
|
+
similarity=self.similarity,
|
|
439
|
+
similarity_providers=self.similarity_providers,
|
|
440
|
+
fields=self.config.source_columns,
|
|
441
|
+
quick_comparison=self.config.quick_comparison,
|
|
442
|
+
src_masking_cols=src_masking_cols if src_masking_cols else [],
|
|
443
|
+
tgt_masking_cols=tgt_masking_cols if tgt_masking_cols else [],
|
|
444
|
+
masking_character=masking_character,
|
|
445
|
+
)
|
|
446
|
+
else:
|
|
447
|
+
diff_res = {
|
|
448
|
+
"stats": {
|
|
449
|
+
"rows_A": 0,
|
|
450
|
+
"rows_B": 0,
|
|
451
|
+
"exclusive_A": 0,
|
|
452
|
+
"exclusive_B": 0,
|
|
453
|
+
"diff_pk_percent": 0,
|
|
454
|
+
"unchanged": 0,
|
|
455
|
+
"total_diff_count": 0,
|
|
456
|
+
"diff_rows_count": 0,
|
|
457
|
+
"total_duplicate_count_source": 0,
|
|
458
|
+
"total_duplicate_count_target": 0,
|
|
459
|
+
"diff_rows_percent": 0,
|
|
460
|
+
"has_differences": False,
|
|
461
|
+
"error": {},
|
|
462
|
+
},
|
|
463
|
+
"exclusive_pk_values_target": [],
|
|
464
|
+
"exclusive_pk_values_source": [],
|
|
465
|
+
"duplicate_pk_values_source": [],
|
|
466
|
+
"duplicate_pk_values_target": [],
|
|
467
|
+
"records_with_differences": [],
|
|
468
|
+
"table": None,
|
|
469
|
+
}
|
|
470
|
+
if is_table_empty:
|
|
471
|
+
diff_res["stats"]["has_differences"] = table_1_row_count != table_2_row_count
|
|
472
|
+
try:
|
|
473
|
+
diff_res["stats"]["diff_pk_percent"] = abs(
|
|
474
|
+
(table_1_row_count - table_2_row_count) / max(table_1_row_count, table_2_row_count)
|
|
475
|
+
)
|
|
476
|
+
except ZeroDivisionError:
|
|
477
|
+
diff_res["stats"]["diff_pk_percent"] = 0
|
|
478
|
+
diff_res["stats"]["error"] = {
|
|
479
|
+
"code": "empty_table",
|
|
480
|
+
"message": error_message,
|
|
481
|
+
"level": "WARNING",
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
diff_res.setdefault("stats", {})["rows_A"] = table_1_row_count
|
|
485
|
+
diff_res.setdefault("stats", {})["rows_B"] = table_2_row_count
|
|
486
|
+
columns_with_unmatched_data_type, columns_not_compared, exc_to_src, exc_to_tgt = (
|
|
487
|
+
calculate_column_differences(
|
|
488
|
+
source_columns=source_dataset["columns"],
|
|
489
|
+
target_columns=target_dataset["columns"],
|
|
490
|
+
columns_mappings=columns_mappings,
|
|
491
|
+
)
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
diff_res.get("stats", {}).update(
|
|
495
|
+
{
|
|
496
|
+
"identical_columns": find_identical_columns(
|
|
497
|
+
source_dataset["columns"],
|
|
498
|
+
target_dataset["columns"],
|
|
499
|
+
),
|
|
500
|
+
"columns_with_unmatched_data_type": columns_with_unmatched_data_type,
|
|
501
|
+
"columns_not_compared": columns_not_compared,
|
|
502
|
+
}
|
|
503
|
+
)
|
|
504
|
+
if self.config.schema_diff:
|
|
505
|
+
if error_message:
|
|
506
|
+
diff_res["stats"]["error"]["level"] = "WARNING"
|
|
507
|
+
|
|
508
|
+
source_dataset["exclusive_pk_cnt"] = diff_res.get("stats", {}).get("exclusive_A", 0)
|
|
509
|
+
target_dataset["exclusive_pk_cnt"] = diff_res.get("stats", {}).get("exclusive_B", 0)
|
|
510
|
+
table = diff_res.pop("table", None)
|
|
511
|
+
if is_cli and display_table:
|
|
512
|
+
create_table_schema_row_count(self.response, table, self.console)
|
|
513
|
+
if save_html:
|
|
514
|
+
self.console.save_html(html_path, theme=theme_1, clear=True)
|
|
515
|
+
|
|
516
|
+
duckb_file_location_source = self.config.source.filepath
|
|
517
|
+
duckb_file_location_target = self.config.target.filepath
|
|
518
|
+
self.config.source.filepath = self.source_file_path
|
|
519
|
+
self.config.target.filepath = self.target_file_path
|
|
520
|
+
if self.config.source.driver == "duckdb":
|
|
521
|
+
self.config.source.driver = "file"
|
|
522
|
+
if self.config.target.driver == "duckdb":
|
|
523
|
+
self.config.target.driver = "file"
|
|
524
|
+
|
|
525
|
+
self.response["source_dataset"]["duplicate_pk_cnt"] = diff_res.get("stats", {}).get(
|
|
526
|
+
"total_duplicate_count_source", 0
|
|
527
|
+
)
|
|
528
|
+
self.response["target_dataset"]["duplicate_pk_cnt"] = diff_res.get("stats", {}).get(
|
|
529
|
+
"total_duplicate_count_target", 0
|
|
530
|
+
)
|
|
531
|
+
self.response["source_dataset"]["null_pk_cnt"] = diff_res.get("stats", {}).get("null_pk_count_source", 0)
|
|
532
|
+
self.response["target_dataset"]["null_pk_cnt"] = diff_res.get("stats", {}).get("null_pk_count_target", 0)
|
|
533
|
+
|
|
534
|
+
self.response["source_dataset"]["pk_cnt"] = (
|
|
535
|
+
self.response["source_dataset"]["row_count"]
|
|
536
|
+
- self.response["source_dataset"]["duplicate_pk_cnt"]
|
|
537
|
+
- self.response["source_dataset"]["null_pk_cnt"]
|
|
538
|
+
)
|
|
539
|
+
self.response["target_dataset"]["pk_cnt"] = (
|
|
540
|
+
self.response["target_dataset"]["row_count"]
|
|
541
|
+
- self.response["target_dataset"]["duplicate_pk_cnt"]
|
|
542
|
+
- self.response["target_dataset"]["null_pk_cnt"]
|
|
543
|
+
)
|
|
544
|
+
self.response.update(diff_res)
|
|
545
|
+
if show_stats:
|
|
546
|
+
self.print_stats()
|
|
547
|
+
table_1_stats = self.table1.query_stats
|
|
548
|
+
table_2_stats = self.table2.query_stats
|
|
549
|
+
for stats in [table_1_stats, table_2_stats]:
|
|
550
|
+
for _, stats_dict in stats.items():
|
|
551
|
+
if isinstance(stats_dict, dict):
|
|
552
|
+
stats_dict.pop("_query_times", None)
|
|
553
|
+
|
|
554
|
+
self.response.get("stats", {}).update(
|
|
555
|
+
{
|
|
556
|
+
"source_query_stats": table_1_stats,
|
|
557
|
+
"target_query_stats": table_2_stats,
|
|
558
|
+
"comparison_tracker": diff_res.get("stats", {}).get("comparison_tracker", []),
|
|
559
|
+
}
|
|
560
|
+
)
|
|
561
|
+
finished_at = datetime.now(tz=timezone.utc)
|
|
562
|
+
end_time = time.monotonic()
|
|
563
|
+
duration = end_time - self.start_time
|
|
564
|
+
meta = {
|
|
565
|
+
"meta": {
|
|
566
|
+
"created_at": self.created_at.isoformat(),
|
|
567
|
+
"seconds": round(duration, 2),
|
|
568
|
+
"finished_at": finished_at.isoformat(),
|
|
569
|
+
"status": "done",
|
|
570
|
+
}
|
|
571
|
+
}
|
|
572
|
+
self.response.update(meta)
|
|
573
|
+
rules_repo = RulesRepository.get_instance()
|
|
574
|
+
column_transforms = rules_repo.value_rules
|
|
575
|
+
schema_overrides = rules_repo.schema_rules
|
|
576
|
+
|
|
577
|
+
# diff_res["stats"]["has_differences"] = (table_1_row_count != table_2_row_count) or diff_res["stats"].get(
|
|
578
|
+
# "total_diff_count", 0
|
|
579
|
+
# ) > 0
|
|
580
|
+
|
|
581
|
+
is_row_mismatch = table_1_row_count != table_2_row_count
|
|
582
|
+
|
|
583
|
+
is_value_mismatch = diff_res["stats"].get("total_diff_count", 0) > 0
|
|
584
|
+
|
|
585
|
+
is_schema_mismatch = any([len(exc_to_src) != 0, len(exc_to_tgt) != 0, columns_with_unmatched_data_type])
|
|
586
|
+
|
|
587
|
+
diff_res["stats"]["has_differences"] = is_row_mismatch or is_value_mismatch or is_schema_mismatch
|
|
588
|
+
diff_res["stats"]["is_row_count_mismatch"] = is_row_mismatch
|
|
589
|
+
diff_res["stats"]["is_value_mismatch"] = is_value_mismatch
|
|
590
|
+
diff_res["stats"]["is_schema_mismatch"] = is_schema_mismatch
|
|
591
|
+
|
|
592
|
+
if not is_value_mismatch:
|
|
593
|
+
table_1_sample_data = convert_to_masked_if_required(
|
|
594
|
+
table_sample_data=table_1_sample_data if table_1_sample_data else [],
|
|
595
|
+
masking_character=masking_character,
|
|
596
|
+
masking_columns=src_masking_cols if src_masking_cols else [],
|
|
597
|
+
columns_order_wise=columns_order_wise_src if columns_order_wise_src else [],
|
|
598
|
+
)
|
|
599
|
+
|
|
600
|
+
table_2_sample_data = convert_to_masked_if_required(
|
|
601
|
+
table_sample_data=table_2_sample_data if table_2_sample_data else [],
|
|
602
|
+
masking_character=masking_character,
|
|
603
|
+
masking_columns=tgt_masking_cols if tgt_masking_cols else [],
|
|
604
|
+
columns_order_wise=columns_order_wise_target if columns_order_wise_target else [],
|
|
605
|
+
)
|
|
606
|
+
|
|
607
|
+
sample_value_column_names_src = list(self.table1.key_columns) + list(self.table1.extra_columns)
|
|
608
|
+
sample_value_column_names_tgt = list(self.table2.key_columns) + list(self.table2.extra_columns)
|
|
609
|
+
sample_value_source_dicts = [
|
|
610
|
+
dict(zip(sample_value_column_names_src, row)) for row in table_1_sample_data
|
|
611
|
+
]
|
|
612
|
+
sample_value_target_dicts = [
|
|
613
|
+
dict(zip(sample_value_column_names_tgt, row)) for row in table_2_sample_data
|
|
614
|
+
]
|
|
615
|
+
|
|
616
|
+
def get_pk(row, key_columns):
|
|
617
|
+
return tuple(row[k] for k in key_columns)
|
|
618
|
+
|
|
619
|
+
grouped_source = defaultdict(list)
|
|
620
|
+
grouped_target = defaultdict(list)
|
|
621
|
+
|
|
622
|
+
for row in sample_value_source_dicts:
|
|
623
|
+
grouped_source[get_pk(row, self.table1.key_columns)].append(row)
|
|
624
|
+
|
|
625
|
+
for row in sample_value_target_dicts:
|
|
626
|
+
grouped_target[get_pk(row, self.table2.key_columns)].append(row)
|
|
627
|
+
|
|
628
|
+
sample_values_record_list = []
|
|
629
|
+
|
|
630
|
+
def safe_numeric_sort(keys: list[tuple[str]]) -> list[tuple[str]]:
|
|
631
|
+
def sort_key(tup):
|
|
632
|
+
key = []
|
|
633
|
+
for val in tup:
|
|
634
|
+
if isinstance(val, str) and val.isdigit():
|
|
635
|
+
key.append((0, int(val)))
|
|
636
|
+
else:
|
|
637
|
+
key.append((1, str(val)))
|
|
638
|
+
return tuple(key)
|
|
639
|
+
|
|
640
|
+
return sorted(keys, key=sort_key)
|
|
641
|
+
|
|
642
|
+
sorted_pks = safe_numeric_sort(list(grouped_source.keys() | grouped_target.keys()))
|
|
643
|
+
|
|
644
|
+
for pk in sorted_pks:
|
|
645
|
+
source_rows = grouped_source.get(pk, [])
|
|
646
|
+
target_rows = grouped_target.get(pk, [])
|
|
647
|
+
used_targets = set()
|
|
648
|
+
used_sources = set()
|
|
649
|
+
|
|
650
|
+
for i, src_row in enumerate(source_rows):
|
|
651
|
+
for j, tgt_row in enumerate(target_rows):
|
|
652
|
+
if j in used_targets:
|
|
653
|
+
continue
|
|
654
|
+
if src_row.values() == tgt_row.values():
|
|
655
|
+
sample_values_record_list.append(src_row)
|
|
656
|
+
sample_values_record_list.append(tgt_row)
|
|
657
|
+
used_sources.add(i)
|
|
658
|
+
used_targets.add(j)
|
|
659
|
+
break
|
|
660
|
+
|
|
661
|
+
def sort_key(row, key_columns, extra_columns):
|
|
662
|
+
key_values = []
|
|
663
|
+
for k in key_columns + extra_columns:
|
|
664
|
+
if k in row:
|
|
665
|
+
value = row[k]
|
|
666
|
+
if value is None:
|
|
667
|
+
key_values.append("None")
|
|
668
|
+
else:
|
|
669
|
+
key_values.append(value)
|
|
670
|
+
return tuple(key_values)
|
|
671
|
+
|
|
672
|
+
remaining_sources = [row for i, row in enumerate(source_rows) if i not in used_sources]
|
|
673
|
+
remaining_targets = [row for j, row in enumerate(target_rows) if j not in used_targets]
|
|
674
|
+
|
|
675
|
+
remaining_sources_sorted = sorted(
|
|
676
|
+
remaining_sources,
|
|
677
|
+
key=lambda row: sort_key(row, self.table1.key_columns, self.table1.extra_columns),
|
|
678
|
+
)
|
|
679
|
+
|
|
680
|
+
remaining_targets_sorted = sorted(
|
|
681
|
+
remaining_targets,
|
|
682
|
+
key=lambda row: sort_key(row, self.table2.key_columns, self.table2.extra_columns),
|
|
683
|
+
)
|
|
684
|
+
|
|
685
|
+
for src_row, tgt_row in zip(remaining_sources_sorted, remaining_targets_sorted):
|
|
686
|
+
sample_values_record_list.append(src_row)
|
|
687
|
+
sample_values_record_list.append(tgt_row)
|
|
688
|
+
|
|
689
|
+
self.response["sample_data_values"] = sample_values_record_list
|
|
690
|
+
|
|
691
|
+
self.response.update({"column_transforms": column_transforms})
|
|
692
|
+
self.response.update({"schema_overrides": schema_overrides})
|
|
693
|
+
|
|
694
|
+
return self.response
|
|
695
|
+
except Exception as e:
|
|
696
|
+
logger.exception(f"Error during diff_tables: {e}")
|
|
697
|
+
raise
|
|
698
|
+
finally:
|
|
699
|
+
self.drop_view_and_close_connection(view_name_source, view_name_target)
|
|
700
|
+
self.cleanup_duckdb(
|
|
701
|
+
src=duckb_file_location_source,
|
|
702
|
+
target=duckb_file_location_target,
|
|
703
|
+
)
|
|
704
|
+
logger.info("Dropped views and closed database connections")
|
|
705
|
+
|
|
706
|
+
def process_limit(self, max_row_count):
|
|
707
|
+
if isinstance(self.limit, int):
|
|
708
|
+
if self.limit > max_row_count:
|
|
709
|
+
self.limit = max_row_count
|
|
710
|
+
logger.info(f"Limit exceeds max row count, adjusted to {max_row_count}")
|
|
711
|
+
return
|
|
712
|
+
|
|
713
|
+
if isinstance(self.limit, str):
|
|
714
|
+
if "%" in self.limit:
|
|
715
|
+
cleaned_limit = self.limit.replace("%", "").strip()
|
|
716
|
+
if cleaned_limit.isdigit():
|
|
717
|
+
percentage = float(cleaned_limit)
|
|
718
|
+
if percentage > 100:
|
|
719
|
+
self.limit = max_row_count
|
|
720
|
+
logger.info("Percentage exceeds 100%, set limit to maximum row count")
|
|
721
|
+
else:
|
|
722
|
+
calc_limit = int((percentage / 100) * max_row_count)
|
|
723
|
+
self.limit = max(1, int(calc_limit))
|
|
724
|
+
logger.info(f"Limit set to {self.limit} ({percentage}% of {max_row_count})")
|
|
725
|
+
else:
|
|
726
|
+
self.limit = self.default_limit
|
|
727
|
+
logger.warning(
|
|
728
|
+
f"Invalid percentage format '{self.limit}', using default limit: {self.default_limit}"
|
|
729
|
+
)
|
|
730
|
+
else:
|
|
731
|
+
self.limit = self.default_limit
|
|
732
|
+
logger.warning(f"Invalid limit format '{self.limit}', using default limit: {self.default_limit}")
|
|
733
|
+
|
|
734
|
+
def drop_view_and_close_connection(self, view_name_source, view_name_target):
|
|
735
|
+
|
|
736
|
+
def safe_close(db_connection):
|
|
737
|
+
if db_connection:
|
|
738
|
+
with suppress(Exception):
|
|
739
|
+
db_connection.close()
|
|
740
|
+
|
|
741
|
+
safe_close(self.table1.database)
|
|
742
|
+
safe_close(self.table2.database)
|
|
743
|
+
|
|
744
|
+
if self.source_db:
|
|
745
|
+
self.source_db.drop_view_from_db(
|
|
746
|
+
view_name=view_name_source,
|
|
747
|
+
schema=self.config.temporary_schema_source,
|
|
748
|
+
)
|
|
749
|
+
if self.target_db:
|
|
750
|
+
self.target_db.drop_view_from_db(
|
|
751
|
+
view_name=view_name_target,
|
|
752
|
+
schema=self.config.temporary_schema_target,
|
|
753
|
+
)
|
|
754
|
+
|
|
755
|
+
safe_close(self.source_db)
|
|
756
|
+
safe_close(self.target_db)
|
|
757
|
+
if self.config.job_id:
|
|
758
|
+
safe_close(RedisBackend.get_instance())
|
|
759
|
+
|
|
760
|
+
def cleanup_duckdb(self, src: str, target: str):
|
|
761
|
+
if src and src.endswith("duckdb"):
|
|
762
|
+
with suppress(Exception):
|
|
763
|
+
os.remove(src)
|
|
764
|
+
if target and target.endswith("duckdb"):
|
|
765
|
+
with suppress(Exception):
|
|
766
|
+
os.remove(target)
|
|
767
|
+
|
|
768
|
+
def print_stats(self):
|
|
769
|
+
try:
|
|
770
|
+
stats = self.response.get("stats", {})
|
|
771
|
+
output = ""
|
|
772
|
+
if stats:
|
|
773
|
+
if self.config.quick_comparison:
|
|
774
|
+
output += f"Quick comparison: {self.config.quick_comparison}\n"
|
|
775
|
+
output += f"Has differences {stats.get('has_differences', False)}\n"
|
|
776
|
+
else:
|
|
777
|
+
output += f"{stats.get('exclusive_A', 0)} rows are exclusive to source\n"
|
|
778
|
+
output += f"{stats.get('exclusive_B', 0)} rows are exclusive to target\n"
|
|
779
|
+
output += f"{stats.get('total_duplicate_count_source', 0)} duplicate rows in source\n"
|
|
780
|
+
output += f"{stats.get('total_duplicate_count_target', 0)} duplicate rows in target\n"
|
|
781
|
+
# output += f"{stats.get('total_diff_count', 0)} rows are different\n"
|
|
782
|
+
# output += f"{stats.get('diff_rows_count', 0)} rows are different\n"
|
|
783
|
+
for k, v in stats.get("values", {}).items():
|
|
784
|
+
output += f"{v} rows with different values in column: {k}\n"
|
|
785
|
+
# output += f"{round((stats.get('diff_pk_percent', 0) * 100),3)}% of primary keys are different\n"
|
|
786
|
+
# output += f"{round((stats.get('diff_rows_percent', 0) * 100),3)}% of rows are different\n"
|
|
787
|
+
print(output)
|
|
788
|
+
except Exception as e:
|
|
789
|
+
logger.exception(f"Error in printing stats: {e}")
|
|
790
|
+
|
|
791
|
+
def slice_rows(self, rows, start, end):
|
|
792
|
+
return rows[start:end]
|
|
793
|
+
|
|
794
|
+
|
|
795
|
+
def diff_db_tables(
|
|
796
|
+
config: Comparison,
|
|
797
|
+
is_cli: bool = False,
|
|
798
|
+
show_stats: bool = False,
|
|
799
|
+
save_html: bool = False,
|
|
800
|
+
html_path: str = "dcs_report.html",
|
|
801
|
+
display_table: bool = False,
|
|
802
|
+
) -> Dict:
|
|
803
|
+
differ = DBTableDiffer(config)
|
|
804
|
+
response = differ.diff_tables(
|
|
805
|
+
is_cli=is_cli,
|
|
806
|
+
show_stats=show_stats,
|
|
807
|
+
save_html=save_html,
|
|
808
|
+
html_path=html_path,
|
|
809
|
+
display_table=display_table,
|
|
810
|
+
)
|
|
811
|
+
response["comparison_name"] = config.comparison_name
|
|
812
|
+
configuration = config.model_dump()
|
|
813
|
+
del configuration["source"]["id"]
|
|
814
|
+
del configuration["target"]["id"]
|
|
815
|
+
configuration["source"]["schema_name"] = response["source_dataset"]["schema"]
|
|
816
|
+
configuration["target"]["schema_name"] = response["target_dataset"]["schema"]
|
|
817
|
+
response["configuration"] = configuration
|
|
818
|
+
if is_cli:
|
|
819
|
+
response["configuration"]["source"] = obfuscate_sensitive_data(response["configuration"]["source"])
|
|
820
|
+
response["configuration"]["target"] = obfuscate_sensitive_data(response["configuration"]["target"])
|
|
821
|
+
return response
|