dcs-sdk 1.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. data_diff/__init__.py +221 -0
  2. data_diff/__main__.py +517 -0
  3. data_diff/abcs/__init__.py +13 -0
  4. data_diff/abcs/compiler.py +27 -0
  5. data_diff/abcs/database_types.py +402 -0
  6. data_diff/config.py +141 -0
  7. data_diff/databases/__init__.py +38 -0
  8. data_diff/databases/_connect.py +323 -0
  9. data_diff/databases/base.py +1417 -0
  10. data_diff/databases/bigquery.py +376 -0
  11. data_diff/databases/clickhouse.py +217 -0
  12. data_diff/databases/databricks.py +262 -0
  13. data_diff/databases/duckdb.py +207 -0
  14. data_diff/databases/mssql.py +343 -0
  15. data_diff/databases/mysql.py +189 -0
  16. data_diff/databases/oracle.py +238 -0
  17. data_diff/databases/postgresql.py +293 -0
  18. data_diff/databases/presto.py +222 -0
  19. data_diff/databases/redis.py +93 -0
  20. data_diff/databases/redshift.py +233 -0
  21. data_diff/databases/snowflake.py +222 -0
  22. data_diff/databases/sybase.py +720 -0
  23. data_diff/databases/trino.py +73 -0
  24. data_diff/databases/vertica.py +174 -0
  25. data_diff/diff_tables.py +489 -0
  26. data_diff/errors.py +17 -0
  27. data_diff/format.py +369 -0
  28. data_diff/hashdiff_tables.py +1026 -0
  29. data_diff/info_tree.py +76 -0
  30. data_diff/joindiff_tables.py +434 -0
  31. data_diff/lexicographic_space.py +253 -0
  32. data_diff/parse_time.py +88 -0
  33. data_diff/py.typed +0 -0
  34. data_diff/queries/__init__.py +13 -0
  35. data_diff/queries/api.py +213 -0
  36. data_diff/queries/ast_classes.py +811 -0
  37. data_diff/queries/base.py +38 -0
  38. data_diff/queries/extras.py +43 -0
  39. data_diff/query_utils.py +70 -0
  40. data_diff/schema.py +67 -0
  41. data_diff/table_segment.py +583 -0
  42. data_diff/thread_utils.py +112 -0
  43. data_diff/utils.py +1022 -0
  44. data_diff/version.py +15 -0
  45. dcs_core/__init__.py +13 -0
  46. dcs_core/__main__.py +17 -0
  47. dcs_core/__version__.py +15 -0
  48. dcs_core/cli/__init__.py +13 -0
  49. dcs_core/cli/cli.py +165 -0
  50. dcs_core/core/__init__.py +19 -0
  51. dcs_core/core/common/__init__.py +13 -0
  52. dcs_core/core/common/errors.py +50 -0
  53. dcs_core/core/common/models/__init__.py +13 -0
  54. dcs_core/core/common/models/configuration.py +284 -0
  55. dcs_core/core/common/models/dashboard.py +24 -0
  56. dcs_core/core/common/models/data_source_resource.py +75 -0
  57. dcs_core/core/common/models/metric.py +160 -0
  58. dcs_core/core/common/models/profile.py +75 -0
  59. dcs_core/core/common/models/validation.py +216 -0
  60. dcs_core/core/common/models/widget.py +44 -0
  61. dcs_core/core/configuration/__init__.py +13 -0
  62. dcs_core/core/configuration/config_loader.py +139 -0
  63. dcs_core/core/configuration/configuration_parser.py +262 -0
  64. dcs_core/core/configuration/configuration_parser_arc.py +328 -0
  65. dcs_core/core/datasource/__init__.py +13 -0
  66. dcs_core/core/datasource/base.py +62 -0
  67. dcs_core/core/datasource/manager.py +112 -0
  68. dcs_core/core/datasource/search_datasource.py +421 -0
  69. dcs_core/core/datasource/sql_datasource.py +1094 -0
  70. dcs_core/core/inspect.py +163 -0
  71. dcs_core/core/logger/__init__.py +13 -0
  72. dcs_core/core/logger/base.py +32 -0
  73. dcs_core/core/logger/default_logger.py +94 -0
  74. dcs_core/core/metric/__init__.py +13 -0
  75. dcs_core/core/metric/base.py +220 -0
  76. dcs_core/core/metric/combined_metric.py +98 -0
  77. dcs_core/core/metric/custom_metric.py +34 -0
  78. dcs_core/core/metric/manager.py +137 -0
  79. dcs_core/core/metric/numeric_metric.py +403 -0
  80. dcs_core/core/metric/reliability_metric.py +90 -0
  81. dcs_core/core/profiling/__init__.py +13 -0
  82. dcs_core/core/profiling/datasource_profiling.py +136 -0
  83. dcs_core/core/profiling/numeric_field_profiling.py +72 -0
  84. dcs_core/core/profiling/text_field_profiling.py +67 -0
  85. dcs_core/core/repository/__init__.py +13 -0
  86. dcs_core/core/repository/metric_repository.py +77 -0
  87. dcs_core/core/utils/__init__.py +13 -0
  88. dcs_core/core/utils/log.py +29 -0
  89. dcs_core/core/utils/tracking.py +105 -0
  90. dcs_core/core/utils/utils.py +44 -0
  91. dcs_core/core/validation/__init__.py +13 -0
  92. dcs_core/core/validation/base.py +230 -0
  93. dcs_core/core/validation/completeness_validation.py +153 -0
  94. dcs_core/core/validation/custom_query_validation.py +24 -0
  95. dcs_core/core/validation/manager.py +282 -0
  96. dcs_core/core/validation/numeric_validation.py +276 -0
  97. dcs_core/core/validation/reliability_validation.py +91 -0
  98. dcs_core/core/validation/uniqueness_validation.py +61 -0
  99. dcs_core/core/validation/validity_validation.py +738 -0
  100. dcs_core/integrations/__init__.py +13 -0
  101. dcs_core/integrations/databases/__init__.py +13 -0
  102. dcs_core/integrations/databases/bigquery.py +187 -0
  103. dcs_core/integrations/databases/databricks.py +51 -0
  104. dcs_core/integrations/databases/db2.py +652 -0
  105. dcs_core/integrations/databases/elasticsearch.py +61 -0
  106. dcs_core/integrations/databases/mssql.py +829 -0
  107. dcs_core/integrations/databases/mysql.py +409 -0
  108. dcs_core/integrations/databases/opensearch.py +64 -0
  109. dcs_core/integrations/databases/oracle.py +719 -0
  110. dcs_core/integrations/databases/postgres.py +482 -0
  111. dcs_core/integrations/databases/redshift.py +53 -0
  112. dcs_core/integrations/databases/snowflake.py +48 -0
  113. dcs_core/integrations/databases/spark_df.py +111 -0
  114. dcs_core/integrations/databases/sybase.py +1069 -0
  115. dcs_core/integrations/storage/__init__.py +13 -0
  116. dcs_core/integrations/storage/local_file.py +149 -0
  117. dcs_core/integrations/utils/__init__.py +13 -0
  118. dcs_core/integrations/utils/utils.py +36 -0
  119. dcs_core/report/__init__.py +13 -0
  120. dcs_core/report/dashboard.py +211 -0
  121. dcs_core/report/models.py +88 -0
  122. dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
  123. dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
  124. dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
  125. dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
  126. dcs_core/report/static/assets/images/docs.svg +6 -0
  127. dcs_core/report/static/assets/images/github.svg +4 -0
  128. dcs_core/report/static/assets/images/logo.svg +7 -0
  129. dcs_core/report/static/assets/images/slack.svg +13 -0
  130. dcs_core/report/static/index.js +2 -0
  131. dcs_core/report/static/index.js.LICENSE.txt +3971 -0
  132. dcs_sdk/__init__.py +13 -0
  133. dcs_sdk/__main__.py +18 -0
  134. dcs_sdk/__version__.py +15 -0
  135. dcs_sdk/cli/__init__.py +13 -0
  136. dcs_sdk/cli/cli.py +163 -0
  137. dcs_sdk/sdk/__init__.py +58 -0
  138. dcs_sdk/sdk/config/__init__.py +13 -0
  139. dcs_sdk/sdk/config/config_loader.py +491 -0
  140. dcs_sdk/sdk/data_diff/__init__.py +13 -0
  141. dcs_sdk/sdk/data_diff/data_differ.py +821 -0
  142. dcs_sdk/sdk/rules/__init__.py +15 -0
  143. dcs_sdk/sdk/rules/rules_mappping.py +31 -0
  144. dcs_sdk/sdk/rules/rules_repository.py +214 -0
  145. dcs_sdk/sdk/rules/schema_rules.py +65 -0
  146. dcs_sdk/sdk/utils/__init__.py +13 -0
  147. dcs_sdk/sdk/utils/serializer.py +25 -0
  148. dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
  149. dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
  150. dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
  151. dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
  152. dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
  153. dcs_sdk/sdk/utils/table.py +475 -0
  154. dcs_sdk/sdk/utils/themes.py +40 -0
  155. dcs_sdk/sdk/utils/utils.py +349 -0
  156. dcs_sdk-1.6.5.dist-info/METADATA +150 -0
  157. dcs_sdk-1.6.5.dist-info/RECORD +159 -0
  158. dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
  159. dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,821 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import glob
16
+ import os
17
+ import time
18
+ from collections import defaultdict
19
+ from contextlib import suppress
20
+ from datetime import datetime, timezone
21
+ from typing import Dict, Optional
22
+
23
+ from loguru import logger
24
+ from rich.console import Console
25
+
26
+ from data_diff import TableSegment, connect, connect_to_table, diff_tables
27
+ from data_diff.databases import Database
28
+ from data_diff.databases.redis import RedisBackend
29
+ from dcs_sdk.sdk.config.config_loader import Comparison, SourceTargetConnection
30
+ from dcs_sdk.sdk.rules.rules_repository import RulesRepository
31
+ from dcs_sdk.sdk.utils.serializer import serialize_table_schema
32
+ from dcs_sdk.sdk.utils.table import create_table_schema_row_count, differ_rows
33
+ from dcs_sdk.sdk.utils.themes import theme_1
34
+ from dcs_sdk.sdk.utils.utils import (
35
+ calculate_column_differences,
36
+ convert_to_masked_if_required,
37
+ duck_db_load_csv_to_table,
38
+ find_identical_columns,
39
+ generate_table_name,
40
+ obfuscate_sensitive_data,
41
+ )
42
+
43
+ DYNAMIC_BISECTION_THRESHOLD_MAX_LIMIT = 5_00_000
44
+ DEFAULT_BISECTION_THRESHOLD = 50_000
45
+ ROW_COUNT_PER_SEGMENT = 1_00_000
46
+ MAX_EGRESS_LIMIT = 5_00_000
47
+ MIN_EGRESS_LIMIT = 50_000
48
+
49
+
50
+ class DBTableDiffer:
51
+ def __init__(self, config: Comparison):
52
+ self.config = config
53
+ self.console = Console(record=True)
54
+ self.created_at = datetime.now(tz=timezone.utc)
55
+ self.start_time = time.monotonic()
56
+ self.algorithm = "hashdiff"
57
+ self.table1 = None
58
+ self.table2 = None
59
+ self.diff_iter = None
60
+ self.response = {}
61
+ self.source_file_path = self.config.source.filepath
62
+ self.target_file_path = self.config.target.filepath
63
+ self.limit = config.limit
64
+ self.default_limit = 1000
65
+ self.table_limit = 100
66
+ self.source_db: Database = None
67
+ self.target_db: Database = None
68
+ self.similarity = self.config.similarity
69
+ self.similarity_providers = None
70
+ if self.similarity:
71
+ from dcs_sdk.sdk.utils.similarity_score.base_provider import (
72
+ ensure_nltk_data,
73
+ )
74
+ from dcs_sdk.sdk.utils.similarity_score.cosine_similarity_provider import (
75
+ CosineSimilarityProvider,
76
+ )
77
+ from dcs_sdk.sdk.utils.similarity_score.jaccard_provider import (
78
+ JaccardSimilarityProvider,
79
+ )
80
+ from dcs_sdk.sdk.utils.similarity_score.levenshtein_distance_provider import (
81
+ LevenshteinDistanceProvider,
82
+ )
83
+
84
+ ensure_nltk_data()
85
+
86
+ self.similarity_providers = {
87
+ "jaccard": JaccardSimilarityProvider,
88
+ "levenshtein": LevenshteinDistanceProvider,
89
+ "cosine": CosineSimilarityProvider,
90
+ }
91
+
92
+ def create_dataset_dict(
93
+ self,
94
+ config: SourceTargetConnection,
95
+ table: TableSegment,
96
+ db_name: str,
97
+ file_path: str,
98
+ database_type: str,
99
+ ) -> Dict:
100
+ schema_list = [serialize_table_schema(v) for v in table.get_schema().values()]
101
+ schema_list.sort(key=lambda x: x["column_name"].upper())
102
+
103
+ return {
104
+ "id": config.id,
105
+ "name": config.name,
106
+ "workspace": config.workspace,
107
+ "database_type": database_type,
108
+ "table_name": table.table_path[0],
109
+ "schema": table.database.default_schema,
110
+ "database": db_name,
111
+ "primary_keys": list(table.key_columns),
112
+ "file_path": file_path,
113
+ "files": [] if file_path is None else [generate_table_name(csv, False) for csv in glob.glob(file_path)],
114
+ "row_count": table.count(),
115
+ "columns": schema_list,
116
+ "exclusive_pk_cnt": 0,
117
+ "duplicate_pk_cnt": 0,
118
+ "null_pk_cnt": 0,
119
+ }
120
+
121
+ def connect_to_db_table(
122
+ self,
123
+ config: SourceTargetConnection,
124
+ is_source: bool,
125
+ ) -> TableSegment:
126
+ if is_source:
127
+ primary_keys = self.config.primary_keys_source
128
+ columns = self.config.source_columns
129
+ where = self.config.source_filter
130
+ else:
131
+ primary_keys = self.config.primary_keys_target
132
+ columns = self.config.target_columns
133
+ where = self.config.target_filter
134
+
135
+ return connect_to_table(
136
+ {
137
+ "driver": config.driver,
138
+ "host": config.host,
139
+ "port": config.port,
140
+ "http_path": config.http_path,
141
+ "access_token": config.access_token,
142
+ "user": config.username,
143
+ "password": config.password,
144
+ "database": config.database,
145
+ "schema": config.schema_name,
146
+ "filepath": config.filepath,
147
+ "warehouse": config.warehouse,
148
+ "role": config.role,
149
+ "catalog": config.catalog,
150
+ "account": config.account,
151
+ "odbc_driver": config.odbc_driver,
152
+ "server": config.server,
153
+ "project": config.project,
154
+ "dataset": config.dataset,
155
+ "keyfile": config.keyfile,
156
+ "impersonate_service_account": config.impersonate_service_account,
157
+ "bigquery_credentials": config.bigquery_credentials,
158
+ },
159
+ config.table,
160
+ tuple(primary_keys),
161
+ extra_columns=tuple(columns),
162
+ where=where,
163
+ transform_columns=config.transform_columns,
164
+ job_id=self.config.job_id,
165
+ )
166
+
167
+ def connect_to_db(self, config: SourceTargetConnection, is_source: bool):
168
+ if is_source:
169
+ self.source_db: Database = connect(
170
+ {
171
+ "driver": config.driver,
172
+ "host": config.host,
173
+ "port": config.port,
174
+ "http_path": config.http_path,
175
+ "access_token": config.access_token,
176
+ "user": config.username,
177
+ "password": config.password,
178
+ "database": config.database,
179
+ "warehouse": config.warehouse,
180
+ "schema": config.schema_name,
181
+ "role": config.role,
182
+ "catalog": config.catalog,
183
+ "account": config.account,
184
+ "odbc_driver": config.odbc_driver,
185
+ "server": config.server,
186
+ "project": config.project,
187
+ "keyfile": config.keyfile,
188
+ "impersonate_service_account": config.impersonate_service_account,
189
+ "bigquery_credentials": config.bigquery_credentials,
190
+ "dataset": config.dataset,
191
+ }
192
+ )
193
+ else:
194
+ self.target_db: Database = connect(
195
+ {
196
+ "driver": config.driver,
197
+ "host": config.host,
198
+ "port": config.port,
199
+ "http_path": config.http_path,
200
+ "access_token": config.access_token,
201
+ "user": config.username,
202
+ "password": config.password,
203
+ "database": config.database,
204
+ "warehouse": config.warehouse,
205
+ "schema": config.schema_name,
206
+ "role": config.role,
207
+ "catalog": config.catalog,
208
+ "account": config.account,
209
+ "odbc_driver": config.odbc_driver,
210
+ "server": config.server,
211
+ "project": config.project,
212
+ "keyfile": config.keyfile,
213
+ "impersonate_service_account": config.impersonate_service_account,
214
+ "bigquery_credentials": config.bigquery_credentials,
215
+ "dataset": config.dataset,
216
+ }
217
+ )
218
+
219
+ def process_duckdb(self, is_source: bool):
220
+ if is_source:
221
+ filepath = self.config.source.filepath
222
+ else:
223
+ filepath = self.config.target.filepath
224
+ if filepath is None:
225
+ raise ValueError("File path is required for file")
226
+ if filepath.endswith(".csv"):
227
+ if not duck_db_load_csv_to_table(self.config, filepath, is_source):
228
+ raise ValueError(f"Error in loading CSV, for the {'source' if is_source else 'target'}")
229
+
230
+ def _prepare_source_table(self) -> Optional[str]:
231
+ view_name = None
232
+ if self.config.source.driver == "duckdb":
233
+ return view_name
234
+ if self.config.source_query is not None:
235
+ self._process_database_as_schema(
236
+ driver=self.config.source.driver,
237
+ is_source=True,
238
+ )
239
+ self.connect_to_db(
240
+ self.config.source,
241
+ is_source=True,
242
+ )
243
+ view_name = self.source_db.create_view_from_query(
244
+ query=self.config.source_query,
245
+ schema=self.config.temporary_schema_source,
246
+ view_name=self.config.view_name_source,
247
+ )
248
+ self.config.source.schema_name = self.config.temporary_schema_source
249
+ self.config.source.table = view_name
250
+ return view_name
251
+
252
+ def _prepare_target_table(self) -> Optional[str]:
253
+ view_name = None
254
+ if self.config.target.driver == "duckdb":
255
+ return view_name
256
+ if self.config.target_query is not None:
257
+ self._process_database_as_schema(
258
+ driver=self.config.target.driver,
259
+ is_source=False,
260
+ )
261
+ self.connect_to_db(
262
+ self.config.target,
263
+ is_source=False,
264
+ )
265
+ view_name = self.target_db.create_view_from_query(
266
+ query=self.config.target_query,
267
+ schema=self.config.temporary_schema_target,
268
+ view_name=self.config.view_name_target,
269
+ )
270
+ self.config.target.schema_name = self.config.temporary_schema_target
271
+ self.config.target.table = view_name
272
+
273
+ return view_name
274
+
275
+ def _process_database_as_schema(self, driver: str, is_source: bool):
276
+ if driver in ["mysql"]:
277
+ if is_source:
278
+ self.config.source.database = self.config.temporary_schema_source
279
+ else:
280
+ self.config.target.database = self.config.temporary_schema_target
281
+
282
+ def _process_duckdb_connections(self):
283
+ if self.config.source.driver == "duckdb":
284
+ self.process_duckdb(is_source=True)
285
+ if self.config.target.driver == "duckdb":
286
+ self.process_duckdb(is_source=False)
287
+
288
+ def _get_automatic_bisection_threshold(self, max_row_count: int) -> int:
289
+ val = max_row_count // 10
290
+
291
+ if val > DYNAMIC_BISECTION_THRESHOLD_MAX_LIMIT:
292
+ return DYNAMIC_BISECTION_THRESHOLD_MAX_LIMIT
293
+
294
+ return val
295
+
296
+ def _get_automatic_bisection_factor(self, max_row_count) -> int:
297
+ return max_row_count // ROW_COUNT_PER_SEGMENT
298
+
299
+ def _get_automatic_egress_limit(self, max_row_count: int) -> int:
300
+ val = max_row_count // 10
301
+
302
+ if val > MAX_EGRESS_LIMIT:
303
+ return MAX_EGRESS_LIMIT
304
+
305
+ return val
306
+
307
+ def diff_tables(
308
+ self,
309
+ is_cli: bool = False,
310
+ show_stats: bool = False,
311
+ save_html: bool = False,
312
+ html_path: str = "dcs_report.html",
313
+ display_table: bool = False,
314
+ ) -> Dict:
315
+ view_name_source = None
316
+ view_name_target = None
317
+ duckb_file_location_source = None
318
+ duckb_file_location_target = None
319
+
320
+ try:
321
+ self._process_duckdb_connections()
322
+ view_name_source = self._prepare_source_table()
323
+ view_name_target = self._prepare_target_table()
324
+
325
+ self.table1 = self.connect_to_db_table(self.config.source, is_source=True)
326
+ self.table2 = self.connect_to_db_table(self.config.target, is_source=False)
327
+ table_1_sample_data = []
328
+ table_2_sample_data = []
329
+ db1_name = (
330
+ self.config.source.database or self.config.source.catalog or self.config.source.project or "source"
331
+ )
332
+ db2_name = (
333
+ self.config.target.database or self.config.target.catalog or self.config.target.project or "target"
334
+ )
335
+
336
+ columns_order_wise_src = self.config.primary_keys_source + self.config.source_columns
337
+ columns_order_wise_target = self.config.primary_keys_target + self.config.target_columns
338
+
339
+ src_masking_cols = self.config.source_masking_columns
340
+ tgt_masking_cols = self.config.target_masking_columns
341
+ masking_character = self.config.masking_character
342
+
343
+ source_dataset = self.create_dataset_dict(
344
+ self.config.source,
345
+ self.table1,
346
+ db1_name,
347
+ self.source_file_path,
348
+ "file" if self.config.source.driver == "duckdb" else self.config.source.driver,
349
+ )
350
+ target_dataset = self.create_dataset_dict(
351
+ self.config.target,
352
+ self.table2,
353
+ db2_name,
354
+ self.target_file_path,
355
+ "file" if self.config.target.driver == "duckdb" else self.config.target.driver,
356
+ )
357
+ table_1_row_count = source_dataset.get("row_count", 0)
358
+ table_2_row_count = target_dataset.get("row_count", 0)
359
+ max_row_count = max(table_1_row_count, table_2_row_count)
360
+
361
+ is_bisection_threshold_automatic = self.config.advanced_configuration.bisection_threshold == -1
362
+ is_bisection_factor_automatic = self.config.advanced_configuration.bisection_factor == -1
363
+ is_egress_limit_automatic = self.config.advanced_configuration.egress_limit == -1
364
+
365
+ threshold = (
366
+ self.config.advanced_configuration.bisection_threshold
367
+ if not is_bisection_threshold_automatic
368
+ else self._get_automatic_bisection_threshold(max_row_count)
369
+ )
370
+
371
+ factor = (
372
+ self.config.advanced_configuration.bisection_factor
373
+ if not is_bisection_factor_automatic
374
+ else self._get_automatic_bisection_factor(max_row_count)
375
+ )
376
+
377
+ egress_limit = (
378
+ self.config.advanced_configuration.egress_limit
379
+ if not is_egress_limit_automatic
380
+ else self._get_automatic_egress_limit(max_row_count)
381
+ )
382
+
383
+ self.config.advanced_configuration.bisection_threshold = max(threshold, 1000)
384
+ self.config.advanced_configuration.bisection_factor = max(factor, 10)
385
+ self.config.advanced_configuration.egress_limit = max(egress_limit, MIN_EGRESS_LIMIT)
386
+
387
+ error_message = None
388
+ is_table_empty = False
389
+ if table_1_row_count == 0:
390
+ error_message = f"Source table '{source_dataset.get('table_name')}' is empty"
391
+ is_table_empty = True
392
+ if table_2_row_count == 0:
393
+ if error_message:
394
+ error_message += f" and target table '{target_dataset.get('table_name')}' is empty"
395
+ else:
396
+ error_message = f"Target table '{target_dataset.get('table_name')}' is empty"
397
+ is_table_empty = True
398
+ if not is_table_empty and not self.config.schema_diff:
399
+ pks_len = len(self.table1.key_columns)
400
+ table_1_sample_data = self.table1.with_schema().get_sample_data(limit=100)
401
+ sample_keys = [list(row[:pks_len]) for row in table_1_sample_data]
402
+ table_2_sample_data = self.table2.with_schema().get_sample_data(limit=100, sample_keys=sample_keys)
403
+ # if self.config.advanced_configuration.in_memory_diff:
404
+ # self.config.advanced_configuration.egress_limit = min(max_row_count, 50_00_000)
405
+ self.diff_iter = diff_tables(
406
+ self.table1,
407
+ self.table2,
408
+ algorithm=self.algorithm,
409
+ bisection_factor=self.config.advanced_configuration.bisection_factor,
410
+ bisection_threshold=self.config.advanced_configuration.bisection_threshold,
411
+ max_threadpool_size=self.config.advanced_configuration.max_threadpool_size,
412
+ strict=self.config.strict,
413
+ per_column_diff_limit=self.config.advanced_configuration.per_column_diff_limit,
414
+ egress_limit=self.config.advanced_configuration.egress_limit,
415
+ timeout_limit=self.config.advanced_configuration.timeout_limit,
416
+ in_memory_diff=self.config.advanced_configuration.in_memory_diff,
417
+ )
418
+
419
+ columns_mappings = [
420
+ {"source_column": src, "target_column": trg}
421
+ for src, trg in zip(columns_order_wise_src, columns_order_wise_target)
422
+ ]
423
+
424
+ self.response = {
425
+ "source_dataset": source_dataset,
426
+ "target_dataset": target_dataset,
427
+ "columns_mappings": columns_mappings,
428
+ }
429
+
430
+ self.process_limit(max_row_count)
431
+ if not is_table_empty and not self.config.schema_diff:
432
+ diff_res = differ_rows(
433
+ diff_iter=self.diff_iter,
434
+ response=self.response,
435
+ limit=self.limit,
436
+ table_limit=self.table_limit,
437
+ display_table=display_table,
438
+ similarity=self.similarity,
439
+ similarity_providers=self.similarity_providers,
440
+ fields=self.config.source_columns,
441
+ quick_comparison=self.config.quick_comparison,
442
+ src_masking_cols=src_masking_cols if src_masking_cols else [],
443
+ tgt_masking_cols=tgt_masking_cols if tgt_masking_cols else [],
444
+ masking_character=masking_character,
445
+ )
446
+ else:
447
+ diff_res = {
448
+ "stats": {
449
+ "rows_A": 0,
450
+ "rows_B": 0,
451
+ "exclusive_A": 0,
452
+ "exclusive_B": 0,
453
+ "diff_pk_percent": 0,
454
+ "unchanged": 0,
455
+ "total_diff_count": 0,
456
+ "diff_rows_count": 0,
457
+ "total_duplicate_count_source": 0,
458
+ "total_duplicate_count_target": 0,
459
+ "diff_rows_percent": 0,
460
+ "has_differences": False,
461
+ "error": {},
462
+ },
463
+ "exclusive_pk_values_target": [],
464
+ "exclusive_pk_values_source": [],
465
+ "duplicate_pk_values_source": [],
466
+ "duplicate_pk_values_target": [],
467
+ "records_with_differences": [],
468
+ "table": None,
469
+ }
470
+ if is_table_empty:
471
+ diff_res["stats"]["has_differences"] = table_1_row_count != table_2_row_count
472
+ try:
473
+ diff_res["stats"]["diff_pk_percent"] = abs(
474
+ (table_1_row_count - table_2_row_count) / max(table_1_row_count, table_2_row_count)
475
+ )
476
+ except ZeroDivisionError:
477
+ diff_res["stats"]["diff_pk_percent"] = 0
478
+ diff_res["stats"]["error"] = {
479
+ "code": "empty_table",
480
+ "message": error_message,
481
+ "level": "WARNING",
482
+ }
483
+
484
+ diff_res.setdefault("stats", {})["rows_A"] = table_1_row_count
485
+ diff_res.setdefault("stats", {})["rows_B"] = table_2_row_count
486
+ columns_with_unmatched_data_type, columns_not_compared, exc_to_src, exc_to_tgt = (
487
+ calculate_column_differences(
488
+ source_columns=source_dataset["columns"],
489
+ target_columns=target_dataset["columns"],
490
+ columns_mappings=columns_mappings,
491
+ )
492
+ )
493
+
494
+ diff_res.get("stats", {}).update(
495
+ {
496
+ "identical_columns": find_identical_columns(
497
+ source_dataset["columns"],
498
+ target_dataset["columns"],
499
+ ),
500
+ "columns_with_unmatched_data_type": columns_with_unmatched_data_type,
501
+ "columns_not_compared": columns_not_compared,
502
+ }
503
+ )
504
+ if self.config.schema_diff:
505
+ if error_message:
506
+ diff_res["stats"]["error"]["level"] = "WARNING"
507
+
508
+ source_dataset["exclusive_pk_cnt"] = diff_res.get("stats", {}).get("exclusive_A", 0)
509
+ target_dataset["exclusive_pk_cnt"] = diff_res.get("stats", {}).get("exclusive_B", 0)
510
+ table = diff_res.pop("table", None)
511
+ if is_cli and display_table:
512
+ create_table_schema_row_count(self.response, table, self.console)
513
+ if save_html:
514
+ self.console.save_html(html_path, theme=theme_1, clear=True)
515
+
516
+ duckb_file_location_source = self.config.source.filepath
517
+ duckb_file_location_target = self.config.target.filepath
518
+ self.config.source.filepath = self.source_file_path
519
+ self.config.target.filepath = self.target_file_path
520
+ if self.config.source.driver == "duckdb":
521
+ self.config.source.driver = "file"
522
+ if self.config.target.driver == "duckdb":
523
+ self.config.target.driver = "file"
524
+
525
+ self.response["source_dataset"]["duplicate_pk_cnt"] = diff_res.get("stats", {}).get(
526
+ "total_duplicate_count_source", 0
527
+ )
528
+ self.response["target_dataset"]["duplicate_pk_cnt"] = diff_res.get("stats", {}).get(
529
+ "total_duplicate_count_target", 0
530
+ )
531
+ self.response["source_dataset"]["null_pk_cnt"] = diff_res.get("stats", {}).get("null_pk_count_source", 0)
532
+ self.response["target_dataset"]["null_pk_cnt"] = diff_res.get("stats", {}).get("null_pk_count_target", 0)
533
+
534
+ self.response["source_dataset"]["pk_cnt"] = (
535
+ self.response["source_dataset"]["row_count"]
536
+ - self.response["source_dataset"]["duplicate_pk_cnt"]
537
+ - self.response["source_dataset"]["null_pk_cnt"]
538
+ )
539
+ self.response["target_dataset"]["pk_cnt"] = (
540
+ self.response["target_dataset"]["row_count"]
541
+ - self.response["target_dataset"]["duplicate_pk_cnt"]
542
+ - self.response["target_dataset"]["null_pk_cnt"]
543
+ )
544
+ self.response.update(diff_res)
545
+ if show_stats:
546
+ self.print_stats()
547
+ table_1_stats = self.table1.query_stats
548
+ table_2_stats = self.table2.query_stats
549
+ for stats in [table_1_stats, table_2_stats]:
550
+ for _, stats_dict in stats.items():
551
+ if isinstance(stats_dict, dict):
552
+ stats_dict.pop("_query_times", None)
553
+
554
+ self.response.get("stats", {}).update(
555
+ {
556
+ "source_query_stats": table_1_stats,
557
+ "target_query_stats": table_2_stats,
558
+ "comparison_tracker": diff_res.get("stats", {}).get("comparison_tracker", []),
559
+ }
560
+ )
561
+ finished_at = datetime.now(tz=timezone.utc)
562
+ end_time = time.monotonic()
563
+ duration = end_time - self.start_time
564
+ meta = {
565
+ "meta": {
566
+ "created_at": self.created_at.isoformat(),
567
+ "seconds": round(duration, 2),
568
+ "finished_at": finished_at.isoformat(),
569
+ "status": "done",
570
+ }
571
+ }
572
+ self.response.update(meta)
573
+ rules_repo = RulesRepository.get_instance()
574
+ column_transforms = rules_repo.value_rules
575
+ schema_overrides = rules_repo.schema_rules
576
+
577
+ # diff_res["stats"]["has_differences"] = (table_1_row_count != table_2_row_count) or diff_res["stats"].get(
578
+ # "total_diff_count", 0
579
+ # ) > 0
580
+
581
+ is_row_mismatch = table_1_row_count != table_2_row_count
582
+
583
+ is_value_mismatch = diff_res["stats"].get("total_diff_count", 0) > 0
584
+
585
+ is_schema_mismatch = any([len(exc_to_src) != 0, len(exc_to_tgt) != 0, columns_with_unmatched_data_type])
586
+
587
+ diff_res["stats"]["has_differences"] = is_row_mismatch or is_value_mismatch or is_schema_mismatch
588
+ diff_res["stats"]["is_row_count_mismatch"] = is_row_mismatch
589
+ diff_res["stats"]["is_value_mismatch"] = is_value_mismatch
590
+ diff_res["stats"]["is_schema_mismatch"] = is_schema_mismatch
591
+
592
+ if not is_value_mismatch:
593
+ table_1_sample_data = convert_to_masked_if_required(
594
+ table_sample_data=table_1_sample_data if table_1_sample_data else [],
595
+ masking_character=masking_character,
596
+ masking_columns=src_masking_cols if src_masking_cols else [],
597
+ columns_order_wise=columns_order_wise_src if columns_order_wise_src else [],
598
+ )
599
+
600
+ table_2_sample_data = convert_to_masked_if_required(
601
+ table_sample_data=table_2_sample_data if table_2_sample_data else [],
602
+ masking_character=masking_character,
603
+ masking_columns=tgt_masking_cols if tgt_masking_cols else [],
604
+ columns_order_wise=columns_order_wise_target if columns_order_wise_target else [],
605
+ )
606
+
607
+ sample_value_column_names_src = list(self.table1.key_columns) + list(self.table1.extra_columns)
608
+ sample_value_column_names_tgt = list(self.table2.key_columns) + list(self.table2.extra_columns)
609
+ sample_value_source_dicts = [
610
+ dict(zip(sample_value_column_names_src, row)) for row in table_1_sample_data
611
+ ]
612
+ sample_value_target_dicts = [
613
+ dict(zip(sample_value_column_names_tgt, row)) for row in table_2_sample_data
614
+ ]
615
+
616
+ def get_pk(row, key_columns):
617
+ return tuple(row[k] for k in key_columns)
618
+
619
+ grouped_source = defaultdict(list)
620
+ grouped_target = defaultdict(list)
621
+
622
+ for row in sample_value_source_dicts:
623
+ grouped_source[get_pk(row, self.table1.key_columns)].append(row)
624
+
625
+ for row in sample_value_target_dicts:
626
+ grouped_target[get_pk(row, self.table2.key_columns)].append(row)
627
+
628
+ sample_values_record_list = []
629
+
630
+ def safe_numeric_sort(keys: list[tuple[str]]) -> list[tuple[str]]:
631
+ def sort_key(tup):
632
+ key = []
633
+ for val in tup:
634
+ if isinstance(val, str) and val.isdigit():
635
+ key.append((0, int(val)))
636
+ else:
637
+ key.append((1, str(val)))
638
+ return tuple(key)
639
+
640
+ return sorted(keys, key=sort_key)
641
+
642
+ sorted_pks = safe_numeric_sort(list(grouped_source.keys() | grouped_target.keys()))
643
+
644
+ for pk in sorted_pks:
645
+ source_rows = grouped_source.get(pk, [])
646
+ target_rows = grouped_target.get(pk, [])
647
+ used_targets = set()
648
+ used_sources = set()
649
+
650
+ for i, src_row in enumerate(source_rows):
651
+ for j, tgt_row in enumerate(target_rows):
652
+ if j in used_targets:
653
+ continue
654
+ if src_row.values() == tgt_row.values():
655
+ sample_values_record_list.append(src_row)
656
+ sample_values_record_list.append(tgt_row)
657
+ used_sources.add(i)
658
+ used_targets.add(j)
659
+ break
660
+
661
+ def sort_key(row, key_columns, extra_columns):
662
+ key_values = []
663
+ for k in key_columns + extra_columns:
664
+ if k in row:
665
+ value = row[k]
666
+ if value is None:
667
+ key_values.append("None")
668
+ else:
669
+ key_values.append(value)
670
+ return tuple(key_values)
671
+
672
+ remaining_sources = [row for i, row in enumerate(source_rows) if i not in used_sources]
673
+ remaining_targets = [row for j, row in enumerate(target_rows) if j not in used_targets]
674
+
675
+ remaining_sources_sorted = sorted(
676
+ remaining_sources,
677
+ key=lambda row: sort_key(row, self.table1.key_columns, self.table1.extra_columns),
678
+ )
679
+
680
+ remaining_targets_sorted = sorted(
681
+ remaining_targets,
682
+ key=lambda row: sort_key(row, self.table2.key_columns, self.table2.extra_columns),
683
+ )
684
+
685
+ for src_row, tgt_row in zip(remaining_sources_sorted, remaining_targets_sorted):
686
+ sample_values_record_list.append(src_row)
687
+ sample_values_record_list.append(tgt_row)
688
+
689
+ self.response["sample_data_values"] = sample_values_record_list
690
+
691
+ self.response.update({"column_transforms": column_transforms})
692
+ self.response.update({"schema_overrides": schema_overrides})
693
+
694
+ return self.response
695
+ except Exception as e:
696
+ logger.exception(f"Error during diff_tables: {e}")
697
+ raise
698
+ finally:
699
+ self.drop_view_and_close_connection(view_name_source, view_name_target)
700
+ self.cleanup_duckdb(
701
+ src=duckb_file_location_source,
702
+ target=duckb_file_location_target,
703
+ )
704
+ logger.info("Dropped views and closed database connections")
705
+
706
+ def process_limit(self, max_row_count):
707
+ if isinstance(self.limit, int):
708
+ if self.limit > max_row_count:
709
+ self.limit = max_row_count
710
+ logger.info(f"Limit exceeds max row count, adjusted to {max_row_count}")
711
+ return
712
+
713
+ if isinstance(self.limit, str):
714
+ if "%" in self.limit:
715
+ cleaned_limit = self.limit.replace("%", "").strip()
716
+ if cleaned_limit.isdigit():
717
+ percentage = float(cleaned_limit)
718
+ if percentage > 100:
719
+ self.limit = max_row_count
720
+ logger.info("Percentage exceeds 100%, set limit to maximum row count")
721
+ else:
722
+ calc_limit = int((percentage / 100) * max_row_count)
723
+ self.limit = max(1, int(calc_limit))
724
+ logger.info(f"Limit set to {self.limit} ({percentage}% of {max_row_count})")
725
+ else:
726
+ self.limit = self.default_limit
727
+ logger.warning(
728
+ f"Invalid percentage format '{self.limit}', using default limit: {self.default_limit}"
729
+ )
730
+ else:
731
+ self.limit = self.default_limit
732
+ logger.warning(f"Invalid limit format '{self.limit}', using default limit: {self.default_limit}")
733
+
734
+ def drop_view_and_close_connection(self, view_name_source, view_name_target):
735
+
736
+ def safe_close(db_connection):
737
+ if db_connection:
738
+ with suppress(Exception):
739
+ db_connection.close()
740
+
741
+ safe_close(self.table1.database)
742
+ safe_close(self.table2.database)
743
+
744
+ if self.source_db:
745
+ self.source_db.drop_view_from_db(
746
+ view_name=view_name_source,
747
+ schema=self.config.temporary_schema_source,
748
+ )
749
+ if self.target_db:
750
+ self.target_db.drop_view_from_db(
751
+ view_name=view_name_target,
752
+ schema=self.config.temporary_schema_target,
753
+ )
754
+
755
+ safe_close(self.source_db)
756
+ safe_close(self.target_db)
757
+ if self.config.job_id:
758
+ safe_close(RedisBackend.get_instance())
759
+
760
+ def cleanup_duckdb(self, src: str, target: str):
761
+ if src and src.endswith("duckdb"):
762
+ with suppress(Exception):
763
+ os.remove(src)
764
+ if target and target.endswith("duckdb"):
765
+ with suppress(Exception):
766
+ os.remove(target)
767
+
768
+ def print_stats(self):
769
+ try:
770
+ stats = self.response.get("stats", {})
771
+ output = ""
772
+ if stats:
773
+ if self.config.quick_comparison:
774
+ output += f"Quick comparison: {self.config.quick_comparison}\n"
775
+ output += f"Has differences {stats.get('has_differences', False)}\n"
776
+ else:
777
+ output += f"{stats.get('exclusive_A', 0)} rows are exclusive to source\n"
778
+ output += f"{stats.get('exclusive_B', 0)} rows are exclusive to target\n"
779
+ output += f"{stats.get('total_duplicate_count_source', 0)} duplicate rows in source\n"
780
+ output += f"{stats.get('total_duplicate_count_target', 0)} duplicate rows in target\n"
781
+ # output += f"{stats.get('total_diff_count', 0)} rows are different\n"
782
+ # output += f"{stats.get('diff_rows_count', 0)} rows are different\n"
783
+ for k, v in stats.get("values", {}).items():
784
+ output += f"{v} rows with different values in column: {k}\n"
785
+ # output += f"{round((stats.get('diff_pk_percent', 0) * 100),3)}% of primary keys are different\n"
786
+ # output += f"{round((stats.get('diff_rows_percent', 0) * 100),3)}% of rows are different\n"
787
+ print(output)
788
+ except Exception as e:
789
+ logger.exception(f"Error in printing stats: {e}")
790
+
791
+ def slice_rows(self, rows, start, end):
792
+ return rows[start:end]
793
+
794
+
795
+ def diff_db_tables(
796
+ config: Comparison,
797
+ is_cli: bool = False,
798
+ show_stats: bool = False,
799
+ save_html: bool = False,
800
+ html_path: str = "dcs_report.html",
801
+ display_table: bool = False,
802
+ ) -> Dict:
803
+ differ = DBTableDiffer(config)
804
+ response = differ.diff_tables(
805
+ is_cli=is_cli,
806
+ show_stats=show_stats,
807
+ save_html=save_html,
808
+ html_path=html_path,
809
+ display_table=display_table,
810
+ )
811
+ response["comparison_name"] = config.comparison_name
812
+ configuration = config.model_dump()
813
+ del configuration["source"]["id"]
814
+ del configuration["target"]["id"]
815
+ configuration["source"]["schema_name"] = response["source_dataset"]["schema"]
816
+ configuration["target"]["schema_name"] = response["target_dataset"]["schema"]
817
+ response["configuration"] = configuration
818
+ if is_cli:
819
+ response["configuration"]["source"] = obfuscate_sensitive_data(response["configuration"]["source"])
820
+ response["configuration"]["target"] = obfuscate_sensitive_data(response["configuration"]["target"])
821
+ return response