dcs-sdk 1.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. data_diff/__init__.py +221 -0
  2. data_diff/__main__.py +517 -0
  3. data_diff/abcs/__init__.py +13 -0
  4. data_diff/abcs/compiler.py +27 -0
  5. data_diff/abcs/database_types.py +402 -0
  6. data_diff/config.py +141 -0
  7. data_diff/databases/__init__.py +38 -0
  8. data_diff/databases/_connect.py +323 -0
  9. data_diff/databases/base.py +1417 -0
  10. data_diff/databases/bigquery.py +376 -0
  11. data_diff/databases/clickhouse.py +217 -0
  12. data_diff/databases/databricks.py +262 -0
  13. data_diff/databases/duckdb.py +207 -0
  14. data_diff/databases/mssql.py +343 -0
  15. data_diff/databases/mysql.py +189 -0
  16. data_diff/databases/oracle.py +238 -0
  17. data_diff/databases/postgresql.py +293 -0
  18. data_diff/databases/presto.py +222 -0
  19. data_diff/databases/redis.py +93 -0
  20. data_diff/databases/redshift.py +233 -0
  21. data_diff/databases/snowflake.py +222 -0
  22. data_diff/databases/sybase.py +720 -0
  23. data_diff/databases/trino.py +73 -0
  24. data_diff/databases/vertica.py +174 -0
  25. data_diff/diff_tables.py +489 -0
  26. data_diff/errors.py +17 -0
  27. data_diff/format.py +369 -0
  28. data_diff/hashdiff_tables.py +1026 -0
  29. data_diff/info_tree.py +76 -0
  30. data_diff/joindiff_tables.py +434 -0
  31. data_diff/lexicographic_space.py +253 -0
  32. data_diff/parse_time.py +88 -0
  33. data_diff/py.typed +0 -0
  34. data_diff/queries/__init__.py +13 -0
  35. data_diff/queries/api.py +213 -0
  36. data_diff/queries/ast_classes.py +811 -0
  37. data_diff/queries/base.py +38 -0
  38. data_diff/queries/extras.py +43 -0
  39. data_diff/query_utils.py +70 -0
  40. data_diff/schema.py +67 -0
  41. data_diff/table_segment.py +583 -0
  42. data_diff/thread_utils.py +112 -0
  43. data_diff/utils.py +1022 -0
  44. data_diff/version.py +15 -0
  45. dcs_core/__init__.py +13 -0
  46. dcs_core/__main__.py +17 -0
  47. dcs_core/__version__.py +15 -0
  48. dcs_core/cli/__init__.py +13 -0
  49. dcs_core/cli/cli.py +165 -0
  50. dcs_core/core/__init__.py +19 -0
  51. dcs_core/core/common/__init__.py +13 -0
  52. dcs_core/core/common/errors.py +50 -0
  53. dcs_core/core/common/models/__init__.py +13 -0
  54. dcs_core/core/common/models/configuration.py +284 -0
  55. dcs_core/core/common/models/dashboard.py +24 -0
  56. dcs_core/core/common/models/data_source_resource.py +75 -0
  57. dcs_core/core/common/models/metric.py +160 -0
  58. dcs_core/core/common/models/profile.py +75 -0
  59. dcs_core/core/common/models/validation.py +216 -0
  60. dcs_core/core/common/models/widget.py +44 -0
  61. dcs_core/core/configuration/__init__.py +13 -0
  62. dcs_core/core/configuration/config_loader.py +139 -0
  63. dcs_core/core/configuration/configuration_parser.py +262 -0
  64. dcs_core/core/configuration/configuration_parser_arc.py +328 -0
  65. dcs_core/core/datasource/__init__.py +13 -0
  66. dcs_core/core/datasource/base.py +62 -0
  67. dcs_core/core/datasource/manager.py +112 -0
  68. dcs_core/core/datasource/search_datasource.py +421 -0
  69. dcs_core/core/datasource/sql_datasource.py +1094 -0
  70. dcs_core/core/inspect.py +163 -0
  71. dcs_core/core/logger/__init__.py +13 -0
  72. dcs_core/core/logger/base.py +32 -0
  73. dcs_core/core/logger/default_logger.py +94 -0
  74. dcs_core/core/metric/__init__.py +13 -0
  75. dcs_core/core/metric/base.py +220 -0
  76. dcs_core/core/metric/combined_metric.py +98 -0
  77. dcs_core/core/metric/custom_metric.py +34 -0
  78. dcs_core/core/metric/manager.py +137 -0
  79. dcs_core/core/metric/numeric_metric.py +403 -0
  80. dcs_core/core/metric/reliability_metric.py +90 -0
  81. dcs_core/core/profiling/__init__.py +13 -0
  82. dcs_core/core/profiling/datasource_profiling.py +136 -0
  83. dcs_core/core/profiling/numeric_field_profiling.py +72 -0
  84. dcs_core/core/profiling/text_field_profiling.py +67 -0
  85. dcs_core/core/repository/__init__.py +13 -0
  86. dcs_core/core/repository/metric_repository.py +77 -0
  87. dcs_core/core/utils/__init__.py +13 -0
  88. dcs_core/core/utils/log.py +29 -0
  89. dcs_core/core/utils/tracking.py +105 -0
  90. dcs_core/core/utils/utils.py +44 -0
  91. dcs_core/core/validation/__init__.py +13 -0
  92. dcs_core/core/validation/base.py +230 -0
  93. dcs_core/core/validation/completeness_validation.py +153 -0
  94. dcs_core/core/validation/custom_query_validation.py +24 -0
  95. dcs_core/core/validation/manager.py +282 -0
  96. dcs_core/core/validation/numeric_validation.py +276 -0
  97. dcs_core/core/validation/reliability_validation.py +91 -0
  98. dcs_core/core/validation/uniqueness_validation.py +61 -0
  99. dcs_core/core/validation/validity_validation.py +738 -0
  100. dcs_core/integrations/__init__.py +13 -0
  101. dcs_core/integrations/databases/__init__.py +13 -0
  102. dcs_core/integrations/databases/bigquery.py +187 -0
  103. dcs_core/integrations/databases/databricks.py +51 -0
  104. dcs_core/integrations/databases/db2.py +652 -0
  105. dcs_core/integrations/databases/elasticsearch.py +61 -0
  106. dcs_core/integrations/databases/mssql.py +829 -0
  107. dcs_core/integrations/databases/mysql.py +409 -0
  108. dcs_core/integrations/databases/opensearch.py +64 -0
  109. dcs_core/integrations/databases/oracle.py +719 -0
  110. dcs_core/integrations/databases/postgres.py +482 -0
  111. dcs_core/integrations/databases/redshift.py +53 -0
  112. dcs_core/integrations/databases/snowflake.py +48 -0
  113. dcs_core/integrations/databases/spark_df.py +111 -0
  114. dcs_core/integrations/databases/sybase.py +1069 -0
  115. dcs_core/integrations/storage/__init__.py +13 -0
  116. dcs_core/integrations/storage/local_file.py +149 -0
  117. dcs_core/integrations/utils/__init__.py +13 -0
  118. dcs_core/integrations/utils/utils.py +36 -0
  119. dcs_core/report/__init__.py +13 -0
  120. dcs_core/report/dashboard.py +211 -0
  121. dcs_core/report/models.py +88 -0
  122. dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
  123. dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
  124. dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
  125. dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
  126. dcs_core/report/static/assets/images/docs.svg +6 -0
  127. dcs_core/report/static/assets/images/github.svg +4 -0
  128. dcs_core/report/static/assets/images/logo.svg +7 -0
  129. dcs_core/report/static/assets/images/slack.svg +13 -0
  130. dcs_core/report/static/index.js +2 -0
  131. dcs_core/report/static/index.js.LICENSE.txt +3971 -0
  132. dcs_sdk/__init__.py +13 -0
  133. dcs_sdk/__main__.py +18 -0
  134. dcs_sdk/__version__.py +15 -0
  135. dcs_sdk/cli/__init__.py +13 -0
  136. dcs_sdk/cli/cli.py +163 -0
  137. dcs_sdk/sdk/__init__.py +58 -0
  138. dcs_sdk/sdk/config/__init__.py +13 -0
  139. dcs_sdk/sdk/config/config_loader.py +491 -0
  140. dcs_sdk/sdk/data_diff/__init__.py +13 -0
  141. dcs_sdk/sdk/data_diff/data_differ.py +821 -0
  142. dcs_sdk/sdk/rules/__init__.py +15 -0
  143. dcs_sdk/sdk/rules/rules_mappping.py +31 -0
  144. dcs_sdk/sdk/rules/rules_repository.py +214 -0
  145. dcs_sdk/sdk/rules/schema_rules.py +65 -0
  146. dcs_sdk/sdk/utils/__init__.py +13 -0
  147. dcs_sdk/sdk/utils/serializer.py +25 -0
  148. dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
  149. dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
  150. dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
  151. dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
  152. dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
  153. dcs_sdk/sdk/utils/table.py +475 -0
  154. dcs_sdk/sdk/utils/themes.py +40 -0
  155. dcs_sdk/sdk/utils/utils.py +349 -0
  156. dcs_sdk-1.6.5.dist-info/METADATA +150 -0
  157. dcs_sdk-1.6.5.dist-info/RECORD +159 -0
  158. dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
  159. dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,475 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import multiprocessing
16
+ from collections import defaultdict
17
+ from concurrent.futures import ThreadPoolExecutor, as_completed
18
+ from typing import Any, Dict, List, Optional, Union
19
+
20
+ from rich.panel import Panel
21
+ from rich.table import Table
22
+ from rich.text import Text
23
+
24
+ from dcs_sdk.sdk.rules.rules_repository import RulesRepository
25
+ from dcs_sdk.sdk.utils.utils import apply_custom_masking, apply_masking
26
+
27
+
28
+ def create_legend():
29
+ legend = Table(show_header=False, box=None)
30
+ legend.add_column(style="bold")
31
+ legend.add_column()
32
+ legend.add_row("Red", "Mismatch", style="red")
33
+ legend.add_row("Cyan", "Match", style="cyan")
34
+ legend.add_row("Yellow", "Duplicate", style="yellow")
35
+ return Panel(legend, title="Info", border_style="cyan bold", width=80)
36
+
37
+
38
+ def create_schema_table(response, console, is_source=True):
39
+ key = "source_dataset" if is_source else "target_dataset"
40
+ columns = response[key]["columns"]
41
+ title = f"Schema: {response[key]['database']}.{response[key]['schema']}.{response[key]['table_name']}"
42
+ mapped_columns = response["columns_mappings"]
43
+ other_columns = response["target_dataset"]["columns"] if is_source else response["source_dataset"]["columns"]
44
+ rules_repo = RulesRepository.get_instance()
45
+
46
+ table = Table(title=title, show_header=True, header_style="bold magenta")
47
+ table.add_column("#")
48
+ table.add_column("Column Name", style="cyan")
49
+ table.add_column("Data Type", style="magenta")
50
+ table.add_column("Reason", style="red")
51
+
52
+ for index, col in enumerate(columns, start=1):
53
+ name = col["column_name"]
54
+ data_type = col["data_type"]
55
+ max_length = col.get("character_maximum_length", None)
56
+
57
+ mapped_col = next(
58
+ (
59
+ m["target_column"] if is_source else m["source_column"]
60
+ for m in mapped_columns
61
+ if m["source_column" if is_source else "target_column"] == name
62
+ ),
63
+ None,
64
+ )
65
+
66
+ other_col = next((c for c in other_columns if c["column_name"] == (mapped_col or name)), None)
67
+
68
+ mismatch_reason = ""
69
+ if other_col:
70
+ match, reason = rules_repo.apply_schema_rules(
71
+ src_col=col,
72
+ tgt_col=other_col,
73
+ )
74
+
75
+ if not match:
76
+ mismatch_reason = reason or ("Exclusive to source" if is_source else "Exclusive to target")
77
+ else:
78
+ mismatch_reason = "Exclusive to source" if is_source else "Exclusive to target"
79
+
80
+ data_type_with_max_len = f"{data_type} {('('+ str(max_length) + ')') if max_length is not None else ''}"
81
+ if mismatch_reason:
82
+ table.add_row(
83
+ str(index),
84
+ Text(name, style="red"),
85
+ Text(data_type_with_max_len, style="red"),
86
+ mismatch_reason,
87
+ )
88
+ else:
89
+ table.add_row(str(index), name, data_type_with_max_len, Text("-", style="green", justify="left"))
90
+ col["mismatch_reason"] = mismatch_reason
91
+ console.print(table)
92
+
93
+
94
+ def create_table_schema_row_count(response, row_diff_table, console):
95
+ source_dataset = response["source_dataset"]
96
+ target_dataset = response["target_dataset"]
97
+
98
+ console.print(create_legend())
99
+ table_row_counts = Table(title="Row Counts", show_header=True, header_style="bold magenta")
100
+ table_row_counts.add_column("")
101
+ table_row_counts.add_column(
102
+ f"{source_dataset['database']}.{source_dataset['schema']}.{source_dataset['table_name']}",
103
+ style="cyan",
104
+ )
105
+ table_row_counts.add_column(
106
+ f"{target_dataset['database']}.{target_dataset['schema']}.{target_dataset['table_name']}",
107
+ style="yellow",
108
+ )
109
+ table_row_counts.add_row(
110
+ "Row Count",
111
+ str(source_dataset["row_count"]),
112
+ str(target_dataset["row_count"]),
113
+ )
114
+ console.print(table_row_counts)
115
+
116
+ create_schema_table(response, console, is_source=True)
117
+ create_schema_table(response, console, is_source=False)
118
+
119
+ if row_diff_table is not None:
120
+ console.print(row_diff_table)
121
+
122
+
123
+ def process_batch(
124
+ batch: List[Dict[str, Any]],
125
+ provider_class,
126
+ primary_keys,
127
+ fields,
128
+ similarity,
129
+ src_masking_cols: List[str],
130
+ tgt_masking_cols: List[str],
131
+ masking_character: str,
132
+ ) -> List[Dict[str, Any]]:
133
+ if not provider_class or len(batch) < 2:
134
+ return batch
135
+
136
+ provider = provider_class()
137
+ batch_size = len(batch)
138
+ i = 0
139
+ while i < batch_size - 1:
140
+ provider.add_text_similarity(
141
+ data=batch[i : i + 2],
142
+ key=primary_keys,
143
+ fields=fields,
144
+ similarity=similarity,
145
+ source_masking_cols=src_masking_cols,
146
+ target_masking_cols=tgt_masking_cols,
147
+ mask_char=masking_character,
148
+ )
149
+ i += 2
150
+
151
+ return batch
152
+
153
+
154
+ def differ_rows(
155
+ diff_iter,
156
+ response,
157
+ src_masking_cols: List[str],
158
+ tgt_masking_cols: List[str],
159
+ masking_character: str,
160
+ limit: int | None = None,
161
+ table_limit: int = 100,
162
+ display_table: bool = False,
163
+ similarity=None,
164
+ similarity_providers=None,
165
+ fields=None,
166
+ batch_size: int = 2_000,
167
+ max_workers: int = max(1, multiprocessing.cpu_count() - 2),
168
+ quick_comparison: bool = False,
169
+ ):
170
+ if quick_comparison:
171
+ try:
172
+ next(iter(diff_iter))
173
+ return {
174
+ "stats": {
175
+ "rows_A": 0,
176
+ "rows_B": 0,
177
+ "exclusive_A": 0,
178
+ "exclusive_B": 0,
179
+ "diff_pk_percent": 0,
180
+ "unchanged": 0,
181
+ "total_diff_count": 0,
182
+ "diff_rows_count": 0,
183
+ "total_duplicate_count_source": 0,
184
+ "total_duplicate_count_target": 0,
185
+ "diff_rows_percent": 0,
186
+ "has_differences": True,
187
+ },
188
+ "exclusive_pk_values_target": [],
189
+ "exclusive_pk_values_source": [],
190
+ "duplicate_pk_values_source": [],
191
+ "duplicate_pk_values_target": [],
192
+ "records_with_differences": [],
193
+ "table": None,
194
+ }
195
+ except StopIteration:
196
+ return {
197
+ "stats": {"has_differences": False},
198
+ "exclusive_pk_values_target": [],
199
+ "exclusive_pk_values_source": [],
200
+ "duplicate_pk_values_source": [],
201
+ "duplicate_pk_values_target": [],
202
+ "records_with_differences": [],
203
+ "table": None,
204
+ }
205
+ stats = diff_iter.get_stats_dict()
206
+ exclusive_source_set = set(stats["exclusive_source_ids"])
207
+ exclusive_target_set = set(stats["exclusive_target_ids"])
208
+ diff_values_set = set(stats["diff_values_ids"])
209
+ source_duplicates = set(stats["duplicate_source_ids"])
210
+ target_duplicates = set(stats["duplicate_target_ids"])
211
+ pk_key_cols = response["source_dataset"]["primary_keys"]
212
+
213
+ exclusive_to_source = []
214
+ exclusive_to_target = []
215
+ duplicates_in_source = []
216
+ duplicates_in_target = []
217
+
218
+ seen_ex_source = set()
219
+ seen_ex_target = set()
220
+
221
+ diff_pks_to_collect = set(diff_values_set) if limit is None else set(list(diff_values_set)[:limit])
222
+ diff_records_dict = {}
223
+
224
+ total_source_duplicates = 0
225
+ total_target_duplicates = 0
226
+ table_data = []
227
+ table = None
228
+
229
+ for diff in diff_iter:
230
+ sign, rows = diff
231
+ obj = {"meta": {"origin": "source" if sign == "-" else "target", "sign": sign}}
232
+ column_values = {}
233
+
234
+ for idx, col_ in enumerate(rows):
235
+ column_name = response["columns_mappings"][idx]["source_column"]
236
+ obj[column_name] = col_
237
+ column_values[column_name] = col_
238
+
239
+ if len(table_data) < table_limit:
240
+ table_data.append(obj)
241
+ pk_value = tuple(column_values[col] for col in pk_key_cols)
242
+
243
+ if sign == "-" and pk_value in exclusive_source_set:
244
+ if pk_value not in seen_ex_source and (limit is None or len(exclusive_to_source) < limit):
245
+ masked_obj = apply_masking(obj, src_masking_cols, masking_character)
246
+ exclusive_to_source.append(masked_obj)
247
+ seen_ex_source.add(pk_value)
248
+
249
+ if sign == "+" and pk_value in exclusive_target_set:
250
+ if pk_value not in seen_ex_target and (limit is None or len(exclusive_to_target) < limit):
251
+ masked_obj = apply_masking(obj, tgt_masking_cols, masking_character)
252
+ exclusive_to_target.append(masked_obj)
253
+ seen_ex_target.add(pk_value)
254
+
255
+ if sign == "-" and pk_value in source_duplicates:
256
+ total_source_duplicates += 1
257
+ if limit is None or len(duplicates_in_source) < limit:
258
+ masked_obj = apply_masking(obj, src_masking_cols, masking_character)
259
+ duplicates_in_source.append(masked_obj)
260
+
261
+ if sign == "+" and pk_value in target_duplicates:
262
+ total_target_duplicates += 1
263
+ if limit is None or len(duplicates_in_target) < limit:
264
+ masked_obj = apply_masking(obj, tgt_masking_cols, masking_character)
265
+ duplicates_in_target.append(masked_obj)
266
+
267
+ if pk_value in diff_pks_to_collect:
268
+ if pk_value not in diff_records_dict:
269
+ diff_records_dict[pk_value] = []
270
+ if limit is None or len(diff_records_dict[pk_value]) < 2:
271
+ diff_records_dict[pk_value].append(obj.copy())
272
+
273
+ def sort_by_pk(obj):
274
+ sort_key = []
275
+ for col in pk_key_cols:
276
+ pk_value = obj[col]
277
+ try:
278
+ sort_key.append((0, int(pk_value)))
279
+ except (ValueError, TypeError):
280
+ sort_key.append((1, str(pk_value)))
281
+ return tuple(sort_key)
282
+
283
+ try:
284
+ exclusive_to_source.sort(key=sort_by_pk)
285
+ exclusive_to_target.sort(key=sort_by_pk)
286
+ duplicates_in_source.sort(key=sort_by_pk)
287
+ duplicates_in_target.sort(key=sort_by_pk)
288
+ except:
289
+ pass
290
+
291
+ records_with_differences = []
292
+ masked_records = []
293
+
294
+ for pk_value, records in diff_records_dict.items():
295
+ records.sort(key=lambda x: x["meta"]["sign"], reverse=True)
296
+ if not similarity:
297
+ if len(records) == 2:
298
+ source = records[0]
299
+ target = records[1]
300
+
301
+ masked_src, masked_tgt = apply_custom_masking(
302
+ source=source,
303
+ target=target,
304
+ source_masking_cols=src_masking_cols,
305
+ target_masking_cols=tgt_masking_cols,
306
+ mask_char=masking_character,
307
+ )
308
+
309
+ masked_record = [masked_src, masked_tgt]
310
+ masked_records.extend(masked_record)
311
+ else:
312
+ masked_records.extend(records)
313
+
314
+ records_with_differences.extend(masked_records)
315
+
316
+ provider_class = None
317
+ primary_keys = response["source_dataset"]["primary_keys"]
318
+ if similarity and similarity_providers and fields and primary_keys:
319
+ provider_class = similarity_providers.get(similarity.similarity_method.lower())
320
+ if not provider_class:
321
+ print(f"Unknown similarity method: {similarity.similarity_method}")
322
+
323
+ if provider_class:
324
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
325
+ effective_batch_size = min(batch_size, len(records_with_differences))
326
+ effective_batch_size = effective_batch_size if effective_batch_size % 2 == 0 else effective_batch_size - 1
327
+ if effective_batch_size < 2:
328
+ effective_batch_size = 2
329
+ batches = [
330
+ records_with_differences[i : i + effective_batch_size]
331
+ for i in range(0, len(records_with_differences), effective_batch_size)
332
+ ]
333
+ futures = [
334
+ executor.submit(
335
+ process_batch,
336
+ batch,
337
+ provider_class,
338
+ primary_keys,
339
+ fields,
340
+ similarity,
341
+ src_masking_cols,
342
+ tgt_masking_cols,
343
+ masking_character,
344
+ )
345
+ for batch in batches
346
+ ]
347
+ i = 0
348
+ for future in as_completed(futures):
349
+ try:
350
+ result = future.result()
351
+ records_with_differences[i : i + len(result)] = result
352
+ i += len(result)
353
+ except Exception as e:
354
+ print(f"Error in batch processing: {str(e)}")
355
+
356
+ try:
357
+ records_with_differences.sort(key=sort_by_pk)
358
+ except:
359
+ pass
360
+
361
+ stats["total_diff_count"] = len(diff_values_set) + len(exclusive_source_set) + len(exclusive_target_set)
362
+ stats["diff_rows_count"] = len(diff_values_set)
363
+ stats.pop("exclusive_source_ids", None)
364
+ stats.pop("exclusive_target_ids", None)
365
+ stats.pop("diff_values_ids", None)
366
+ stats.pop("duplicate_source_ids", None)
367
+ stats.pop("duplicate_target_ids", None)
368
+ stats["total_duplicate_count_source"] = total_source_duplicates
369
+ stats["total_duplicate_count_target"] = total_target_duplicates
370
+ try:
371
+ diff_rows_percent = stats.get("diff_rows_count", 0) / (
372
+ stats.get("diff_rows_count", 0) + stats.get("unchanged", 0)
373
+ )
374
+ diff_rows_percent = abs(diff_rows_percent)
375
+ except ZeroDivisionError:
376
+ diff_rows_percent = 0.0
377
+ stats["diff_pk_percent"] = min(stats.get("diff_pk_percent", 0), 1)
378
+ stats["diff_rows_percent"] = diff_rows_percent
379
+ has_differences = diff_rows_percent != 0 or stats["diff_pk_percent"] != 0
380
+ stats["has_differences"] = has_differences
381
+ stats["source_masked_columns"] = src_masking_cols
382
+ stats["target_masked_columns"] = tgt_masking_cols
383
+
384
+ if display_table:
385
+ table = create_table_diff_rows(table_data, primary_keys, response["columns_mappings"], 100)
386
+
387
+ return {
388
+ "exclusive_pk_values_target": exclusive_to_target,
389
+ "exclusive_pk_values_source": exclusive_to_source,
390
+ "duplicate_pk_values_source": duplicates_in_source,
391
+ "duplicate_pk_values_target": duplicates_in_target,
392
+ "records_with_differences": records_with_differences,
393
+ "stats": stats,
394
+ "table": table,
395
+ }
396
+
397
+
398
+ def create_table_diff_rows(data, primary_keys: Union[str, list[str]], columns_mappings, limit: int = 100):
399
+ table = Table(title="Diff Rows", show_header=True, header_style="bold magenta")
400
+ column_mapping_dict = {mapping["source_column"]: mapping["target_column"] for mapping in columns_mappings}
401
+
402
+ table.add_column("#")
403
+ table.add_column("Origin")
404
+ for mapping in columns_mappings:
405
+ source_col = mapping["source_column"]
406
+ target_col = mapping["target_column"]
407
+ if source_col == target_col:
408
+ table.add_column(source_col, style="cyan")
409
+ else:
410
+ table.add_column(f"{target_col}/{source_col}", style="cyan")
411
+
412
+ if isinstance(primary_keys, str):
413
+ primary_keys = [primary_keys]
414
+
415
+ def get_composite_key(row):
416
+ return tuple(row[key] for key in primary_keys)
417
+
418
+ records = defaultdict(lambda: defaultdict(list))
419
+ for row in data:
420
+ composite_key = get_composite_key(row)
421
+ origin = row["meta"]["origin"]
422
+ records[composite_key][origin].append(row)
423
+
424
+ previous_composite_key = None
425
+ serial_number = 0
426
+ unique_keys_processed = set()
427
+ for row in data:
428
+ composite_key = get_composite_key(row)
429
+ if composite_key not in unique_keys_processed:
430
+ if len(unique_keys_processed) >= limit:
431
+ break
432
+ serial_number += 1
433
+ meta_values = row["meta"]
434
+ row_values = {key: row[key] for key in column_mapping_dict.keys()}
435
+ origin = meta_values["origin"]
436
+ has_duplicates = any(len(records[composite_key][orig]) > 1 for orig in records[composite_key])
437
+
438
+ mismatched_columns = set()
439
+ if not has_duplicates:
440
+ source_count = len(records[composite_key]["source"])
441
+ target_count = len(records[composite_key]["target"])
442
+ if source_count == 1 and target_count == 1:
443
+ source_row = records[composite_key]["source"][0]
444
+ target_row = records[composite_key]["target"][0]
445
+ if origin == "target":
446
+ for col in column_mapping_dict.keys():
447
+ if source_row[col] != target_row[col]:
448
+ mismatched_columns.add(col)
449
+
450
+ formatted_cells = [Text(str(serial_number))]
451
+ for col in meta_values:
452
+ if col != "sign":
453
+ formatted_cells.append(
454
+ Text(
455
+ str(meta_values[col]),
456
+ style=f"{'chartreuse2' if meta_values['origin'] == 'source' else 'cyan3'}",
457
+ )
458
+ )
459
+
460
+ for col in column_mapping_dict.keys():
461
+ cell_value = row_values[col]
462
+ if has_duplicates:
463
+ formatted_cells.append(Text(str(cell_value), style="yellow bold"))
464
+ elif col in mismatched_columns:
465
+ formatted_cells.append(Text(str(cell_value), style="red bold"))
466
+ else:
467
+ formatted_cells.append(Text(str(cell_value)))
468
+
469
+ if previous_composite_key is not None and previous_composite_key != composite_key:
470
+ table.add_section()
471
+
472
+ table.add_row(*formatted_cells)
473
+ previous_composite_key = composite_key
474
+ unique_keys_processed.add(composite_key)
475
+ return table
@@ -0,0 +1,40 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from rich.terminal_theme import TerminalTheme
16
+
17
+ theme_1 = TerminalTheme(
18
+ background=(20, 20, 20), # RGB triplet for grey
19
+ foreground=(255, 255, 255), # RGB triplet for white
20
+ normal=[
21
+ (0, 0, 0), # black
22
+ (255, 0, 0), # red
23
+ (0, 255, 0), # green
24
+ (255, 255, 0), # yellow
25
+ (0, 0, 255), # blue
26
+ (255, 0, 255), # magenta
27
+ (0, 255, 255), # cyan
28
+ (255, 255, 255), # white
29
+ ],
30
+ bright=[
31
+ (64, 64, 64), # medium grey
32
+ (128, 64, 64), # medium red
33
+ (64, 128, 64), # medium green
34
+ (128, 128, 64), # medium yellow
35
+ (64, 64, 128), # medium blue
36
+ (128, 64, 128), # medium magenta
37
+ (64, 128, 128), # medium cyan
38
+ (128, 128, 128), # medium white (grey)
39
+ ],
40
+ )