dcs-sdk 1.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. data_diff/__init__.py +221 -0
  2. data_diff/__main__.py +517 -0
  3. data_diff/abcs/__init__.py +13 -0
  4. data_diff/abcs/compiler.py +27 -0
  5. data_diff/abcs/database_types.py +402 -0
  6. data_diff/config.py +141 -0
  7. data_diff/databases/__init__.py +38 -0
  8. data_diff/databases/_connect.py +323 -0
  9. data_diff/databases/base.py +1417 -0
  10. data_diff/databases/bigquery.py +376 -0
  11. data_diff/databases/clickhouse.py +217 -0
  12. data_diff/databases/databricks.py +262 -0
  13. data_diff/databases/duckdb.py +207 -0
  14. data_diff/databases/mssql.py +343 -0
  15. data_diff/databases/mysql.py +189 -0
  16. data_diff/databases/oracle.py +238 -0
  17. data_diff/databases/postgresql.py +293 -0
  18. data_diff/databases/presto.py +222 -0
  19. data_diff/databases/redis.py +93 -0
  20. data_diff/databases/redshift.py +233 -0
  21. data_diff/databases/snowflake.py +222 -0
  22. data_diff/databases/sybase.py +720 -0
  23. data_diff/databases/trino.py +73 -0
  24. data_diff/databases/vertica.py +174 -0
  25. data_diff/diff_tables.py +489 -0
  26. data_diff/errors.py +17 -0
  27. data_diff/format.py +369 -0
  28. data_diff/hashdiff_tables.py +1026 -0
  29. data_diff/info_tree.py +76 -0
  30. data_diff/joindiff_tables.py +434 -0
  31. data_diff/lexicographic_space.py +253 -0
  32. data_diff/parse_time.py +88 -0
  33. data_diff/py.typed +0 -0
  34. data_diff/queries/__init__.py +13 -0
  35. data_diff/queries/api.py +213 -0
  36. data_diff/queries/ast_classes.py +811 -0
  37. data_diff/queries/base.py +38 -0
  38. data_diff/queries/extras.py +43 -0
  39. data_diff/query_utils.py +70 -0
  40. data_diff/schema.py +67 -0
  41. data_diff/table_segment.py +583 -0
  42. data_diff/thread_utils.py +112 -0
  43. data_diff/utils.py +1022 -0
  44. data_diff/version.py +15 -0
  45. dcs_core/__init__.py +13 -0
  46. dcs_core/__main__.py +17 -0
  47. dcs_core/__version__.py +15 -0
  48. dcs_core/cli/__init__.py +13 -0
  49. dcs_core/cli/cli.py +165 -0
  50. dcs_core/core/__init__.py +19 -0
  51. dcs_core/core/common/__init__.py +13 -0
  52. dcs_core/core/common/errors.py +50 -0
  53. dcs_core/core/common/models/__init__.py +13 -0
  54. dcs_core/core/common/models/configuration.py +284 -0
  55. dcs_core/core/common/models/dashboard.py +24 -0
  56. dcs_core/core/common/models/data_source_resource.py +75 -0
  57. dcs_core/core/common/models/metric.py +160 -0
  58. dcs_core/core/common/models/profile.py +75 -0
  59. dcs_core/core/common/models/validation.py +216 -0
  60. dcs_core/core/common/models/widget.py +44 -0
  61. dcs_core/core/configuration/__init__.py +13 -0
  62. dcs_core/core/configuration/config_loader.py +139 -0
  63. dcs_core/core/configuration/configuration_parser.py +262 -0
  64. dcs_core/core/configuration/configuration_parser_arc.py +328 -0
  65. dcs_core/core/datasource/__init__.py +13 -0
  66. dcs_core/core/datasource/base.py +62 -0
  67. dcs_core/core/datasource/manager.py +112 -0
  68. dcs_core/core/datasource/search_datasource.py +421 -0
  69. dcs_core/core/datasource/sql_datasource.py +1094 -0
  70. dcs_core/core/inspect.py +163 -0
  71. dcs_core/core/logger/__init__.py +13 -0
  72. dcs_core/core/logger/base.py +32 -0
  73. dcs_core/core/logger/default_logger.py +94 -0
  74. dcs_core/core/metric/__init__.py +13 -0
  75. dcs_core/core/metric/base.py +220 -0
  76. dcs_core/core/metric/combined_metric.py +98 -0
  77. dcs_core/core/metric/custom_metric.py +34 -0
  78. dcs_core/core/metric/manager.py +137 -0
  79. dcs_core/core/metric/numeric_metric.py +403 -0
  80. dcs_core/core/metric/reliability_metric.py +90 -0
  81. dcs_core/core/profiling/__init__.py +13 -0
  82. dcs_core/core/profiling/datasource_profiling.py +136 -0
  83. dcs_core/core/profiling/numeric_field_profiling.py +72 -0
  84. dcs_core/core/profiling/text_field_profiling.py +67 -0
  85. dcs_core/core/repository/__init__.py +13 -0
  86. dcs_core/core/repository/metric_repository.py +77 -0
  87. dcs_core/core/utils/__init__.py +13 -0
  88. dcs_core/core/utils/log.py +29 -0
  89. dcs_core/core/utils/tracking.py +105 -0
  90. dcs_core/core/utils/utils.py +44 -0
  91. dcs_core/core/validation/__init__.py +13 -0
  92. dcs_core/core/validation/base.py +230 -0
  93. dcs_core/core/validation/completeness_validation.py +153 -0
  94. dcs_core/core/validation/custom_query_validation.py +24 -0
  95. dcs_core/core/validation/manager.py +282 -0
  96. dcs_core/core/validation/numeric_validation.py +276 -0
  97. dcs_core/core/validation/reliability_validation.py +91 -0
  98. dcs_core/core/validation/uniqueness_validation.py +61 -0
  99. dcs_core/core/validation/validity_validation.py +738 -0
  100. dcs_core/integrations/__init__.py +13 -0
  101. dcs_core/integrations/databases/__init__.py +13 -0
  102. dcs_core/integrations/databases/bigquery.py +187 -0
  103. dcs_core/integrations/databases/databricks.py +51 -0
  104. dcs_core/integrations/databases/db2.py +652 -0
  105. dcs_core/integrations/databases/elasticsearch.py +61 -0
  106. dcs_core/integrations/databases/mssql.py +829 -0
  107. dcs_core/integrations/databases/mysql.py +409 -0
  108. dcs_core/integrations/databases/opensearch.py +64 -0
  109. dcs_core/integrations/databases/oracle.py +719 -0
  110. dcs_core/integrations/databases/postgres.py +482 -0
  111. dcs_core/integrations/databases/redshift.py +53 -0
  112. dcs_core/integrations/databases/snowflake.py +48 -0
  113. dcs_core/integrations/databases/spark_df.py +111 -0
  114. dcs_core/integrations/databases/sybase.py +1069 -0
  115. dcs_core/integrations/storage/__init__.py +13 -0
  116. dcs_core/integrations/storage/local_file.py +149 -0
  117. dcs_core/integrations/utils/__init__.py +13 -0
  118. dcs_core/integrations/utils/utils.py +36 -0
  119. dcs_core/report/__init__.py +13 -0
  120. dcs_core/report/dashboard.py +211 -0
  121. dcs_core/report/models.py +88 -0
  122. dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
  123. dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
  124. dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
  125. dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
  126. dcs_core/report/static/assets/images/docs.svg +6 -0
  127. dcs_core/report/static/assets/images/github.svg +4 -0
  128. dcs_core/report/static/assets/images/logo.svg +7 -0
  129. dcs_core/report/static/assets/images/slack.svg +13 -0
  130. dcs_core/report/static/index.js +2 -0
  131. dcs_core/report/static/index.js.LICENSE.txt +3971 -0
  132. dcs_sdk/__init__.py +13 -0
  133. dcs_sdk/__main__.py +18 -0
  134. dcs_sdk/__version__.py +15 -0
  135. dcs_sdk/cli/__init__.py +13 -0
  136. dcs_sdk/cli/cli.py +163 -0
  137. dcs_sdk/sdk/__init__.py +58 -0
  138. dcs_sdk/sdk/config/__init__.py +13 -0
  139. dcs_sdk/sdk/config/config_loader.py +491 -0
  140. dcs_sdk/sdk/data_diff/__init__.py +13 -0
  141. dcs_sdk/sdk/data_diff/data_differ.py +821 -0
  142. dcs_sdk/sdk/rules/__init__.py +15 -0
  143. dcs_sdk/sdk/rules/rules_mappping.py +31 -0
  144. dcs_sdk/sdk/rules/rules_repository.py +214 -0
  145. dcs_sdk/sdk/rules/schema_rules.py +65 -0
  146. dcs_sdk/sdk/utils/__init__.py +13 -0
  147. dcs_sdk/sdk/utils/serializer.py +25 -0
  148. dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
  149. dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
  150. dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
  151. dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
  152. dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
  153. dcs_sdk/sdk/utils/table.py +475 -0
  154. dcs_sdk/sdk/utils/themes.py +40 -0
  155. dcs_sdk/sdk/utils/utils.py +349 -0
  156. dcs_sdk-1.6.5.dist-info/METADATA +150 -0
  157. dcs_sdk-1.6.5.dist-info/RECORD +159 -0
  158. dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
  159. dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,491 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import os
16
+ import uuid
17
+ from typing import Dict, List, Literal, Optional, Union
18
+
19
+ import yaml
20
+ from dotenv import load_dotenv
21
+ from pydantic import BaseModel
22
+
23
+ from dcs_sdk.sdk.rules import RulesRepository
24
+
25
+
26
+ class InvalidUUIDError(ValueError):
27
+ pass
28
+
29
+
30
+ class MissingRequiredFieldError(ValueError):
31
+ pass
32
+
33
+
34
+ class InvalidConnectionTypeError(ValueError):
35
+ pass
36
+
37
+
38
+ class InvalidSimilarityMethodError(ValueError):
39
+ pass
40
+
41
+
42
+ class SourceTargetConnection(BaseModel):
43
+ id: Optional[Union[str, None]] = None
44
+ name: str
45
+ workspace: Optional[str] = "default"
46
+ host: Optional[str] = None
47
+ port: Optional[Union[int, str]] = None
48
+ driver: str
49
+ table: Optional[str] = None
50
+ database: Optional[str] = None
51
+ filepath: Optional[str] = None
52
+ catalog: Optional[str] = None
53
+ schema_name: Optional[str] = None
54
+ warehouse: Optional[str] = None
55
+ role: Optional[str] = None
56
+ account: Optional[str] = None
57
+ username: Optional[str] = None
58
+ password: Optional[str] = None
59
+ http_path: Optional[str] = None
60
+ access_token: Optional[str] = None
61
+ odbc_driver: Optional[str] = None
62
+ server: Optional[str] = None
63
+ project: Optional[str] = None # bigquery specific
64
+ dataset: Optional[str] = None # bigquery specific
65
+ keyfile: Optional[str] = None # bigquery specific
66
+ impersonate_service_account: Optional[str] = None # bigquery specific
67
+ bigquery_credentials: Optional[str] = None # bigquery specific
68
+ transform_columns: Dict[str, str] | None = None
69
+
70
+
71
+ class SimilarityConfig(BaseModel):
72
+ pre_processing: List[str]
73
+ similarity_method: str
74
+ threshold: float
75
+
76
+
77
+ class DiffAdvancedConfig(BaseModel):
78
+ bisection_factor: int = 10
79
+ bisection_threshold: int = 50_000
80
+ max_threadpool_size: int = 2
81
+ egress_limit: int = 5_00_000
82
+ per_column_diff_limit: int = 100
83
+ timeout_limit: int = 60 * 5 # minutes
84
+ in_memory_diff: bool = False # Whether to perform diff in memory (may use more RAM)
85
+
86
+
87
+ class Comparison(BaseModel):
88
+ comparison_name: str
89
+ job_id: Optional[int] = None
90
+ source: SourceTargetConnection
91
+ target: SourceTargetConnection
92
+ source_columns: Optional[List[str]] = None
93
+ target_columns: Optional[List[str]] = None
94
+ primary_keys_source: List[str] = []
95
+ primary_keys_target: List[str] = []
96
+ source_filter: Optional[str] = None
97
+ target_filter: Optional[str] = None
98
+ source_query: Optional[str] = None
99
+ target_query: Optional[str] = None
100
+ temporary_schema_source: Optional[str] = None
101
+ temporary_schema_target: Optional[str] = None
102
+ similarity: Optional[SimilarityConfig] = None
103
+ view_name_source: Optional[str] = None
104
+ view_name_target: Optional[str] = None
105
+ advanced_configuration: DiffAdvancedConfig
106
+ limit: Union[int, None, str] = "10%"
107
+ strict: bool = True # Used for strict comparison with matching column data types
108
+ quick_comparison: bool = False # Used for quick overview of the comparison
109
+ source_masking_columns: Optional[List[str]] = None
110
+ target_masking_columns: Optional[List[str]] = None
111
+ masking_character: str = "*"
112
+ schema_diff: bool = False # Used for schema diff
113
+
114
+
115
+ class EnvYamlLoader(yaml.SafeLoader):
116
+ """YAML Loader with `!ENV` constructor."""
117
+
118
+ def __init__(self, stream):
119
+ super(EnvYamlLoader, self).__init__(stream)
120
+ self.add_constructor("!ENV", self.env_constructor)
121
+
122
+ @classmethod
123
+ def env_constructor(cls, loader, node):
124
+ value = loader.construct_scalar(node)
125
+ env_var = value.strip("${} ")
126
+ return os.environ.get(env_var, "")
127
+
128
+
129
+ class DataDiffConfig:
130
+ DRIVER_MAP = {
131
+ "file": "duckdb",
132
+ "duckdb": "duckdb",
133
+ "postgres": "postgres",
134
+ "postgresql": "postgres",
135
+ "snowflake": "snowflake",
136
+ "trino": "trino",
137
+ "databricks": "databricks",
138
+ "oracle": "oracle",
139
+ "mssql": "mssql",
140
+ "mysql": "mysql",
141
+ "sybase": "sybase",
142
+ "bigquery": "bigquery",
143
+ }
144
+
145
+ def __init__(
146
+ self,
147
+ yaml_file_path: Optional[str] = None,
148
+ yaml_string: Optional[str] = None,
149
+ config_json: Optional[dict] = None,
150
+ ):
151
+ load_dotenv()
152
+ if yaml_file_path:
153
+ self.data = self.read_yaml_file(yaml_file_path)
154
+ elif yaml_string:
155
+ self.data = self.read_yaml_string(yaml_string)
156
+ elif config_json:
157
+ self.data = config_json
158
+ else:
159
+ raise ValueError("No configuration provided")
160
+ self.rules_repo = RulesRepository.get_instance()
161
+
162
+ @staticmethod
163
+ def read_yaml_file(file_path: str) -> dict:
164
+ with open(file_path, "r") as file:
165
+ return yaml.load(file, Loader=EnvYamlLoader)
166
+
167
+ @staticmethod
168
+ def read_yaml_string(yaml_string: str) -> dict:
169
+ return yaml.load(yaml_string, Loader=EnvYamlLoader)
170
+
171
+ @staticmethod
172
+ def is_valid_uuid(val: str) -> bool:
173
+ try:
174
+ uuid.UUID(str(val))
175
+ return True
176
+ except ValueError:
177
+ return False
178
+
179
+ def validate_uuid(self, uuid_str: str | None, field_name: str) -> None:
180
+ if uuid_str is not None and not self.is_valid_uuid(uuid_str):
181
+ raise InvalidUUIDError(f"{field_name} is not a valid UUID")
182
+
183
+ @staticmethod
184
+ def validate_required_field(value: Union[str, None], field_name: str, source_name: str) -> None:
185
+ if value is None:
186
+ raise MissingRequiredFieldError(f"{field_name} is required for datasource {source_name}")
187
+
188
+ @staticmethod
189
+ def validate_file_connection(connection: dict) -> None:
190
+ if connection.get("type") == "file" and connection.get("filepath") is None:
191
+ raise MissingRequiredFieldError("file path is required for file connection")
192
+
193
+ @staticmethod
194
+ def validate_databricks_connection(connection: dict) -> None:
195
+ if connection.get("type") == "databricks":
196
+ if connection.get("connection", {}).get("http_path") is None:
197
+ raise MissingRequiredFieldError("http_path is required for databricks connection")
198
+ if connection.get("connection", {}).get("access_token") is None:
199
+ raise MissingRequiredFieldError("access_token is required for databricks connection")
200
+
201
+ @staticmethod
202
+ def validate_host_or_server(connection: dict) -> None:
203
+ if connection.get("type") == "sybase":
204
+ if not connection.get("connection", {}).get("host") and not connection.get("connection", {}).get("server"):
205
+ raise MissingRequiredFieldError("host or server is required for connection")
206
+
207
+ @staticmethod
208
+ def validate_comparison_by_query(
209
+ comparison_data: dict,
210
+ field_name: Literal["source", "target"],
211
+ temporary_schema: str | None,
212
+ database_type: str,
213
+ view_name: str | None,
214
+ ) -> None:
215
+ if comparison_data.get(field_name, {}).get("query") is not None:
216
+ if comparison_data.get(field_name, {}).get("table") is not None:
217
+ raise ValueError(f"table and query cannot be used together in {field_name} connection")
218
+ if comparison_data.get(field_name, {}).get("filter") is not None:
219
+ raise ValueError(f"filter and query cannot be used together in {field_name} connection")
220
+ if database_type in ["file", "oracle"]:
221
+ return
222
+ if temporary_schema is None:
223
+ raise ValueError("temporary_schema is required for query based comparison")
224
+ if view_name is not None and len(view_name.split(".")) > 1:
225
+ raise ValueError("view_name should not contain schema name")
226
+
227
+ @staticmethod
228
+ def validate_similarity_threshold(threshold: float) -> None:
229
+ if threshold is None:
230
+ raise MissingRequiredFieldError("threshold is required for similarity")
231
+ if not 0 <= threshold <= 1:
232
+ raise ValueError("Similarity threshold must be between 0 and 1")
233
+ return threshold
234
+
235
+ def get_driver(self, connection: dict) -> str:
236
+ connection_type = connection.get("type")
237
+ if connection_type not in self.DRIVER_MAP:
238
+ raise InvalidConnectionTypeError(f"Invalid connection type: {connection_type}")
239
+ return self.DRIVER_MAP[connection_type]
240
+
241
+ def get_similarity_method(self, similarity_method: str) -> str:
242
+ if similarity_method is None:
243
+ raise MissingRequiredFieldError("similarity_method is required for similarity")
244
+ similarity_methods = ["jaccard", "cosine", "levenshtein"]
245
+ if similarity_method not in similarity_methods:
246
+ raise InvalidSimilarityMethodError(f"Invalid similarity method: {similarity_method}")
247
+ return similarity_method
248
+
249
+ def get_pre_processing_methods(self, pre_processing: List[str]) -> List[str]:
250
+ if pre_processing is None:
251
+ raise MissingRequiredFieldError("pre_processing is required for similarity")
252
+ pre_processing_methods = ["lower_case", "remove_punctuation", "remove_stop_words", "remove_extra_whitespaces"]
253
+ for method in pre_processing:
254
+ if method not in pre_processing_methods:
255
+ raise ValueError(f"Invalid pre_processing method: {method}")
256
+ return pre_processing
257
+
258
+ def create_connection_config(
259
+ self,
260
+ connection: dict,
261
+ comparison_data: dict,
262
+ is_source: bool,
263
+ temporary_schema: str | None,
264
+ view_name: str | None,
265
+ transform_columns: Dict[str, str] | None = None,
266
+ ) -> dict:
267
+ self.validate_uuid(connection.get("id", None), "Datasource id")
268
+ self.validate_required_field(connection.get("name"), "connection name", source_name=connection.get("name"))
269
+ self.validate_required_field(connection.get("type"), "connection type", source_name=connection.get("name"))
270
+ self.validate_file_connection(connection)
271
+ self.validate_databricks_connection(connection)
272
+ self.validate_host_or_server(connection)
273
+ self.validate_comparison_by_query(
274
+ comparison_data,
275
+ "source" if is_source else "target",
276
+ temporary_schema,
277
+ connection.get("type"),
278
+ view_name,
279
+ )
280
+
281
+ driver = self.get_driver(connection)
282
+
283
+ return {
284
+ "id": connection.get("id", None),
285
+ "name": connection.get("name"),
286
+ "workspace": connection.get("workspace", "default"),
287
+ "host": connection.get("connection", {}).get("host", ""),
288
+ "port": connection.get("connection", {}).get("port", None),
289
+ "account": connection.get("connection", {}).get("account"),
290
+ "warehouse": connection.get("connection", {}).get("warehouse"),
291
+ "role": connection.get("connection", {}).get("role"),
292
+ "driver": driver,
293
+ "table": comparison_data.get("source" if is_source else "target", {}).get("table"),
294
+ "database": connection.get("connection", {}).get("database"),
295
+ "catalog": connection.get("connection", {}).get("catalog"),
296
+ "schema_name": connection.get("connection", {}).get("schema"),
297
+ "username": connection.get("connection", {}).get("username"),
298
+ "password": connection.get("connection", {}).get("password"),
299
+ "http_path": connection.get("connection", {}).get("http_path"),
300
+ "access_token": connection.get("connection", {}).get("access_token"),
301
+ "filepath": connection.get("filepath"),
302
+ "odbc_driver": connection.get("connection", {}).get("odbc_driver"),
303
+ "server": connection.get("connection", {}).get("server"),
304
+ "project": connection.get("connection", {}).get("project"),
305
+ "dataset": connection.get("connection", {}).get("dataset"),
306
+ "keyfile": connection.get("connection", {}).get("keyfile"),
307
+ "impersonate_service_account": connection.get("connection", {}).get("impersonate_service_account"),
308
+ "bigquery_credentials": connection.get("connection", {}).get("bigquery_credentials"),
309
+ "transform_columns": transform_columns,
310
+ }
311
+
312
+ def get_data_diff_configs(self) -> List[Comparison]:
313
+ data_sources = {
314
+ ds["name"]: {
315
+ "name": ds.get("name"),
316
+ "id": ds.get("id", None),
317
+ "type": ds.get("type"),
318
+ "workspace": ds.get("workspace", "default"),
319
+ "connection": ds.get("connection", {}),
320
+ "filepath": ds.get("file_path"),
321
+ "temporary_schema": ds.get("temporary_schema"),
322
+ "view_name": ds.get("view_name"),
323
+ }
324
+ for ds in self.data["data_sources"]
325
+ }
326
+
327
+ rules = self.data.get("rules", []) or []
328
+
329
+ for rule in rules:
330
+ rule_id = rule.get("id")
331
+ if rule_id:
332
+ self.rules_repo.register(rule_id, rule)
333
+
334
+ new_structure = []
335
+
336
+ for comparison_name, comparison_data in self.data["comparisons"].items():
337
+ source_connection = data_sources[comparison_data["source"]["data_source"]]
338
+ target_connection = data_sources[comparison_data["target"]["data_source"]]
339
+
340
+ source_masking_cols = comparison_data.get("source", {}).get("masking_columns")
341
+ target_masking_cols = comparison_data.get("target", {}).get("masking_columns")
342
+
343
+ masking_character = comparison_data.get("masking_configuration", {}).get("mask_character", "*") or "*"
344
+
345
+ schema_overrides = comparison_data.get("schema_overrides", []) or []
346
+ self.rules_repo.register_schema_rules(schema_rules=schema_overrides)
347
+
348
+ transform_columns = comparison_data.get("transform_columns", {}) or {}
349
+ self.rules_repo.register_value_rules(value_rules=transform_columns)
350
+
351
+ source_transform_columns = {}
352
+ target_transform_columns = {}
353
+
354
+ source_transform_configs = transform_columns.get("source", []) or []
355
+ if source_transform_configs:
356
+ for source_transform_config in source_transform_configs:
357
+ column = source_transform_config.get("name")
358
+ rule_id = source_transform_config.get("rule")
359
+ rule = self.rules_repo.get(rule_id)
360
+
361
+ if not rule:
362
+ raise ValueError(f"Rule with '{rule_id}' not found in rules repository")
363
+
364
+ transformation_template = rule["transformation"]
365
+ transformation_query = self._build_query(column, transformation_template)
366
+ source_transform_columns[column] = transformation_template
367
+
368
+ target_transform_configs = transform_columns.get("target", []) or []
369
+ if target_transform_configs:
370
+ for target_transform_config in target_transform_configs:
371
+ column = target_transform_config.get("name")
372
+ rule_id = target_transform_config.get("rule")
373
+ rule = self.rules_repo.get(rule_id)
374
+
375
+ if not rule:
376
+ raise ValueError(f"Rule with '{rule_id}' not found in rules repository")
377
+
378
+ transformation_template = rule["transformation"]
379
+ transformation_query = self._build_query(column, transformation_template)
380
+ target_transform_columns[column] = transformation_template
381
+
382
+ temporary_schema_source = source_connection.get("temporary_schema")
383
+ temporary_schema_target = target_connection.get("temporary_schema")
384
+
385
+ view_name_source = comparison_data.get("source", {}).get("view_name", None)
386
+ view_name_target = comparison_data.get("target", {}).get("view_name", None)
387
+
388
+ source_to_target = {
389
+ item["source_column"]: item["target_column"] for item in comparison_data.get("columns_mappings", {})
390
+ }
391
+
392
+ source_columns = comparison_data.get("columns", [])
393
+ limit = comparison_data.get("limit", None)
394
+ strict = comparison_data.get("strict", True)
395
+ quick_comparison = comparison_data.get("quick_comparison", False)
396
+ target_columns = [source_to_target.get(col, col) for col in source_columns]
397
+ schema_diff = comparison_data.get("schema_diff", False)
398
+ if quick_comparison and schema_diff:
399
+ raise ValueError("quick_comparison and schema_diff cannot be used together")
400
+ assert len(source_columns) == len(
401
+ target_columns
402
+ ), "source_columns and target_columns must have the same length"
403
+ if not schema_diff and not (source_columns or target_columns):
404
+ raise MissingRequiredFieldError("source_columns and target_columns are required for comparison")
405
+
406
+ primary_keys_source = comparison_data.get("key_columns", [])
407
+ if not primary_keys_source and not schema_diff:
408
+ raise MissingRequiredFieldError("key_columns are required for comparison")
409
+ primary_keys_target = [source_to_target.get(pk, pk) for pk in primary_keys_source]
410
+
411
+ similarity_data = comparison_data.get("similarity")
412
+ similarity = (
413
+ SimilarityConfig(
414
+ pre_processing=self.get_pre_processing_methods(similarity_data.get("pre_processing", None)),
415
+ similarity_method=self.get_similarity_method(similarity_data.get("similarity_method", None)),
416
+ threshold=self.validate_similarity_threshold(similarity_data.get("threshold", None)),
417
+ )
418
+ if similarity_data
419
+ else None
420
+ )
421
+ advanced_diff_config = comparison_data.get("advanced_configuration", {})
422
+ advanced_configuration = DiffAdvancedConfig(
423
+ bisection_factor=advanced_diff_config.get("bisection_factor", 10),
424
+ bisection_threshold=advanced_diff_config.get("bisection_threshold", 50_000),
425
+ max_threadpool_size=advanced_diff_config.get("max_threadpool_size", 2),
426
+ egress_limit=advanced_diff_config.get("egress_limit", 5_00_000),
427
+ per_column_diff_limit=advanced_diff_config.get("per_column_diff_limit", 100),
428
+ timeout_limit=advanced_diff_config.get("timeout_limit", 60 * 5),
429
+ in_memory_diff=advanced_diff_config.get("in_memory_diff", False),
430
+ )
431
+ new_comparison = {
432
+ "comparison_name": comparison_name,
433
+ "job_id": comparison_data.get("job_id", None),
434
+ "source": self.create_connection_config(
435
+ source_connection,
436
+ comparison_data,
437
+ True,
438
+ temporary_schema_source,
439
+ view_name_source,
440
+ transform_columns=source_transform_columns,
441
+ ),
442
+ "target": self.create_connection_config(
443
+ target_connection,
444
+ comparison_data,
445
+ False,
446
+ temporary_schema_target,
447
+ view_name_target,
448
+ transform_columns=target_transform_columns,
449
+ ),
450
+ "source_columns": source_columns,
451
+ "target_columns": target_columns,
452
+ "primary_keys_source": primary_keys_source,
453
+ "primary_keys_target": primary_keys_target,
454
+ "source_filter": comparison_data.get("source", {}).get("filter", None),
455
+ "target_filter": comparison_data.get("target", {}).get("filter", None),
456
+ "source_query": comparison_data.get("source", {}).get("query", None),
457
+ "target_query": comparison_data.get("target", {}).get("query", None),
458
+ "temporary_schema_source": temporary_schema_source,
459
+ "temporary_schema_target": temporary_schema_target,
460
+ "similarity": similarity,
461
+ "view_name_source": view_name_source,
462
+ "view_name_target": view_name_target,
463
+ "advanced_configuration": advanced_configuration,
464
+ "limit": limit,
465
+ "strict": strict,
466
+ "quick_comparison": quick_comparison,
467
+ "source_masking_columns": source_masking_cols,
468
+ "target_masking_columns": target_masking_cols,
469
+ "masking_character": masking_character,
470
+ "schema_diff": schema_diff,
471
+ }
472
+ new_structure.append(Comparison(**new_comparison))
473
+
474
+ return new_structure
475
+
476
+ def _build_query(self, column, transformation_template):
477
+ transformation_query = transformation_template.format(column=column)
478
+ return transformation_query
479
+
480
+
481
+ def data_diff_config_loader(
482
+ config_path: Optional[str] = None,
483
+ config_yaml: Optional[str] = None,
484
+ config_json: Optional[dict] = None,
485
+ ) -> List[Comparison]:
486
+ config = DataDiffConfig(
487
+ yaml_file_path=config_path,
488
+ yaml_string=config_yaml,
489
+ config_json=config_json,
490
+ )
491
+ return config.get_data_diff_configs()
@@ -0,0 +1,13 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.