dcs-sdk 1.6.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_diff/__init__.py +221 -0
- data_diff/__main__.py +517 -0
- data_diff/abcs/__init__.py +13 -0
- data_diff/abcs/compiler.py +27 -0
- data_diff/abcs/database_types.py +402 -0
- data_diff/config.py +141 -0
- data_diff/databases/__init__.py +38 -0
- data_diff/databases/_connect.py +323 -0
- data_diff/databases/base.py +1417 -0
- data_diff/databases/bigquery.py +376 -0
- data_diff/databases/clickhouse.py +217 -0
- data_diff/databases/databricks.py +262 -0
- data_diff/databases/duckdb.py +207 -0
- data_diff/databases/mssql.py +343 -0
- data_diff/databases/mysql.py +189 -0
- data_diff/databases/oracle.py +238 -0
- data_diff/databases/postgresql.py +293 -0
- data_diff/databases/presto.py +222 -0
- data_diff/databases/redis.py +93 -0
- data_diff/databases/redshift.py +233 -0
- data_diff/databases/snowflake.py +222 -0
- data_diff/databases/sybase.py +720 -0
- data_diff/databases/trino.py +73 -0
- data_diff/databases/vertica.py +174 -0
- data_diff/diff_tables.py +489 -0
- data_diff/errors.py +17 -0
- data_diff/format.py +369 -0
- data_diff/hashdiff_tables.py +1026 -0
- data_diff/info_tree.py +76 -0
- data_diff/joindiff_tables.py +434 -0
- data_diff/lexicographic_space.py +253 -0
- data_diff/parse_time.py +88 -0
- data_diff/py.typed +0 -0
- data_diff/queries/__init__.py +13 -0
- data_diff/queries/api.py +213 -0
- data_diff/queries/ast_classes.py +811 -0
- data_diff/queries/base.py +38 -0
- data_diff/queries/extras.py +43 -0
- data_diff/query_utils.py +70 -0
- data_diff/schema.py +67 -0
- data_diff/table_segment.py +583 -0
- data_diff/thread_utils.py +112 -0
- data_diff/utils.py +1022 -0
- data_diff/version.py +15 -0
- dcs_core/__init__.py +13 -0
- dcs_core/__main__.py +17 -0
- dcs_core/__version__.py +15 -0
- dcs_core/cli/__init__.py +13 -0
- dcs_core/cli/cli.py +165 -0
- dcs_core/core/__init__.py +19 -0
- dcs_core/core/common/__init__.py +13 -0
- dcs_core/core/common/errors.py +50 -0
- dcs_core/core/common/models/__init__.py +13 -0
- dcs_core/core/common/models/configuration.py +284 -0
- dcs_core/core/common/models/dashboard.py +24 -0
- dcs_core/core/common/models/data_source_resource.py +75 -0
- dcs_core/core/common/models/metric.py +160 -0
- dcs_core/core/common/models/profile.py +75 -0
- dcs_core/core/common/models/validation.py +216 -0
- dcs_core/core/common/models/widget.py +44 -0
- dcs_core/core/configuration/__init__.py +13 -0
- dcs_core/core/configuration/config_loader.py +139 -0
- dcs_core/core/configuration/configuration_parser.py +262 -0
- dcs_core/core/configuration/configuration_parser_arc.py +328 -0
- dcs_core/core/datasource/__init__.py +13 -0
- dcs_core/core/datasource/base.py +62 -0
- dcs_core/core/datasource/manager.py +112 -0
- dcs_core/core/datasource/search_datasource.py +421 -0
- dcs_core/core/datasource/sql_datasource.py +1094 -0
- dcs_core/core/inspect.py +163 -0
- dcs_core/core/logger/__init__.py +13 -0
- dcs_core/core/logger/base.py +32 -0
- dcs_core/core/logger/default_logger.py +94 -0
- dcs_core/core/metric/__init__.py +13 -0
- dcs_core/core/metric/base.py +220 -0
- dcs_core/core/metric/combined_metric.py +98 -0
- dcs_core/core/metric/custom_metric.py +34 -0
- dcs_core/core/metric/manager.py +137 -0
- dcs_core/core/metric/numeric_metric.py +403 -0
- dcs_core/core/metric/reliability_metric.py +90 -0
- dcs_core/core/profiling/__init__.py +13 -0
- dcs_core/core/profiling/datasource_profiling.py +136 -0
- dcs_core/core/profiling/numeric_field_profiling.py +72 -0
- dcs_core/core/profiling/text_field_profiling.py +67 -0
- dcs_core/core/repository/__init__.py +13 -0
- dcs_core/core/repository/metric_repository.py +77 -0
- dcs_core/core/utils/__init__.py +13 -0
- dcs_core/core/utils/log.py +29 -0
- dcs_core/core/utils/tracking.py +105 -0
- dcs_core/core/utils/utils.py +44 -0
- dcs_core/core/validation/__init__.py +13 -0
- dcs_core/core/validation/base.py +230 -0
- dcs_core/core/validation/completeness_validation.py +153 -0
- dcs_core/core/validation/custom_query_validation.py +24 -0
- dcs_core/core/validation/manager.py +282 -0
- dcs_core/core/validation/numeric_validation.py +276 -0
- dcs_core/core/validation/reliability_validation.py +91 -0
- dcs_core/core/validation/uniqueness_validation.py +61 -0
- dcs_core/core/validation/validity_validation.py +738 -0
- dcs_core/integrations/__init__.py +13 -0
- dcs_core/integrations/databases/__init__.py +13 -0
- dcs_core/integrations/databases/bigquery.py +187 -0
- dcs_core/integrations/databases/databricks.py +51 -0
- dcs_core/integrations/databases/db2.py +652 -0
- dcs_core/integrations/databases/elasticsearch.py +61 -0
- dcs_core/integrations/databases/mssql.py +829 -0
- dcs_core/integrations/databases/mysql.py +409 -0
- dcs_core/integrations/databases/opensearch.py +64 -0
- dcs_core/integrations/databases/oracle.py +719 -0
- dcs_core/integrations/databases/postgres.py +482 -0
- dcs_core/integrations/databases/redshift.py +53 -0
- dcs_core/integrations/databases/snowflake.py +48 -0
- dcs_core/integrations/databases/spark_df.py +111 -0
- dcs_core/integrations/databases/sybase.py +1069 -0
- dcs_core/integrations/storage/__init__.py +13 -0
- dcs_core/integrations/storage/local_file.py +149 -0
- dcs_core/integrations/utils/__init__.py +13 -0
- dcs_core/integrations/utils/utils.py +36 -0
- dcs_core/report/__init__.py +13 -0
- dcs_core/report/dashboard.py +211 -0
- dcs_core/report/models.py +88 -0
- dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
- dcs_core/report/static/assets/images/docs.svg +6 -0
- dcs_core/report/static/assets/images/github.svg +4 -0
- dcs_core/report/static/assets/images/logo.svg +7 -0
- dcs_core/report/static/assets/images/slack.svg +13 -0
- dcs_core/report/static/index.js +2 -0
- dcs_core/report/static/index.js.LICENSE.txt +3971 -0
- dcs_sdk/__init__.py +13 -0
- dcs_sdk/__main__.py +18 -0
- dcs_sdk/__version__.py +15 -0
- dcs_sdk/cli/__init__.py +13 -0
- dcs_sdk/cli/cli.py +163 -0
- dcs_sdk/sdk/__init__.py +58 -0
- dcs_sdk/sdk/config/__init__.py +13 -0
- dcs_sdk/sdk/config/config_loader.py +491 -0
- dcs_sdk/sdk/data_diff/__init__.py +13 -0
- dcs_sdk/sdk/data_diff/data_differ.py +821 -0
- dcs_sdk/sdk/rules/__init__.py +15 -0
- dcs_sdk/sdk/rules/rules_mappping.py +31 -0
- dcs_sdk/sdk/rules/rules_repository.py +214 -0
- dcs_sdk/sdk/rules/schema_rules.py +65 -0
- dcs_sdk/sdk/utils/__init__.py +13 -0
- dcs_sdk/sdk/utils/serializer.py +25 -0
- dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
- dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
- dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
- dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
- dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
- dcs_sdk/sdk/utils/table.py +475 -0
- dcs_sdk/sdk/utils/themes.py +40 -0
- dcs_sdk/sdk/utils/utils.py +349 -0
- dcs_sdk-1.6.5.dist-info/METADATA +150 -0
- dcs_sdk-1.6.5.dist-info/RECORD +159 -0
- dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
- dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,491 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
import uuid
|
|
17
|
+
from typing import Dict, List, Literal, Optional, Union
|
|
18
|
+
|
|
19
|
+
import yaml
|
|
20
|
+
from dotenv import load_dotenv
|
|
21
|
+
from pydantic import BaseModel
|
|
22
|
+
|
|
23
|
+
from dcs_sdk.sdk.rules import RulesRepository
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class InvalidUUIDError(ValueError):
|
|
27
|
+
pass
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class MissingRequiredFieldError(ValueError):
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class InvalidConnectionTypeError(ValueError):
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class InvalidSimilarityMethodError(ValueError):
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class SourceTargetConnection(BaseModel):
|
|
43
|
+
id: Optional[Union[str, None]] = None
|
|
44
|
+
name: str
|
|
45
|
+
workspace: Optional[str] = "default"
|
|
46
|
+
host: Optional[str] = None
|
|
47
|
+
port: Optional[Union[int, str]] = None
|
|
48
|
+
driver: str
|
|
49
|
+
table: Optional[str] = None
|
|
50
|
+
database: Optional[str] = None
|
|
51
|
+
filepath: Optional[str] = None
|
|
52
|
+
catalog: Optional[str] = None
|
|
53
|
+
schema_name: Optional[str] = None
|
|
54
|
+
warehouse: Optional[str] = None
|
|
55
|
+
role: Optional[str] = None
|
|
56
|
+
account: Optional[str] = None
|
|
57
|
+
username: Optional[str] = None
|
|
58
|
+
password: Optional[str] = None
|
|
59
|
+
http_path: Optional[str] = None
|
|
60
|
+
access_token: Optional[str] = None
|
|
61
|
+
odbc_driver: Optional[str] = None
|
|
62
|
+
server: Optional[str] = None
|
|
63
|
+
project: Optional[str] = None # bigquery specific
|
|
64
|
+
dataset: Optional[str] = None # bigquery specific
|
|
65
|
+
keyfile: Optional[str] = None # bigquery specific
|
|
66
|
+
impersonate_service_account: Optional[str] = None # bigquery specific
|
|
67
|
+
bigquery_credentials: Optional[str] = None # bigquery specific
|
|
68
|
+
transform_columns: Dict[str, str] | None = None
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class SimilarityConfig(BaseModel):
|
|
72
|
+
pre_processing: List[str]
|
|
73
|
+
similarity_method: str
|
|
74
|
+
threshold: float
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class DiffAdvancedConfig(BaseModel):
|
|
78
|
+
bisection_factor: int = 10
|
|
79
|
+
bisection_threshold: int = 50_000
|
|
80
|
+
max_threadpool_size: int = 2
|
|
81
|
+
egress_limit: int = 5_00_000
|
|
82
|
+
per_column_diff_limit: int = 100
|
|
83
|
+
timeout_limit: int = 60 * 5 # minutes
|
|
84
|
+
in_memory_diff: bool = False # Whether to perform diff in memory (may use more RAM)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class Comparison(BaseModel):
|
|
88
|
+
comparison_name: str
|
|
89
|
+
job_id: Optional[int] = None
|
|
90
|
+
source: SourceTargetConnection
|
|
91
|
+
target: SourceTargetConnection
|
|
92
|
+
source_columns: Optional[List[str]] = None
|
|
93
|
+
target_columns: Optional[List[str]] = None
|
|
94
|
+
primary_keys_source: List[str] = []
|
|
95
|
+
primary_keys_target: List[str] = []
|
|
96
|
+
source_filter: Optional[str] = None
|
|
97
|
+
target_filter: Optional[str] = None
|
|
98
|
+
source_query: Optional[str] = None
|
|
99
|
+
target_query: Optional[str] = None
|
|
100
|
+
temporary_schema_source: Optional[str] = None
|
|
101
|
+
temporary_schema_target: Optional[str] = None
|
|
102
|
+
similarity: Optional[SimilarityConfig] = None
|
|
103
|
+
view_name_source: Optional[str] = None
|
|
104
|
+
view_name_target: Optional[str] = None
|
|
105
|
+
advanced_configuration: DiffAdvancedConfig
|
|
106
|
+
limit: Union[int, None, str] = "10%"
|
|
107
|
+
strict: bool = True # Used for strict comparison with matching column data types
|
|
108
|
+
quick_comparison: bool = False # Used for quick overview of the comparison
|
|
109
|
+
source_masking_columns: Optional[List[str]] = None
|
|
110
|
+
target_masking_columns: Optional[List[str]] = None
|
|
111
|
+
masking_character: str = "*"
|
|
112
|
+
schema_diff: bool = False # Used for schema diff
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class EnvYamlLoader(yaml.SafeLoader):
|
|
116
|
+
"""YAML Loader with `!ENV` constructor."""
|
|
117
|
+
|
|
118
|
+
def __init__(self, stream):
|
|
119
|
+
super(EnvYamlLoader, self).__init__(stream)
|
|
120
|
+
self.add_constructor("!ENV", self.env_constructor)
|
|
121
|
+
|
|
122
|
+
@classmethod
|
|
123
|
+
def env_constructor(cls, loader, node):
|
|
124
|
+
value = loader.construct_scalar(node)
|
|
125
|
+
env_var = value.strip("${} ")
|
|
126
|
+
return os.environ.get(env_var, "")
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class DataDiffConfig:
|
|
130
|
+
DRIVER_MAP = {
|
|
131
|
+
"file": "duckdb",
|
|
132
|
+
"duckdb": "duckdb",
|
|
133
|
+
"postgres": "postgres",
|
|
134
|
+
"postgresql": "postgres",
|
|
135
|
+
"snowflake": "snowflake",
|
|
136
|
+
"trino": "trino",
|
|
137
|
+
"databricks": "databricks",
|
|
138
|
+
"oracle": "oracle",
|
|
139
|
+
"mssql": "mssql",
|
|
140
|
+
"mysql": "mysql",
|
|
141
|
+
"sybase": "sybase",
|
|
142
|
+
"bigquery": "bigquery",
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
def __init__(
|
|
146
|
+
self,
|
|
147
|
+
yaml_file_path: Optional[str] = None,
|
|
148
|
+
yaml_string: Optional[str] = None,
|
|
149
|
+
config_json: Optional[dict] = None,
|
|
150
|
+
):
|
|
151
|
+
load_dotenv()
|
|
152
|
+
if yaml_file_path:
|
|
153
|
+
self.data = self.read_yaml_file(yaml_file_path)
|
|
154
|
+
elif yaml_string:
|
|
155
|
+
self.data = self.read_yaml_string(yaml_string)
|
|
156
|
+
elif config_json:
|
|
157
|
+
self.data = config_json
|
|
158
|
+
else:
|
|
159
|
+
raise ValueError("No configuration provided")
|
|
160
|
+
self.rules_repo = RulesRepository.get_instance()
|
|
161
|
+
|
|
162
|
+
@staticmethod
|
|
163
|
+
def read_yaml_file(file_path: str) -> dict:
|
|
164
|
+
with open(file_path, "r") as file:
|
|
165
|
+
return yaml.load(file, Loader=EnvYamlLoader)
|
|
166
|
+
|
|
167
|
+
@staticmethod
|
|
168
|
+
def read_yaml_string(yaml_string: str) -> dict:
|
|
169
|
+
return yaml.load(yaml_string, Loader=EnvYamlLoader)
|
|
170
|
+
|
|
171
|
+
@staticmethod
|
|
172
|
+
def is_valid_uuid(val: str) -> bool:
|
|
173
|
+
try:
|
|
174
|
+
uuid.UUID(str(val))
|
|
175
|
+
return True
|
|
176
|
+
except ValueError:
|
|
177
|
+
return False
|
|
178
|
+
|
|
179
|
+
def validate_uuid(self, uuid_str: str | None, field_name: str) -> None:
|
|
180
|
+
if uuid_str is not None and not self.is_valid_uuid(uuid_str):
|
|
181
|
+
raise InvalidUUIDError(f"{field_name} is not a valid UUID")
|
|
182
|
+
|
|
183
|
+
@staticmethod
|
|
184
|
+
def validate_required_field(value: Union[str, None], field_name: str, source_name: str) -> None:
|
|
185
|
+
if value is None:
|
|
186
|
+
raise MissingRequiredFieldError(f"{field_name} is required for datasource {source_name}")
|
|
187
|
+
|
|
188
|
+
@staticmethod
|
|
189
|
+
def validate_file_connection(connection: dict) -> None:
|
|
190
|
+
if connection.get("type") == "file" and connection.get("filepath") is None:
|
|
191
|
+
raise MissingRequiredFieldError("file path is required for file connection")
|
|
192
|
+
|
|
193
|
+
@staticmethod
|
|
194
|
+
def validate_databricks_connection(connection: dict) -> None:
|
|
195
|
+
if connection.get("type") == "databricks":
|
|
196
|
+
if connection.get("connection", {}).get("http_path") is None:
|
|
197
|
+
raise MissingRequiredFieldError("http_path is required for databricks connection")
|
|
198
|
+
if connection.get("connection", {}).get("access_token") is None:
|
|
199
|
+
raise MissingRequiredFieldError("access_token is required for databricks connection")
|
|
200
|
+
|
|
201
|
+
@staticmethod
|
|
202
|
+
def validate_host_or_server(connection: dict) -> None:
|
|
203
|
+
if connection.get("type") == "sybase":
|
|
204
|
+
if not connection.get("connection", {}).get("host") and not connection.get("connection", {}).get("server"):
|
|
205
|
+
raise MissingRequiredFieldError("host or server is required for connection")
|
|
206
|
+
|
|
207
|
+
@staticmethod
|
|
208
|
+
def validate_comparison_by_query(
|
|
209
|
+
comparison_data: dict,
|
|
210
|
+
field_name: Literal["source", "target"],
|
|
211
|
+
temporary_schema: str | None,
|
|
212
|
+
database_type: str,
|
|
213
|
+
view_name: str | None,
|
|
214
|
+
) -> None:
|
|
215
|
+
if comparison_data.get(field_name, {}).get("query") is not None:
|
|
216
|
+
if comparison_data.get(field_name, {}).get("table") is not None:
|
|
217
|
+
raise ValueError(f"table and query cannot be used together in {field_name} connection")
|
|
218
|
+
if comparison_data.get(field_name, {}).get("filter") is not None:
|
|
219
|
+
raise ValueError(f"filter and query cannot be used together in {field_name} connection")
|
|
220
|
+
if database_type in ["file", "oracle"]:
|
|
221
|
+
return
|
|
222
|
+
if temporary_schema is None:
|
|
223
|
+
raise ValueError("temporary_schema is required for query based comparison")
|
|
224
|
+
if view_name is not None and len(view_name.split(".")) > 1:
|
|
225
|
+
raise ValueError("view_name should not contain schema name")
|
|
226
|
+
|
|
227
|
+
@staticmethod
|
|
228
|
+
def validate_similarity_threshold(threshold: float) -> None:
|
|
229
|
+
if threshold is None:
|
|
230
|
+
raise MissingRequiredFieldError("threshold is required for similarity")
|
|
231
|
+
if not 0 <= threshold <= 1:
|
|
232
|
+
raise ValueError("Similarity threshold must be between 0 and 1")
|
|
233
|
+
return threshold
|
|
234
|
+
|
|
235
|
+
def get_driver(self, connection: dict) -> str:
|
|
236
|
+
connection_type = connection.get("type")
|
|
237
|
+
if connection_type not in self.DRIVER_MAP:
|
|
238
|
+
raise InvalidConnectionTypeError(f"Invalid connection type: {connection_type}")
|
|
239
|
+
return self.DRIVER_MAP[connection_type]
|
|
240
|
+
|
|
241
|
+
def get_similarity_method(self, similarity_method: str) -> str:
|
|
242
|
+
if similarity_method is None:
|
|
243
|
+
raise MissingRequiredFieldError("similarity_method is required for similarity")
|
|
244
|
+
similarity_methods = ["jaccard", "cosine", "levenshtein"]
|
|
245
|
+
if similarity_method not in similarity_methods:
|
|
246
|
+
raise InvalidSimilarityMethodError(f"Invalid similarity method: {similarity_method}")
|
|
247
|
+
return similarity_method
|
|
248
|
+
|
|
249
|
+
def get_pre_processing_methods(self, pre_processing: List[str]) -> List[str]:
|
|
250
|
+
if pre_processing is None:
|
|
251
|
+
raise MissingRequiredFieldError("pre_processing is required for similarity")
|
|
252
|
+
pre_processing_methods = ["lower_case", "remove_punctuation", "remove_stop_words", "remove_extra_whitespaces"]
|
|
253
|
+
for method in pre_processing:
|
|
254
|
+
if method not in pre_processing_methods:
|
|
255
|
+
raise ValueError(f"Invalid pre_processing method: {method}")
|
|
256
|
+
return pre_processing
|
|
257
|
+
|
|
258
|
+
def create_connection_config(
|
|
259
|
+
self,
|
|
260
|
+
connection: dict,
|
|
261
|
+
comparison_data: dict,
|
|
262
|
+
is_source: bool,
|
|
263
|
+
temporary_schema: str | None,
|
|
264
|
+
view_name: str | None,
|
|
265
|
+
transform_columns: Dict[str, str] | None = None,
|
|
266
|
+
) -> dict:
|
|
267
|
+
self.validate_uuid(connection.get("id", None), "Datasource id")
|
|
268
|
+
self.validate_required_field(connection.get("name"), "connection name", source_name=connection.get("name"))
|
|
269
|
+
self.validate_required_field(connection.get("type"), "connection type", source_name=connection.get("name"))
|
|
270
|
+
self.validate_file_connection(connection)
|
|
271
|
+
self.validate_databricks_connection(connection)
|
|
272
|
+
self.validate_host_or_server(connection)
|
|
273
|
+
self.validate_comparison_by_query(
|
|
274
|
+
comparison_data,
|
|
275
|
+
"source" if is_source else "target",
|
|
276
|
+
temporary_schema,
|
|
277
|
+
connection.get("type"),
|
|
278
|
+
view_name,
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
driver = self.get_driver(connection)
|
|
282
|
+
|
|
283
|
+
return {
|
|
284
|
+
"id": connection.get("id", None),
|
|
285
|
+
"name": connection.get("name"),
|
|
286
|
+
"workspace": connection.get("workspace", "default"),
|
|
287
|
+
"host": connection.get("connection", {}).get("host", ""),
|
|
288
|
+
"port": connection.get("connection", {}).get("port", None),
|
|
289
|
+
"account": connection.get("connection", {}).get("account"),
|
|
290
|
+
"warehouse": connection.get("connection", {}).get("warehouse"),
|
|
291
|
+
"role": connection.get("connection", {}).get("role"),
|
|
292
|
+
"driver": driver,
|
|
293
|
+
"table": comparison_data.get("source" if is_source else "target", {}).get("table"),
|
|
294
|
+
"database": connection.get("connection", {}).get("database"),
|
|
295
|
+
"catalog": connection.get("connection", {}).get("catalog"),
|
|
296
|
+
"schema_name": connection.get("connection", {}).get("schema"),
|
|
297
|
+
"username": connection.get("connection", {}).get("username"),
|
|
298
|
+
"password": connection.get("connection", {}).get("password"),
|
|
299
|
+
"http_path": connection.get("connection", {}).get("http_path"),
|
|
300
|
+
"access_token": connection.get("connection", {}).get("access_token"),
|
|
301
|
+
"filepath": connection.get("filepath"),
|
|
302
|
+
"odbc_driver": connection.get("connection", {}).get("odbc_driver"),
|
|
303
|
+
"server": connection.get("connection", {}).get("server"),
|
|
304
|
+
"project": connection.get("connection", {}).get("project"),
|
|
305
|
+
"dataset": connection.get("connection", {}).get("dataset"),
|
|
306
|
+
"keyfile": connection.get("connection", {}).get("keyfile"),
|
|
307
|
+
"impersonate_service_account": connection.get("connection", {}).get("impersonate_service_account"),
|
|
308
|
+
"bigquery_credentials": connection.get("connection", {}).get("bigquery_credentials"),
|
|
309
|
+
"transform_columns": transform_columns,
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
def get_data_diff_configs(self) -> List[Comparison]:
|
|
313
|
+
data_sources = {
|
|
314
|
+
ds["name"]: {
|
|
315
|
+
"name": ds.get("name"),
|
|
316
|
+
"id": ds.get("id", None),
|
|
317
|
+
"type": ds.get("type"),
|
|
318
|
+
"workspace": ds.get("workspace", "default"),
|
|
319
|
+
"connection": ds.get("connection", {}),
|
|
320
|
+
"filepath": ds.get("file_path"),
|
|
321
|
+
"temporary_schema": ds.get("temporary_schema"),
|
|
322
|
+
"view_name": ds.get("view_name"),
|
|
323
|
+
}
|
|
324
|
+
for ds in self.data["data_sources"]
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
rules = self.data.get("rules", []) or []
|
|
328
|
+
|
|
329
|
+
for rule in rules:
|
|
330
|
+
rule_id = rule.get("id")
|
|
331
|
+
if rule_id:
|
|
332
|
+
self.rules_repo.register(rule_id, rule)
|
|
333
|
+
|
|
334
|
+
new_structure = []
|
|
335
|
+
|
|
336
|
+
for comparison_name, comparison_data in self.data["comparisons"].items():
|
|
337
|
+
source_connection = data_sources[comparison_data["source"]["data_source"]]
|
|
338
|
+
target_connection = data_sources[comparison_data["target"]["data_source"]]
|
|
339
|
+
|
|
340
|
+
source_masking_cols = comparison_data.get("source", {}).get("masking_columns")
|
|
341
|
+
target_masking_cols = comparison_data.get("target", {}).get("masking_columns")
|
|
342
|
+
|
|
343
|
+
masking_character = comparison_data.get("masking_configuration", {}).get("mask_character", "*") or "*"
|
|
344
|
+
|
|
345
|
+
schema_overrides = comparison_data.get("schema_overrides", []) or []
|
|
346
|
+
self.rules_repo.register_schema_rules(schema_rules=schema_overrides)
|
|
347
|
+
|
|
348
|
+
transform_columns = comparison_data.get("transform_columns", {}) or {}
|
|
349
|
+
self.rules_repo.register_value_rules(value_rules=transform_columns)
|
|
350
|
+
|
|
351
|
+
source_transform_columns = {}
|
|
352
|
+
target_transform_columns = {}
|
|
353
|
+
|
|
354
|
+
source_transform_configs = transform_columns.get("source", []) or []
|
|
355
|
+
if source_transform_configs:
|
|
356
|
+
for source_transform_config in source_transform_configs:
|
|
357
|
+
column = source_transform_config.get("name")
|
|
358
|
+
rule_id = source_transform_config.get("rule")
|
|
359
|
+
rule = self.rules_repo.get(rule_id)
|
|
360
|
+
|
|
361
|
+
if not rule:
|
|
362
|
+
raise ValueError(f"Rule with '{rule_id}' not found in rules repository")
|
|
363
|
+
|
|
364
|
+
transformation_template = rule["transformation"]
|
|
365
|
+
transformation_query = self._build_query(column, transformation_template)
|
|
366
|
+
source_transform_columns[column] = transformation_template
|
|
367
|
+
|
|
368
|
+
target_transform_configs = transform_columns.get("target", []) or []
|
|
369
|
+
if target_transform_configs:
|
|
370
|
+
for target_transform_config in target_transform_configs:
|
|
371
|
+
column = target_transform_config.get("name")
|
|
372
|
+
rule_id = target_transform_config.get("rule")
|
|
373
|
+
rule = self.rules_repo.get(rule_id)
|
|
374
|
+
|
|
375
|
+
if not rule:
|
|
376
|
+
raise ValueError(f"Rule with '{rule_id}' not found in rules repository")
|
|
377
|
+
|
|
378
|
+
transformation_template = rule["transformation"]
|
|
379
|
+
transformation_query = self._build_query(column, transformation_template)
|
|
380
|
+
target_transform_columns[column] = transformation_template
|
|
381
|
+
|
|
382
|
+
temporary_schema_source = source_connection.get("temporary_schema")
|
|
383
|
+
temporary_schema_target = target_connection.get("temporary_schema")
|
|
384
|
+
|
|
385
|
+
view_name_source = comparison_data.get("source", {}).get("view_name", None)
|
|
386
|
+
view_name_target = comparison_data.get("target", {}).get("view_name", None)
|
|
387
|
+
|
|
388
|
+
source_to_target = {
|
|
389
|
+
item["source_column"]: item["target_column"] for item in comparison_data.get("columns_mappings", {})
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
source_columns = comparison_data.get("columns", [])
|
|
393
|
+
limit = comparison_data.get("limit", None)
|
|
394
|
+
strict = comparison_data.get("strict", True)
|
|
395
|
+
quick_comparison = comparison_data.get("quick_comparison", False)
|
|
396
|
+
target_columns = [source_to_target.get(col, col) for col in source_columns]
|
|
397
|
+
schema_diff = comparison_data.get("schema_diff", False)
|
|
398
|
+
if quick_comparison and schema_diff:
|
|
399
|
+
raise ValueError("quick_comparison and schema_diff cannot be used together")
|
|
400
|
+
assert len(source_columns) == len(
|
|
401
|
+
target_columns
|
|
402
|
+
), "source_columns and target_columns must have the same length"
|
|
403
|
+
if not schema_diff and not (source_columns or target_columns):
|
|
404
|
+
raise MissingRequiredFieldError("source_columns and target_columns are required for comparison")
|
|
405
|
+
|
|
406
|
+
primary_keys_source = comparison_data.get("key_columns", [])
|
|
407
|
+
if not primary_keys_source and not schema_diff:
|
|
408
|
+
raise MissingRequiredFieldError("key_columns are required for comparison")
|
|
409
|
+
primary_keys_target = [source_to_target.get(pk, pk) for pk in primary_keys_source]
|
|
410
|
+
|
|
411
|
+
similarity_data = comparison_data.get("similarity")
|
|
412
|
+
similarity = (
|
|
413
|
+
SimilarityConfig(
|
|
414
|
+
pre_processing=self.get_pre_processing_methods(similarity_data.get("pre_processing", None)),
|
|
415
|
+
similarity_method=self.get_similarity_method(similarity_data.get("similarity_method", None)),
|
|
416
|
+
threshold=self.validate_similarity_threshold(similarity_data.get("threshold", None)),
|
|
417
|
+
)
|
|
418
|
+
if similarity_data
|
|
419
|
+
else None
|
|
420
|
+
)
|
|
421
|
+
advanced_diff_config = comparison_data.get("advanced_configuration", {})
|
|
422
|
+
advanced_configuration = DiffAdvancedConfig(
|
|
423
|
+
bisection_factor=advanced_diff_config.get("bisection_factor", 10),
|
|
424
|
+
bisection_threshold=advanced_diff_config.get("bisection_threshold", 50_000),
|
|
425
|
+
max_threadpool_size=advanced_diff_config.get("max_threadpool_size", 2),
|
|
426
|
+
egress_limit=advanced_diff_config.get("egress_limit", 5_00_000),
|
|
427
|
+
per_column_diff_limit=advanced_diff_config.get("per_column_diff_limit", 100),
|
|
428
|
+
timeout_limit=advanced_diff_config.get("timeout_limit", 60 * 5),
|
|
429
|
+
in_memory_diff=advanced_diff_config.get("in_memory_diff", False),
|
|
430
|
+
)
|
|
431
|
+
new_comparison = {
|
|
432
|
+
"comparison_name": comparison_name,
|
|
433
|
+
"job_id": comparison_data.get("job_id", None),
|
|
434
|
+
"source": self.create_connection_config(
|
|
435
|
+
source_connection,
|
|
436
|
+
comparison_data,
|
|
437
|
+
True,
|
|
438
|
+
temporary_schema_source,
|
|
439
|
+
view_name_source,
|
|
440
|
+
transform_columns=source_transform_columns,
|
|
441
|
+
),
|
|
442
|
+
"target": self.create_connection_config(
|
|
443
|
+
target_connection,
|
|
444
|
+
comparison_data,
|
|
445
|
+
False,
|
|
446
|
+
temporary_schema_target,
|
|
447
|
+
view_name_target,
|
|
448
|
+
transform_columns=target_transform_columns,
|
|
449
|
+
),
|
|
450
|
+
"source_columns": source_columns,
|
|
451
|
+
"target_columns": target_columns,
|
|
452
|
+
"primary_keys_source": primary_keys_source,
|
|
453
|
+
"primary_keys_target": primary_keys_target,
|
|
454
|
+
"source_filter": comparison_data.get("source", {}).get("filter", None),
|
|
455
|
+
"target_filter": comparison_data.get("target", {}).get("filter", None),
|
|
456
|
+
"source_query": comparison_data.get("source", {}).get("query", None),
|
|
457
|
+
"target_query": comparison_data.get("target", {}).get("query", None),
|
|
458
|
+
"temporary_schema_source": temporary_schema_source,
|
|
459
|
+
"temporary_schema_target": temporary_schema_target,
|
|
460
|
+
"similarity": similarity,
|
|
461
|
+
"view_name_source": view_name_source,
|
|
462
|
+
"view_name_target": view_name_target,
|
|
463
|
+
"advanced_configuration": advanced_configuration,
|
|
464
|
+
"limit": limit,
|
|
465
|
+
"strict": strict,
|
|
466
|
+
"quick_comparison": quick_comparison,
|
|
467
|
+
"source_masking_columns": source_masking_cols,
|
|
468
|
+
"target_masking_columns": target_masking_cols,
|
|
469
|
+
"masking_character": masking_character,
|
|
470
|
+
"schema_diff": schema_diff,
|
|
471
|
+
}
|
|
472
|
+
new_structure.append(Comparison(**new_comparison))
|
|
473
|
+
|
|
474
|
+
return new_structure
|
|
475
|
+
|
|
476
|
+
def _build_query(self, column, transformation_template):
|
|
477
|
+
transformation_query = transformation_template.format(column=column)
|
|
478
|
+
return transformation_query
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
def data_diff_config_loader(
|
|
482
|
+
config_path: Optional[str] = None,
|
|
483
|
+
config_yaml: Optional[str] = None,
|
|
484
|
+
config_json: Optional[dict] = None,
|
|
485
|
+
) -> List[Comparison]:
|
|
486
|
+
config = DataDiffConfig(
|
|
487
|
+
yaml_file_path=config_path,
|
|
488
|
+
yaml_string=config_yaml,
|
|
489
|
+
config_json=config_json,
|
|
490
|
+
)
|
|
491
|
+
return config.get_data_diff_configs()
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|