dcs-sdk 1.6.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_diff/__init__.py +221 -0
- data_diff/__main__.py +517 -0
- data_diff/abcs/__init__.py +13 -0
- data_diff/abcs/compiler.py +27 -0
- data_diff/abcs/database_types.py +402 -0
- data_diff/config.py +141 -0
- data_diff/databases/__init__.py +38 -0
- data_diff/databases/_connect.py +323 -0
- data_diff/databases/base.py +1417 -0
- data_diff/databases/bigquery.py +376 -0
- data_diff/databases/clickhouse.py +217 -0
- data_diff/databases/databricks.py +262 -0
- data_diff/databases/duckdb.py +207 -0
- data_diff/databases/mssql.py +343 -0
- data_diff/databases/mysql.py +189 -0
- data_diff/databases/oracle.py +238 -0
- data_diff/databases/postgresql.py +293 -0
- data_diff/databases/presto.py +222 -0
- data_diff/databases/redis.py +93 -0
- data_diff/databases/redshift.py +233 -0
- data_diff/databases/snowflake.py +222 -0
- data_diff/databases/sybase.py +720 -0
- data_diff/databases/trino.py +73 -0
- data_diff/databases/vertica.py +174 -0
- data_diff/diff_tables.py +489 -0
- data_diff/errors.py +17 -0
- data_diff/format.py +369 -0
- data_diff/hashdiff_tables.py +1026 -0
- data_diff/info_tree.py +76 -0
- data_diff/joindiff_tables.py +434 -0
- data_diff/lexicographic_space.py +253 -0
- data_diff/parse_time.py +88 -0
- data_diff/py.typed +0 -0
- data_diff/queries/__init__.py +13 -0
- data_diff/queries/api.py +213 -0
- data_diff/queries/ast_classes.py +811 -0
- data_diff/queries/base.py +38 -0
- data_diff/queries/extras.py +43 -0
- data_diff/query_utils.py +70 -0
- data_diff/schema.py +67 -0
- data_diff/table_segment.py +583 -0
- data_diff/thread_utils.py +112 -0
- data_diff/utils.py +1022 -0
- data_diff/version.py +15 -0
- dcs_core/__init__.py +13 -0
- dcs_core/__main__.py +17 -0
- dcs_core/__version__.py +15 -0
- dcs_core/cli/__init__.py +13 -0
- dcs_core/cli/cli.py +165 -0
- dcs_core/core/__init__.py +19 -0
- dcs_core/core/common/__init__.py +13 -0
- dcs_core/core/common/errors.py +50 -0
- dcs_core/core/common/models/__init__.py +13 -0
- dcs_core/core/common/models/configuration.py +284 -0
- dcs_core/core/common/models/dashboard.py +24 -0
- dcs_core/core/common/models/data_source_resource.py +75 -0
- dcs_core/core/common/models/metric.py +160 -0
- dcs_core/core/common/models/profile.py +75 -0
- dcs_core/core/common/models/validation.py +216 -0
- dcs_core/core/common/models/widget.py +44 -0
- dcs_core/core/configuration/__init__.py +13 -0
- dcs_core/core/configuration/config_loader.py +139 -0
- dcs_core/core/configuration/configuration_parser.py +262 -0
- dcs_core/core/configuration/configuration_parser_arc.py +328 -0
- dcs_core/core/datasource/__init__.py +13 -0
- dcs_core/core/datasource/base.py +62 -0
- dcs_core/core/datasource/manager.py +112 -0
- dcs_core/core/datasource/search_datasource.py +421 -0
- dcs_core/core/datasource/sql_datasource.py +1094 -0
- dcs_core/core/inspect.py +163 -0
- dcs_core/core/logger/__init__.py +13 -0
- dcs_core/core/logger/base.py +32 -0
- dcs_core/core/logger/default_logger.py +94 -0
- dcs_core/core/metric/__init__.py +13 -0
- dcs_core/core/metric/base.py +220 -0
- dcs_core/core/metric/combined_metric.py +98 -0
- dcs_core/core/metric/custom_metric.py +34 -0
- dcs_core/core/metric/manager.py +137 -0
- dcs_core/core/metric/numeric_metric.py +403 -0
- dcs_core/core/metric/reliability_metric.py +90 -0
- dcs_core/core/profiling/__init__.py +13 -0
- dcs_core/core/profiling/datasource_profiling.py +136 -0
- dcs_core/core/profiling/numeric_field_profiling.py +72 -0
- dcs_core/core/profiling/text_field_profiling.py +67 -0
- dcs_core/core/repository/__init__.py +13 -0
- dcs_core/core/repository/metric_repository.py +77 -0
- dcs_core/core/utils/__init__.py +13 -0
- dcs_core/core/utils/log.py +29 -0
- dcs_core/core/utils/tracking.py +105 -0
- dcs_core/core/utils/utils.py +44 -0
- dcs_core/core/validation/__init__.py +13 -0
- dcs_core/core/validation/base.py +230 -0
- dcs_core/core/validation/completeness_validation.py +153 -0
- dcs_core/core/validation/custom_query_validation.py +24 -0
- dcs_core/core/validation/manager.py +282 -0
- dcs_core/core/validation/numeric_validation.py +276 -0
- dcs_core/core/validation/reliability_validation.py +91 -0
- dcs_core/core/validation/uniqueness_validation.py +61 -0
- dcs_core/core/validation/validity_validation.py +738 -0
- dcs_core/integrations/__init__.py +13 -0
- dcs_core/integrations/databases/__init__.py +13 -0
- dcs_core/integrations/databases/bigquery.py +187 -0
- dcs_core/integrations/databases/databricks.py +51 -0
- dcs_core/integrations/databases/db2.py +652 -0
- dcs_core/integrations/databases/elasticsearch.py +61 -0
- dcs_core/integrations/databases/mssql.py +829 -0
- dcs_core/integrations/databases/mysql.py +409 -0
- dcs_core/integrations/databases/opensearch.py +64 -0
- dcs_core/integrations/databases/oracle.py +719 -0
- dcs_core/integrations/databases/postgres.py +482 -0
- dcs_core/integrations/databases/redshift.py +53 -0
- dcs_core/integrations/databases/snowflake.py +48 -0
- dcs_core/integrations/databases/spark_df.py +111 -0
- dcs_core/integrations/databases/sybase.py +1069 -0
- dcs_core/integrations/storage/__init__.py +13 -0
- dcs_core/integrations/storage/local_file.py +149 -0
- dcs_core/integrations/utils/__init__.py +13 -0
- dcs_core/integrations/utils/utils.py +36 -0
- dcs_core/report/__init__.py +13 -0
- dcs_core/report/dashboard.py +211 -0
- dcs_core/report/models.py +88 -0
- dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
- dcs_core/report/static/assets/images/docs.svg +6 -0
- dcs_core/report/static/assets/images/github.svg +4 -0
- dcs_core/report/static/assets/images/logo.svg +7 -0
- dcs_core/report/static/assets/images/slack.svg +13 -0
- dcs_core/report/static/index.js +2 -0
- dcs_core/report/static/index.js.LICENSE.txt +3971 -0
- dcs_sdk/__init__.py +13 -0
- dcs_sdk/__main__.py +18 -0
- dcs_sdk/__version__.py +15 -0
- dcs_sdk/cli/__init__.py +13 -0
- dcs_sdk/cli/cli.py +163 -0
- dcs_sdk/sdk/__init__.py +58 -0
- dcs_sdk/sdk/config/__init__.py +13 -0
- dcs_sdk/sdk/config/config_loader.py +491 -0
- dcs_sdk/sdk/data_diff/__init__.py +13 -0
- dcs_sdk/sdk/data_diff/data_differ.py +821 -0
- dcs_sdk/sdk/rules/__init__.py +15 -0
- dcs_sdk/sdk/rules/rules_mappping.py +31 -0
- dcs_sdk/sdk/rules/rules_repository.py +214 -0
- dcs_sdk/sdk/rules/schema_rules.py +65 -0
- dcs_sdk/sdk/utils/__init__.py +13 -0
- dcs_sdk/sdk/utils/serializer.py +25 -0
- dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
- dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
- dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
- dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
- dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
- dcs_sdk/sdk/utils/table.py +475 -0
- dcs_sdk/sdk/utils/themes.py +40 -0
- dcs_sdk/sdk/utils/utils.py +349 -0
- dcs_sdk-1.6.5.dist-info/METADATA +150 -0
- dcs_sdk-1.6.5.dist-info/RECORD +159 -0
- dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
- dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from .rules_repository import RulesRepository
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from dcs_sdk.sdk.rules.schema_rules import (
|
|
16
|
+
allow_equivalent_data_types,
|
|
17
|
+
ignore_column_length_difference,
|
|
18
|
+
ignore_datetime_precision_difference,
|
|
19
|
+
ignore_numeric_precision_difference,
|
|
20
|
+
ignore_numeric_scale_difference,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_rules_to_func_mapping():
|
|
25
|
+
return {
|
|
26
|
+
"ignore_column_length_difference": ignore_column_length_difference,
|
|
27
|
+
"allow_equivalent_data_types": allow_equivalent_data_types,
|
|
28
|
+
"ignore_numeric_precision_difference": ignore_numeric_precision_difference,
|
|
29
|
+
"ignore_numeric_scale_difference": ignore_numeric_scale_difference,
|
|
30
|
+
"ignore_datetime_precision_difference": ignore_datetime_precision_difference,
|
|
31
|
+
}
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import copy
|
|
16
|
+
from collections import defaultdict
|
|
17
|
+
from typing import Optional
|
|
18
|
+
from uuid import UUID
|
|
19
|
+
|
|
20
|
+
from loguru import logger
|
|
21
|
+
|
|
22
|
+
from dcs_sdk.sdk.rules.rules_mappping import get_rules_to_func_mapping
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def map_str_type_to_generic_type(c_type):
|
|
26
|
+
c_type = c_type.lower() if c_type else str(c_type)
|
|
27
|
+
if any(
|
|
28
|
+
c_type.startswith(prefix)
|
|
29
|
+
for prefix in [
|
|
30
|
+
"int",
|
|
31
|
+
"integer",
|
|
32
|
+
"bigint",
|
|
33
|
+
"smallint",
|
|
34
|
+
"tinyint",
|
|
35
|
+
"float",
|
|
36
|
+
"double",
|
|
37
|
+
"real",
|
|
38
|
+
"numeric",
|
|
39
|
+
"decimal",
|
|
40
|
+
"money",
|
|
41
|
+
"smallmoney",
|
|
42
|
+
"number",
|
|
43
|
+
]
|
|
44
|
+
):
|
|
45
|
+
return "numeric"
|
|
46
|
+
|
|
47
|
+
elif any(
|
|
48
|
+
c_type.startswith(prefix)
|
|
49
|
+
for prefix in [
|
|
50
|
+
"string",
|
|
51
|
+
"varchar",
|
|
52
|
+
"char",
|
|
53
|
+
"text",
|
|
54
|
+
"str",
|
|
55
|
+
"character varying",
|
|
56
|
+
"character",
|
|
57
|
+
"nvar",
|
|
58
|
+
"nchar",
|
|
59
|
+
]
|
|
60
|
+
):
|
|
61
|
+
return "string"
|
|
62
|
+
elif any(c_type.startswith(prefix) for prefix in ["uuid", "uniqueidentifier", "guid"]):
|
|
63
|
+
return "uuid"
|
|
64
|
+
elif any(
|
|
65
|
+
c_type.startswith(prefix)
|
|
66
|
+
for prefix in [
|
|
67
|
+
"date",
|
|
68
|
+
"time",
|
|
69
|
+
"timestamp",
|
|
70
|
+
"datetime",
|
|
71
|
+
"timestamp with time zone",
|
|
72
|
+
"timestamp without time zone",
|
|
73
|
+
"smalldatetime",
|
|
74
|
+
]
|
|
75
|
+
):
|
|
76
|
+
return "datetime"
|
|
77
|
+
elif any(c_type.startswith(prefix) for prefix in ["json", "jsonb", "dict", "map"]):
|
|
78
|
+
return "json"
|
|
79
|
+
elif any(c_type.startswith(prefix) for prefix in ["array", "list"]):
|
|
80
|
+
return "array"
|
|
81
|
+
elif any(c_type.startswith(prefix) for prefix in ["binary", "blob"]):
|
|
82
|
+
return "binary"
|
|
83
|
+
elif any(c_type.startswith(prefix) for prefix in ["bytea"]):
|
|
84
|
+
return "bytea"
|
|
85
|
+
elif any(c_type.startswith(prefix) for prefix in ["boolean", "bool"]):
|
|
86
|
+
return "boolean"
|
|
87
|
+
else:
|
|
88
|
+
return c_type
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
# CENTRALIZED REPO FOR ALL THE RULES
|
|
92
|
+
# RULE NAME -> ALL ITS PROPERTIES
|
|
93
|
+
class RulesRepository:
|
|
94
|
+
_INSTANCE = None
|
|
95
|
+
|
|
96
|
+
@classmethod
|
|
97
|
+
def get_instance(cls):
|
|
98
|
+
if cls._INSTANCE is None:
|
|
99
|
+
cls._INSTANCE = cls()
|
|
100
|
+
return cls._INSTANCE
|
|
101
|
+
|
|
102
|
+
def __init__(self):
|
|
103
|
+
self.rules = {}
|
|
104
|
+
self.rules_mapping = get_rules_to_func_mapping()
|
|
105
|
+
self.value_rules = []
|
|
106
|
+
self.schema_rules = {}
|
|
107
|
+
|
|
108
|
+
def register(self, id: UUID, rule_dict: dict):
|
|
109
|
+
"""
|
|
110
|
+
Registers a rule in the centralized repository.
|
|
111
|
+
Supports both 'schema' and 'value' rules.
|
|
112
|
+
|
|
113
|
+
- For value rules: transformation is a string template
|
|
114
|
+
- For schema rules: transformation is a function name (to be resolved)
|
|
115
|
+
"""
|
|
116
|
+
rule_type = rule_dict.get("type")
|
|
117
|
+
|
|
118
|
+
if not rule_type:
|
|
119
|
+
return
|
|
120
|
+
|
|
121
|
+
if rule_type == "schema_override":
|
|
122
|
+
func_name = rule_dict.get("transformation")
|
|
123
|
+
if func_name:
|
|
124
|
+
func = self.rules_mapping.get(func_name)
|
|
125
|
+
if not func:
|
|
126
|
+
raise ValueError(f"Function '{func_name}' not found in registry")
|
|
127
|
+
rule_dict["function"] = func
|
|
128
|
+
rule_dict["function_name"] = func_name
|
|
129
|
+
|
|
130
|
+
self.rules[id] = rule_dict
|
|
131
|
+
|
|
132
|
+
def register_schema_rules(self, schema_rules: list):
|
|
133
|
+
self.schema_rules = schema_rules
|
|
134
|
+
|
|
135
|
+
def register_value_rules(self, value_rules: list):
|
|
136
|
+
self.value_rules = value_rules
|
|
137
|
+
|
|
138
|
+
def get(self, id: UUID) -> Optional[dict]:
|
|
139
|
+
return self.rules.get(id)
|
|
140
|
+
|
|
141
|
+
def apply_schema_rules(self, src_col: dict, tgt_col: dict) -> tuple[bool, str | None]:
|
|
142
|
+
"""
|
|
143
|
+
Performs baseline schema checks and overrides them if corresponding rules are configured.
|
|
144
|
+
Returns (True, None) if all checks pass.
|
|
145
|
+
Returns (False, reason) if any check fails.
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
schema_rule_function_mapping = defaultdict(list)
|
|
149
|
+
|
|
150
|
+
for schema_rule in self.schema_rules:
|
|
151
|
+
rule_obj = self.get(schema_rule)
|
|
152
|
+
if rule_obj:
|
|
153
|
+
func_name = rule_obj.get("function_name")
|
|
154
|
+
params = rule_obj.get("params")
|
|
155
|
+
function = rule_obj.get("function")
|
|
156
|
+
|
|
157
|
+
schema_rule_function_mapping[func_name].append({"params": params, "func": function})
|
|
158
|
+
|
|
159
|
+
# def is_rule_allowed(rule_name: str, fallback_check: bool) -> bool:
|
|
160
|
+
# if rule_name not in schema_rule_function_mapping:
|
|
161
|
+
# return fallback_check
|
|
162
|
+
# rule_obj = schema_rule_function_mapping[rule_name]
|
|
163
|
+
# func = rule_obj.get("func")
|
|
164
|
+
# params = rule_obj.get("params")
|
|
165
|
+
# return func(src_col, tgt_col, params) if func else fallback_check
|
|
166
|
+
|
|
167
|
+
def is_rule_allowed(rule_name: str, fallback_check: bool) -> bool:
|
|
168
|
+
if rule_name not in schema_rule_function_mapping:
|
|
169
|
+
return fallback_check
|
|
170
|
+
|
|
171
|
+
rule_objs = schema_rule_function_mapping[rule_name]
|
|
172
|
+
if not rule_objs:
|
|
173
|
+
return fallback_check
|
|
174
|
+
|
|
175
|
+
for rule_obj in rule_objs:
|
|
176
|
+
func = rule_obj.get("func")
|
|
177
|
+
params = rule_obj.get("params")
|
|
178
|
+
|
|
179
|
+
if func:
|
|
180
|
+
result = func(src_col, tgt_col, params)
|
|
181
|
+
if result:
|
|
182
|
+
return True
|
|
183
|
+
|
|
184
|
+
return fallback_check
|
|
185
|
+
|
|
186
|
+
source_generic_data_type = map_str_type_to_generic_type(src_col["data_type"].lower())
|
|
187
|
+
tgt_generic_data_type = map_str_type_to_generic_type(tgt_col["data_type"].lower())
|
|
188
|
+
|
|
189
|
+
if src_col["data_type"].lower() != tgt_col["data_type"].lower():
|
|
190
|
+
if not is_rule_allowed("allow_equivalent_data_types", False):
|
|
191
|
+
return False, f"Data type mismatch"
|
|
192
|
+
|
|
193
|
+
return True, None
|
|
194
|
+
|
|
195
|
+
if source_generic_data_type == "string" and tgt_generic_data_type == "string":
|
|
196
|
+
if src_col.get("character_maximum_length") != tgt_col.get("character_maximum_length"):
|
|
197
|
+
if not is_rule_allowed("ignore_column_length_difference", False):
|
|
198
|
+
return False, f"Length mismatch"
|
|
199
|
+
|
|
200
|
+
if source_generic_data_type == "numeric" and tgt_generic_data_type == "numeric":
|
|
201
|
+
if src_col.get("numeric_precision") != tgt_col.get("numeric_precision"):
|
|
202
|
+
if not is_rule_allowed("ignore_numeric_precision_difference", False):
|
|
203
|
+
return False, f"Numeric precision mismatch"
|
|
204
|
+
|
|
205
|
+
if src_col.get("numeric_scale") != tgt_col.get("numeric_scale"):
|
|
206
|
+
if not is_rule_allowed("ignore_numeric_scale_difference", False):
|
|
207
|
+
return False, f"Numeric scale mismatch"
|
|
208
|
+
|
|
209
|
+
if source_generic_data_type == "datetime" and tgt_generic_data_type == "datetime":
|
|
210
|
+
if src_col.get("datetime_precision") != tgt_col.get("datetime_precision"):
|
|
211
|
+
if not is_rule_allowed("ignore_datetime_precision_difference", False):
|
|
212
|
+
return False, f"Datetime precision mismatch"
|
|
213
|
+
|
|
214
|
+
return True, None
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from typing import Dict, Optional
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def ignore_column_length_difference(
|
|
19
|
+
src_col: Dict,
|
|
20
|
+
tgt_col: Dict,
|
|
21
|
+
params: Optional[Dict] = None,
|
|
22
|
+
) -> bool:
|
|
23
|
+
if not params:
|
|
24
|
+
# IN THIS CASE ALWAYS RETURN TRUE
|
|
25
|
+
return True
|
|
26
|
+
|
|
27
|
+
# IF PARAMS THEN PROCESS
|
|
28
|
+
# FOR EG: MAX_LENGTH_DIFF = 30 SO IN THIS CASE WE CAN CACL THE DIFF AND RETURN APPROPRIATE RESPONSE
|
|
29
|
+
|
|
30
|
+
return False
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def allow_equivalent_data_types(src_col: Dict, tgt_col: Dict, params: Optional[Dict] = None) -> bool:
|
|
34
|
+
|
|
35
|
+
src_type = src_col["data_type"].lower()
|
|
36
|
+
tgt_type = tgt_col["data_type"].lower()
|
|
37
|
+
|
|
38
|
+
if params and "equivalent_groups" in params:
|
|
39
|
+
for group in params["equivalent_groups"]:
|
|
40
|
+
group_set = {t.lower() for t in group}
|
|
41
|
+
if src_type in group_set and tgt_type in group_set:
|
|
42
|
+
return True
|
|
43
|
+
|
|
44
|
+
return False
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def ignore_numeric_precision_difference(src_col: Dict, tgt_col: Dict, params: Optional[Dict] = None) -> bool:
|
|
48
|
+
if not params:
|
|
49
|
+
return True
|
|
50
|
+
|
|
51
|
+
return False
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def ignore_numeric_scale_difference(src_col: Dict, tgt_col: Dict, params: Optional[Dict] = None) -> bool:
|
|
55
|
+
if not params:
|
|
56
|
+
return True
|
|
57
|
+
|
|
58
|
+
return False
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def ignore_datetime_precision_difference(src_col: Dict, tgt_col: Dict, params: Optional[Dict] = None) -> bool:
|
|
62
|
+
if not params:
|
|
63
|
+
return True
|
|
64
|
+
|
|
65
|
+
return False
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def serialize_table_schema(info) -> dict:
|
|
17
|
+
return {
|
|
18
|
+
"column_name": info.column_name,
|
|
19
|
+
"data_type": info.data_type,
|
|
20
|
+
"datetime_precision": info.datetime_precision,
|
|
21
|
+
"numeric_precision": info.numeric_precision,
|
|
22
|
+
"numeric_scale": info.numeric_scale,
|
|
23
|
+
"collation_name": info.collation_name,
|
|
24
|
+
"character_maximum_length": info.character_maximum_length,
|
|
25
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
import string
|
|
17
|
+
from abc import ABC, abstractmethod
|
|
18
|
+
from collections import defaultdict
|
|
19
|
+
from typing import Any, Dict, List
|
|
20
|
+
|
|
21
|
+
import nltk
|
|
22
|
+
from dotenv import load_dotenv
|
|
23
|
+
from nltk.corpus import stopwords
|
|
24
|
+
from nltk.tokenize import word_tokenize
|
|
25
|
+
|
|
26
|
+
from dcs_sdk.sdk.config.config_loader import SimilarityConfig
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def ensure_nltk_data():
|
|
30
|
+
load_dotenv()
|
|
31
|
+
nltk_data_dir = os.getenv("NLTK_DATA_DIR")
|
|
32
|
+
|
|
33
|
+
if not nltk_data_dir:
|
|
34
|
+
default_root = os.path.dirname(os.path.abspath(__file__))
|
|
35
|
+
nltk_data_dir = os.path.join(default_root, "nltk_data")
|
|
36
|
+
print(f"NLTK_DATA_DIR ENV variable not set. Using default path: {nltk_data_dir}")
|
|
37
|
+
|
|
38
|
+
punkt_path = os.path.join(nltk_data_dir, "tokenizers", "punkt")
|
|
39
|
+
stopwords_path = os.path.join(nltk_data_dir, "corpora", "stopwords")
|
|
40
|
+
punkt_tab_path = os.path.join(nltk_data_dir, "tokenizers", "punkt_tab")
|
|
41
|
+
|
|
42
|
+
if not os.path.exists(punkt_path):
|
|
43
|
+
nltk.download(
|
|
44
|
+
"punkt",
|
|
45
|
+
download_dir=nltk_data_dir,
|
|
46
|
+
halt_on_error=True,
|
|
47
|
+
raise_on_error=True,
|
|
48
|
+
)
|
|
49
|
+
if not os.path.exists(stopwords_path):
|
|
50
|
+
nltk.download(
|
|
51
|
+
"stopwords",
|
|
52
|
+
download_dir=nltk_data_dir,
|
|
53
|
+
halt_on_error=True,
|
|
54
|
+
raise_on_error=True,
|
|
55
|
+
)
|
|
56
|
+
if not os.path.exists(punkt_tab_path):
|
|
57
|
+
nltk.download(
|
|
58
|
+
"punkt_tab",
|
|
59
|
+
download_dir=nltk_data_dir,
|
|
60
|
+
halt_on_error=True,
|
|
61
|
+
raise_on_error=True,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
nltk.data.path.append(nltk_data_dir)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class SimilarityScoreProvider(ABC):
|
|
68
|
+
def preprocess_text(self, text: str, methods: list[str]) -> set:
|
|
69
|
+
"""Applies preprocessing steps dynamically before tokenization."""
|
|
70
|
+
if "lower_case" in methods:
|
|
71
|
+
text = text.lower()
|
|
72
|
+
if "remove_punctuation" in methods:
|
|
73
|
+
text = text.translate(str.maketrans("", "", string.punctuation))
|
|
74
|
+
if "remove_stop_words" in methods:
|
|
75
|
+
stop_words = set(stopwords.words("english"))
|
|
76
|
+
text = " ".join(word for word in text.split() if word not in stop_words)
|
|
77
|
+
if "remove_extra_whitespaces" in methods:
|
|
78
|
+
text = " ".join(text.split())
|
|
79
|
+
|
|
80
|
+
tokens = set(word_tokenize(text))
|
|
81
|
+
return tokens
|
|
82
|
+
|
|
83
|
+
@abstractmethod
|
|
84
|
+
def fuzzy_match(self, str1: set, str2: set) -> float:
|
|
85
|
+
"""Computes a similarity score between two sets of tokens."""
|
|
86
|
+
pass
|
|
87
|
+
|
|
88
|
+
def add_text_similarity(
|
|
89
|
+
self,
|
|
90
|
+
key: List[str],
|
|
91
|
+
data: List[Dict[str, Any]],
|
|
92
|
+
fields: List[str],
|
|
93
|
+
similarity: SimilarityConfig,
|
|
94
|
+
source_masking_cols: List[str],
|
|
95
|
+
target_masking_cols: List[str],
|
|
96
|
+
mask_char: str,
|
|
97
|
+
) -> List[Dict[str, Any]]:
|
|
98
|
+
"""Adds text similarity scores for the given fields inside the meta dictionary
|
|
99
|
+
and determines if they are a match based on the threshold.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
key (List[str]): List of primary key column names to form a composite key.
|
|
103
|
+
data (List[Dict[str, Any]]): List of records to process.
|
|
104
|
+
fields (List[str]): List of fields to compute similarity scores for.
|
|
105
|
+
similarity (SimilarityConfig): Configuration object with pre_processing and threshold.
|
|
106
|
+
source_masking_cols (List[str]): List of cols from the source to be masked
|
|
107
|
+
target_masking_cols (List[str]): List of cols from the target to be masked
|
|
108
|
+
mask_char (str): Character to be used for masking (Default = "*")
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
List[Dict[str, Any]]: Processed records with similarity scores added.
|
|
112
|
+
"""
|
|
113
|
+
grouped_data = defaultdict(list)
|
|
114
|
+
for item in data:
|
|
115
|
+
composite_key = "_".join(str(item[k]) for k in key)
|
|
116
|
+
grouped_data[composite_key].append(item)
|
|
117
|
+
|
|
118
|
+
for _, items in grouped_data.items():
|
|
119
|
+
if len(items) == 2:
|
|
120
|
+
source, target = items
|
|
121
|
+
|
|
122
|
+
source.setdefault("meta", {}).setdefault("scores", {})
|
|
123
|
+
target.setdefault("meta", {}).setdefault("scores", {})
|
|
124
|
+
|
|
125
|
+
for field in fields:
|
|
126
|
+
source_text = self.preprocess_text(source.get(field, ""), similarity.pre_processing)
|
|
127
|
+
target_text = self.preprocess_text(target.get(field, ""), similarity.pre_processing)
|
|
128
|
+
similarity_score = self.fuzzy_match(source_text, target_text)
|
|
129
|
+
|
|
130
|
+
match_status = "match" if similarity_score >= similarity.threshold else "not matched"
|
|
131
|
+
|
|
132
|
+
score_key = f"{field}"
|
|
133
|
+
source["meta"]["scores"][score_key] = {"score": similarity_score, "status": match_status}
|
|
134
|
+
target["meta"]["scores"][score_key] = {"score": similarity_score, "status": match_status}
|
|
135
|
+
|
|
136
|
+
source_val = str(source.get(field, ""))
|
|
137
|
+
target_val = str(target.get(field, ""))
|
|
138
|
+
|
|
139
|
+
if field in source_masking_cols and field in target_masking_cols:
|
|
140
|
+
if len(source_val) == len(target_val) and match_status == "not matched":
|
|
141
|
+
source[field] = mask_char * (len(source_val) + 1)
|
|
142
|
+
target[field] = mask_char * (len(target_val))
|
|
143
|
+
else:
|
|
144
|
+
source[field] = mask_char * (len(source_val))
|
|
145
|
+
target[field] = mask_char * (len(target_val))
|
|
146
|
+
|
|
147
|
+
if field in source_masking_cols:
|
|
148
|
+
source[field] = mask_char * len(source_val)
|
|
149
|
+
|
|
150
|
+
if field in target_masking_cols:
|
|
151
|
+
target[field] = mask_char * len(target_val)
|
|
152
|
+
|
|
153
|
+
return data
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import math
|
|
16
|
+
from collections import Counter
|
|
17
|
+
|
|
18
|
+
from dcs_sdk.sdk.utils.similarity_score.base_provider import SimilarityScoreProvider
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class CosineSimilarityProvider(SimilarityScoreProvider):
|
|
22
|
+
def fuzzy_match(self, tokens1: set, tokens2: set) -> float:
|
|
23
|
+
"""Computes cosine similarity between two sets of tokens."""
|
|
24
|
+
if not tokens1 or not tokens2:
|
|
25
|
+
return 0.0
|
|
26
|
+
|
|
27
|
+
freq1 = Counter(tokens1)
|
|
28
|
+
freq2 = Counter(tokens2)
|
|
29
|
+
|
|
30
|
+
all_words = set(freq1.keys()).union(set(freq2.keys()))
|
|
31
|
+
|
|
32
|
+
dot_product = sum(freq1[word] * freq2[word] for word in all_words)
|
|
33
|
+
magnitude1 = math.sqrt(sum(freq1[word] ** 2 for word in all_words))
|
|
34
|
+
magnitude2 = math.sqrt(sum(freq2[word] ** 2 for word in all_words))
|
|
35
|
+
|
|
36
|
+
if magnitude1 == 0 or magnitude2 == 0:
|
|
37
|
+
return 0.0
|
|
38
|
+
|
|
39
|
+
return round(dot_product / (magnitude1 * magnitude2), 4)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
from dcs_sdk.sdk.utils.similarity_score.base_provider import SimilarityScoreProvider
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class JaccardSimilarityProvider(SimilarityScoreProvider):
|
|
20
|
+
def fuzzy_match(self, set1: set, set2: set) -> float:
|
|
21
|
+
"""Computes the Jaccard similarity between two sets of tokens."""
|
|
22
|
+
intersection = len(set1 & set2)
|
|
23
|
+
union = len(set1 | set2)
|
|
24
|
+
return intersection / union if union != 0 else 0.0
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from nltk.metrics import edit_distance
|
|
16
|
+
|
|
17
|
+
from dcs_sdk.sdk.utils.similarity_score.base_provider import SimilarityScoreProvider
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class LevenshteinDistanceProvider(SimilarityScoreProvider):
|
|
21
|
+
def fuzzy_match(self, tokens1: set, tokens2: set) -> float:
|
|
22
|
+
"""Computes similarity score using Levenshtein distance."""
|
|
23
|
+
str1 = " ".join(tokens1)
|
|
24
|
+
str2 = " ".join(tokens2)
|
|
25
|
+
|
|
26
|
+
max_len = max(len(str1), len(str2))
|
|
27
|
+
if max_len == 0:
|
|
28
|
+
return 1.0
|
|
29
|
+
|
|
30
|
+
distance = edit_distance(str1, str2)
|
|
31
|
+
return round(1 - (distance / max_len), 4)
|