dcs-sdk 1.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. data_diff/__init__.py +221 -0
  2. data_diff/__main__.py +517 -0
  3. data_diff/abcs/__init__.py +13 -0
  4. data_diff/abcs/compiler.py +27 -0
  5. data_diff/abcs/database_types.py +402 -0
  6. data_diff/config.py +141 -0
  7. data_diff/databases/__init__.py +38 -0
  8. data_diff/databases/_connect.py +323 -0
  9. data_diff/databases/base.py +1417 -0
  10. data_diff/databases/bigquery.py +376 -0
  11. data_diff/databases/clickhouse.py +217 -0
  12. data_diff/databases/databricks.py +262 -0
  13. data_diff/databases/duckdb.py +207 -0
  14. data_diff/databases/mssql.py +343 -0
  15. data_diff/databases/mysql.py +189 -0
  16. data_diff/databases/oracle.py +238 -0
  17. data_diff/databases/postgresql.py +293 -0
  18. data_diff/databases/presto.py +222 -0
  19. data_diff/databases/redis.py +93 -0
  20. data_diff/databases/redshift.py +233 -0
  21. data_diff/databases/snowflake.py +222 -0
  22. data_diff/databases/sybase.py +720 -0
  23. data_diff/databases/trino.py +73 -0
  24. data_diff/databases/vertica.py +174 -0
  25. data_diff/diff_tables.py +489 -0
  26. data_diff/errors.py +17 -0
  27. data_diff/format.py +369 -0
  28. data_diff/hashdiff_tables.py +1026 -0
  29. data_diff/info_tree.py +76 -0
  30. data_diff/joindiff_tables.py +434 -0
  31. data_diff/lexicographic_space.py +253 -0
  32. data_diff/parse_time.py +88 -0
  33. data_diff/py.typed +0 -0
  34. data_diff/queries/__init__.py +13 -0
  35. data_diff/queries/api.py +213 -0
  36. data_diff/queries/ast_classes.py +811 -0
  37. data_diff/queries/base.py +38 -0
  38. data_diff/queries/extras.py +43 -0
  39. data_diff/query_utils.py +70 -0
  40. data_diff/schema.py +67 -0
  41. data_diff/table_segment.py +583 -0
  42. data_diff/thread_utils.py +112 -0
  43. data_diff/utils.py +1022 -0
  44. data_diff/version.py +15 -0
  45. dcs_core/__init__.py +13 -0
  46. dcs_core/__main__.py +17 -0
  47. dcs_core/__version__.py +15 -0
  48. dcs_core/cli/__init__.py +13 -0
  49. dcs_core/cli/cli.py +165 -0
  50. dcs_core/core/__init__.py +19 -0
  51. dcs_core/core/common/__init__.py +13 -0
  52. dcs_core/core/common/errors.py +50 -0
  53. dcs_core/core/common/models/__init__.py +13 -0
  54. dcs_core/core/common/models/configuration.py +284 -0
  55. dcs_core/core/common/models/dashboard.py +24 -0
  56. dcs_core/core/common/models/data_source_resource.py +75 -0
  57. dcs_core/core/common/models/metric.py +160 -0
  58. dcs_core/core/common/models/profile.py +75 -0
  59. dcs_core/core/common/models/validation.py +216 -0
  60. dcs_core/core/common/models/widget.py +44 -0
  61. dcs_core/core/configuration/__init__.py +13 -0
  62. dcs_core/core/configuration/config_loader.py +139 -0
  63. dcs_core/core/configuration/configuration_parser.py +262 -0
  64. dcs_core/core/configuration/configuration_parser_arc.py +328 -0
  65. dcs_core/core/datasource/__init__.py +13 -0
  66. dcs_core/core/datasource/base.py +62 -0
  67. dcs_core/core/datasource/manager.py +112 -0
  68. dcs_core/core/datasource/search_datasource.py +421 -0
  69. dcs_core/core/datasource/sql_datasource.py +1094 -0
  70. dcs_core/core/inspect.py +163 -0
  71. dcs_core/core/logger/__init__.py +13 -0
  72. dcs_core/core/logger/base.py +32 -0
  73. dcs_core/core/logger/default_logger.py +94 -0
  74. dcs_core/core/metric/__init__.py +13 -0
  75. dcs_core/core/metric/base.py +220 -0
  76. dcs_core/core/metric/combined_metric.py +98 -0
  77. dcs_core/core/metric/custom_metric.py +34 -0
  78. dcs_core/core/metric/manager.py +137 -0
  79. dcs_core/core/metric/numeric_metric.py +403 -0
  80. dcs_core/core/metric/reliability_metric.py +90 -0
  81. dcs_core/core/profiling/__init__.py +13 -0
  82. dcs_core/core/profiling/datasource_profiling.py +136 -0
  83. dcs_core/core/profiling/numeric_field_profiling.py +72 -0
  84. dcs_core/core/profiling/text_field_profiling.py +67 -0
  85. dcs_core/core/repository/__init__.py +13 -0
  86. dcs_core/core/repository/metric_repository.py +77 -0
  87. dcs_core/core/utils/__init__.py +13 -0
  88. dcs_core/core/utils/log.py +29 -0
  89. dcs_core/core/utils/tracking.py +105 -0
  90. dcs_core/core/utils/utils.py +44 -0
  91. dcs_core/core/validation/__init__.py +13 -0
  92. dcs_core/core/validation/base.py +230 -0
  93. dcs_core/core/validation/completeness_validation.py +153 -0
  94. dcs_core/core/validation/custom_query_validation.py +24 -0
  95. dcs_core/core/validation/manager.py +282 -0
  96. dcs_core/core/validation/numeric_validation.py +276 -0
  97. dcs_core/core/validation/reliability_validation.py +91 -0
  98. dcs_core/core/validation/uniqueness_validation.py +61 -0
  99. dcs_core/core/validation/validity_validation.py +738 -0
  100. dcs_core/integrations/__init__.py +13 -0
  101. dcs_core/integrations/databases/__init__.py +13 -0
  102. dcs_core/integrations/databases/bigquery.py +187 -0
  103. dcs_core/integrations/databases/databricks.py +51 -0
  104. dcs_core/integrations/databases/db2.py +652 -0
  105. dcs_core/integrations/databases/elasticsearch.py +61 -0
  106. dcs_core/integrations/databases/mssql.py +829 -0
  107. dcs_core/integrations/databases/mysql.py +409 -0
  108. dcs_core/integrations/databases/opensearch.py +64 -0
  109. dcs_core/integrations/databases/oracle.py +719 -0
  110. dcs_core/integrations/databases/postgres.py +482 -0
  111. dcs_core/integrations/databases/redshift.py +53 -0
  112. dcs_core/integrations/databases/snowflake.py +48 -0
  113. dcs_core/integrations/databases/spark_df.py +111 -0
  114. dcs_core/integrations/databases/sybase.py +1069 -0
  115. dcs_core/integrations/storage/__init__.py +13 -0
  116. dcs_core/integrations/storage/local_file.py +149 -0
  117. dcs_core/integrations/utils/__init__.py +13 -0
  118. dcs_core/integrations/utils/utils.py +36 -0
  119. dcs_core/report/__init__.py +13 -0
  120. dcs_core/report/dashboard.py +211 -0
  121. dcs_core/report/models.py +88 -0
  122. dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
  123. dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
  124. dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
  125. dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
  126. dcs_core/report/static/assets/images/docs.svg +6 -0
  127. dcs_core/report/static/assets/images/github.svg +4 -0
  128. dcs_core/report/static/assets/images/logo.svg +7 -0
  129. dcs_core/report/static/assets/images/slack.svg +13 -0
  130. dcs_core/report/static/index.js +2 -0
  131. dcs_core/report/static/index.js.LICENSE.txt +3971 -0
  132. dcs_sdk/__init__.py +13 -0
  133. dcs_sdk/__main__.py +18 -0
  134. dcs_sdk/__version__.py +15 -0
  135. dcs_sdk/cli/__init__.py +13 -0
  136. dcs_sdk/cli/cli.py +163 -0
  137. dcs_sdk/sdk/__init__.py +58 -0
  138. dcs_sdk/sdk/config/__init__.py +13 -0
  139. dcs_sdk/sdk/config/config_loader.py +491 -0
  140. dcs_sdk/sdk/data_diff/__init__.py +13 -0
  141. dcs_sdk/sdk/data_diff/data_differ.py +821 -0
  142. dcs_sdk/sdk/rules/__init__.py +15 -0
  143. dcs_sdk/sdk/rules/rules_mappping.py +31 -0
  144. dcs_sdk/sdk/rules/rules_repository.py +214 -0
  145. dcs_sdk/sdk/rules/schema_rules.py +65 -0
  146. dcs_sdk/sdk/utils/__init__.py +13 -0
  147. dcs_sdk/sdk/utils/serializer.py +25 -0
  148. dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
  149. dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
  150. dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
  151. dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
  152. dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
  153. dcs_sdk/sdk/utils/table.py +475 -0
  154. dcs_sdk/sdk/utils/themes.py +40 -0
  155. dcs_sdk/sdk/utils/utils.py +349 -0
  156. dcs_sdk-1.6.5.dist-info/METADATA +150 -0
  157. dcs_sdk-1.6.5.dist-info/RECORD +159 -0
  158. dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
  159. dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,15 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from .rules_repository import RulesRepository
@@ -0,0 +1,31 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from dcs_sdk.sdk.rules.schema_rules import (
16
+ allow_equivalent_data_types,
17
+ ignore_column_length_difference,
18
+ ignore_datetime_precision_difference,
19
+ ignore_numeric_precision_difference,
20
+ ignore_numeric_scale_difference,
21
+ )
22
+
23
+
24
+ def get_rules_to_func_mapping():
25
+ return {
26
+ "ignore_column_length_difference": ignore_column_length_difference,
27
+ "allow_equivalent_data_types": allow_equivalent_data_types,
28
+ "ignore_numeric_precision_difference": ignore_numeric_precision_difference,
29
+ "ignore_numeric_scale_difference": ignore_numeric_scale_difference,
30
+ "ignore_datetime_precision_difference": ignore_datetime_precision_difference,
31
+ }
@@ -0,0 +1,214 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import copy
16
+ from collections import defaultdict
17
+ from typing import Optional
18
+ from uuid import UUID
19
+
20
+ from loguru import logger
21
+
22
+ from dcs_sdk.sdk.rules.rules_mappping import get_rules_to_func_mapping
23
+
24
+
25
+ def map_str_type_to_generic_type(c_type):
26
+ c_type = c_type.lower() if c_type else str(c_type)
27
+ if any(
28
+ c_type.startswith(prefix)
29
+ for prefix in [
30
+ "int",
31
+ "integer",
32
+ "bigint",
33
+ "smallint",
34
+ "tinyint",
35
+ "float",
36
+ "double",
37
+ "real",
38
+ "numeric",
39
+ "decimal",
40
+ "money",
41
+ "smallmoney",
42
+ "number",
43
+ ]
44
+ ):
45
+ return "numeric"
46
+
47
+ elif any(
48
+ c_type.startswith(prefix)
49
+ for prefix in [
50
+ "string",
51
+ "varchar",
52
+ "char",
53
+ "text",
54
+ "str",
55
+ "character varying",
56
+ "character",
57
+ "nvar",
58
+ "nchar",
59
+ ]
60
+ ):
61
+ return "string"
62
+ elif any(c_type.startswith(prefix) for prefix in ["uuid", "uniqueidentifier", "guid"]):
63
+ return "uuid"
64
+ elif any(
65
+ c_type.startswith(prefix)
66
+ for prefix in [
67
+ "date",
68
+ "time",
69
+ "timestamp",
70
+ "datetime",
71
+ "timestamp with time zone",
72
+ "timestamp without time zone",
73
+ "smalldatetime",
74
+ ]
75
+ ):
76
+ return "datetime"
77
+ elif any(c_type.startswith(prefix) for prefix in ["json", "jsonb", "dict", "map"]):
78
+ return "json"
79
+ elif any(c_type.startswith(prefix) for prefix in ["array", "list"]):
80
+ return "array"
81
+ elif any(c_type.startswith(prefix) for prefix in ["binary", "blob"]):
82
+ return "binary"
83
+ elif any(c_type.startswith(prefix) for prefix in ["bytea"]):
84
+ return "bytea"
85
+ elif any(c_type.startswith(prefix) for prefix in ["boolean", "bool"]):
86
+ return "boolean"
87
+ else:
88
+ return c_type
89
+
90
+
91
+ # CENTRALIZED REPO FOR ALL THE RULES
92
+ # RULE NAME -> ALL ITS PROPERTIES
93
+ class RulesRepository:
94
+ _INSTANCE = None
95
+
96
+ @classmethod
97
+ def get_instance(cls):
98
+ if cls._INSTANCE is None:
99
+ cls._INSTANCE = cls()
100
+ return cls._INSTANCE
101
+
102
+ def __init__(self):
103
+ self.rules = {}
104
+ self.rules_mapping = get_rules_to_func_mapping()
105
+ self.value_rules = []
106
+ self.schema_rules = {}
107
+
108
+ def register(self, id: UUID, rule_dict: dict):
109
+ """
110
+ Registers a rule in the centralized repository.
111
+ Supports both 'schema' and 'value' rules.
112
+
113
+ - For value rules: transformation is a string template
114
+ - For schema rules: transformation is a function name (to be resolved)
115
+ """
116
+ rule_type = rule_dict.get("type")
117
+
118
+ if not rule_type:
119
+ return
120
+
121
+ if rule_type == "schema_override":
122
+ func_name = rule_dict.get("transformation")
123
+ if func_name:
124
+ func = self.rules_mapping.get(func_name)
125
+ if not func:
126
+ raise ValueError(f"Function '{func_name}' not found in registry")
127
+ rule_dict["function"] = func
128
+ rule_dict["function_name"] = func_name
129
+
130
+ self.rules[id] = rule_dict
131
+
132
+ def register_schema_rules(self, schema_rules: list):
133
+ self.schema_rules = schema_rules
134
+
135
+ def register_value_rules(self, value_rules: list):
136
+ self.value_rules = value_rules
137
+
138
+ def get(self, id: UUID) -> Optional[dict]:
139
+ return self.rules.get(id)
140
+
141
+ def apply_schema_rules(self, src_col: dict, tgt_col: dict) -> tuple[bool, str | None]:
142
+ """
143
+ Performs baseline schema checks and overrides them if corresponding rules are configured.
144
+ Returns (True, None) if all checks pass.
145
+ Returns (False, reason) if any check fails.
146
+ """
147
+
148
+ schema_rule_function_mapping = defaultdict(list)
149
+
150
+ for schema_rule in self.schema_rules:
151
+ rule_obj = self.get(schema_rule)
152
+ if rule_obj:
153
+ func_name = rule_obj.get("function_name")
154
+ params = rule_obj.get("params")
155
+ function = rule_obj.get("function")
156
+
157
+ schema_rule_function_mapping[func_name].append({"params": params, "func": function})
158
+
159
+ # def is_rule_allowed(rule_name: str, fallback_check: bool) -> bool:
160
+ # if rule_name not in schema_rule_function_mapping:
161
+ # return fallback_check
162
+ # rule_obj = schema_rule_function_mapping[rule_name]
163
+ # func = rule_obj.get("func")
164
+ # params = rule_obj.get("params")
165
+ # return func(src_col, tgt_col, params) if func else fallback_check
166
+
167
+ def is_rule_allowed(rule_name: str, fallback_check: bool) -> bool:
168
+ if rule_name not in schema_rule_function_mapping:
169
+ return fallback_check
170
+
171
+ rule_objs = schema_rule_function_mapping[rule_name]
172
+ if not rule_objs:
173
+ return fallback_check
174
+
175
+ for rule_obj in rule_objs:
176
+ func = rule_obj.get("func")
177
+ params = rule_obj.get("params")
178
+
179
+ if func:
180
+ result = func(src_col, tgt_col, params)
181
+ if result:
182
+ return True
183
+
184
+ return fallback_check
185
+
186
+ source_generic_data_type = map_str_type_to_generic_type(src_col["data_type"].lower())
187
+ tgt_generic_data_type = map_str_type_to_generic_type(tgt_col["data_type"].lower())
188
+
189
+ if src_col["data_type"].lower() != tgt_col["data_type"].lower():
190
+ if not is_rule_allowed("allow_equivalent_data_types", False):
191
+ return False, f"Data type mismatch"
192
+
193
+ return True, None
194
+
195
+ if source_generic_data_type == "string" and tgt_generic_data_type == "string":
196
+ if src_col.get("character_maximum_length") != tgt_col.get("character_maximum_length"):
197
+ if not is_rule_allowed("ignore_column_length_difference", False):
198
+ return False, f"Length mismatch"
199
+
200
+ if source_generic_data_type == "numeric" and tgt_generic_data_type == "numeric":
201
+ if src_col.get("numeric_precision") != tgt_col.get("numeric_precision"):
202
+ if not is_rule_allowed("ignore_numeric_precision_difference", False):
203
+ return False, f"Numeric precision mismatch"
204
+
205
+ if src_col.get("numeric_scale") != tgt_col.get("numeric_scale"):
206
+ if not is_rule_allowed("ignore_numeric_scale_difference", False):
207
+ return False, f"Numeric scale mismatch"
208
+
209
+ if source_generic_data_type == "datetime" and tgt_generic_data_type == "datetime":
210
+ if src_col.get("datetime_precision") != tgt_col.get("datetime_precision"):
211
+ if not is_rule_allowed("ignore_datetime_precision_difference", False):
212
+ return False, f"Datetime precision mismatch"
213
+
214
+ return True, None
@@ -0,0 +1,65 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Dict, Optional
16
+
17
+
18
+ def ignore_column_length_difference(
19
+ src_col: Dict,
20
+ tgt_col: Dict,
21
+ params: Optional[Dict] = None,
22
+ ) -> bool:
23
+ if not params:
24
+ # IN THIS CASE ALWAYS RETURN TRUE
25
+ return True
26
+
27
+ # IF PARAMS THEN PROCESS
28
+ # FOR EG: MAX_LENGTH_DIFF = 30 SO IN THIS CASE WE CAN CACL THE DIFF AND RETURN APPROPRIATE RESPONSE
29
+
30
+ return False
31
+
32
+
33
+ def allow_equivalent_data_types(src_col: Dict, tgt_col: Dict, params: Optional[Dict] = None) -> bool:
34
+
35
+ src_type = src_col["data_type"].lower()
36
+ tgt_type = tgt_col["data_type"].lower()
37
+
38
+ if params and "equivalent_groups" in params:
39
+ for group in params["equivalent_groups"]:
40
+ group_set = {t.lower() for t in group}
41
+ if src_type in group_set and tgt_type in group_set:
42
+ return True
43
+
44
+ return False
45
+
46
+
47
+ def ignore_numeric_precision_difference(src_col: Dict, tgt_col: Dict, params: Optional[Dict] = None) -> bool:
48
+ if not params:
49
+ return True
50
+
51
+ return False
52
+
53
+
54
+ def ignore_numeric_scale_difference(src_col: Dict, tgt_col: Dict, params: Optional[Dict] = None) -> bool:
55
+ if not params:
56
+ return True
57
+
58
+ return False
59
+
60
+
61
+ def ignore_datetime_precision_difference(src_col: Dict, tgt_col: Dict, params: Optional[Dict] = None) -> bool:
62
+ if not params:
63
+ return True
64
+
65
+ return False
@@ -0,0 +1,13 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
@@ -0,0 +1,25 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ def serialize_table_schema(info) -> dict:
17
+ return {
18
+ "column_name": info.column_name,
19
+ "data_type": info.data_type,
20
+ "datetime_precision": info.datetime_precision,
21
+ "numeric_precision": info.numeric_precision,
22
+ "numeric_scale": info.numeric_scale,
23
+ "collation_name": info.collation_name,
24
+ "character_maximum_length": info.character_maximum_length,
25
+ }
@@ -0,0 +1,13 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
@@ -0,0 +1,153 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import os
16
+ import string
17
+ from abc import ABC, abstractmethod
18
+ from collections import defaultdict
19
+ from typing import Any, Dict, List
20
+
21
+ import nltk
22
+ from dotenv import load_dotenv
23
+ from nltk.corpus import stopwords
24
+ from nltk.tokenize import word_tokenize
25
+
26
+ from dcs_sdk.sdk.config.config_loader import SimilarityConfig
27
+
28
+
29
+ def ensure_nltk_data():
30
+ load_dotenv()
31
+ nltk_data_dir = os.getenv("NLTK_DATA_DIR")
32
+
33
+ if not nltk_data_dir:
34
+ default_root = os.path.dirname(os.path.abspath(__file__))
35
+ nltk_data_dir = os.path.join(default_root, "nltk_data")
36
+ print(f"NLTK_DATA_DIR ENV variable not set. Using default path: {nltk_data_dir}")
37
+
38
+ punkt_path = os.path.join(nltk_data_dir, "tokenizers", "punkt")
39
+ stopwords_path = os.path.join(nltk_data_dir, "corpora", "stopwords")
40
+ punkt_tab_path = os.path.join(nltk_data_dir, "tokenizers", "punkt_tab")
41
+
42
+ if not os.path.exists(punkt_path):
43
+ nltk.download(
44
+ "punkt",
45
+ download_dir=nltk_data_dir,
46
+ halt_on_error=True,
47
+ raise_on_error=True,
48
+ )
49
+ if not os.path.exists(stopwords_path):
50
+ nltk.download(
51
+ "stopwords",
52
+ download_dir=nltk_data_dir,
53
+ halt_on_error=True,
54
+ raise_on_error=True,
55
+ )
56
+ if not os.path.exists(punkt_tab_path):
57
+ nltk.download(
58
+ "punkt_tab",
59
+ download_dir=nltk_data_dir,
60
+ halt_on_error=True,
61
+ raise_on_error=True,
62
+ )
63
+
64
+ nltk.data.path.append(nltk_data_dir)
65
+
66
+
67
+ class SimilarityScoreProvider(ABC):
68
+ def preprocess_text(self, text: str, methods: list[str]) -> set:
69
+ """Applies preprocessing steps dynamically before tokenization."""
70
+ if "lower_case" in methods:
71
+ text = text.lower()
72
+ if "remove_punctuation" in methods:
73
+ text = text.translate(str.maketrans("", "", string.punctuation))
74
+ if "remove_stop_words" in methods:
75
+ stop_words = set(stopwords.words("english"))
76
+ text = " ".join(word for word in text.split() if word not in stop_words)
77
+ if "remove_extra_whitespaces" in methods:
78
+ text = " ".join(text.split())
79
+
80
+ tokens = set(word_tokenize(text))
81
+ return tokens
82
+
83
+ @abstractmethod
84
+ def fuzzy_match(self, str1: set, str2: set) -> float:
85
+ """Computes a similarity score between two sets of tokens."""
86
+ pass
87
+
88
+ def add_text_similarity(
89
+ self,
90
+ key: List[str],
91
+ data: List[Dict[str, Any]],
92
+ fields: List[str],
93
+ similarity: SimilarityConfig,
94
+ source_masking_cols: List[str],
95
+ target_masking_cols: List[str],
96
+ mask_char: str,
97
+ ) -> List[Dict[str, Any]]:
98
+ """Adds text similarity scores for the given fields inside the meta dictionary
99
+ and determines if they are a match based on the threshold.
100
+
101
+ Args:
102
+ key (List[str]): List of primary key column names to form a composite key.
103
+ data (List[Dict[str, Any]]): List of records to process.
104
+ fields (List[str]): List of fields to compute similarity scores for.
105
+ similarity (SimilarityConfig): Configuration object with pre_processing and threshold.
106
+ source_masking_cols (List[str]): List of cols from the source to be masked
107
+ target_masking_cols (List[str]): List of cols from the target to be masked
108
+ mask_char (str): Character to be used for masking (Default = "*")
109
+
110
+ Returns:
111
+ List[Dict[str, Any]]: Processed records with similarity scores added.
112
+ """
113
+ grouped_data = defaultdict(list)
114
+ for item in data:
115
+ composite_key = "_".join(str(item[k]) for k in key)
116
+ grouped_data[composite_key].append(item)
117
+
118
+ for _, items in grouped_data.items():
119
+ if len(items) == 2:
120
+ source, target = items
121
+
122
+ source.setdefault("meta", {}).setdefault("scores", {})
123
+ target.setdefault("meta", {}).setdefault("scores", {})
124
+
125
+ for field in fields:
126
+ source_text = self.preprocess_text(source.get(field, ""), similarity.pre_processing)
127
+ target_text = self.preprocess_text(target.get(field, ""), similarity.pre_processing)
128
+ similarity_score = self.fuzzy_match(source_text, target_text)
129
+
130
+ match_status = "match" if similarity_score >= similarity.threshold else "not matched"
131
+
132
+ score_key = f"{field}"
133
+ source["meta"]["scores"][score_key] = {"score": similarity_score, "status": match_status}
134
+ target["meta"]["scores"][score_key] = {"score": similarity_score, "status": match_status}
135
+
136
+ source_val = str(source.get(field, ""))
137
+ target_val = str(target.get(field, ""))
138
+
139
+ if field in source_masking_cols and field in target_masking_cols:
140
+ if len(source_val) == len(target_val) and match_status == "not matched":
141
+ source[field] = mask_char * (len(source_val) + 1)
142
+ target[field] = mask_char * (len(target_val))
143
+ else:
144
+ source[field] = mask_char * (len(source_val))
145
+ target[field] = mask_char * (len(target_val))
146
+
147
+ if field in source_masking_cols:
148
+ source[field] = mask_char * len(source_val)
149
+
150
+ if field in target_masking_cols:
151
+ target[field] = mask_char * len(target_val)
152
+
153
+ return data
@@ -0,0 +1,39 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import math
16
+ from collections import Counter
17
+
18
+ from dcs_sdk.sdk.utils.similarity_score.base_provider import SimilarityScoreProvider
19
+
20
+
21
+ class CosineSimilarityProvider(SimilarityScoreProvider):
22
+ def fuzzy_match(self, tokens1: set, tokens2: set) -> float:
23
+ """Computes cosine similarity between two sets of tokens."""
24
+ if not tokens1 or not tokens2:
25
+ return 0.0
26
+
27
+ freq1 = Counter(tokens1)
28
+ freq2 = Counter(tokens2)
29
+
30
+ all_words = set(freq1.keys()).union(set(freq2.keys()))
31
+
32
+ dot_product = sum(freq1[word] * freq2[word] for word in all_words)
33
+ magnitude1 = math.sqrt(sum(freq1[word] ** 2 for word in all_words))
34
+ magnitude2 = math.sqrt(sum(freq2[word] ** 2 for word in all_words))
35
+
36
+ if magnitude1 == 0 or magnitude2 == 0:
37
+ return 0.0
38
+
39
+ return round(dot_product / (magnitude1 * magnitude2), 4)
@@ -0,0 +1,24 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from dcs_sdk.sdk.utils.similarity_score.base_provider import SimilarityScoreProvider
17
+
18
+
19
+ class JaccardSimilarityProvider(SimilarityScoreProvider):
20
+ def fuzzy_match(self, set1: set, set2: set) -> float:
21
+ """Computes the Jaccard similarity between two sets of tokens."""
22
+ intersection = len(set1 & set2)
23
+ union = len(set1 | set2)
24
+ return intersection / union if union != 0 else 0.0
@@ -0,0 +1,31 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from nltk.metrics import edit_distance
16
+
17
+ from dcs_sdk.sdk.utils.similarity_score.base_provider import SimilarityScoreProvider
18
+
19
+
20
+ class LevenshteinDistanceProvider(SimilarityScoreProvider):
21
+ def fuzzy_match(self, tokens1: set, tokens2: set) -> float:
22
+ """Computes similarity score using Levenshtein distance."""
23
+ str1 = " ".join(tokens1)
24
+ str2 = " ".join(tokens2)
25
+
26
+ max_len = max(len(str1), len(str2))
27
+ if max_len == 0:
28
+ return 1.0
29
+
30
+ distance = edit_distance(str1, str2)
31
+ return round(1 - (distance / max_len), 4)