dcs-sdk 1.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. data_diff/__init__.py +221 -0
  2. data_diff/__main__.py +517 -0
  3. data_diff/abcs/__init__.py +13 -0
  4. data_diff/abcs/compiler.py +27 -0
  5. data_diff/abcs/database_types.py +402 -0
  6. data_diff/config.py +141 -0
  7. data_diff/databases/__init__.py +38 -0
  8. data_diff/databases/_connect.py +323 -0
  9. data_diff/databases/base.py +1417 -0
  10. data_diff/databases/bigquery.py +376 -0
  11. data_diff/databases/clickhouse.py +217 -0
  12. data_diff/databases/databricks.py +262 -0
  13. data_diff/databases/duckdb.py +207 -0
  14. data_diff/databases/mssql.py +343 -0
  15. data_diff/databases/mysql.py +189 -0
  16. data_diff/databases/oracle.py +238 -0
  17. data_diff/databases/postgresql.py +293 -0
  18. data_diff/databases/presto.py +222 -0
  19. data_diff/databases/redis.py +93 -0
  20. data_diff/databases/redshift.py +233 -0
  21. data_diff/databases/snowflake.py +222 -0
  22. data_diff/databases/sybase.py +720 -0
  23. data_diff/databases/trino.py +73 -0
  24. data_diff/databases/vertica.py +174 -0
  25. data_diff/diff_tables.py +489 -0
  26. data_diff/errors.py +17 -0
  27. data_diff/format.py +369 -0
  28. data_diff/hashdiff_tables.py +1026 -0
  29. data_diff/info_tree.py +76 -0
  30. data_diff/joindiff_tables.py +434 -0
  31. data_diff/lexicographic_space.py +253 -0
  32. data_diff/parse_time.py +88 -0
  33. data_diff/py.typed +0 -0
  34. data_diff/queries/__init__.py +13 -0
  35. data_diff/queries/api.py +213 -0
  36. data_diff/queries/ast_classes.py +811 -0
  37. data_diff/queries/base.py +38 -0
  38. data_diff/queries/extras.py +43 -0
  39. data_diff/query_utils.py +70 -0
  40. data_diff/schema.py +67 -0
  41. data_diff/table_segment.py +583 -0
  42. data_diff/thread_utils.py +112 -0
  43. data_diff/utils.py +1022 -0
  44. data_diff/version.py +15 -0
  45. dcs_core/__init__.py +13 -0
  46. dcs_core/__main__.py +17 -0
  47. dcs_core/__version__.py +15 -0
  48. dcs_core/cli/__init__.py +13 -0
  49. dcs_core/cli/cli.py +165 -0
  50. dcs_core/core/__init__.py +19 -0
  51. dcs_core/core/common/__init__.py +13 -0
  52. dcs_core/core/common/errors.py +50 -0
  53. dcs_core/core/common/models/__init__.py +13 -0
  54. dcs_core/core/common/models/configuration.py +284 -0
  55. dcs_core/core/common/models/dashboard.py +24 -0
  56. dcs_core/core/common/models/data_source_resource.py +75 -0
  57. dcs_core/core/common/models/metric.py +160 -0
  58. dcs_core/core/common/models/profile.py +75 -0
  59. dcs_core/core/common/models/validation.py +216 -0
  60. dcs_core/core/common/models/widget.py +44 -0
  61. dcs_core/core/configuration/__init__.py +13 -0
  62. dcs_core/core/configuration/config_loader.py +139 -0
  63. dcs_core/core/configuration/configuration_parser.py +262 -0
  64. dcs_core/core/configuration/configuration_parser_arc.py +328 -0
  65. dcs_core/core/datasource/__init__.py +13 -0
  66. dcs_core/core/datasource/base.py +62 -0
  67. dcs_core/core/datasource/manager.py +112 -0
  68. dcs_core/core/datasource/search_datasource.py +421 -0
  69. dcs_core/core/datasource/sql_datasource.py +1094 -0
  70. dcs_core/core/inspect.py +163 -0
  71. dcs_core/core/logger/__init__.py +13 -0
  72. dcs_core/core/logger/base.py +32 -0
  73. dcs_core/core/logger/default_logger.py +94 -0
  74. dcs_core/core/metric/__init__.py +13 -0
  75. dcs_core/core/metric/base.py +220 -0
  76. dcs_core/core/metric/combined_metric.py +98 -0
  77. dcs_core/core/metric/custom_metric.py +34 -0
  78. dcs_core/core/metric/manager.py +137 -0
  79. dcs_core/core/metric/numeric_metric.py +403 -0
  80. dcs_core/core/metric/reliability_metric.py +90 -0
  81. dcs_core/core/profiling/__init__.py +13 -0
  82. dcs_core/core/profiling/datasource_profiling.py +136 -0
  83. dcs_core/core/profiling/numeric_field_profiling.py +72 -0
  84. dcs_core/core/profiling/text_field_profiling.py +67 -0
  85. dcs_core/core/repository/__init__.py +13 -0
  86. dcs_core/core/repository/metric_repository.py +77 -0
  87. dcs_core/core/utils/__init__.py +13 -0
  88. dcs_core/core/utils/log.py +29 -0
  89. dcs_core/core/utils/tracking.py +105 -0
  90. dcs_core/core/utils/utils.py +44 -0
  91. dcs_core/core/validation/__init__.py +13 -0
  92. dcs_core/core/validation/base.py +230 -0
  93. dcs_core/core/validation/completeness_validation.py +153 -0
  94. dcs_core/core/validation/custom_query_validation.py +24 -0
  95. dcs_core/core/validation/manager.py +282 -0
  96. dcs_core/core/validation/numeric_validation.py +276 -0
  97. dcs_core/core/validation/reliability_validation.py +91 -0
  98. dcs_core/core/validation/uniqueness_validation.py +61 -0
  99. dcs_core/core/validation/validity_validation.py +738 -0
  100. dcs_core/integrations/__init__.py +13 -0
  101. dcs_core/integrations/databases/__init__.py +13 -0
  102. dcs_core/integrations/databases/bigquery.py +187 -0
  103. dcs_core/integrations/databases/databricks.py +51 -0
  104. dcs_core/integrations/databases/db2.py +652 -0
  105. dcs_core/integrations/databases/elasticsearch.py +61 -0
  106. dcs_core/integrations/databases/mssql.py +829 -0
  107. dcs_core/integrations/databases/mysql.py +409 -0
  108. dcs_core/integrations/databases/opensearch.py +64 -0
  109. dcs_core/integrations/databases/oracle.py +719 -0
  110. dcs_core/integrations/databases/postgres.py +482 -0
  111. dcs_core/integrations/databases/redshift.py +53 -0
  112. dcs_core/integrations/databases/snowflake.py +48 -0
  113. dcs_core/integrations/databases/spark_df.py +111 -0
  114. dcs_core/integrations/databases/sybase.py +1069 -0
  115. dcs_core/integrations/storage/__init__.py +13 -0
  116. dcs_core/integrations/storage/local_file.py +149 -0
  117. dcs_core/integrations/utils/__init__.py +13 -0
  118. dcs_core/integrations/utils/utils.py +36 -0
  119. dcs_core/report/__init__.py +13 -0
  120. dcs_core/report/dashboard.py +211 -0
  121. dcs_core/report/models.py +88 -0
  122. dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
  123. dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
  124. dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
  125. dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
  126. dcs_core/report/static/assets/images/docs.svg +6 -0
  127. dcs_core/report/static/assets/images/github.svg +4 -0
  128. dcs_core/report/static/assets/images/logo.svg +7 -0
  129. dcs_core/report/static/assets/images/slack.svg +13 -0
  130. dcs_core/report/static/index.js +2 -0
  131. dcs_core/report/static/index.js.LICENSE.txt +3971 -0
  132. dcs_sdk/__init__.py +13 -0
  133. dcs_sdk/__main__.py +18 -0
  134. dcs_sdk/__version__.py +15 -0
  135. dcs_sdk/cli/__init__.py +13 -0
  136. dcs_sdk/cli/cli.py +163 -0
  137. dcs_sdk/sdk/__init__.py +58 -0
  138. dcs_sdk/sdk/config/__init__.py +13 -0
  139. dcs_sdk/sdk/config/config_loader.py +491 -0
  140. dcs_sdk/sdk/data_diff/__init__.py +13 -0
  141. dcs_sdk/sdk/data_diff/data_differ.py +821 -0
  142. dcs_sdk/sdk/rules/__init__.py +15 -0
  143. dcs_sdk/sdk/rules/rules_mappping.py +31 -0
  144. dcs_sdk/sdk/rules/rules_repository.py +214 -0
  145. dcs_sdk/sdk/rules/schema_rules.py +65 -0
  146. dcs_sdk/sdk/utils/__init__.py +13 -0
  147. dcs_sdk/sdk/utils/serializer.py +25 -0
  148. dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
  149. dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
  150. dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
  151. dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
  152. dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
  153. dcs_sdk/sdk/utils/table.py +475 -0
  154. dcs_sdk/sdk/utils/themes.py +40 -0
  155. dcs_sdk/sdk/utils/utils.py +349 -0
  156. dcs_sdk-1.6.5.dist-info/METADATA +150 -0
  157. dcs_sdk-1.6.5.dist-info/RECORD +159 -0
  158. dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
  159. dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,829 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import datetime
16
+ from decimal import Decimal
17
+ from typing import Any, Dict, List, Optional, Tuple, Union
18
+ from uuid import UUID
19
+
20
+ import pyodbc
21
+ from loguru import logger
22
+ from sqlalchemy import text
23
+
24
+ from dcs_core.core.common.errors import DataChecksDataSourcesConnectionError
25
+ from dcs_core.core.common.models.data_source_resource import RawColumnInfo
26
+ from dcs_core.core.datasource.sql_datasource import SQLDataSource
27
+
28
+
29
+ class MssqlDataSource(SQLDataSource):
30
+ def __init__(self, data_source_name: str, data_connection: Dict):
31
+ super().__init__(data_source_name, data_connection)
32
+ self.regex_patterns = {
33
+ "uuid": r"[0-9a-fA-F]%-%[0-9a-fA-F]%-%[0-9a-fA-F]%-%[0-9a-fA-F]%-%[0-9a-fA-F]%",
34
+ "usa_phone": r"^(\+1[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}$",
35
+ "email": r"%[a-zA-Z0-9._%+-]@[a-zA-Z0-9.-]%.[a-zA-Z]%",
36
+ "usa_zip_code": r"^[0-9]{5}(?:-[0-9]{4})?$",
37
+ "ssn": r"^(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}$",
38
+ "sedol": r"[B-DF-HJ-NP-TV-XZ0-9][B-DF-HJ-NP-TV-XZ0-9][B-DF-HJ-NP-TV-XZ0-9][B-DF-HJ-NP-TV-XZ0-9][B-DF-HJ-NP-TV-XZ0-9][B-DF-HJ-NP-TV-XZ0-9][0-9]",
39
+ "lei": r"[A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][0-9][0-9]",
40
+ "cusip": r"[0-9A-Z][0-9A-Z][0-9A-Z][0-9A-Z][0-9A-Z][0-9A-Z][0-9A-Z][0-9A-Z][0-9A-Z]",
41
+ "figi": r"BBG[A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9]",
42
+ "isin": r"[A-Z][A-Z][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][0-9]",
43
+ "perm_id": r"^\d{4}([- ]?)\d{4}\1\d{4}\1\d{4}([- ]?)\d{3}$",
44
+ }
45
+
46
+ def connect(self) -> Any:
47
+ """
48
+ Connect to the data source
49
+ """
50
+ driver = self.data_connection.get("driver") or "ODBC Driver 18 for SQL Server"
51
+ host = self.data_connection.get("host")
52
+ port = self.data_connection.get("port")
53
+ database = self.data_connection.get("database")
54
+ username = self.data_connection.get("username")
55
+ password = self.data_connection.get("password")
56
+ server = self.data_connection.get("server")
57
+
58
+ connection_params = self._build_connection_params(
59
+ driver=driver, database=database, username=username, password=password
60
+ )
61
+
62
+ return self._establish_connection(connection_params, host, server, port)
63
+
64
+ def _prepare_driver_string(self, driver: str) -> str:
65
+ """Ensure driver string is properly formatted with braces."""
66
+ return f"{{{driver}}}" if not driver.startswith("{") else driver
67
+
68
+ def _build_connection_params(self, driver: str, database: str, username: str, password: str) -> dict:
69
+ return {
70
+ "DRIVER": self._prepare_driver_string(driver),
71
+ "DATABASE": database,
72
+ "UID": username,
73
+ "PWD": password,
74
+ "TrustServerCertificate": "yes",
75
+ }
76
+
77
+ def _establish_connection(self, conn_dict: dict, host: str, server: str, port: str) -> Any:
78
+ connection_attempts = [
79
+ (host, True), # host with port
80
+ (host, False), # host without port
81
+ (server, True), # server with port
82
+ (server, False), # server without port
83
+ ]
84
+
85
+ for _, (server_value, use_port) in enumerate(connection_attempts, 1):
86
+ if not server_value:
87
+ continue
88
+
89
+ try:
90
+ conn_dict["SERVER"] = f"{server_value},{port}" if use_port and port else server_value
91
+ self.connection = pyodbc.connect(**conn_dict)
92
+ logger.info(f"Connected to MSSQL database using {conn_dict['SERVER']}")
93
+ return self.connection
94
+ except Exception:
95
+ continue
96
+
97
+ raise DataChecksDataSourcesConnectionError(
98
+ message="Failed to connect to Mssql data source: [All connection attempts failed]"
99
+ )
100
+
101
+ def fetchall(self, query):
102
+ return self.connection.cursor().execute(query).fetchall()
103
+
104
+ def fetchone(self, query):
105
+ return self.connection.cursor().execute(query).fetchone()
106
+
107
+ def qualified_table_name(self, table_name: str) -> str:
108
+ """
109
+ Get the qualified table name
110
+ :param table_name: name of the table
111
+ :return: qualified table name
112
+ """
113
+ if self.schema_name:
114
+ return f"[{self.schema_name}].[{table_name}]"
115
+ return f"[{table_name}]"
116
+
117
+ def quote_column(self, column: str) -> str:
118
+ """
119
+ Quote the column name
120
+ :param column: name of the column
121
+ :return: quoted column name
122
+ """
123
+ return f"[{column}]"
124
+
125
+ def query_get_table_names(self, schema: str | None = None, with_view: bool = False) -> dict:
126
+ """
127
+ Get the list of tables in the database.
128
+ :param schema: optional schema name
129
+ :param with_view: whether to include views
130
+ :return: dictionary with table names and optionally view names
131
+ """
132
+ schema = schema or self.schema_name
133
+
134
+ if with_view:
135
+ object_types = "IN ('U', 'V')"
136
+ else:
137
+ object_types = "= 'U'"
138
+
139
+ query = f"SELECT o.name AS table_name, o.type FROM sys.objects o JOIN sys.schemas s ON o.schema_id = s.schema_id WHERE o.type {object_types} AND s.name = '{schema}' ORDER BY o.name"
140
+
141
+ rows = self.fetchall(query)
142
+
143
+ if with_view:
144
+ result = {"table": [], "view": []}
145
+ if rows:
146
+ for row in rows:
147
+ object_name = row[0]
148
+ object_type = row[1].strip() if row[1] else row[1]
149
+
150
+ if object_type == "U":
151
+ result["table"].append(object_name)
152
+ elif object_type == "V":
153
+ result["view"].append(object_name)
154
+ else:
155
+ result = {"table": []}
156
+ if rows:
157
+ result["table"] = [row[0] for row in rows]
158
+
159
+ return result
160
+
161
+ def query_get_table_indexes(self, table: str, schema: str | None = None) -> dict[str, dict]:
162
+ """
163
+ Get index information for a table in MSSQL DB.
164
+ :param table: Table name
165
+ :param schema: Optional schema name
166
+ :return: Dictionary with index details
167
+ """
168
+ schema = schema or self.schema_name
169
+ table = table.upper()
170
+ schema = schema.upper()
171
+
172
+ query = f"""
173
+ SELECT
174
+ i.name AS index_name,
175
+ i.type_desc AS index_type,
176
+ c.name AS column_name,
177
+ ic.key_ordinal AS column_order
178
+ FROM
179
+ sys.indexes i
180
+ JOIN
181
+ sys.index_columns ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id
182
+ JOIN
183
+ sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id
184
+ JOIN
185
+ sys.tables t ON t.object_id = i.object_id
186
+ JOIN
187
+ sys.schemas s ON t.schema_id = s.schema_id
188
+ WHERE
189
+ t.name = '{table}'
190
+ AND s.name = '{schema}'
191
+ AND i.is_hypothetical = 0
192
+ ORDER BY
193
+ i.name, ic.key_ordinal
194
+ """
195
+
196
+ rows = self.fetchall(query)
197
+
198
+ if not rows:
199
+ raise RuntimeError(f"No index information found for table '{table}' in schema '{schema}'.")
200
+
201
+ pk_query = f"""
202
+ SELECT c.name AS column_name
203
+ FROM
204
+ sys.key_constraints kc
205
+ JOIN
206
+ sys.index_columns ic ON kc.parent_object_id = ic.object_id AND kc.unique_index_id = ic.index_id
207
+ JOIN
208
+ sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id
209
+ JOIN
210
+ sys.tables t ON t.object_id = kc.parent_object_id
211
+ JOIN
212
+ sys.schemas s ON t.schema_id = s.schema_id
213
+ WHERE
214
+ kc.type = 'PK'
215
+ AND t.name = '{table}'
216
+ AND s.name = '{schema}'
217
+ ORDER BY ic.key_ordinal
218
+ """
219
+ pk_rows = self.fetchall(pk_query)
220
+ pk_columns = [row[0].strip() for row in pk_rows] if pk_rows else []
221
+ pk_columns_set = set(pk_columns)
222
+
223
+ indexes = {}
224
+ for row in rows:
225
+ index_name = row[0]
226
+ index_type = row[1]
227
+ column_info = {
228
+ "column_name": self.safe_get(row, 2),
229
+ "column_order": self.safe_get(row, 3),
230
+ }
231
+ if index_name not in indexes:
232
+ indexes[index_name] = {"columns": [], "index_type": index_type}
233
+ indexes[index_name]["columns"].append(column_info)
234
+
235
+ for index_name, idx in indexes.items():
236
+ index_columns = [col["column_name"].strip() for col in idx["columns"]]
237
+ index_columns_set = set(index_columns)
238
+ idx["is_primary_key"] = pk_columns_set == index_columns_set and len(index_columns) == len(pk_columns)
239
+ return indexes
240
+
241
+ def query_get_table_columns(self, table: str, schema: str | None = None) -> RawColumnInfo:
242
+ """
243
+ Get the schema of a table.
244
+ :param table: table name
245
+ :return: RawColumnInfo object containing column information
246
+ """
247
+ schema = schema or self.schema_name
248
+ database = self.quote_database(self.database)
249
+ query = (
250
+ "SELECT column_name, data_type, ISNULL(datetime_precision, 0) AS datetime_precision, ISNULL(numeric_precision, 0) AS numeric_precision, ISNULL(numeric_scale, 0) AS numeric_scale, collation_name, ISNULL(character_maximum_length, 0) AS character_maximum_length "
251
+ f"FROM {database}.information_schema.columns "
252
+ f"WHERE table_name = '{table}' AND table_schema = '{schema}'"
253
+ )
254
+ rows = self.fetchall(query)
255
+ if not rows:
256
+ raise RuntimeError(f"{table}: Table, {schema}: Schema, does not exist, or has no columns")
257
+
258
+ column_info = {
259
+ r[0]: RawColumnInfo(
260
+ column_name=self.safe_get(r, 0),
261
+ data_type=self.safe_get(r, 1),
262
+ datetime_precision=self.safe_get(r, 2),
263
+ numeric_precision=self.safe_get(r, 3),
264
+ numeric_scale=self.safe_get(r, 4),
265
+ collation_name=self.safe_get(r, 5),
266
+ character_maximum_length=self.safe_get(r, 6),
267
+ )
268
+ for r in rows
269
+ }
270
+ return column_info
271
+
272
+ def fetch_rows(
273
+ self,
274
+ query: str,
275
+ limit: int = 1,
276
+ with_column_names: bool = False,
277
+ complete_query: Optional[str] = None,
278
+ ) -> Tuple[List, Optional[List[str]]]:
279
+ """
280
+ Fetch rows from the database using pyodbc.
281
+
282
+ :param query: SQL query to execute.
283
+ :param limit: Number of rows to fetch.
284
+ :param with_column_names: Whether to include column names in the result.
285
+ :return: Tuple of (rows, column_names or None)
286
+ """
287
+ query = (
288
+ complete_query
289
+ or f"SELECT * FROM ({query}) AS subquery ORDER BY 1 OFFSET 0 ROWS FETCH NEXT {limit} ROWS ONLY"
290
+ )
291
+ cursor = self.connection.cursor()
292
+ cursor.execute(query)
293
+ rows = cursor.fetchmany(limit)
294
+
295
+ if with_column_names:
296
+ column_names = [column[0] for column in cursor.description]
297
+ return rows, column_names
298
+ else:
299
+ return rows, None
300
+
301
+ def regex_to_sql_condition(self, regex_pattern: str, field: str) -> str:
302
+ """
303
+ Convert regex patterns to SQL Server conditions
304
+ """
305
+ if (regex_pattern.startswith("^") and regex_pattern.endswith("$")) or "|" in regex_pattern:
306
+ pattern = regex_pattern.strip("^$")
307
+ if pattern.startswith("(") and pattern.endswith(")"):
308
+ pattern = pattern[1:-1]
309
+
310
+ if "|" in pattern:
311
+ values = [f"'{val.strip()}'" for val in pattern.split("|")]
312
+ return f"IIF({field} IN ({', '.join(values)}), 1, 0)"
313
+
314
+ pattern = regex_pattern
315
+ if pattern.startswith("^"):
316
+ pattern = pattern[1:]
317
+ if pattern.endswith("$"):
318
+ pattern = pattern[:-1]
319
+
320
+ pattern = pattern.replace(".*", "%").replace(".+", "%").replace(".", "_")
321
+
322
+ return f"IIF({field} LIKE '{pattern}', 1, 0)"
323
+
324
+ def query_get_variance(self, table: str, field: str, filters: str = None) -> int:
325
+ """
326
+ Get the variance value
327
+ :param table: table name
328
+ :param field: column name
329
+ :param filters: filter condition
330
+ :return:
331
+ """
332
+ qualified_table_name = self.qualified_table_name(table)
333
+ field = self.quote_column(field)
334
+ query = "SELECT VAR({}) FROM {}".format(field, qualified_table_name)
335
+ if filters:
336
+ query += " WHERE {}".format(filters)
337
+
338
+ return round(self.fetchone(query)[0], 2)
339
+
340
+ def query_get_stddev(self, table: str, field: str, filters: str = None) -> int:
341
+ """
342
+ Get the standard deviation value
343
+ :param table: table name
344
+ :param field: column name
345
+ :param filters: filter condition
346
+ :return:
347
+ """
348
+ qualified_table_name = self.qualified_table_name(table)
349
+ field = self.quote_column(field)
350
+ query = "SELECT STDEV({}) FROM {}".format(field, qualified_table_name)
351
+ if filters:
352
+ query += " WHERE {}".format(filters)
353
+
354
+ return round(self.fetchone(query)[0], 2)
355
+
356
+ def query_get_percentile(self, table: str, field: str, percentile: float, filters: str = None) -> float:
357
+ """
358
+ Get the specified percentile value of a numeric column in a table.
359
+ :param table: table name
360
+ :param field: column name
361
+ :param percentile: percentile to calculate (e.g., 0.2 for 20th percentile)
362
+ :param filters: filter condition
363
+ :return: the value at the specified percentile
364
+ """
365
+ qualified_table_name = self.qualified_table_name(table)
366
+ field = self.quote_column(field)
367
+ query = f"""
368
+ SELECT PERCENTILE_CONT({percentile}) WITHIN GROUP (ORDER BY {field})
369
+ OVER () AS percentile_value
370
+ FROM {qualified_table_name}
371
+ """
372
+ if filters:
373
+ query += f" WHERE {filters}"
374
+
375
+ result = self.fetchone(query)
376
+ return round(result[0], 2) if result and result[0] is not None else None
377
+
378
+ def query_get_null_keyword_count(
379
+ self, table: str, field: str, operation: str, filters: str = None
380
+ ) -> Union[int, float]:
381
+ """
382
+ Get the count of NULL-like values (specific keywords) in the specified column for MSSQL.
383
+ :param table: table name
384
+ :param field: column name
385
+ :param operation: type of operation ('count' or 'percent')
386
+ :param filters: filter condition
387
+ :return: count (int) or percentage (float) of NULL-like keyword values
388
+ """
389
+ qualified_table_name = self.qualified_table_name(table)
390
+ field = self.quote_column(field)
391
+
392
+ query = f"""
393
+ SELECT
394
+ SUM(CASE
395
+ WHEN {field} IS NULL
396
+ OR LTRIM(RTRIM(LOWER(ISNULL({field}, '')))) IN ('nothing', 'nil', 'null', 'none', 'n/a', '')
397
+ THEN 1
398
+ ELSE 0
399
+ END) AS null_count,
400
+ COUNT(*) AS total_count
401
+ FROM {qualified_table_name}
402
+ """
403
+
404
+ if filters:
405
+ query += f" AND {filters}"
406
+
407
+ result = self.fetchone(query)
408
+
409
+ if not result or not result[1]:
410
+ return 0
411
+
412
+ null_count = int(result[0] if result[0] is not None else 0)
413
+ total_count = int(result[1])
414
+
415
+ if operation == "percent":
416
+ return round((null_count / total_count) * 100, 2) if total_count > 0 else 0.0
417
+
418
+ return null_count
419
+
420
+ def query_get_string_length_metric(
421
+ self, table: str, field: str, metric: str, filters: str = None
422
+ ) -> Union[int, float]:
423
+ """
424
+ Get the string length metric (max, min, avg) in a column of a table.
425
+
426
+ :param table: table name
427
+ :param field: column name
428
+ :param metric: the metric to calculate ('max', 'min', 'avg')
429
+ :param filters: filter condition
430
+ :return: the calculated metric as int for 'max' and 'min', float for 'avg'
431
+ """
432
+ qualified_table_name = self.qualified_table_name(table)
433
+ field = self.quote_column(field)
434
+
435
+ if metric.lower() == "max":
436
+ sql_function = "MAX(LEN"
437
+ elif metric.lower() == "min":
438
+ sql_function = "MIN(LEN"
439
+ elif metric.lower() == "avg":
440
+ sql_function = "AVG(LEN"
441
+ else:
442
+ raise ValueError(f"Invalid metric '{metric}'. Choose from 'max', 'min', or 'avg'.")
443
+
444
+ if metric.lower() == "avg":
445
+ query = f'SELECT AVG(CAST(LEN("{field}") AS FLOAT)) FROM {qualified_table_name}'
446
+ else:
447
+ query = f'SELECT {sql_function}("{field}")) FROM {qualified_table_name}'
448
+
449
+ if filters:
450
+ query += f" WHERE {filters}"
451
+
452
+ result = self.fetchone(query)[0]
453
+ return round(result, 2) if metric.lower() == "avg" else result
454
+
455
+ def query_string_pattern_validity(
456
+ self,
457
+ table: str,
458
+ field: str,
459
+ regex_pattern: str = None,
460
+ predefined_regex_pattern: str = None,
461
+ filters: str = None,
462
+ ) -> Tuple[int, int]:
463
+ """
464
+ Get the count of valid values based on the regex pattern.
465
+ :param table: table name
466
+ :param field: column name
467
+ :param regex_pattern: custom regex pattern
468
+ :param predefined_regex_pattern: predefined regex pattern
469
+ :param filters: filter condition
470
+ :return: count of valid values, count of total row count
471
+ """
472
+ filters = f"WHERE {filters}" if filters else ""
473
+ qualified_table_name = self.qualified_table_name(table)
474
+ field = self.quote_column(field)
475
+ if not regex_pattern and not predefined_regex_pattern:
476
+ raise ValueError("Either regex_pattern or predefined_regex_pattern should be provided")
477
+ if regex_pattern:
478
+ regex = regex_pattern
479
+ else:
480
+ regex = self.regex_patterns[predefined_regex_pattern]
481
+
482
+ regex = self.regex_to_sql_condition(regex, field)
483
+
484
+ query = f"""
485
+ SELECT SUM(CAST({regex} AS BIGINT)) AS valid_count,
486
+ COUNT(*) AS total_count
487
+ FROM {qualified_table_name}
488
+ {filters}
489
+ """
490
+ if predefined_regex_pattern == "perm_id":
491
+ query = f"""
492
+ SELECT
493
+ SUM(CASE
494
+ WHEN {field} LIKE '[0-9][0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]-[0-9][0-9][0-9]'
495
+ OR {field} LIKE '[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]'
496
+ THEN 1
497
+ ELSE 0
498
+ END) AS valid_count,
499
+ COUNT(*) AS total_count
500
+ FROM {qualified_table_name};
501
+ """
502
+ elif predefined_regex_pattern == "ssn":
503
+ query = f"""
504
+ SELECT
505
+ SUM(CASE
506
+ WHEN {field} LIKE '[0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9][0-9][0-9]'
507
+ AND LEFT({field}, 3) NOT IN ('000', '666')
508
+ AND LEFT({field}, 1) != '9'
509
+ AND SUBSTRING({field}, 5, 2) != '00'
510
+ AND RIGHT({field}, 4) != '0000'
511
+ THEN 1
512
+ ELSE 0
513
+ END) AS valid_count,
514
+ COUNT(*) AS total_count
515
+ FROM {qualified_table_name}
516
+ """
517
+ elif predefined_regex_pattern == "usa_phone":
518
+ query = f"""
519
+ SELECT
520
+ SUM(CASE
521
+ WHEN ({field} LIKE '+1 [0-9][0-9][0-9] [0-9][0-9][0-9] [0-9][0-9][0-9][0-9]'
522
+ OR {field} LIKE '+1-[0-9][0-9][0-9]-[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
523
+ OR {field} LIKE '+1.[0-9][0-9][0-9].[0-9][0-9][0-9].[0-9][0-9][0-9][0-9]'
524
+ OR {field} LIKE '+1[0-9][0-9][0-9]-[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
525
+ OR {field} LIKE '([0-9][0-9][0-9]) [0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
526
+ OR {field} LIKE '[0-9][0-9][0-9] [0-9][0-9][0-9] [0-9][0-9][0-9][0-9]'
527
+ OR {field} LIKE '[0-9][0-9][0-9].[0-9][0-9][0-9].[0-9][0-9][0-9][0-9]'
528
+ OR {field} LIKE '[0-9][0-9][0-9]-[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
529
+ OR {field} LIKE '+1 ([0-9][0-9][0-9]) [0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
530
+ OR {field} LIKE '[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]'
531
+ OR {field} LIKE '+1[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]'
532
+ OR {field} LIKE '([0-9][0-9][0-9])[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
533
+ OR {field} LIKE '+1 ([0-9][0-9][0-9])[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
534
+ OR {field} LIKE '+1 ([0-9][0-9][0-9]).[0-9][0-9][0-9].[0-9][0-9][0-9][0-9]'
535
+ OR {field} LIKE '([0-9][0-9][0-9]).[0-9][0-9][0-9].[0-9][0-9][0-9][0-9]'
536
+ OR {field} LIKE '([0-9][0-9][0-9])-[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
537
+ OR {field} LIKE '[0-9][0-9][0-9] [0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
538
+ OR {field} LIKE '[0-9][0-9][0-9].[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]')
539
+ THEN 1
540
+ ELSE 0
541
+ END) AS valid_count,
542
+ COUNT(*) AS total_count
543
+ FROM {qualified_table_name};
544
+
545
+ """
546
+ elif predefined_regex_pattern == "usa_zip_code":
547
+ query = f"""
548
+ SELECT
549
+ SUM(CASE
550
+ WHEN PATINDEX('%[0-9][0-9][0-9][0-9][0-9]%[-][0-9][0-9][0-9][0-9]%', CAST({field} AS VARCHAR)) > 0
551
+ OR PATINDEX('%[0-9][0-9][0-9][0-9][0-9]%', CAST({field} AS VARCHAR)) > 0
552
+ THEN 1 ELSE 0 END) AS valid_count,
553
+ COUNT(*) AS total_count
554
+ FROM {qualified_table_name};
555
+ """
556
+ result = self.fetchone(query)
557
+ return result[0], result[1]
558
+
559
+ def query_valid_invalid_values_validity(
560
+ self,
561
+ table: str,
562
+ field: str,
563
+ regex_pattern: str = None,
564
+ filters: str = None,
565
+ values: List[str] = None,
566
+ ) -> Tuple[int, int]:
567
+ """
568
+ Get the count of valid and invalid values for a specified column.
569
+ :param table: table name
570
+ :param field: column name
571
+ :param values: list of valid values
572
+ :param regex_pattern: regex pattern (will be converted to SQL Server pattern)
573
+ :param filters: filter condition
574
+ :return: count of valid values and total count of rows.
575
+ """
576
+ filters = f"WHERE {filters}" if filters else ""
577
+ qualified_table_name = self.qualified_table_name(table)
578
+ field = self.quote_column(field)
579
+
580
+ if values:
581
+ values_str = ", ".join([f"'{value}'" for value in values])
582
+ validity_condition = f"IIF({field} IN ({values_str}), 1, 0)"
583
+ elif regex_pattern:
584
+ validity_condition = self.regex_to_sql_condition(regex_pattern, field)
585
+ else:
586
+ raise ValueError("Either 'values' or 'regex_pattern' must be provided.")
587
+
588
+ query = f"""
589
+ SELECT SUM(CAST({validity_condition} AS BIGINT)) AS valid_count,
590
+ COUNT(*) AS total_count
591
+ FROM {qualified_table_name}
592
+ {filters}
593
+ """
594
+
595
+ result = self.fetchone(query)
596
+ return result[0], result[1]
597
+
598
+ def query_get_usa_state_code_validity(self, table: str, field: str, filters: str = None) -> Tuple[int, int]:
599
+ """
600
+ Get the count of valid USA state codes
601
+ :param table: table name
602
+ :param field: column name
603
+ :param filters: filter condition
604
+ :return: count of valid state codes, count of total row count
605
+ """
606
+ valid_state_codes_str = ", ".join(f"'{code}'" for code in self.valid_state_codes)
607
+
608
+ filters = f"WHERE {filters}" if filters else ""
609
+ qualified_table_name = self.qualified_table_name(table)
610
+ field = self.quote_column(field)
611
+
612
+ regex_query = f"""
613
+ CASE
614
+ WHEN {field} IS NULL THEN 0
615
+ WHEN {field} IN ({valid_state_codes_str})
616
+ THEN 1
617
+ ELSE 0
618
+ END"""
619
+
620
+ query = f"""
621
+ SELECT
622
+ SUM(CAST({regex_query} AS BIGINT)) AS valid_count,
623
+ COUNT(*) AS total_count
624
+ FROM {qualified_table_name}
625
+ {filters}
626
+ """
627
+ result = self.fetchone(query)
628
+ return result[0], result[1]
629
+
630
+ def query_timestamp_metric(self):
631
+ raise NotImplementedError("Method not implemented for MssqlDataSource")
632
+
633
+ def query_timestamp_not_in_future_metric(self):
634
+ raise NotImplementedError("Method not implemented for MssqlDataSource")
635
+
636
+ def query_timestamp_date_not_in_future_metric(self):
637
+ raise NotImplementedError("Method not implemented for MssqlDataSource")
638
+
639
+ def query_get_time_diff(self, table: str, field: str) -> int:
640
+ """
641
+ Get the time difference
642
+ :param table: name of the index
643
+ :param field: field name of updated time column
644
+ :return: time difference in seconds
645
+ """
646
+ qualified_table_name = self.qualified_table_name(table)
647
+ field = self.quote_column(field)
648
+ query = f"""
649
+ SELECT TOP 1 {field} FROM {qualified_table_name} ORDER BY {field} DESC;
650
+ """
651
+ result = self.fetchone(query)
652
+ if result:
653
+ updated_time = result[0]
654
+ if isinstance(updated_time, str):
655
+ updated_time = datetime.datetime.strptime(updated_time, "%Y-%m-%d %H:%M:%S.%f")
656
+ return int((datetime.datetime.utcnow() - updated_time).total_seconds())
657
+ return 0
658
+
659
+ def build_table_metrics_query(
660
+ self,
661
+ table_name: str,
662
+ column_info: list[dict],
663
+ additional_queries: Optional[List[str]] = None,
664
+ ) -> list[dict]:
665
+ query_parts = []
666
+ if not column_info:
667
+ return []
668
+
669
+ for col in column_info:
670
+ name = col["column_name"]
671
+ dtype = col["data_type"].lower()
672
+
673
+ quoted_name = self.quote_column(name)
674
+
675
+ query_parts.append(f"COUNT(DISTINCT {quoted_name}) AS [{name}_distinct]")
676
+ query_parts.append(f"COUNT({quoted_name}) - COUNT(DISTINCT {quoted_name}) AS [{name}_duplicate]")
677
+ query_parts.append(f"SUM(CASE WHEN {quoted_name} IS NULL THEN 1 ELSE 0 END) AS [{name}_is_null]")
678
+
679
+ if dtype in (
680
+ "int",
681
+ "integer",
682
+ "bigint",
683
+ "smallint",
684
+ "tinyint",
685
+ "decimal",
686
+ "numeric",
687
+ "float",
688
+ "real",
689
+ "money",
690
+ "smallmoney",
691
+ ):
692
+ query_parts.append(f"MIN({quoted_name}) AS [{name}_min]")
693
+ query_parts.append(f"MAX({quoted_name}) AS [{name}_max]")
694
+ query_parts.append(f"AVG(CAST({quoted_name} AS FLOAT)) AS [{name}_average]")
695
+
696
+ elif dtype in ("varchar", "nvarchar", "char", "nchar", "text", "ntext"):
697
+ query_parts.append(f"MAX(LEN({quoted_name})) AS [{name}_max_character_length]")
698
+
699
+ if additional_queries:
700
+ query_parts.extend(additional_queries)
701
+
702
+ qualified_table = self.qualified_table_name(table_name)
703
+ query_body = ",\n ".join(query_parts)
704
+ query = f"SELECT\n {query_body}\nFROM {qualified_table};"
705
+
706
+ cursor = self.connection.cursor()
707
+ try:
708
+ cursor.execute(query)
709
+ columns = [column[0] for column in cursor.description]
710
+ result_row = cursor.fetchone()
711
+ finally:
712
+ cursor.close()
713
+
714
+ row = dict(zip(columns, result_row))
715
+
716
+ def _normalize_metrics(value):
717
+ """Safely normalize DB metric values for JSON serialization."""
718
+ if value is None:
719
+ return None
720
+ if isinstance(value, Decimal):
721
+ return float(value)
722
+ if isinstance(value, (int, float, bool)):
723
+ return value
724
+ if isinstance(value, (datetime.datetime, datetime.date)):
725
+ return value.isoformat()
726
+ if isinstance(value, UUID):
727
+ return str(value)
728
+ if isinstance(value, list):
729
+ return [_normalize_metrics(v) for v in value]
730
+ if isinstance(value, dict):
731
+ return {k: _normalize_metrics(v) for k, v in value.items()}
732
+ return str(value)
733
+
734
+ column_wise = []
735
+ for col in column_info:
736
+ name = col["column_name"]
737
+ col_metrics = {}
738
+
739
+ for key, value in row.items():
740
+ if key.startswith(f"{name}_"):
741
+ metric_name = key[len(name) + 1 :]
742
+ col_metrics[metric_name] = _normalize_metrics(value)
743
+
744
+ column_wise.append({"column_name": name, "metrics": col_metrics})
745
+ return column_wise
746
+
747
+ def fetch_sample_values_from_database(
748
+ self,
749
+ table_name: str,
750
+ column_names: list[str],
751
+ limit: int = 5,
752
+ ) -> Tuple[List[Tuple], List[str]]:
753
+ """
754
+ Fetch sample rows for specific columns from the given table (MSSQL version).
755
+
756
+ :param table_name: The name of the table.
757
+ :param column_names: List of column names to fetch.
758
+ :param limit: Number of rows to fetch.
759
+ :return: Tuple of (list of row tuples, list of column names)
760
+ """
761
+ qualified_table_name = self.qualified_table_name(table_name)
762
+
763
+ if not column_names:
764
+ raise ValueError("At least one column name must be provided")
765
+
766
+ if len(column_names) == 1 and column_names[0] == "*":
767
+ query = f"SELECT TOP {limit} * FROM {qualified_table_name}"
768
+ else:
769
+ columns = ", ".join([self.quote_column(col) for col in column_names])
770
+ query = f"SELECT TOP {limit} {columns} FROM {qualified_table_name}"
771
+
772
+ cursor = self.connection.cursor()
773
+ try:
774
+ cursor.execute(query)
775
+ column_names = [desc[0] for desc in cursor.description]
776
+ rows = cursor.fetchall()
777
+ finally:
778
+ cursor.close()
779
+ return rows, column_names
780
+
781
+ def get_table_foreign_key_info(self, table_name: str, schema: str | None = None):
782
+ schema = schema or self.schema_name
783
+
784
+ query = f"""
785
+ SELECT
786
+ fk.name AS constraint_name,
787
+ t.name AS table_name,
788
+ c.name AS fk_column,
789
+ rt.name AS referenced_table,
790
+ rc.name AS referenced_column
791
+ FROM sys.foreign_keys fk
792
+ INNER JOIN sys.foreign_key_columns fkc
793
+ ON fk.object_id = fkc.constraint_object_id
794
+ INNER JOIN sys.tables t
795
+ ON fk.parent_object_id = t.object_id
796
+ INNER JOIN sys.schemas s
797
+ ON t.schema_id = s.schema_id
798
+ INNER JOIN sys.columns c
799
+ ON fkc.parent_object_id = c.object_id
800
+ AND fkc.parent_column_id = c.column_id
801
+ INNER JOIN sys.tables rt
802
+ ON fk.referenced_object_id = rt.object_id
803
+ INNER JOIN sys.schemas rs
804
+ ON rt.schema_id = rs.schema_id
805
+ INNER JOIN sys.columns rc
806
+ ON fkc.referenced_object_id = rc.object_id
807
+ AND fkc.referenced_column_id = rc.column_id
808
+ WHERE t.name = '{table_name}'
809
+ AND s.name = '{schema}';
810
+ """
811
+ try:
812
+ cursor = self.connection.cursor()
813
+ cursor.execute(query)
814
+ rows = cursor.fetchall()
815
+ except Exception as e:
816
+ print(f"Failed to fetch fk info for dataset: {table_name}")
817
+ return []
818
+
819
+ data = [
820
+ {
821
+ "constraint_name": row[0],
822
+ "table_name": row[1],
823
+ "fk_column": row[2],
824
+ "referenced_table": row[3],
825
+ "referenced_column": row[4],
826
+ }
827
+ for row in rows
828
+ ]
829
+ return data