dcs-sdk 1.6.4__py3-none-any.whl → 1.6.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. dcs_core/__init__.py +13 -0
  2. dcs_core/__main__.py +17 -0
  3. dcs_core/__version__.py +15 -0
  4. dcs_core/cli/__init__.py +13 -0
  5. dcs_core/cli/cli.py +165 -0
  6. dcs_core/core/__init__.py +19 -0
  7. dcs_core/core/common/__init__.py +13 -0
  8. dcs_core/core/common/errors.py +50 -0
  9. dcs_core/core/common/models/__init__.py +13 -0
  10. dcs_core/core/common/models/configuration.py +284 -0
  11. dcs_core/core/common/models/dashboard.py +24 -0
  12. dcs_core/core/common/models/data_source_resource.py +75 -0
  13. dcs_core/core/common/models/metric.py +160 -0
  14. dcs_core/core/common/models/profile.py +75 -0
  15. dcs_core/core/common/models/validation.py +216 -0
  16. dcs_core/core/common/models/widget.py +44 -0
  17. dcs_core/core/configuration/__init__.py +13 -0
  18. dcs_core/core/configuration/config_loader.py +139 -0
  19. dcs_core/core/configuration/configuration_parser.py +262 -0
  20. dcs_core/core/configuration/configuration_parser_arc.py +328 -0
  21. dcs_core/core/datasource/__init__.py +13 -0
  22. dcs_core/core/datasource/base.py +62 -0
  23. dcs_core/core/datasource/manager.py +112 -0
  24. dcs_core/core/datasource/search_datasource.py +421 -0
  25. dcs_core/core/datasource/sql_datasource.py +1094 -0
  26. dcs_core/core/inspect.py +163 -0
  27. dcs_core/core/logger/__init__.py +13 -0
  28. dcs_core/core/logger/base.py +32 -0
  29. dcs_core/core/logger/default_logger.py +94 -0
  30. dcs_core/core/metric/__init__.py +13 -0
  31. dcs_core/core/metric/base.py +220 -0
  32. dcs_core/core/metric/combined_metric.py +98 -0
  33. dcs_core/core/metric/custom_metric.py +34 -0
  34. dcs_core/core/metric/manager.py +137 -0
  35. dcs_core/core/metric/numeric_metric.py +403 -0
  36. dcs_core/core/metric/reliability_metric.py +90 -0
  37. dcs_core/core/profiling/__init__.py +13 -0
  38. dcs_core/core/profiling/datasource_profiling.py +136 -0
  39. dcs_core/core/profiling/numeric_field_profiling.py +72 -0
  40. dcs_core/core/profiling/text_field_profiling.py +67 -0
  41. dcs_core/core/repository/__init__.py +13 -0
  42. dcs_core/core/repository/metric_repository.py +77 -0
  43. dcs_core/core/utils/__init__.py +13 -0
  44. dcs_core/core/utils/log.py +29 -0
  45. dcs_core/core/utils/tracking.py +105 -0
  46. dcs_core/core/utils/utils.py +44 -0
  47. dcs_core/core/validation/__init__.py +13 -0
  48. dcs_core/core/validation/base.py +230 -0
  49. dcs_core/core/validation/completeness_validation.py +153 -0
  50. dcs_core/core/validation/custom_query_validation.py +24 -0
  51. dcs_core/core/validation/manager.py +282 -0
  52. dcs_core/core/validation/numeric_validation.py +276 -0
  53. dcs_core/core/validation/reliability_validation.py +91 -0
  54. dcs_core/core/validation/uniqueness_validation.py +61 -0
  55. dcs_core/core/validation/validity_validation.py +738 -0
  56. dcs_core/integrations/__init__.py +13 -0
  57. dcs_core/integrations/databases/__init__.py +13 -0
  58. dcs_core/integrations/databases/bigquery.py +187 -0
  59. dcs_core/integrations/databases/databricks.py +51 -0
  60. dcs_core/integrations/databases/db2.py +652 -0
  61. dcs_core/integrations/databases/elasticsearch.py +61 -0
  62. dcs_core/integrations/databases/mssql.py +979 -0
  63. dcs_core/integrations/databases/mysql.py +409 -0
  64. dcs_core/integrations/databases/opensearch.py +64 -0
  65. dcs_core/integrations/databases/oracle.py +719 -0
  66. dcs_core/integrations/databases/postgres.py +570 -0
  67. dcs_core/integrations/databases/redshift.py +53 -0
  68. dcs_core/integrations/databases/snowflake.py +48 -0
  69. dcs_core/integrations/databases/spark_df.py +111 -0
  70. dcs_core/integrations/databases/sybase.py +1069 -0
  71. dcs_core/integrations/storage/__init__.py +13 -0
  72. dcs_core/integrations/storage/local_file.py +149 -0
  73. dcs_core/integrations/utils/__init__.py +13 -0
  74. dcs_core/integrations/utils/utils.py +36 -0
  75. dcs_core/report/__init__.py +13 -0
  76. dcs_core/report/dashboard.py +211 -0
  77. dcs_core/report/models.py +88 -0
  78. dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
  79. dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
  80. dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
  81. dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
  82. dcs_core/report/static/assets/images/docs.svg +6 -0
  83. dcs_core/report/static/assets/images/github.svg +4 -0
  84. dcs_core/report/static/assets/images/logo.svg +7 -0
  85. dcs_core/report/static/assets/images/slack.svg +13 -0
  86. dcs_core/report/static/index.js +2 -0
  87. dcs_core/report/static/index.js.LICENSE.txt +3971 -0
  88. dcs_sdk/__version__.py +1 -1
  89. dcs_sdk/cli/cli.py +3 -0
  90. {dcs_sdk-1.6.4.dist-info → dcs_sdk-1.6.6.dist-info}/METADATA +24 -2
  91. dcs_sdk-1.6.6.dist-info/RECORD +159 -0
  92. {dcs_sdk-1.6.4.dist-info → dcs_sdk-1.6.6.dist-info}/entry_points.txt +1 -0
  93. dcs_sdk-1.6.4.dist-info/RECORD +0 -72
  94. {dcs_sdk-1.6.4.dist-info → dcs_sdk-1.6.6.dist-info}/WHEEL +0 -0
@@ -0,0 +1,979 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import datetime
16
+ import math
17
+ from decimal import Decimal
18
+ from typing import Any, Dict, List, Optional, Tuple, Union
19
+ from uuid import UUID
20
+
21
+ import pyodbc
22
+ from loguru import logger
23
+ from sqlalchemy import text
24
+
25
+ from dcs_core.core.common.errors import DataChecksDataSourcesConnectionError
26
+ from dcs_core.core.common.models.data_source_resource import RawColumnInfo
27
+ from dcs_core.core.datasource.sql_datasource import SQLDataSource
28
+
29
+
30
+ class MssqlDataSource(SQLDataSource):
31
+ def __init__(self, data_source_name: str, data_connection: Dict):
32
+ super().__init__(data_source_name, data_connection)
33
+ self.regex_patterns = {
34
+ "uuid": r"[0-9a-fA-F]%-%[0-9a-fA-F]%-%[0-9a-fA-F]%-%[0-9a-fA-F]%-%[0-9a-fA-F]%",
35
+ "usa_phone": r"^(\+1[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}$",
36
+ "email": r"%[a-zA-Z0-9._%+-]@[a-zA-Z0-9.-]%.[a-zA-Z]%",
37
+ "usa_zip_code": r"^[0-9]{5}(?:-[0-9]{4})?$",
38
+ "ssn": r"^(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}$",
39
+ "sedol": r"[B-DF-HJ-NP-TV-XZ0-9][B-DF-HJ-NP-TV-XZ0-9][B-DF-HJ-NP-TV-XZ0-9][B-DF-HJ-NP-TV-XZ0-9][B-DF-HJ-NP-TV-XZ0-9][B-DF-HJ-NP-TV-XZ0-9][0-9]",
40
+ "lei": r"[A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][0-9][0-9]",
41
+ "cusip": r"[0-9A-Z][0-9A-Z][0-9A-Z][0-9A-Z][0-9A-Z][0-9A-Z][0-9A-Z][0-9A-Z][0-9A-Z]",
42
+ "figi": r"BBG[A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9]",
43
+ "isin": r"[A-Z][A-Z][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][0-9]",
44
+ "perm_id": r"^\d{4}([- ]?)\d{4}\1\d{4}\1\d{4}([- ]?)\d{3}$",
45
+ }
46
+
47
+ def connect(self) -> Any:
48
+ """
49
+ Connect to the data source
50
+ """
51
+ driver = self.data_connection.get("driver") or "ODBC Driver 18 for SQL Server"
52
+ host = self.data_connection.get("host")
53
+ port = self.data_connection.get("port")
54
+ database = self.data_connection.get("database")
55
+ username = self.data_connection.get("username")
56
+ password = self.data_connection.get("password")
57
+ server = self.data_connection.get("server")
58
+
59
+ connection_params = self._build_connection_params(
60
+ driver=driver, database=database, username=username, password=password
61
+ )
62
+
63
+ return self._establish_connection(connection_params, host, server, port)
64
+
65
+ def _prepare_driver_string(self, driver: str) -> str:
66
+ """Ensure driver string is properly formatted with braces."""
67
+ return f"{{{driver}}}" if not driver.startswith("{") else driver
68
+
69
+ def _build_connection_params(self, driver: str, database: str, username: str, password: str) -> dict:
70
+ return {
71
+ "DRIVER": self._prepare_driver_string(driver),
72
+ "DATABASE": database,
73
+ "UID": username,
74
+ "PWD": password,
75
+ "TrustServerCertificate": "yes",
76
+ }
77
+
78
+ def _establish_connection(self, conn_dict: dict, host: str, server: str, port: str) -> Any:
79
+ connection_attempts = [
80
+ (host, True), # host with port
81
+ (host, False), # host without port
82
+ (server, True), # server with port
83
+ (server, False), # server without port
84
+ ]
85
+
86
+ for _, (server_value, use_port) in enumerate(connection_attempts, 1):
87
+ if not server_value:
88
+ continue
89
+
90
+ try:
91
+ conn_dict["SERVER"] = f"{server_value},{port}" if use_port and port else server_value
92
+ self.connection = pyodbc.connect(**conn_dict)
93
+ logger.info(f"Connected to MSSQL database using {conn_dict['SERVER']}")
94
+ return self.connection
95
+ except Exception:
96
+ continue
97
+
98
+ raise DataChecksDataSourcesConnectionError(
99
+ message="Failed to connect to Mssql data source: [All connection attempts failed]"
100
+ )
101
+
102
+ def fetchall(self, query):
103
+ return self.connection.cursor().execute(query).fetchall()
104
+
105
+ def fetchone(self, query):
106
+ return self.connection.cursor().execute(query).fetchone()
107
+
108
+ def qualified_table_name(self, table_name: str) -> str:
109
+ """
110
+ Get the qualified table name
111
+ :param table_name: name of the table
112
+ :return: qualified table name
113
+ """
114
+ if self.schema_name:
115
+ return f"[{self.schema_name}].[{table_name}]"
116
+ return f"[{table_name}]"
117
+
118
+ def quote_column(self, column: str) -> str:
119
+ """
120
+ Quote the column name
121
+ :param column: name of the column
122
+ :return: quoted column name
123
+ """
124
+ return f"[{column}]"
125
+
126
+ def query_get_table_names(self, schema: str | None = None, with_view: bool = False) -> dict:
127
+ """
128
+ Get the list of tables in the database.
129
+ :param schema: optional schema name
130
+ :param with_view: whether to include views
131
+ :return: dictionary with table names and optionally view names
132
+ """
133
+ schema = schema or self.schema_name
134
+
135
+ if with_view:
136
+ object_types = "IN ('U', 'V')"
137
+ else:
138
+ object_types = "= 'U'"
139
+
140
+ query = f"SELECT o.name AS table_name, o.type FROM sys.objects o JOIN sys.schemas s ON o.schema_id = s.schema_id WHERE o.type {object_types} AND s.name = '{schema}' ORDER BY o.name"
141
+
142
+ rows = self.fetchall(query)
143
+
144
+ if with_view:
145
+ result = {"table": [], "view": []}
146
+ if rows:
147
+ for row in rows:
148
+ object_name = row[0]
149
+ object_type = row[1].strip() if row[1] else row[1]
150
+
151
+ if object_type == "U":
152
+ result["table"].append(object_name)
153
+ elif object_type == "V":
154
+ result["view"].append(object_name)
155
+ else:
156
+ result = {"table": []}
157
+ if rows:
158
+ result["table"] = [row[0] for row in rows]
159
+
160
+ return result
161
+
162
+ def query_get_table_indexes(self, table: str, schema: str | None = None) -> dict[str, dict]:
163
+ """
164
+ Get index information for a table in MSSQL DB.
165
+ :param table: Table name
166
+ :param schema: Optional schema name
167
+ :return: Dictionary with index details
168
+ """
169
+ schema = schema or self.schema_name
170
+ table = table.upper()
171
+ schema = schema.upper()
172
+
173
+ query = f"""
174
+ SELECT
175
+ i.name AS index_name,
176
+ i.type_desc AS index_type,
177
+ c.name AS column_name,
178
+ ic.key_ordinal AS column_order
179
+ FROM
180
+ sys.indexes i
181
+ JOIN
182
+ sys.index_columns ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id
183
+ JOIN
184
+ sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id
185
+ JOIN
186
+ sys.tables t ON t.object_id = i.object_id
187
+ JOIN
188
+ sys.schemas s ON t.schema_id = s.schema_id
189
+ WHERE
190
+ t.name = '{table}'
191
+ AND s.name = '{schema}'
192
+ AND i.is_hypothetical = 0
193
+ ORDER BY
194
+ i.name, ic.key_ordinal
195
+ """
196
+
197
+ rows = self.fetchall(query)
198
+
199
+ if not rows:
200
+ raise RuntimeError(f"No index information found for table '{table}' in schema '{schema}'.")
201
+
202
+ pk_query = f"""
203
+ SELECT c.name AS column_name
204
+ FROM
205
+ sys.key_constraints kc
206
+ JOIN
207
+ sys.index_columns ic ON kc.parent_object_id = ic.object_id AND kc.unique_index_id = ic.index_id
208
+ JOIN
209
+ sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id
210
+ JOIN
211
+ sys.tables t ON t.object_id = kc.parent_object_id
212
+ JOIN
213
+ sys.schemas s ON t.schema_id = s.schema_id
214
+ WHERE
215
+ kc.type = 'PK'
216
+ AND t.name = '{table}'
217
+ AND s.name = '{schema}'
218
+ ORDER BY ic.key_ordinal
219
+ """
220
+ pk_rows = self.fetchall(pk_query)
221
+ pk_columns = [row[0].strip() for row in pk_rows] if pk_rows else []
222
+ pk_columns_set = set(pk_columns)
223
+
224
+ indexes = {}
225
+ for row in rows:
226
+ index_name = row[0]
227
+ index_type = row[1]
228
+ column_info = {
229
+ "column_name": self.safe_get(row, 2),
230
+ "column_order": self.safe_get(row, 3),
231
+ }
232
+ if index_name not in indexes:
233
+ indexes[index_name] = {"columns": [], "index_type": index_type}
234
+ indexes[index_name]["columns"].append(column_info)
235
+
236
+ for index_name, idx in indexes.items():
237
+ index_columns = [col["column_name"].strip() for col in idx["columns"]]
238
+ index_columns_set = set(index_columns)
239
+ idx["is_primary_key"] = pk_columns_set == index_columns_set and len(index_columns) == len(pk_columns)
240
+ return indexes
241
+
242
+ def query_get_table_columns(self, table: str, schema: str | None = None) -> RawColumnInfo:
243
+ """
244
+ Get the schema of a table.
245
+ :param table: table name
246
+ :return: RawColumnInfo object containing column information
247
+ """
248
+ schema = schema or self.schema_name
249
+ database = self.quote_database(self.database)
250
+ query = (
251
+ "SELECT column_name, data_type, ISNULL(datetime_precision, 0) AS datetime_precision, ISNULL(numeric_precision, 0) AS numeric_precision, ISNULL(numeric_scale, 0) AS numeric_scale, collation_name, ISNULL(character_maximum_length, 0) AS character_maximum_length "
252
+ f"FROM {database}.information_schema.columns "
253
+ f"WHERE table_name = '{table}' AND table_schema = '{schema}'"
254
+ )
255
+ rows = self.fetchall(query)
256
+ if not rows:
257
+ raise RuntimeError(f"{table}: Table, {schema}: Schema, does not exist, or has no columns")
258
+
259
+ column_info = {
260
+ r[0]: RawColumnInfo(
261
+ column_name=self.safe_get(r, 0),
262
+ data_type=self.safe_get(r, 1),
263
+ datetime_precision=self.safe_get(r, 2),
264
+ numeric_precision=self.safe_get(r, 3),
265
+ numeric_scale=self.safe_get(r, 4),
266
+ collation_name=self.safe_get(r, 5),
267
+ character_maximum_length=self.safe_get(r, 6),
268
+ )
269
+ for r in rows
270
+ }
271
+ return column_info
272
+
273
+ def fetch_rows(
274
+ self,
275
+ query: str,
276
+ limit: int = 1,
277
+ with_column_names: bool = False,
278
+ complete_query: Optional[str] = None,
279
+ ) -> Tuple[List, Optional[List[str]]]:
280
+ """
281
+ Fetch rows from the database using pyodbc.
282
+
283
+ :param query: SQL query to execute.
284
+ :param limit: Number of rows to fetch.
285
+ :param with_column_names: Whether to include column names in the result.
286
+ :return: Tuple of (rows, column_names or None)
287
+ """
288
+ query = (
289
+ complete_query
290
+ or f"SELECT * FROM ({query}) AS subquery ORDER BY 1 OFFSET 0 ROWS FETCH NEXT {limit} ROWS ONLY"
291
+ )
292
+ cursor = self.connection.cursor()
293
+ cursor.execute(query)
294
+ rows = cursor.fetchmany(limit)
295
+
296
+ if with_column_names:
297
+ column_names = [column[0] for column in cursor.description]
298
+ return rows, column_names
299
+ else:
300
+ return rows, None
301
+
302
+ def regex_to_sql_condition(self, regex_pattern: str, field: str) -> str:
303
+ """
304
+ Convert regex patterns to SQL Server conditions
305
+ """
306
+ if (regex_pattern.startswith("^") and regex_pattern.endswith("$")) or "|" in regex_pattern:
307
+ pattern = regex_pattern.strip("^$")
308
+ if pattern.startswith("(") and pattern.endswith(")"):
309
+ pattern = pattern[1:-1]
310
+
311
+ if "|" in pattern:
312
+ values = [f"'{val.strip()}'" for val in pattern.split("|")]
313
+ return f"IIF({field} IN ({', '.join(values)}), 1, 0)"
314
+
315
+ pattern = regex_pattern
316
+ if pattern.startswith("^"):
317
+ pattern = pattern[1:]
318
+ if pattern.endswith("$"):
319
+ pattern = pattern[:-1]
320
+
321
+ pattern = pattern.replace(".*", "%").replace(".+", "%").replace(".", "_")
322
+
323
+ return f"IIF({field} LIKE '{pattern}', 1, 0)"
324
+
325
+ def query_get_variance(self, table: str, field: str, filters: str = None) -> int:
326
+ """
327
+ Get the variance value
328
+ :param table: table name
329
+ :param field: column name
330
+ :param filters: filter condition
331
+ :return:
332
+ """
333
+ qualified_table_name = self.qualified_table_name(table)
334
+ field = self.quote_column(field)
335
+ query = "SELECT VAR({}) FROM {}".format(field, qualified_table_name)
336
+ if filters:
337
+ query += " WHERE {}".format(filters)
338
+
339
+ return round(self.fetchone(query)[0], 2)
340
+
341
+ def query_get_stddev(self, table: str, field: str, filters: str = None) -> int:
342
+ """
343
+ Get the standard deviation value
344
+ :param table: table name
345
+ :param field: column name
346
+ :param filters: filter condition
347
+ :return:
348
+ """
349
+ qualified_table_name = self.qualified_table_name(table)
350
+ field = self.quote_column(field)
351
+ query = "SELECT STDEV({}) FROM {}".format(field, qualified_table_name)
352
+ if filters:
353
+ query += " WHERE {}".format(filters)
354
+
355
+ return round(self.fetchone(query)[0], 2)
356
+
357
+ def query_get_percentile(self, table: str, field: str, percentile: float, filters: str = None) -> float:
358
+ """
359
+ Get the specified percentile value of a numeric column in a table.
360
+ :param table: table name
361
+ :param field: column name
362
+ :param percentile: percentile to calculate (e.g., 0.2 for 20th percentile)
363
+ :param filters: filter condition
364
+ :return: the value at the specified percentile
365
+ """
366
+ qualified_table_name = self.qualified_table_name(table)
367
+ field = self.quote_column(field)
368
+ query = f"""
369
+ SELECT PERCENTILE_CONT({percentile}) WITHIN GROUP (ORDER BY {field})
370
+ OVER () AS percentile_value
371
+ FROM {qualified_table_name}
372
+ """
373
+ if filters:
374
+ query += f" WHERE {filters}"
375
+
376
+ result = self.fetchone(query)
377
+ return round(result[0], 2) if result and result[0] is not None else None
378
+
379
+ def query_get_null_keyword_count(
380
+ self, table: str, field: str, operation: str, filters: str = None
381
+ ) -> Union[int, float]:
382
+ """
383
+ Get the count of NULL-like values (specific keywords) in the specified column for MSSQL.
384
+ :param table: table name
385
+ :param field: column name
386
+ :param operation: type of operation ('count' or 'percent')
387
+ :param filters: filter condition
388
+ :return: count (int) or percentage (float) of NULL-like keyword values
389
+ """
390
+ qualified_table_name = self.qualified_table_name(table)
391
+ field = self.quote_column(field)
392
+
393
+ query = f"""
394
+ SELECT
395
+ SUM(CASE
396
+ WHEN {field} IS NULL
397
+ OR LTRIM(RTRIM(LOWER(ISNULL({field}, '')))) IN ('nothing', 'nil', 'null', 'none', 'n/a', '')
398
+ THEN 1
399
+ ELSE 0
400
+ END) AS null_count,
401
+ COUNT(*) AS total_count
402
+ FROM {qualified_table_name}
403
+ """
404
+
405
+ if filters:
406
+ query += f" AND {filters}"
407
+
408
+ result = self.fetchone(query)
409
+
410
+ if not result or not result[1]:
411
+ return 0
412
+
413
+ null_count = int(result[0] if result[0] is not None else 0)
414
+ total_count = int(result[1])
415
+
416
+ if operation == "percent":
417
+ return round((null_count / total_count) * 100, 2) if total_count > 0 else 0.0
418
+
419
+ return null_count
420
+
421
+ def query_get_string_length_metric(
422
+ self, table: str, field: str, metric: str, filters: str = None
423
+ ) -> Union[int, float]:
424
+ """
425
+ Get the string length metric (max, min, avg) in a column of a table.
426
+
427
+ :param table: table name
428
+ :param field: column name
429
+ :param metric: the metric to calculate ('max', 'min', 'avg')
430
+ :param filters: filter condition
431
+ :return: the calculated metric as int for 'max' and 'min', float for 'avg'
432
+ """
433
+ qualified_table_name = self.qualified_table_name(table)
434
+ field = self.quote_column(field)
435
+
436
+ if metric.lower() == "max":
437
+ sql_function = "MAX(LEN"
438
+ elif metric.lower() == "min":
439
+ sql_function = "MIN(LEN"
440
+ elif metric.lower() == "avg":
441
+ sql_function = "AVG(LEN"
442
+ else:
443
+ raise ValueError(f"Invalid metric '{metric}'. Choose from 'max', 'min', or 'avg'.")
444
+
445
+ if metric.lower() == "avg":
446
+ query = f'SELECT AVG(CAST(LEN("{field}") AS FLOAT)) FROM {qualified_table_name}'
447
+ else:
448
+ query = f'SELECT {sql_function}("{field}")) FROM {qualified_table_name}'
449
+
450
+ if filters:
451
+ query += f" WHERE {filters}"
452
+
453
+ result = self.fetchone(query)[0]
454
+ return round(result, 2) if metric.lower() == "avg" else result
455
+
456
+ def query_string_pattern_validity(
457
+ self,
458
+ table: str,
459
+ field: str,
460
+ regex_pattern: str = None,
461
+ predefined_regex_pattern: str = None,
462
+ filters: str = None,
463
+ ) -> Tuple[int, int]:
464
+ """
465
+ Get the count of valid values based on the regex pattern.
466
+ :param table: table name
467
+ :param field: column name
468
+ :param regex_pattern: custom regex pattern
469
+ :param predefined_regex_pattern: predefined regex pattern
470
+ :param filters: filter condition
471
+ :return: count of valid values, count of total row count
472
+ """
473
+ filters = f"WHERE {filters}" if filters else ""
474
+ qualified_table_name = self.qualified_table_name(table)
475
+ field = self.quote_column(field)
476
+ if not regex_pattern and not predefined_regex_pattern:
477
+ raise ValueError("Either regex_pattern or predefined_regex_pattern should be provided")
478
+ if regex_pattern:
479
+ regex = regex_pattern
480
+ else:
481
+ regex = self.regex_patterns[predefined_regex_pattern]
482
+
483
+ regex = self.regex_to_sql_condition(regex, field)
484
+
485
+ query = f"""
486
+ SELECT SUM(CAST({regex} AS BIGINT)) AS valid_count,
487
+ COUNT(*) AS total_count
488
+ FROM {qualified_table_name}
489
+ {filters}
490
+ """
491
+ if predefined_regex_pattern == "perm_id":
492
+ query = f"""
493
+ SELECT
494
+ SUM(CASE
495
+ WHEN {field} LIKE '[0-9][0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]-[0-9][0-9][0-9]'
496
+ OR {field} LIKE '[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]'
497
+ THEN 1
498
+ ELSE 0
499
+ END) AS valid_count,
500
+ COUNT(*) AS total_count
501
+ FROM {qualified_table_name};
502
+ """
503
+ elif predefined_regex_pattern == "ssn":
504
+ query = f"""
505
+ SELECT
506
+ SUM(CASE
507
+ WHEN {field} LIKE '[0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9][0-9][0-9]'
508
+ AND LEFT({field}, 3) NOT IN ('000', '666')
509
+ AND LEFT({field}, 1) != '9'
510
+ AND SUBSTRING({field}, 5, 2) != '00'
511
+ AND RIGHT({field}, 4) != '0000'
512
+ THEN 1
513
+ ELSE 0
514
+ END) AS valid_count,
515
+ COUNT(*) AS total_count
516
+ FROM {qualified_table_name}
517
+ """
518
+ elif predefined_regex_pattern == "usa_phone":
519
+ query = f"""
520
+ SELECT
521
+ SUM(CASE
522
+ WHEN ({field} LIKE '+1 [0-9][0-9][0-9] [0-9][0-9][0-9] [0-9][0-9][0-9][0-9]'
523
+ OR {field} LIKE '+1-[0-9][0-9][0-9]-[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
524
+ OR {field} LIKE '+1.[0-9][0-9][0-9].[0-9][0-9][0-9].[0-9][0-9][0-9][0-9]'
525
+ OR {field} LIKE '+1[0-9][0-9][0-9]-[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
526
+ OR {field} LIKE '([0-9][0-9][0-9]) [0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
527
+ OR {field} LIKE '[0-9][0-9][0-9] [0-9][0-9][0-9] [0-9][0-9][0-9][0-9]'
528
+ OR {field} LIKE '[0-9][0-9][0-9].[0-9][0-9][0-9].[0-9][0-9][0-9][0-9]'
529
+ OR {field} LIKE '[0-9][0-9][0-9]-[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
530
+ OR {field} LIKE '+1 ([0-9][0-9][0-9]) [0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
531
+ OR {field} LIKE '[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]'
532
+ OR {field} LIKE '+1[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]'
533
+ OR {field} LIKE '([0-9][0-9][0-9])[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
534
+ OR {field} LIKE '+1 ([0-9][0-9][0-9])[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
535
+ OR {field} LIKE '+1 ([0-9][0-9][0-9]).[0-9][0-9][0-9].[0-9][0-9][0-9][0-9]'
536
+ OR {field} LIKE '([0-9][0-9][0-9]).[0-9][0-9][0-9].[0-9][0-9][0-9][0-9]'
537
+ OR {field} LIKE '([0-9][0-9][0-9])-[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
538
+ OR {field} LIKE '[0-9][0-9][0-9] [0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
539
+ OR {field} LIKE '[0-9][0-9][0-9].[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]')
540
+ THEN 1
541
+ ELSE 0
542
+ END) AS valid_count,
543
+ COUNT(*) AS total_count
544
+ FROM {qualified_table_name};
545
+
546
+ """
547
+ elif predefined_regex_pattern == "usa_zip_code":
548
+ query = f"""
549
+ SELECT
550
+ SUM(CASE
551
+ WHEN PATINDEX('%[0-9][0-9][0-9][0-9][0-9]%[-][0-9][0-9][0-9][0-9]%', CAST({field} AS VARCHAR)) > 0
552
+ OR PATINDEX('%[0-9][0-9][0-9][0-9][0-9]%', CAST({field} AS VARCHAR)) > 0
553
+ THEN 1 ELSE 0 END) AS valid_count,
554
+ COUNT(*) AS total_count
555
+ FROM {qualified_table_name};
556
+ """
557
+ result = self.fetchone(query)
558
+ return result[0], result[1]
559
+
560
+ def query_valid_invalid_values_validity(
561
+ self,
562
+ table: str,
563
+ field: str,
564
+ regex_pattern: str = None,
565
+ filters: str = None,
566
+ values: List[str] = None,
567
+ ) -> Tuple[int, int]:
568
+ """
569
+ Get the count of valid and invalid values for a specified column.
570
+ :param table: table name
571
+ :param field: column name
572
+ :param values: list of valid values
573
+ :param regex_pattern: regex pattern (will be converted to SQL Server pattern)
574
+ :param filters: filter condition
575
+ :return: count of valid values and total count of rows.
576
+ """
577
+ filters = f"WHERE {filters}" if filters else ""
578
+ qualified_table_name = self.qualified_table_name(table)
579
+ field = self.quote_column(field)
580
+
581
+ if values:
582
+ values_str = ", ".join([f"'{value}'" for value in values])
583
+ validity_condition = f"IIF({field} IN ({values_str}), 1, 0)"
584
+ elif regex_pattern:
585
+ validity_condition = self.regex_to_sql_condition(regex_pattern, field)
586
+ else:
587
+ raise ValueError("Either 'values' or 'regex_pattern' must be provided.")
588
+
589
+ query = f"""
590
+ SELECT SUM(CAST({validity_condition} AS BIGINT)) AS valid_count,
591
+ COUNT(*) AS total_count
592
+ FROM {qualified_table_name}
593
+ {filters}
594
+ """
595
+
596
+ result = self.fetchone(query)
597
+ return result[0], result[1]
598
+
599
+ def query_get_usa_state_code_validity(self, table: str, field: str, filters: str = None) -> Tuple[int, int]:
600
+ """
601
+ Get the count of valid USA state codes
602
+ :param table: table name
603
+ :param field: column name
604
+ :param filters: filter condition
605
+ :return: count of valid state codes, count of total row count
606
+ """
607
+ valid_state_codes_str = ", ".join(f"'{code}'" for code in self.valid_state_codes)
608
+
609
+ filters = f"WHERE {filters}" if filters else ""
610
+ qualified_table_name = self.qualified_table_name(table)
611
+ field = self.quote_column(field)
612
+
613
+ regex_query = f"""
614
+ CASE
615
+ WHEN {field} IS NULL THEN 0
616
+ WHEN {field} IN ({valid_state_codes_str})
617
+ THEN 1
618
+ ELSE 0
619
+ END"""
620
+
621
+ query = f"""
622
+ SELECT
623
+ SUM(CAST({regex_query} AS BIGINT)) AS valid_count,
624
+ COUNT(*) AS total_count
625
+ FROM {qualified_table_name}
626
+ {filters}
627
+ """
628
+ result = self.fetchone(query)
629
+ return result[0], result[1]
630
+
631
+ def query_timestamp_metric(self):
632
+ raise NotImplementedError("Method not implemented for MssqlDataSource")
633
+
634
+ def query_timestamp_not_in_future_metric(self):
635
+ raise NotImplementedError("Method not implemented for MssqlDataSource")
636
+
637
+ def query_timestamp_date_not_in_future_metric(self):
638
+ raise NotImplementedError("Method not implemented for MssqlDataSource")
639
+
640
+ def query_get_time_diff(self, table: str, field: str) -> int:
641
+ """
642
+ Get the time difference
643
+ :param table: name of the index
644
+ :param field: field name of updated time column
645
+ :return: time difference in seconds
646
+ """
647
+ qualified_table_name = self.qualified_table_name(table)
648
+ field = self.quote_column(field)
649
+ query = f"""
650
+ SELECT TOP 1 {field} FROM {qualified_table_name} ORDER BY {field} DESC;
651
+ """
652
+ result = self.fetchone(query)
653
+ if result:
654
+ updated_time = result[0]
655
+ if isinstance(updated_time, str):
656
+ updated_time = datetime.datetime.strptime(updated_time, "%Y-%m-%d %H:%M:%S.%f")
657
+ return int((datetime.datetime.utcnow() - updated_time).total_seconds())
658
+ return 0
659
+
660
+ def build_table_metrics_query(
661
+ self,
662
+ table_name: str,
663
+ column_info: list[dict],
664
+ additional_queries: Optional[List[str]] = None,
665
+ ) -> list[dict]:
666
+ query_parts = []
667
+ if not column_info:
668
+ return []
669
+
670
+ for col in column_info:
671
+ name = col["column_name"]
672
+ dtype = col["data_type"].lower()
673
+
674
+ quoted_name = self.quote_column(name)
675
+
676
+ query_parts.append(f"COUNT(DISTINCT {quoted_name}) AS [{name}_distinct]")
677
+ query_parts.append(f"COUNT({quoted_name}) - COUNT(DISTINCT {quoted_name}) AS [{name}_duplicate]")
678
+ query_parts.append(f"SUM(CASE WHEN {quoted_name} IS NULL THEN 1 ELSE 0 END) AS [{name}_is_null]")
679
+
680
+ if dtype in (
681
+ "int",
682
+ "integer",
683
+ "bigint",
684
+ "smallint",
685
+ "tinyint",
686
+ "decimal",
687
+ "numeric",
688
+ "float",
689
+ "real",
690
+ "money",
691
+ "smallmoney",
692
+ ):
693
+ query_parts.append(f"MIN({quoted_name}) AS [{name}_min]")
694
+ query_parts.append(f"MAX({quoted_name}) AS [{name}_max]")
695
+ query_parts.append(f"AVG(CAST({quoted_name} AS FLOAT)) AS [{name}_average]")
696
+
697
+ elif dtype in ("varchar", "nvarchar", "char", "nchar", "text", "ntext"):
698
+ query_parts.append(f"MAX(LEN({quoted_name})) AS [{name}_max_character_length]")
699
+
700
+ if additional_queries:
701
+ query_parts.extend(additional_queries)
702
+
703
+ qualified_table = self.qualified_table_name(table_name)
704
+ query_body = ",\n ".join(query_parts)
705
+ query = f"SELECT\n {query_body}\nFROM {qualified_table};"
706
+
707
+ cursor = self.connection.cursor()
708
+ try:
709
+ cursor.execute(query)
710
+ if cursor.description:
711
+ columns = [column[0] for column in cursor.description]
712
+ result_row = cursor.fetchone()
713
+ row = dict(zip(columns, result_row)) if result_row else {}
714
+ else:
715
+ row = {}
716
+ finally:
717
+ cursor.close()
718
+
719
+ def _normalize_metrics(value):
720
+ """Safely normalize DB metric values for JSON serialization."""
721
+ if value is None:
722
+ return None
723
+ if isinstance(value, Decimal):
724
+ return float(value)
725
+ if isinstance(value, (int, float, bool)):
726
+ return value
727
+ if isinstance(value, (datetime.datetime, datetime.date)):
728
+ return value.isoformat()
729
+ if isinstance(value, UUID):
730
+ return str(value)
731
+ if isinstance(value, list):
732
+ return [_normalize_metrics(v) for v in value]
733
+ if isinstance(value, dict):
734
+ return {k: _normalize_metrics(v) for k, v in value.items()}
735
+ return str(value)
736
+
737
+ column_wise = []
738
+ for col in column_info:
739
+ name = col["column_name"]
740
+ col_metrics = {}
741
+
742
+ for key, value in row.items():
743
+ clean_key = key.replace("[", "").replace("]", "")
744
+ if clean_key.startswith(f"{name}_"):
745
+ metric_name = clean_key[len(name) + 1 :]
746
+ col_metrics[metric_name] = _normalize_metrics(value)
747
+
748
+ column_wise.append({"column_name": name, "metrics": col_metrics})
749
+
750
+ for col_data in column_wise:
751
+ metrics = col_data["metrics"]
752
+ distinct_count = metrics.get("distinct")
753
+ col_name = col_data["column_name"]
754
+
755
+ dtype = next(c["data_type"].lower() for c in column_info if c["column_name"] == col_name)
756
+
757
+ quoted = self.quote_column(col_name)
758
+
759
+ is_dtype_numeric = (
760
+ True
761
+ if dtype
762
+ in (
763
+ "int",
764
+ "integer",
765
+ "bigint",
766
+ "smallint",
767
+ "tinyint",
768
+ "decimal",
769
+ "numeric",
770
+ "float",
771
+ "real",
772
+ "money",
773
+ "smallmoney",
774
+ )
775
+ else False
776
+ )
777
+
778
+ if is_dtype_numeric:
779
+ col_min = metrics.get("min")
780
+ col_max = metrics.get("max")
781
+
782
+ if col_min is not None and col_max is not None and col_min != col_max:
783
+ bucket_count = 20
784
+ bucket_size = (float(col_max) - float(col_min)) / bucket_count
785
+
786
+ bucket_queries = []
787
+ for i in range(bucket_count):
788
+ start = float(col_min) + i * bucket_size
789
+ end = float(col_min) + (i + 1) * bucket_size
790
+
791
+ bucket_queries.append(
792
+ f"SUM(CASE WHEN {quoted} >= {start} AND {quoted} < {end} THEN 1 ELSE 0 END) AS bucket_{i}"
793
+ )
794
+
795
+ bucket_sql = f"SELECT {', '.join(bucket_queries)} FROM {qualified_table}"
796
+
797
+ try:
798
+ bucket_result = self.connection.execute(text(bucket_sql)).fetchone()
799
+ distribution = []
800
+
801
+ for i in range(bucket_count):
802
+ start_raw = float(col_min) + i * bucket_size
803
+ end_raw = float(col_min) + (i + 1) * bucket_size
804
+
805
+ if dtype in ("int", "integer", "bigint", "smallint", "tinyint"):
806
+ start = math.floor(start_raw)
807
+ end = math.ceil(end_raw)
808
+ else:
809
+ start = round(start_raw, 2)
810
+ end = round(end_raw, 2)
811
+
812
+ count = bucket_result[i] if bucket_result and bucket_result[i] is not None else 0
813
+
814
+ distribution.append(
815
+ {
816
+ "col_val": f"{start} - {end}",
817
+ "count": count,
818
+ }
819
+ )
820
+
821
+ metrics["distribution_graph"] = distribution
822
+
823
+ except Exception as e:
824
+ print(f"Failed to generate numeric distribution for {col_name}: {e}")
825
+
826
+ continue
827
+
828
+ if isinstance(distinct_count, (int, float)) and distinct_count <= 20:
829
+ if dtype in ("text", "ntext", "xml"):
830
+ group_expr = f"CAST({quoted} AS NVARCHAR(MAX))"
831
+ else:
832
+ group_expr = quoted
833
+
834
+ dist_query = (
835
+ f"SELECT {group_expr}, COUNT(*) "
836
+ f"FROM {qualified_table} GROUP BY {group_expr} ORDER BY COUNT(*) DESC"
837
+ )
838
+
839
+ try:
840
+ dist_cursor = self.connection.cursor()
841
+ dist_cursor.execute(dist_query)
842
+ dist_result = dist_cursor.fetchall()
843
+ dist_cursor.close()
844
+
845
+ distribution = []
846
+
847
+ for r in dist_result:
848
+ val = _normalize_metrics(r[0])
849
+ distribution.append(
850
+ {
851
+ "col_val": val,
852
+ "count": r[1],
853
+ }
854
+ )
855
+
856
+ metrics["distribution_graph"] = distribution
857
+
858
+ except Exception as e:
859
+ print(f"Failed to generate distribution graph for column {col_name}: {e}")
860
+
861
+ for col_data in column_wise:
862
+ metrics = col_data["metrics"]
863
+ distinct_count = metrics.get("distinct")
864
+ col_name = col_data["column_name"]
865
+ dtype = next(c["data_type"].lower() for c in column_info if c["column_name"] == col_name)
866
+
867
+ quoted = self.quote_column(col_name)
868
+
869
+ is_dtype_numeric = (
870
+ True
871
+ if dtype
872
+ in (
873
+ "int",
874
+ "integer",
875
+ "bigint",
876
+ "smallint",
877
+ "tinyint",
878
+ "decimal",
879
+ "numeric",
880
+ "float",
881
+ "real",
882
+ "money",
883
+ "smallmoney",
884
+ )
885
+ else False
886
+ )
887
+
888
+ formatted_metrics_data = {
889
+ "general_data": {key: value for key, value in metrics.items() if key != "distribution_graph"},
890
+ "is_dtype_numeric": is_dtype_numeric,
891
+ "distribution_data": metrics.get("distribution_graph", []),
892
+ }
893
+ col_data["metrics"] = formatted_metrics_data
894
+
895
+ return column_wise
896
+
897
+ def fetch_sample_values_from_database(
898
+ self,
899
+ table_name: str,
900
+ column_names: list[str],
901
+ limit: int = 5,
902
+ ) -> Tuple[List[Tuple], List[str]]:
903
+ """
904
+ Fetch sample rows for specific columns from the given table (MSSQL version).
905
+
906
+ :param table_name: The name of the table.
907
+ :param column_names: List of column names to fetch.
908
+ :param limit: Number of rows to fetch.
909
+ :return: Tuple of (list of row tuples, list of column names)
910
+ """
911
+ qualified_table_name = self.qualified_table_name(table_name)
912
+
913
+ if not column_names:
914
+ raise ValueError("At least one column name must be provided")
915
+
916
+ if len(column_names) == 1 and column_names[0] == "*":
917
+ query = f"SELECT TOP {limit} * FROM {qualified_table_name}"
918
+ else:
919
+ columns = ", ".join([self.quote_column(col) for col in column_names])
920
+ query = f"SELECT TOP {limit} {columns} FROM {qualified_table_name}"
921
+
922
+ cursor = self.connection.cursor()
923
+ try:
924
+ cursor.execute(query)
925
+ column_names = [desc[0] for desc in cursor.description]
926
+ rows = cursor.fetchall()
927
+ finally:
928
+ cursor.close()
929
+ return rows, column_names
930
+
931
+ def get_table_foreign_key_info(self, table_name: str, schema: str | None = None):
932
+ schema = schema or self.schema_name
933
+
934
+ query = f"""
935
+ SELECT
936
+ fk.name AS constraint_name,
937
+ t.name AS table_name,
938
+ c.name AS fk_column,
939
+ rt.name AS referenced_table,
940
+ rc.name AS referenced_column
941
+ FROM sys.foreign_keys fk
942
+ INNER JOIN sys.foreign_key_columns fkc
943
+ ON fk.object_id = fkc.constraint_object_id
944
+ INNER JOIN sys.tables t
945
+ ON fk.parent_object_id = t.object_id
946
+ INNER JOIN sys.schemas s
947
+ ON t.schema_id = s.schema_id
948
+ INNER JOIN sys.columns c
949
+ ON fkc.parent_object_id = c.object_id
950
+ AND fkc.parent_column_id = c.column_id
951
+ INNER JOIN sys.tables rt
952
+ ON fk.referenced_object_id = rt.object_id
953
+ INNER JOIN sys.schemas rs
954
+ ON rt.schema_id = rs.schema_id
955
+ INNER JOIN sys.columns rc
956
+ ON fkc.referenced_object_id = rc.object_id
957
+ AND fkc.referenced_column_id = rc.column_id
958
+ WHERE t.name = '{table_name}'
959
+ AND s.name = '{schema}';
960
+ """
961
+ try:
962
+ cursor = self.connection.cursor()
963
+ cursor.execute(query)
964
+ rows = cursor.fetchall()
965
+ except Exception as e:
966
+ print(f"Failed to fetch fk info for dataset: {table_name}")
967
+ return []
968
+
969
+ data = [
970
+ {
971
+ "constraint_name": row[0],
972
+ "table_name": row[1],
973
+ "fk_column": row[2],
974
+ "referenced_table": row[3],
975
+ "referenced_column": row[4],
976
+ }
977
+ for row in rows
978
+ ]
979
+ return data