dcs-sdk 1.6.4__py3-none-any.whl → 1.6.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dcs_core/__init__.py +13 -0
- dcs_core/__main__.py +17 -0
- dcs_core/__version__.py +15 -0
- dcs_core/cli/__init__.py +13 -0
- dcs_core/cli/cli.py +165 -0
- dcs_core/core/__init__.py +19 -0
- dcs_core/core/common/__init__.py +13 -0
- dcs_core/core/common/errors.py +50 -0
- dcs_core/core/common/models/__init__.py +13 -0
- dcs_core/core/common/models/configuration.py +284 -0
- dcs_core/core/common/models/dashboard.py +24 -0
- dcs_core/core/common/models/data_source_resource.py +75 -0
- dcs_core/core/common/models/metric.py +160 -0
- dcs_core/core/common/models/profile.py +75 -0
- dcs_core/core/common/models/validation.py +216 -0
- dcs_core/core/common/models/widget.py +44 -0
- dcs_core/core/configuration/__init__.py +13 -0
- dcs_core/core/configuration/config_loader.py +139 -0
- dcs_core/core/configuration/configuration_parser.py +262 -0
- dcs_core/core/configuration/configuration_parser_arc.py +328 -0
- dcs_core/core/datasource/__init__.py +13 -0
- dcs_core/core/datasource/base.py +62 -0
- dcs_core/core/datasource/manager.py +112 -0
- dcs_core/core/datasource/search_datasource.py +421 -0
- dcs_core/core/datasource/sql_datasource.py +1094 -0
- dcs_core/core/inspect.py +163 -0
- dcs_core/core/logger/__init__.py +13 -0
- dcs_core/core/logger/base.py +32 -0
- dcs_core/core/logger/default_logger.py +94 -0
- dcs_core/core/metric/__init__.py +13 -0
- dcs_core/core/metric/base.py +220 -0
- dcs_core/core/metric/combined_metric.py +98 -0
- dcs_core/core/metric/custom_metric.py +34 -0
- dcs_core/core/metric/manager.py +137 -0
- dcs_core/core/metric/numeric_metric.py +403 -0
- dcs_core/core/metric/reliability_metric.py +90 -0
- dcs_core/core/profiling/__init__.py +13 -0
- dcs_core/core/profiling/datasource_profiling.py +136 -0
- dcs_core/core/profiling/numeric_field_profiling.py +72 -0
- dcs_core/core/profiling/text_field_profiling.py +67 -0
- dcs_core/core/repository/__init__.py +13 -0
- dcs_core/core/repository/metric_repository.py +77 -0
- dcs_core/core/utils/__init__.py +13 -0
- dcs_core/core/utils/log.py +29 -0
- dcs_core/core/utils/tracking.py +105 -0
- dcs_core/core/utils/utils.py +44 -0
- dcs_core/core/validation/__init__.py +13 -0
- dcs_core/core/validation/base.py +230 -0
- dcs_core/core/validation/completeness_validation.py +153 -0
- dcs_core/core/validation/custom_query_validation.py +24 -0
- dcs_core/core/validation/manager.py +282 -0
- dcs_core/core/validation/numeric_validation.py +276 -0
- dcs_core/core/validation/reliability_validation.py +91 -0
- dcs_core/core/validation/uniqueness_validation.py +61 -0
- dcs_core/core/validation/validity_validation.py +738 -0
- dcs_core/integrations/__init__.py +13 -0
- dcs_core/integrations/databases/__init__.py +13 -0
- dcs_core/integrations/databases/bigquery.py +187 -0
- dcs_core/integrations/databases/databricks.py +51 -0
- dcs_core/integrations/databases/db2.py +652 -0
- dcs_core/integrations/databases/elasticsearch.py +61 -0
- dcs_core/integrations/databases/mssql.py +979 -0
- dcs_core/integrations/databases/mysql.py +409 -0
- dcs_core/integrations/databases/opensearch.py +64 -0
- dcs_core/integrations/databases/oracle.py +719 -0
- dcs_core/integrations/databases/postgres.py +570 -0
- dcs_core/integrations/databases/redshift.py +53 -0
- dcs_core/integrations/databases/snowflake.py +48 -0
- dcs_core/integrations/databases/spark_df.py +111 -0
- dcs_core/integrations/databases/sybase.py +1069 -0
- dcs_core/integrations/storage/__init__.py +13 -0
- dcs_core/integrations/storage/local_file.py +149 -0
- dcs_core/integrations/utils/__init__.py +13 -0
- dcs_core/integrations/utils/utils.py +36 -0
- dcs_core/report/__init__.py +13 -0
- dcs_core/report/dashboard.py +211 -0
- dcs_core/report/models.py +88 -0
- dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
- dcs_core/report/static/assets/images/docs.svg +6 -0
- dcs_core/report/static/assets/images/github.svg +4 -0
- dcs_core/report/static/assets/images/logo.svg +7 -0
- dcs_core/report/static/assets/images/slack.svg +13 -0
- dcs_core/report/static/index.js +2 -0
- dcs_core/report/static/index.js.LICENSE.txt +3971 -0
- dcs_sdk/__version__.py +1 -1
- dcs_sdk/cli/cli.py +3 -0
- {dcs_sdk-1.6.4.dist-info → dcs_sdk-1.6.6.dist-info}/METADATA +24 -2
- dcs_sdk-1.6.6.dist-info/RECORD +159 -0
- {dcs_sdk-1.6.4.dist-info → dcs_sdk-1.6.6.dist-info}/entry_points.txt +1 -0
- dcs_sdk-1.6.4.dist-info/RECORD +0 -72
- {dcs_sdk-1.6.4.dist-info → dcs_sdk-1.6.6.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,979 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import datetime
|
|
16
|
+
import math
|
|
17
|
+
from decimal import Decimal
|
|
18
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
19
|
+
from uuid import UUID
|
|
20
|
+
|
|
21
|
+
import pyodbc
|
|
22
|
+
from loguru import logger
|
|
23
|
+
from sqlalchemy import text
|
|
24
|
+
|
|
25
|
+
from dcs_core.core.common.errors import DataChecksDataSourcesConnectionError
|
|
26
|
+
from dcs_core.core.common.models.data_source_resource import RawColumnInfo
|
|
27
|
+
from dcs_core.core.datasource.sql_datasource import SQLDataSource
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class MssqlDataSource(SQLDataSource):
|
|
31
|
+
def __init__(self, data_source_name: str, data_connection: Dict):
|
|
32
|
+
super().__init__(data_source_name, data_connection)
|
|
33
|
+
self.regex_patterns = {
|
|
34
|
+
"uuid": r"[0-9a-fA-F]%-%[0-9a-fA-F]%-%[0-9a-fA-F]%-%[0-9a-fA-F]%-%[0-9a-fA-F]%",
|
|
35
|
+
"usa_phone": r"^(\+1[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}$",
|
|
36
|
+
"email": r"%[a-zA-Z0-9._%+-]@[a-zA-Z0-9.-]%.[a-zA-Z]%",
|
|
37
|
+
"usa_zip_code": r"^[0-9]{5}(?:-[0-9]{4})?$",
|
|
38
|
+
"ssn": r"^(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}$",
|
|
39
|
+
"sedol": r"[B-DF-HJ-NP-TV-XZ0-9][B-DF-HJ-NP-TV-XZ0-9][B-DF-HJ-NP-TV-XZ0-9][B-DF-HJ-NP-TV-XZ0-9][B-DF-HJ-NP-TV-XZ0-9][B-DF-HJ-NP-TV-XZ0-9][0-9]",
|
|
40
|
+
"lei": r"[A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][0-9][0-9]",
|
|
41
|
+
"cusip": r"[0-9A-Z][0-9A-Z][0-9A-Z][0-9A-Z][0-9A-Z][0-9A-Z][0-9A-Z][0-9A-Z][0-9A-Z]",
|
|
42
|
+
"figi": r"BBG[A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9]",
|
|
43
|
+
"isin": r"[A-Z][A-Z][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][0-9]",
|
|
44
|
+
"perm_id": r"^\d{4}([- ]?)\d{4}\1\d{4}\1\d{4}([- ]?)\d{3}$",
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
def connect(self) -> Any:
|
|
48
|
+
"""
|
|
49
|
+
Connect to the data source
|
|
50
|
+
"""
|
|
51
|
+
driver = self.data_connection.get("driver") or "ODBC Driver 18 for SQL Server"
|
|
52
|
+
host = self.data_connection.get("host")
|
|
53
|
+
port = self.data_connection.get("port")
|
|
54
|
+
database = self.data_connection.get("database")
|
|
55
|
+
username = self.data_connection.get("username")
|
|
56
|
+
password = self.data_connection.get("password")
|
|
57
|
+
server = self.data_connection.get("server")
|
|
58
|
+
|
|
59
|
+
connection_params = self._build_connection_params(
|
|
60
|
+
driver=driver, database=database, username=username, password=password
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
return self._establish_connection(connection_params, host, server, port)
|
|
64
|
+
|
|
65
|
+
def _prepare_driver_string(self, driver: str) -> str:
|
|
66
|
+
"""Ensure driver string is properly formatted with braces."""
|
|
67
|
+
return f"{{{driver}}}" if not driver.startswith("{") else driver
|
|
68
|
+
|
|
69
|
+
def _build_connection_params(self, driver: str, database: str, username: str, password: str) -> dict:
|
|
70
|
+
return {
|
|
71
|
+
"DRIVER": self._prepare_driver_string(driver),
|
|
72
|
+
"DATABASE": database,
|
|
73
|
+
"UID": username,
|
|
74
|
+
"PWD": password,
|
|
75
|
+
"TrustServerCertificate": "yes",
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
def _establish_connection(self, conn_dict: dict, host: str, server: str, port: str) -> Any:
|
|
79
|
+
connection_attempts = [
|
|
80
|
+
(host, True), # host with port
|
|
81
|
+
(host, False), # host without port
|
|
82
|
+
(server, True), # server with port
|
|
83
|
+
(server, False), # server without port
|
|
84
|
+
]
|
|
85
|
+
|
|
86
|
+
for _, (server_value, use_port) in enumerate(connection_attempts, 1):
|
|
87
|
+
if not server_value:
|
|
88
|
+
continue
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
conn_dict["SERVER"] = f"{server_value},{port}" if use_port and port else server_value
|
|
92
|
+
self.connection = pyodbc.connect(**conn_dict)
|
|
93
|
+
logger.info(f"Connected to MSSQL database using {conn_dict['SERVER']}")
|
|
94
|
+
return self.connection
|
|
95
|
+
except Exception:
|
|
96
|
+
continue
|
|
97
|
+
|
|
98
|
+
raise DataChecksDataSourcesConnectionError(
|
|
99
|
+
message="Failed to connect to Mssql data source: [All connection attempts failed]"
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
def fetchall(self, query):
|
|
103
|
+
return self.connection.cursor().execute(query).fetchall()
|
|
104
|
+
|
|
105
|
+
def fetchone(self, query):
|
|
106
|
+
return self.connection.cursor().execute(query).fetchone()
|
|
107
|
+
|
|
108
|
+
def qualified_table_name(self, table_name: str) -> str:
|
|
109
|
+
"""
|
|
110
|
+
Get the qualified table name
|
|
111
|
+
:param table_name: name of the table
|
|
112
|
+
:return: qualified table name
|
|
113
|
+
"""
|
|
114
|
+
if self.schema_name:
|
|
115
|
+
return f"[{self.schema_name}].[{table_name}]"
|
|
116
|
+
return f"[{table_name}]"
|
|
117
|
+
|
|
118
|
+
def quote_column(self, column: str) -> str:
|
|
119
|
+
"""
|
|
120
|
+
Quote the column name
|
|
121
|
+
:param column: name of the column
|
|
122
|
+
:return: quoted column name
|
|
123
|
+
"""
|
|
124
|
+
return f"[{column}]"
|
|
125
|
+
|
|
126
|
+
def query_get_table_names(self, schema: str | None = None, with_view: bool = False) -> dict:
|
|
127
|
+
"""
|
|
128
|
+
Get the list of tables in the database.
|
|
129
|
+
:param schema: optional schema name
|
|
130
|
+
:param with_view: whether to include views
|
|
131
|
+
:return: dictionary with table names and optionally view names
|
|
132
|
+
"""
|
|
133
|
+
schema = schema or self.schema_name
|
|
134
|
+
|
|
135
|
+
if with_view:
|
|
136
|
+
object_types = "IN ('U', 'V')"
|
|
137
|
+
else:
|
|
138
|
+
object_types = "= 'U'"
|
|
139
|
+
|
|
140
|
+
query = f"SELECT o.name AS table_name, o.type FROM sys.objects o JOIN sys.schemas s ON o.schema_id = s.schema_id WHERE o.type {object_types} AND s.name = '{schema}' ORDER BY o.name"
|
|
141
|
+
|
|
142
|
+
rows = self.fetchall(query)
|
|
143
|
+
|
|
144
|
+
if with_view:
|
|
145
|
+
result = {"table": [], "view": []}
|
|
146
|
+
if rows:
|
|
147
|
+
for row in rows:
|
|
148
|
+
object_name = row[0]
|
|
149
|
+
object_type = row[1].strip() if row[1] else row[1]
|
|
150
|
+
|
|
151
|
+
if object_type == "U":
|
|
152
|
+
result["table"].append(object_name)
|
|
153
|
+
elif object_type == "V":
|
|
154
|
+
result["view"].append(object_name)
|
|
155
|
+
else:
|
|
156
|
+
result = {"table": []}
|
|
157
|
+
if rows:
|
|
158
|
+
result["table"] = [row[0] for row in rows]
|
|
159
|
+
|
|
160
|
+
return result
|
|
161
|
+
|
|
162
|
+
def query_get_table_indexes(self, table: str, schema: str | None = None) -> dict[str, dict]:
|
|
163
|
+
"""
|
|
164
|
+
Get index information for a table in MSSQL DB.
|
|
165
|
+
:param table: Table name
|
|
166
|
+
:param schema: Optional schema name
|
|
167
|
+
:return: Dictionary with index details
|
|
168
|
+
"""
|
|
169
|
+
schema = schema or self.schema_name
|
|
170
|
+
table = table.upper()
|
|
171
|
+
schema = schema.upper()
|
|
172
|
+
|
|
173
|
+
query = f"""
|
|
174
|
+
SELECT
|
|
175
|
+
i.name AS index_name,
|
|
176
|
+
i.type_desc AS index_type,
|
|
177
|
+
c.name AS column_name,
|
|
178
|
+
ic.key_ordinal AS column_order
|
|
179
|
+
FROM
|
|
180
|
+
sys.indexes i
|
|
181
|
+
JOIN
|
|
182
|
+
sys.index_columns ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id
|
|
183
|
+
JOIN
|
|
184
|
+
sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id
|
|
185
|
+
JOIN
|
|
186
|
+
sys.tables t ON t.object_id = i.object_id
|
|
187
|
+
JOIN
|
|
188
|
+
sys.schemas s ON t.schema_id = s.schema_id
|
|
189
|
+
WHERE
|
|
190
|
+
t.name = '{table}'
|
|
191
|
+
AND s.name = '{schema}'
|
|
192
|
+
AND i.is_hypothetical = 0
|
|
193
|
+
ORDER BY
|
|
194
|
+
i.name, ic.key_ordinal
|
|
195
|
+
"""
|
|
196
|
+
|
|
197
|
+
rows = self.fetchall(query)
|
|
198
|
+
|
|
199
|
+
if not rows:
|
|
200
|
+
raise RuntimeError(f"No index information found for table '{table}' in schema '{schema}'.")
|
|
201
|
+
|
|
202
|
+
pk_query = f"""
|
|
203
|
+
SELECT c.name AS column_name
|
|
204
|
+
FROM
|
|
205
|
+
sys.key_constraints kc
|
|
206
|
+
JOIN
|
|
207
|
+
sys.index_columns ic ON kc.parent_object_id = ic.object_id AND kc.unique_index_id = ic.index_id
|
|
208
|
+
JOIN
|
|
209
|
+
sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id
|
|
210
|
+
JOIN
|
|
211
|
+
sys.tables t ON t.object_id = kc.parent_object_id
|
|
212
|
+
JOIN
|
|
213
|
+
sys.schemas s ON t.schema_id = s.schema_id
|
|
214
|
+
WHERE
|
|
215
|
+
kc.type = 'PK'
|
|
216
|
+
AND t.name = '{table}'
|
|
217
|
+
AND s.name = '{schema}'
|
|
218
|
+
ORDER BY ic.key_ordinal
|
|
219
|
+
"""
|
|
220
|
+
pk_rows = self.fetchall(pk_query)
|
|
221
|
+
pk_columns = [row[0].strip() for row in pk_rows] if pk_rows else []
|
|
222
|
+
pk_columns_set = set(pk_columns)
|
|
223
|
+
|
|
224
|
+
indexes = {}
|
|
225
|
+
for row in rows:
|
|
226
|
+
index_name = row[0]
|
|
227
|
+
index_type = row[1]
|
|
228
|
+
column_info = {
|
|
229
|
+
"column_name": self.safe_get(row, 2),
|
|
230
|
+
"column_order": self.safe_get(row, 3),
|
|
231
|
+
}
|
|
232
|
+
if index_name not in indexes:
|
|
233
|
+
indexes[index_name] = {"columns": [], "index_type": index_type}
|
|
234
|
+
indexes[index_name]["columns"].append(column_info)
|
|
235
|
+
|
|
236
|
+
for index_name, idx in indexes.items():
|
|
237
|
+
index_columns = [col["column_name"].strip() for col in idx["columns"]]
|
|
238
|
+
index_columns_set = set(index_columns)
|
|
239
|
+
idx["is_primary_key"] = pk_columns_set == index_columns_set and len(index_columns) == len(pk_columns)
|
|
240
|
+
return indexes
|
|
241
|
+
|
|
242
|
+
def query_get_table_columns(self, table: str, schema: str | None = None) -> RawColumnInfo:
|
|
243
|
+
"""
|
|
244
|
+
Get the schema of a table.
|
|
245
|
+
:param table: table name
|
|
246
|
+
:return: RawColumnInfo object containing column information
|
|
247
|
+
"""
|
|
248
|
+
schema = schema or self.schema_name
|
|
249
|
+
database = self.quote_database(self.database)
|
|
250
|
+
query = (
|
|
251
|
+
"SELECT column_name, data_type, ISNULL(datetime_precision, 0) AS datetime_precision, ISNULL(numeric_precision, 0) AS numeric_precision, ISNULL(numeric_scale, 0) AS numeric_scale, collation_name, ISNULL(character_maximum_length, 0) AS character_maximum_length "
|
|
252
|
+
f"FROM {database}.information_schema.columns "
|
|
253
|
+
f"WHERE table_name = '{table}' AND table_schema = '{schema}'"
|
|
254
|
+
)
|
|
255
|
+
rows = self.fetchall(query)
|
|
256
|
+
if not rows:
|
|
257
|
+
raise RuntimeError(f"{table}: Table, {schema}: Schema, does not exist, or has no columns")
|
|
258
|
+
|
|
259
|
+
column_info = {
|
|
260
|
+
r[0]: RawColumnInfo(
|
|
261
|
+
column_name=self.safe_get(r, 0),
|
|
262
|
+
data_type=self.safe_get(r, 1),
|
|
263
|
+
datetime_precision=self.safe_get(r, 2),
|
|
264
|
+
numeric_precision=self.safe_get(r, 3),
|
|
265
|
+
numeric_scale=self.safe_get(r, 4),
|
|
266
|
+
collation_name=self.safe_get(r, 5),
|
|
267
|
+
character_maximum_length=self.safe_get(r, 6),
|
|
268
|
+
)
|
|
269
|
+
for r in rows
|
|
270
|
+
}
|
|
271
|
+
return column_info
|
|
272
|
+
|
|
273
|
+
def fetch_rows(
|
|
274
|
+
self,
|
|
275
|
+
query: str,
|
|
276
|
+
limit: int = 1,
|
|
277
|
+
with_column_names: bool = False,
|
|
278
|
+
complete_query: Optional[str] = None,
|
|
279
|
+
) -> Tuple[List, Optional[List[str]]]:
|
|
280
|
+
"""
|
|
281
|
+
Fetch rows from the database using pyodbc.
|
|
282
|
+
|
|
283
|
+
:param query: SQL query to execute.
|
|
284
|
+
:param limit: Number of rows to fetch.
|
|
285
|
+
:param with_column_names: Whether to include column names in the result.
|
|
286
|
+
:return: Tuple of (rows, column_names or None)
|
|
287
|
+
"""
|
|
288
|
+
query = (
|
|
289
|
+
complete_query
|
|
290
|
+
or f"SELECT * FROM ({query}) AS subquery ORDER BY 1 OFFSET 0 ROWS FETCH NEXT {limit} ROWS ONLY"
|
|
291
|
+
)
|
|
292
|
+
cursor = self.connection.cursor()
|
|
293
|
+
cursor.execute(query)
|
|
294
|
+
rows = cursor.fetchmany(limit)
|
|
295
|
+
|
|
296
|
+
if with_column_names:
|
|
297
|
+
column_names = [column[0] for column in cursor.description]
|
|
298
|
+
return rows, column_names
|
|
299
|
+
else:
|
|
300
|
+
return rows, None
|
|
301
|
+
|
|
302
|
+
def regex_to_sql_condition(self, regex_pattern: str, field: str) -> str:
|
|
303
|
+
"""
|
|
304
|
+
Convert regex patterns to SQL Server conditions
|
|
305
|
+
"""
|
|
306
|
+
if (regex_pattern.startswith("^") and regex_pattern.endswith("$")) or "|" in regex_pattern:
|
|
307
|
+
pattern = regex_pattern.strip("^$")
|
|
308
|
+
if pattern.startswith("(") and pattern.endswith(")"):
|
|
309
|
+
pattern = pattern[1:-1]
|
|
310
|
+
|
|
311
|
+
if "|" in pattern:
|
|
312
|
+
values = [f"'{val.strip()}'" for val in pattern.split("|")]
|
|
313
|
+
return f"IIF({field} IN ({', '.join(values)}), 1, 0)"
|
|
314
|
+
|
|
315
|
+
pattern = regex_pattern
|
|
316
|
+
if pattern.startswith("^"):
|
|
317
|
+
pattern = pattern[1:]
|
|
318
|
+
if pattern.endswith("$"):
|
|
319
|
+
pattern = pattern[:-1]
|
|
320
|
+
|
|
321
|
+
pattern = pattern.replace(".*", "%").replace(".+", "%").replace(".", "_")
|
|
322
|
+
|
|
323
|
+
return f"IIF({field} LIKE '{pattern}', 1, 0)"
|
|
324
|
+
|
|
325
|
+
def query_get_variance(self, table: str, field: str, filters: str = None) -> int:
|
|
326
|
+
"""
|
|
327
|
+
Get the variance value
|
|
328
|
+
:param table: table name
|
|
329
|
+
:param field: column name
|
|
330
|
+
:param filters: filter condition
|
|
331
|
+
:return:
|
|
332
|
+
"""
|
|
333
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
334
|
+
field = self.quote_column(field)
|
|
335
|
+
query = "SELECT VAR({}) FROM {}".format(field, qualified_table_name)
|
|
336
|
+
if filters:
|
|
337
|
+
query += " WHERE {}".format(filters)
|
|
338
|
+
|
|
339
|
+
return round(self.fetchone(query)[0], 2)
|
|
340
|
+
|
|
341
|
+
def query_get_stddev(self, table: str, field: str, filters: str = None) -> int:
|
|
342
|
+
"""
|
|
343
|
+
Get the standard deviation value
|
|
344
|
+
:param table: table name
|
|
345
|
+
:param field: column name
|
|
346
|
+
:param filters: filter condition
|
|
347
|
+
:return:
|
|
348
|
+
"""
|
|
349
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
350
|
+
field = self.quote_column(field)
|
|
351
|
+
query = "SELECT STDEV({}) FROM {}".format(field, qualified_table_name)
|
|
352
|
+
if filters:
|
|
353
|
+
query += " WHERE {}".format(filters)
|
|
354
|
+
|
|
355
|
+
return round(self.fetchone(query)[0], 2)
|
|
356
|
+
|
|
357
|
+
def query_get_percentile(self, table: str, field: str, percentile: float, filters: str = None) -> float:
|
|
358
|
+
"""
|
|
359
|
+
Get the specified percentile value of a numeric column in a table.
|
|
360
|
+
:param table: table name
|
|
361
|
+
:param field: column name
|
|
362
|
+
:param percentile: percentile to calculate (e.g., 0.2 for 20th percentile)
|
|
363
|
+
:param filters: filter condition
|
|
364
|
+
:return: the value at the specified percentile
|
|
365
|
+
"""
|
|
366
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
367
|
+
field = self.quote_column(field)
|
|
368
|
+
query = f"""
|
|
369
|
+
SELECT PERCENTILE_CONT({percentile}) WITHIN GROUP (ORDER BY {field})
|
|
370
|
+
OVER () AS percentile_value
|
|
371
|
+
FROM {qualified_table_name}
|
|
372
|
+
"""
|
|
373
|
+
if filters:
|
|
374
|
+
query += f" WHERE {filters}"
|
|
375
|
+
|
|
376
|
+
result = self.fetchone(query)
|
|
377
|
+
return round(result[0], 2) if result and result[0] is not None else None
|
|
378
|
+
|
|
379
|
+
def query_get_null_keyword_count(
|
|
380
|
+
self, table: str, field: str, operation: str, filters: str = None
|
|
381
|
+
) -> Union[int, float]:
|
|
382
|
+
"""
|
|
383
|
+
Get the count of NULL-like values (specific keywords) in the specified column for MSSQL.
|
|
384
|
+
:param table: table name
|
|
385
|
+
:param field: column name
|
|
386
|
+
:param operation: type of operation ('count' or 'percent')
|
|
387
|
+
:param filters: filter condition
|
|
388
|
+
:return: count (int) or percentage (float) of NULL-like keyword values
|
|
389
|
+
"""
|
|
390
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
391
|
+
field = self.quote_column(field)
|
|
392
|
+
|
|
393
|
+
query = f"""
|
|
394
|
+
SELECT
|
|
395
|
+
SUM(CASE
|
|
396
|
+
WHEN {field} IS NULL
|
|
397
|
+
OR LTRIM(RTRIM(LOWER(ISNULL({field}, '')))) IN ('nothing', 'nil', 'null', 'none', 'n/a', '')
|
|
398
|
+
THEN 1
|
|
399
|
+
ELSE 0
|
|
400
|
+
END) AS null_count,
|
|
401
|
+
COUNT(*) AS total_count
|
|
402
|
+
FROM {qualified_table_name}
|
|
403
|
+
"""
|
|
404
|
+
|
|
405
|
+
if filters:
|
|
406
|
+
query += f" AND {filters}"
|
|
407
|
+
|
|
408
|
+
result = self.fetchone(query)
|
|
409
|
+
|
|
410
|
+
if not result or not result[1]:
|
|
411
|
+
return 0
|
|
412
|
+
|
|
413
|
+
null_count = int(result[0] if result[0] is not None else 0)
|
|
414
|
+
total_count = int(result[1])
|
|
415
|
+
|
|
416
|
+
if operation == "percent":
|
|
417
|
+
return round((null_count / total_count) * 100, 2) if total_count > 0 else 0.0
|
|
418
|
+
|
|
419
|
+
return null_count
|
|
420
|
+
|
|
421
|
+
def query_get_string_length_metric(
|
|
422
|
+
self, table: str, field: str, metric: str, filters: str = None
|
|
423
|
+
) -> Union[int, float]:
|
|
424
|
+
"""
|
|
425
|
+
Get the string length metric (max, min, avg) in a column of a table.
|
|
426
|
+
|
|
427
|
+
:param table: table name
|
|
428
|
+
:param field: column name
|
|
429
|
+
:param metric: the metric to calculate ('max', 'min', 'avg')
|
|
430
|
+
:param filters: filter condition
|
|
431
|
+
:return: the calculated metric as int for 'max' and 'min', float for 'avg'
|
|
432
|
+
"""
|
|
433
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
434
|
+
field = self.quote_column(field)
|
|
435
|
+
|
|
436
|
+
if metric.lower() == "max":
|
|
437
|
+
sql_function = "MAX(LEN"
|
|
438
|
+
elif metric.lower() == "min":
|
|
439
|
+
sql_function = "MIN(LEN"
|
|
440
|
+
elif metric.lower() == "avg":
|
|
441
|
+
sql_function = "AVG(LEN"
|
|
442
|
+
else:
|
|
443
|
+
raise ValueError(f"Invalid metric '{metric}'. Choose from 'max', 'min', or 'avg'.")
|
|
444
|
+
|
|
445
|
+
if metric.lower() == "avg":
|
|
446
|
+
query = f'SELECT AVG(CAST(LEN("{field}") AS FLOAT)) FROM {qualified_table_name}'
|
|
447
|
+
else:
|
|
448
|
+
query = f'SELECT {sql_function}("{field}")) FROM {qualified_table_name}'
|
|
449
|
+
|
|
450
|
+
if filters:
|
|
451
|
+
query += f" WHERE {filters}"
|
|
452
|
+
|
|
453
|
+
result = self.fetchone(query)[0]
|
|
454
|
+
return round(result, 2) if metric.lower() == "avg" else result
|
|
455
|
+
|
|
456
|
+
def query_string_pattern_validity(
|
|
457
|
+
self,
|
|
458
|
+
table: str,
|
|
459
|
+
field: str,
|
|
460
|
+
regex_pattern: str = None,
|
|
461
|
+
predefined_regex_pattern: str = None,
|
|
462
|
+
filters: str = None,
|
|
463
|
+
) -> Tuple[int, int]:
|
|
464
|
+
"""
|
|
465
|
+
Get the count of valid values based on the regex pattern.
|
|
466
|
+
:param table: table name
|
|
467
|
+
:param field: column name
|
|
468
|
+
:param regex_pattern: custom regex pattern
|
|
469
|
+
:param predefined_regex_pattern: predefined regex pattern
|
|
470
|
+
:param filters: filter condition
|
|
471
|
+
:return: count of valid values, count of total row count
|
|
472
|
+
"""
|
|
473
|
+
filters = f"WHERE {filters}" if filters else ""
|
|
474
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
475
|
+
field = self.quote_column(field)
|
|
476
|
+
if not regex_pattern and not predefined_regex_pattern:
|
|
477
|
+
raise ValueError("Either regex_pattern or predefined_regex_pattern should be provided")
|
|
478
|
+
if regex_pattern:
|
|
479
|
+
regex = regex_pattern
|
|
480
|
+
else:
|
|
481
|
+
regex = self.regex_patterns[predefined_regex_pattern]
|
|
482
|
+
|
|
483
|
+
regex = self.regex_to_sql_condition(regex, field)
|
|
484
|
+
|
|
485
|
+
query = f"""
|
|
486
|
+
SELECT SUM(CAST({regex} AS BIGINT)) AS valid_count,
|
|
487
|
+
COUNT(*) AS total_count
|
|
488
|
+
FROM {qualified_table_name}
|
|
489
|
+
{filters}
|
|
490
|
+
"""
|
|
491
|
+
if predefined_regex_pattern == "perm_id":
|
|
492
|
+
query = f"""
|
|
493
|
+
SELECT
|
|
494
|
+
SUM(CASE
|
|
495
|
+
WHEN {field} LIKE '[0-9][0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]-[0-9][0-9][0-9]'
|
|
496
|
+
OR {field} LIKE '[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]'
|
|
497
|
+
THEN 1
|
|
498
|
+
ELSE 0
|
|
499
|
+
END) AS valid_count,
|
|
500
|
+
COUNT(*) AS total_count
|
|
501
|
+
FROM {qualified_table_name};
|
|
502
|
+
"""
|
|
503
|
+
elif predefined_regex_pattern == "ssn":
|
|
504
|
+
query = f"""
|
|
505
|
+
SELECT
|
|
506
|
+
SUM(CASE
|
|
507
|
+
WHEN {field} LIKE '[0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9][0-9][0-9]'
|
|
508
|
+
AND LEFT({field}, 3) NOT IN ('000', '666')
|
|
509
|
+
AND LEFT({field}, 1) != '9'
|
|
510
|
+
AND SUBSTRING({field}, 5, 2) != '00'
|
|
511
|
+
AND RIGHT({field}, 4) != '0000'
|
|
512
|
+
THEN 1
|
|
513
|
+
ELSE 0
|
|
514
|
+
END) AS valid_count,
|
|
515
|
+
COUNT(*) AS total_count
|
|
516
|
+
FROM {qualified_table_name}
|
|
517
|
+
"""
|
|
518
|
+
elif predefined_regex_pattern == "usa_phone":
|
|
519
|
+
query = f"""
|
|
520
|
+
SELECT
|
|
521
|
+
SUM(CASE
|
|
522
|
+
WHEN ({field} LIKE '+1 [0-9][0-9][0-9] [0-9][0-9][0-9] [0-9][0-9][0-9][0-9]'
|
|
523
|
+
OR {field} LIKE '+1-[0-9][0-9][0-9]-[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
|
|
524
|
+
OR {field} LIKE '+1.[0-9][0-9][0-9].[0-9][0-9][0-9].[0-9][0-9][0-9][0-9]'
|
|
525
|
+
OR {field} LIKE '+1[0-9][0-9][0-9]-[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
|
|
526
|
+
OR {field} LIKE '([0-9][0-9][0-9]) [0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
|
|
527
|
+
OR {field} LIKE '[0-9][0-9][0-9] [0-9][0-9][0-9] [0-9][0-9][0-9][0-9]'
|
|
528
|
+
OR {field} LIKE '[0-9][0-9][0-9].[0-9][0-9][0-9].[0-9][0-9][0-9][0-9]'
|
|
529
|
+
OR {field} LIKE '[0-9][0-9][0-9]-[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
|
|
530
|
+
OR {field} LIKE '+1 ([0-9][0-9][0-9]) [0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
|
|
531
|
+
OR {field} LIKE '[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]'
|
|
532
|
+
OR {field} LIKE '+1[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]'
|
|
533
|
+
OR {field} LIKE '([0-9][0-9][0-9])[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
|
|
534
|
+
OR {field} LIKE '+1 ([0-9][0-9][0-9])[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
|
|
535
|
+
OR {field} LIKE '+1 ([0-9][0-9][0-9]).[0-9][0-9][0-9].[0-9][0-9][0-9][0-9]'
|
|
536
|
+
OR {field} LIKE '([0-9][0-9][0-9]).[0-9][0-9][0-9].[0-9][0-9][0-9][0-9]'
|
|
537
|
+
OR {field} LIKE '([0-9][0-9][0-9])-[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
|
|
538
|
+
OR {field} LIKE '[0-9][0-9][0-9] [0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
|
|
539
|
+
OR {field} LIKE '[0-9][0-9][0-9].[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]')
|
|
540
|
+
THEN 1
|
|
541
|
+
ELSE 0
|
|
542
|
+
END) AS valid_count,
|
|
543
|
+
COUNT(*) AS total_count
|
|
544
|
+
FROM {qualified_table_name};
|
|
545
|
+
|
|
546
|
+
"""
|
|
547
|
+
elif predefined_regex_pattern == "usa_zip_code":
|
|
548
|
+
query = f"""
|
|
549
|
+
SELECT
|
|
550
|
+
SUM(CASE
|
|
551
|
+
WHEN PATINDEX('%[0-9][0-9][0-9][0-9][0-9]%[-][0-9][0-9][0-9][0-9]%', CAST({field} AS VARCHAR)) > 0
|
|
552
|
+
OR PATINDEX('%[0-9][0-9][0-9][0-9][0-9]%', CAST({field} AS VARCHAR)) > 0
|
|
553
|
+
THEN 1 ELSE 0 END) AS valid_count,
|
|
554
|
+
COUNT(*) AS total_count
|
|
555
|
+
FROM {qualified_table_name};
|
|
556
|
+
"""
|
|
557
|
+
result = self.fetchone(query)
|
|
558
|
+
return result[0], result[1]
|
|
559
|
+
|
|
560
|
+
def query_valid_invalid_values_validity(
|
|
561
|
+
self,
|
|
562
|
+
table: str,
|
|
563
|
+
field: str,
|
|
564
|
+
regex_pattern: str = None,
|
|
565
|
+
filters: str = None,
|
|
566
|
+
values: List[str] = None,
|
|
567
|
+
) -> Tuple[int, int]:
|
|
568
|
+
"""
|
|
569
|
+
Get the count of valid and invalid values for a specified column.
|
|
570
|
+
:param table: table name
|
|
571
|
+
:param field: column name
|
|
572
|
+
:param values: list of valid values
|
|
573
|
+
:param regex_pattern: regex pattern (will be converted to SQL Server pattern)
|
|
574
|
+
:param filters: filter condition
|
|
575
|
+
:return: count of valid values and total count of rows.
|
|
576
|
+
"""
|
|
577
|
+
filters = f"WHERE {filters}" if filters else ""
|
|
578
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
579
|
+
field = self.quote_column(field)
|
|
580
|
+
|
|
581
|
+
if values:
|
|
582
|
+
values_str = ", ".join([f"'{value}'" for value in values])
|
|
583
|
+
validity_condition = f"IIF({field} IN ({values_str}), 1, 0)"
|
|
584
|
+
elif regex_pattern:
|
|
585
|
+
validity_condition = self.regex_to_sql_condition(regex_pattern, field)
|
|
586
|
+
else:
|
|
587
|
+
raise ValueError("Either 'values' or 'regex_pattern' must be provided.")
|
|
588
|
+
|
|
589
|
+
query = f"""
|
|
590
|
+
SELECT SUM(CAST({validity_condition} AS BIGINT)) AS valid_count,
|
|
591
|
+
COUNT(*) AS total_count
|
|
592
|
+
FROM {qualified_table_name}
|
|
593
|
+
{filters}
|
|
594
|
+
"""
|
|
595
|
+
|
|
596
|
+
result = self.fetchone(query)
|
|
597
|
+
return result[0], result[1]
|
|
598
|
+
|
|
599
|
+
def query_get_usa_state_code_validity(self, table: str, field: str, filters: str = None) -> Tuple[int, int]:
|
|
600
|
+
"""
|
|
601
|
+
Get the count of valid USA state codes
|
|
602
|
+
:param table: table name
|
|
603
|
+
:param field: column name
|
|
604
|
+
:param filters: filter condition
|
|
605
|
+
:return: count of valid state codes, count of total row count
|
|
606
|
+
"""
|
|
607
|
+
valid_state_codes_str = ", ".join(f"'{code}'" for code in self.valid_state_codes)
|
|
608
|
+
|
|
609
|
+
filters = f"WHERE {filters}" if filters else ""
|
|
610
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
611
|
+
field = self.quote_column(field)
|
|
612
|
+
|
|
613
|
+
regex_query = f"""
|
|
614
|
+
CASE
|
|
615
|
+
WHEN {field} IS NULL THEN 0
|
|
616
|
+
WHEN {field} IN ({valid_state_codes_str})
|
|
617
|
+
THEN 1
|
|
618
|
+
ELSE 0
|
|
619
|
+
END"""
|
|
620
|
+
|
|
621
|
+
query = f"""
|
|
622
|
+
SELECT
|
|
623
|
+
SUM(CAST({regex_query} AS BIGINT)) AS valid_count,
|
|
624
|
+
COUNT(*) AS total_count
|
|
625
|
+
FROM {qualified_table_name}
|
|
626
|
+
{filters}
|
|
627
|
+
"""
|
|
628
|
+
result = self.fetchone(query)
|
|
629
|
+
return result[0], result[1]
|
|
630
|
+
|
|
631
|
+
def query_timestamp_metric(self):
|
|
632
|
+
raise NotImplementedError("Method not implemented for MssqlDataSource")
|
|
633
|
+
|
|
634
|
+
def query_timestamp_not_in_future_metric(self):
|
|
635
|
+
raise NotImplementedError("Method not implemented for MssqlDataSource")
|
|
636
|
+
|
|
637
|
+
def query_timestamp_date_not_in_future_metric(self):
|
|
638
|
+
raise NotImplementedError("Method not implemented for MssqlDataSource")
|
|
639
|
+
|
|
640
|
+
def query_get_time_diff(self, table: str, field: str) -> int:
|
|
641
|
+
"""
|
|
642
|
+
Get the time difference
|
|
643
|
+
:param table: name of the index
|
|
644
|
+
:param field: field name of updated time column
|
|
645
|
+
:return: time difference in seconds
|
|
646
|
+
"""
|
|
647
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
648
|
+
field = self.quote_column(field)
|
|
649
|
+
query = f"""
|
|
650
|
+
SELECT TOP 1 {field} FROM {qualified_table_name} ORDER BY {field} DESC;
|
|
651
|
+
"""
|
|
652
|
+
result = self.fetchone(query)
|
|
653
|
+
if result:
|
|
654
|
+
updated_time = result[0]
|
|
655
|
+
if isinstance(updated_time, str):
|
|
656
|
+
updated_time = datetime.datetime.strptime(updated_time, "%Y-%m-%d %H:%M:%S.%f")
|
|
657
|
+
return int((datetime.datetime.utcnow() - updated_time).total_seconds())
|
|
658
|
+
return 0
|
|
659
|
+
|
|
660
|
+
def build_table_metrics_query(
|
|
661
|
+
self,
|
|
662
|
+
table_name: str,
|
|
663
|
+
column_info: list[dict],
|
|
664
|
+
additional_queries: Optional[List[str]] = None,
|
|
665
|
+
) -> list[dict]:
|
|
666
|
+
query_parts = []
|
|
667
|
+
if not column_info:
|
|
668
|
+
return []
|
|
669
|
+
|
|
670
|
+
for col in column_info:
|
|
671
|
+
name = col["column_name"]
|
|
672
|
+
dtype = col["data_type"].lower()
|
|
673
|
+
|
|
674
|
+
quoted_name = self.quote_column(name)
|
|
675
|
+
|
|
676
|
+
query_parts.append(f"COUNT(DISTINCT {quoted_name}) AS [{name}_distinct]")
|
|
677
|
+
query_parts.append(f"COUNT({quoted_name}) - COUNT(DISTINCT {quoted_name}) AS [{name}_duplicate]")
|
|
678
|
+
query_parts.append(f"SUM(CASE WHEN {quoted_name} IS NULL THEN 1 ELSE 0 END) AS [{name}_is_null]")
|
|
679
|
+
|
|
680
|
+
if dtype in (
|
|
681
|
+
"int",
|
|
682
|
+
"integer",
|
|
683
|
+
"bigint",
|
|
684
|
+
"smallint",
|
|
685
|
+
"tinyint",
|
|
686
|
+
"decimal",
|
|
687
|
+
"numeric",
|
|
688
|
+
"float",
|
|
689
|
+
"real",
|
|
690
|
+
"money",
|
|
691
|
+
"smallmoney",
|
|
692
|
+
):
|
|
693
|
+
query_parts.append(f"MIN({quoted_name}) AS [{name}_min]")
|
|
694
|
+
query_parts.append(f"MAX({quoted_name}) AS [{name}_max]")
|
|
695
|
+
query_parts.append(f"AVG(CAST({quoted_name} AS FLOAT)) AS [{name}_average]")
|
|
696
|
+
|
|
697
|
+
elif dtype in ("varchar", "nvarchar", "char", "nchar", "text", "ntext"):
|
|
698
|
+
query_parts.append(f"MAX(LEN({quoted_name})) AS [{name}_max_character_length]")
|
|
699
|
+
|
|
700
|
+
if additional_queries:
|
|
701
|
+
query_parts.extend(additional_queries)
|
|
702
|
+
|
|
703
|
+
qualified_table = self.qualified_table_name(table_name)
|
|
704
|
+
query_body = ",\n ".join(query_parts)
|
|
705
|
+
query = f"SELECT\n {query_body}\nFROM {qualified_table};"
|
|
706
|
+
|
|
707
|
+
cursor = self.connection.cursor()
|
|
708
|
+
try:
|
|
709
|
+
cursor.execute(query)
|
|
710
|
+
if cursor.description:
|
|
711
|
+
columns = [column[0] for column in cursor.description]
|
|
712
|
+
result_row = cursor.fetchone()
|
|
713
|
+
row = dict(zip(columns, result_row)) if result_row else {}
|
|
714
|
+
else:
|
|
715
|
+
row = {}
|
|
716
|
+
finally:
|
|
717
|
+
cursor.close()
|
|
718
|
+
|
|
719
|
+
def _normalize_metrics(value):
|
|
720
|
+
"""Safely normalize DB metric values for JSON serialization."""
|
|
721
|
+
if value is None:
|
|
722
|
+
return None
|
|
723
|
+
if isinstance(value, Decimal):
|
|
724
|
+
return float(value)
|
|
725
|
+
if isinstance(value, (int, float, bool)):
|
|
726
|
+
return value
|
|
727
|
+
if isinstance(value, (datetime.datetime, datetime.date)):
|
|
728
|
+
return value.isoformat()
|
|
729
|
+
if isinstance(value, UUID):
|
|
730
|
+
return str(value)
|
|
731
|
+
if isinstance(value, list):
|
|
732
|
+
return [_normalize_metrics(v) for v in value]
|
|
733
|
+
if isinstance(value, dict):
|
|
734
|
+
return {k: _normalize_metrics(v) for k, v in value.items()}
|
|
735
|
+
return str(value)
|
|
736
|
+
|
|
737
|
+
column_wise = []
|
|
738
|
+
for col in column_info:
|
|
739
|
+
name = col["column_name"]
|
|
740
|
+
col_metrics = {}
|
|
741
|
+
|
|
742
|
+
for key, value in row.items():
|
|
743
|
+
clean_key = key.replace("[", "").replace("]", "")
|
|
744
|
+
if clean_key.startswith(f"{name}_"):
|
|
745
|
+
metric_name = clean_key[len(name) + 1 :]
|
|
746
|
+
col_metrics[metric_name] = _normalize_metrics(value)
|
|
747
|
+
|
|
748
|
+
column_wise.append({"column_name": name, "metrics": col_metrics})
|
|
749
|
+
|
|
750
|
+
for col_data in column_wise:
|
|
751
|
+
metrics = col_data["metrics"]
|
|
752
|
+
distinct_count = metrics.get("distinct")
|
|
753
|
+
col_name = col_data["column_name"]
|
|
754
|
+
|
|
755
|
+
dtype = next(c["data_type"].lower() for c in column_info if c["column_name"] == col_name)
|
|
756
|
+
|
|
757
|
+
quoted = self.quote_column(col_name)
|
|
758
|
+
|
|
759
|
+
is_dtype_numeric = (
|
|
760
|
+
True
|
|
761
|
+
if dtype
|
|
762
|
+
in (
|
|
763
|
+
"int",
|
|
764
|
+
"integer",
|
|
765
|
+
"bigint",
|
|
766
|
+
"smallint",
|
|
767
|
+
"tinyint",
|
|
768
|
+
"decimal",
|
|
769
|
+
"numeric",
|
|
770
|
+
"float",
|
|
771
|
+
"real",
|
|
772
|
+
"money",
|
|
773
|
+
"smallmoney",
|
|
774
|
+
)
|
|
775
|
+
else False
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
if is_dtype_numeric:
|
|
779
|
+
col_min = metrics.get("min")
|
|
780
|
+
col_max = metrics.get("max")
|
|
781
|
+
|
|
782
|
+
if col_min is not None and col_max is not None and col_min != col_max:
|
|
783
|
+
bucket_count = 20
|
|
784
|
+
bucket_size = (float(col_max) - float(col_min)) / bucket_count
|
|
785
|
+
|
|
786
|
+
bucket_queries = []
|
|
787
|
+
for i in range(bucket_count):
|
|
788
|
+
start = float(col_min) + i * bucket_size
|
|
789
|
+
end = float(col_min) + (i + 1) * bucket_size
|
|
790
|
+
|
|
791
|
+
bucket_queries.append(
|
|
792
|
+
f"SUM(CASE WHEN {quoted} >= {start} AND {quoted} < {end} THEN 1 ELSE 0 END) AS bucket_{i}"
|
|
793
|
+
)
|
|
794
|
+
|
|
795
|
+
bucket_sql = f"SELECT {', '.join(bucket_queries)} FROM {qualified_table}"
|
|
796
|
+
|
|
797
|
+
try:
|
|
798
|
+
bucket_result = self.connection.execute(text(bucket_sql)).fetchone()
|
|
799
|
+
distribution = []
|
|
800
|
+
|
|
801
|
+
for i in range(bucket_count):
|
|
802
|
+
start_raw = float(col_min) + i * bucket_size
|
|
803
|
+
end_raw = float(col_min) + (i + 1) * bucket_size
|
|
804
|
+
|
|
805
|
+
if dtype in ("int", "integer", "bigint", "smallint", "tinyint"):
|
|
806
|
+
start = math.floor(start_raw)
|
|
807
|
+
end = math.ceil(end_raw)
|
|
808
|
+
else:
|
|
809
|
+
start = round(start_raw, 2)
|
|
810
|
+
end = round(end_raw, 2)
|
|
811
|
+
|
|
812
|
+
count = bucket_result[i] if bucket_result and bucket_result[i] is not None else 0
|
|
813
|
+
|
|
814
|
+
distribution.append(
|
|
815
|
+
{
|
|
816
|
+
"col_val": f"{start} - {end}",
|
|
817
|
+
"count": count,
|
|
818
|
+
}
|
|
819
|
+
)
|
|
820
|
+
|
|
821
|
+
metrics["distribution_graph"] = distribution
|
|
822
|
+
|
|
823
|
+
except Exception as e:
|
|
824
|
+
print(f"Failed to generate numeric distribution for {col_name}: {e}")
|
|
825
|
+
|
|
826
|
+
continue
|
|
827
|
+
|
|
828
|
+
if isinstance(distinct_count, (int, float)) and distinct_count <= 20:
|
|
829
|
+
if dtype in ("text", "ntext", "xml"):
|
|
830
|
+
group_expr = f"CAST({quoted} AS NVARCHAR(MAX))"
|
|
831
|
+
else:
|
|
832
|
+
group_expr = quoted
|
|
833
|
+
|
|
834
|
+
dist_query = (
|
|
835
|
+
f"SELECT {group_expr}, COUNT(*) "
|
|
836
|
+
f"FROM {qualified_table} GROUP BY {group_expr} ORDER BY COUNT(*) DESC"
|
|
837
|
+
)
|
|
838
|
+
|
|
839
|
+
try:
|
|
840
|
+
dist_cursor = self.connection.cursor()
|
|
841
|
+
dist_cursor.execute(dist_query)
|
|
842
|
+
dist_result = dist_cursor.fetchall()
|
|
843
|
+
dist_cursor.close()
|
|
844
|
+
|
|
845
|
+
distribution = []
|
|
846
|
+
|
|
847
|
+
for r in dist_result:
|
|
848
|
+
val = _normalize_metrics(r[0])
|
|
849
|
+
distribution.append(
|
|
850
|
+
{
|
|
851
|
+
"col_val": val,
|
|
852
|
+
"count": r[1],
|
|
853
|
+
}
|
|
854
|
+
)
|
|
855
|
+
|
|
856
|
+
metrics["distribution_graph"] = distribution
|
|
857
|
+
|
|
858
|
+
except Exception as e:
|
|
859
|
+
print(f"Failed to generate distribution graph for column {col_name}: {e}")
|
|
860
|
+
|
|
861
|
+
for col_data in column_wise:
|
|
862
|
+
metrics = col_data["metrics"]
|
|
863
|
+
distinct_count = metrics.get("distinct")
|
|
864
|
+
col_name = col_data["column_name"]
|
|
865
|
+
dtype = next(c["data_type"].lower() for c in column_info if c["column_name"] == col_name)
|
|
866
|
+
|
|
867
|
+
quoted = self.quote_column(col_name)
|
|
868
|
+
|
|
869
|
+
is_dtype_numeric = (
|
|
870
|
+
True
|
|
871
|
+
if dtype
|
|
872
|
+
in (
|
|
873
|
+
"int",
|
|
874
|
+
"integer",
|
|
875
|
+
"bigint",
|
|
876
|
+
"smallint",
|
|
877
|
+
"tinyint",
|
|
878
|
+
"decimal",
|
|
879
|
+
"numeric",
|
|
880
|
+
"float",
|
|
881
|
+
"real",
|
|
882
|
+
"money",
|
|
883
|
+
"smallmoney",
|
|
884
|
+
)
|
|
885
|
+
else False
|
|
886
|
+
)
|
|
887
|
+
|
|
888
|
+
formatted_metrics_data = {
|
|
889
|
+
"general_data": {key: value for key, value in metrics.items() if key != "distribution_graph"},
|
|
890
|
+
"is_dtype_numeric": is_dtype_numeric,
|
|
891
|
+
"distribution_data": metrics.get("distribution_graph", []),
|
|
892
|
+
}
|
|
893
|
+
col_data["metrics"] = formatted_metrics_data
|
|
894
|
+
|
|
895
|
+
return column_wise
|
|
896
|
+
|
|
897
|
+
def fetch_sample_values_from_database(
|
|
898
|
+
self,
|
|
899
|
+
table_name: str,
|
|
900
|
+
column_names: list[str],
|
|
901
|
+
limit: int = 5,
|
|
902
|
+
) -> Tuple[List[Tuple], List[str]]:
|
|
903
|
+
"""
|
|
904
|
+
Fetch sample rows for specific columns from the given table (MSSQL version).
|
|
905
|
+
|
|
906
|
+
:param table_name: The name of the table.
|
|
907
|
+
:param column_names: List of column names to fetch.
|
|
908
|
+
:param limit: Number of rows to fetch.
|
|
909
|
+
:return: Tuple of (list of row tuples, list of column names)
|
|
910
|
+
"""
|
|
911
|
+
qualified_table_name = self.qualified_table_name(table_name)
|
|
912
|
+
|
|
913
|
+
if not column_names:
|
|
914
|
+
raise ValueError("At least one column name must be provided")
|
|
915
|
+
|
|
916
|
+
if len(column_names) == 1 and column_names[0] == "*":
|
|
917
|
+
query = f"SELECT TOP {limit} * FROM {qualified_table_name}"
|
|
918
|
+
else:
|
|
919
|
+
columns = ", ".join([self.quote_column(col) for col in column_names])
|
|
920
|
+
query = f"SELECT TOP {limit} {columns} FROM {qualified_table_name}"
|
|
921
|
+
|
|
922
|
+
cursor = self.connection.cursor()
|
|
923
|
+
try:
|
|
924
|
+
cursor.execute(query)
|
|
925
|
+
column_names = [desc[0] for desc in cursor.description]
|
|
926
|
+
rows = cursor.fetchall()
|
|
927
|
+
finally:
|
|
928
|
+
cursor.close()
|
|
929
|
+
return rows, column_names
|
|
930
|
+
|
|
931
|
+
def get_table_foreign_key_info(self, table_name: str, schema: str | None = None):
|
|
932
|
+
schema = schema or self.schema_name
|
|
933
|
+
|
|
934
|
+
query = f"""
|
|
935
|
+
SELECT
|
|
936
|
+
fk.name AS constraint_name,
|
|
937
|
+
t.name AS table_name,
|
|
938
|
+
c.name AS fk_column,
|
|
939
|
+
rt.name AS referenced_table,
|
|
940
|
+
rc.name AS referenced_column
|
|
941
|
+
FROM sys.foreign_keys fk
|
|
942
|
+
INNER JOIN sys.foreign_key_columns fkc
|
|
943
|
+
ON fk.object_id = fkc.constraint_object_id
|
|
944
|
+
INNER JOIN sys.tables t
|
|
945
|
+
ON fk.parent_object_id = t.object_id
|
|
946
|
+
INNER JOIN sys.schemas s
|
|
947
|
+
ON t.schema_id = s.schema_id
|
|
948
|
+
INNER JOIN sys.columns c
|
|
949
|
+
ON fkc.parent_object_id = c.object_id
|
|
950
|
+
AND fkc.parent_column_id = c.column_id
|
|
951
|
+
INNER JOIN sys.tables rt
|
|
952
|
+
ON fk.referenced_object_id = rt.object_id
|
|
953
|
+
INNER JOIN sys.schemas rs
|
|
954
|
+
ON rt.schema_id = rs.schema_id
|
|
955
|
+
INNER JOIN sys.columns rc
|
|
956
|
+
ON fkc.referenced_object_id = rc.object_id
|
|
957
|
+
AND fkc.referenced_column_id = rc.column_id
|
|
958
|
+
WHERE t.name = '{table_name}'
|
|
959
|
+
AND s.name = '{schema}';
|
|
960
|
+
"""
|
|
961
|
+
try:
|
|
962
|
+
cursor = self.connection.cursor()
|
|
963
|
+
cursor.execute(query)
|
|
964
|
+
rows = cursor.fetchall()
|
|
965
|
+
except Exception as e:
|
|
966
|
+
print(f"Failed to fetch fk info for dataset: {table_name}")
|
|
967
|
+
return []
|
|
968
|
+
|
|
969
|
+
data = [
|
|
970
|
+
{
|
|
971
|
+
"constraint_name": row[0],
|
|
972
|
+
"table_name": row[1],
|
|
973
|
+
"fk_column": row[2],
|
|
974
|
+
"referenced_table": row[3],
|
|
975
|
+
"referenced_column": row[4],
|
|
976
|
+
}
|
|
977
|
+
for row in rows
|
|
978
|
+
]
|
|
979
|
+
return data
|