dcs-sdk 1.6.4__py3-none-any.whl → 1.6.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dcs_core/__init__.py +13 -0
- dcs_core/__main__.py +17 -0
- dcs_core/__version__.py +15 -0
- dcs_core/cli/__init__.py +13 -0
- dcs_core/cli/cli.py +165 -0
- dcs_core/core/__init__.py +19 -0
- dcs_core/core/common/__init__.py +13 -0
- dcs_core/core/common/errors.py +50 -0
- dcs_core/core/common/models/__init__.py +13 -0
- dcs_core/core/common/models/configuration.py +284 -0
- dcs_core/core/common/models/dashboard.py +24 -0
- dcs_core/core/common/models/data_source_resource.py +75 -0
- dcs_core/core/common/models/metric.py +160 -0
- dcs_core/core/common/models/profile.py +75 -0
- dcs_core/core/common/models/validation.py +216 -0
- dcs_core/core/common/models/widget.py +44 -0
- dcs_core/core/configuration/__init__.py +13 -0
- dcs_core/core/configuration/config_loader.py +139 -0
- dcs_core/core/configuration/configuration_parser.py +262 -0
- dcs_core/core/configuration/configuration_parser_arc.py +328 -0
- dcs_core/core/datasource/__init__.py +13 -0
- dcs_core/core/datasource/base.py +62 -0
- dcs_core/core/datasource/manager.py +112 -0
- dcs_core/core/datasource/search_datasource.py +421 -0
- dcs_core/core/datasource/sql_datasource.py +1094 -0
- dcs_core/core/inspect.py +163 -0
- dcs_core/core/logger/__init__.py +13 -0
- dcs_core/core/logger/base.py +32 -0
- dcs_core/core/logger/default_logger.py +94 -0
- dcs_core/core/metric/__init__.py +13 -0
- dcs_core/core/metric/base.py +220 -0
- dcs_core/core/metric/combined_metric.py +98 -0
- dcs_core/core/metric/custom_metric.py +34 -0
- dcs_core/core/metric/manager.py +137 -0
- dcs_core/core/metric/numeric_metric.py +403 -0
- dcs_core/core/metric/reliability_metric.py +90 -0
- dcs_core/core/profiling/__init__.py +13 -0
- dcs_core/core/profiling/datasource_profiling.py +136 -0
- dcs_core/core/profiling/numeric_field_profiling.py +72 -0
- dcs_core/core/profiling/text_field_profiling.py +67 -0
- dcs_core/core/repository/__init__.py +13 -0
- dcs_core/core/repository/metric_repository.py +77 -0
- dcs_core/core/utils/__init__.py +13 -0
- dcs_core/core/utils/log.py +29 -0
- dcs_core/core/utils/tracking.py +105 -0
- dcs_core/core/utils/utils.py +44 -0
- dcs_core/core/validation/__init__.py +13 -0
- dcs_core/core/validation/base.py +230 -0
- dcs_core/core/validation/completeness_validation.py +153 -0
- dcs_core/core/validation/custom_query_validation.py +24 -0
- dcs_core/core/validation/manager.py +282 -0
- dcs_core/core/validation/numeric_validation.py +276 -0
- dcs_core/core/validation/reliability_validation.py +91 -0
- dcs_core/core/validation/uniqueness_validation.py +61 -0
- dcs_core/core/validation/validity_validation.py +738 -0
- dcs_core/integrations/__init__.py +13 -0
- dcs_core/integrations/databases/__init__.py +13 -0
- dcs_core/integrations/databases/bigquery.py +187 -0
- dcs_core/integrations/databases/databricks.py +51 -0
- dcs_core/integrations/databases/db2.py +652 -0
- dcs_core/integrations/databases/elasticsearch.py +61 -0
- dcs_core/integrations/databases/mssql.py +979 -0
- dcs_core/integrations/databases/mysql.py +409 -0
- dcs_core/integrations/databases/opensearch.py +64 -0
- dcs_core/integrations/databases/oracle.py +719 -0
- dcs_core/integrations/databases/postgres.py +570 -0
- dcs_core/integrations/databases/redshift.py +53 -0
- dcs_core/integrations/databases/snowflake.py +48 -0
- dcs_core/integrations/databases/spark_df.py +111 -0
- dcs_core/integrations/databases/sybase.py +1069 -0
- dcs_core/integrations/storage/__init__.py +13 -0
- dcs_core/integrations/storage/local_file.py +149 -0
- dcs_core/integrations/utils/__init__.py +13 -0
- dcs_core/integrations/utils/utils.py +36 -0
- dcs_core/report/__init__.py +13 -0
- dcs_core/report/dashboard.py +211 -0
- dcs_core/report/models.py +88 -0
- dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
- dcs_core/report/static/assets/images/docs.svg +6 -0
- dcs_core/report/static/assets/images/github.svg +4 -0
- dcs_core/report/static/assets/images/logo.svg +7 -0
- dcs_core/report/static/assets/images/slack.svg +13 -0
- dcs_core/report/static/index.js +2 -0
- dcs_core/report/static/index.js.LICENSE.txt +3971 -0
- dcs_sdk/__version__.py +1 -1
- dcs_sdk/cli/cli.py +3 -0
- {dcs_sdk-1.6.4.dist-info → dcs_sdk-1.6.6.dist-info}/METADATA +24 -2
- dcs_sdk-1.6.6.dist-info/RECORD +159 -0
- {dcs_sdk-1.6.4.dist-info → dcs_sdk-1.6.6.dist-info}/entry_points.txt +1 -0
- dcs_sdk-1.6.4.dist-info/RECORD +0 -72
- {dcs_sdk-1.6.4.dist-info → dcs_sdk-1.6.6.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,570 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import datetime
|
|
16
|
+
import math
|
|
17
|
+
from decimal import Decimal
|
|
18
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
19
|
+
from uuid import UUID
|
|
20
|
+
|
|
21
|
+
from sqlalchemy import create_engine, text
|
|
22
|
+
from sqlalchemy.engine import URL
|
|
23
|
+
|
|
24
|
+
from dcs_core.core.common.errors import DataChecksDataSourcesConnectionError
|
|
25
|
+
from dcs_core.core.common.models.data_source_resource import RawColumnInfo
|
|
26
|
+
from dcs_core.core.datasource.sql_datasource import SQLDataSource
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class PostgresDataSource(SQLDataSource):
|
|
30
|
+
def __init__(self, data_source_name: str, data_connection: Dict):
|
|
31
|
+
super().__init__(data_source_name, data_connection)
|
|
32
|
+
self.DEFAULT_NUMERIC_PRECISION = 16383
|
|
33
|
+
|
|
34
|
+
def connect(self) -> Any:
|
|
35
|
+
"""
|
|
36
|
+
Connect to the data source
|
|
37
|
+
"""
|
|
38
|
+
try:
|
|
39
|
+
url = URL.create(
|
|
40
|
+
drivername="postgresql",
|
|
41
|
+
username=self.data_connection.get("username"),
|
|
42
|
+
password=self.data_connection.get("password"),
|
|
43
|
+
host=self.data_connection.get("host"),
|
|
44
|
+
port=self.data_connection.get("port"),
|
|
45
|
+
database=self.data_connection.get("database"),
|
|
46
|
+
)
|
|
47
|
+
schema = self.data_connection.get("schema") or "public"
|
|
48
|
+
engine = create_engine(
|
|
49
|
+
url,
|
|
50
|
+
connect_args={"options": f"-csearch_path={schema}"},
|
|
51
|
+
isolation_level="AUTOCOMMIT",
|
|
52
|
+
)
|
|
53
|
+
self.connection = engine.connect()
|
|
54
|
+
return self.connection
|
|
55
|
+
except Exception as e:
|
|
56
|
+
raise DataChecksDataSourcesConnectionError(
|
|
57
|
+
message=f"Failed to connect to PostgresSQL data source: [{str(e)}]"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
def qualified_table_name(self, table_name: str) -> str:
|
|
61
|
+
"""
|
|
62
|
+
Get the qualified table name
|
|
63
|
+
:param table_name: name of the table
|
|
64
|
+
:return: qualified table name
|
|
65
|
+
"""
|
|
66
|
+
if self.schema_name:
|
|
67
|
+
return f'"{self.schema_name}"."{table_name}"'
|
|
68
|
+
return f'"{table_name}"'
|
|
69
|
+
|
|
70
|
+
def quote_column(self, column: str) -> str:
|
|
71
|
+
"""
|
|
72
|
+
Quote the column name
|
|
73
|
+
:param column: name of the column
|
|
74
|
+
:return: quoted column name
|
|
75
|
+
"""
|
|
76
|
+
return f'"{column}"'
|
|
77
|
+
|
|
78
|
+
def query_get_database_version(self, database_version_query: Optional[str] = None) -> str:
|
|
79
|
+
"""
|
|
80
|
+
Get the database version
|
|
81
|
+
:return: version string
|
|
82
|
+
"""
|
|
83
|
+
query = database_version_query or "SELECT version()"
|
|
84
|
+
result = self.fetchone(query)[0]
|
|
85
|
+
return result if result else None
|
|
86
|
+
|
|
87
|
+
def query_get_table_names(
|
|
88
|
+
self,
|
|
89
|
+
schema: str | None = None,
|
|
90
|
+
with_view: bool = False,
|
|
91
|
+
) -> dict:
|
|
92
|
+
"""
|
|
93
|
+
Get the list of tables in the database.
|
|
94
|
+
:param schema: optional schema name
|
|
95
|
+
:param with_view: whether to include views
|
|
96
|
+
:return: dictionary with table names and optionally view names
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
schema = schema or self.schema_name
|
|
100
|
+
database = self.quote_database(self.database)
|
|
101
|
+
|
|
102
|
+
if with_view:
|
|
103
|
+
table_type_condition = "table_type IN ('BASE TABLE', 'VIEW')"
|
|
104
|
+
else:
|
|
105
|
+
table_type_condition = "table_type = 'BASE TABLE'"
|
|
106
|
+
|
|
107
|
+
query = (
|
|
108
|
+
f"SELECT table_name, table_type FROM {database}.information_schema.tables "
|
|
109
|
+
f"WHERE table_schema = '{schema}' AND {table_type_condition}"
|
|
110
|
+
)
|
|
111
|
+
rows = self.fetchall(query)
|
|
112
|
+
|
|
113
|
+
if with_view:
|
|
114
|
+
result = {"table": [], "view": []}
|
|
115
|
+
if rows:
|
|
116
|
+
for row in rows:
|
|
117
|
+
table_name = row[0]
|
|
118
|
+
table_type = row[1].strip() if row[1] else row[1]
|
|
119
|
+
|
|
120
|
+
if table_type == "BASE TABLE":
|
|
121
|
+
result["table"].append(table_name)
|
|
122
|
+
elif table_type == "VIEW":
|
|
123
|
+
result["view"].append(table_name)
|
|
124
|
+
else:
|
|
125
|
+
result = {"table": []}
|
|
126
|
+
if rows:
|
|
127
|
+
result["table"] = [row[0] for row in rows]
|
|
128
|
+
|
|
129
|
+
return result
|
|
130
|
+
|
|
131
|
+
def query_get_table_indexes(self, table: str, schema: str | None = None) -> dict[str, dict]:
|
|
132
|
+
"""
|
|
133
|
+
Get index information for a table in PostgreSQL DB.
|
|
134
|
+
:param table: Table name
|
|
135
|
+
:param schema: Optional schema name
|
|
136
|
+
:return: Dictionary with index details
|
|
137
|
+
"""
|
|
138
|
+
schema = schema or self.schema_name
|
|
139
|
+
table = table.lower()
|
|
140
|
+
schema = schema.lower()
|
|
141
|
+
|
|
142
|
+
query = f"""
|
|
143
|
+
SELECT
|
|
144
|
+
i.relname AS index_name,
|
|
145
|
+
am.amname AS index_type,
|
|
146
|
+
a.attname AS column_name,
|
|
147
|
+
x.n AS column_order
|
|
148
|
+
FROM
|
|
149
|
+
pg_class t
|
|
150
|
+
JOIN
|
|
151
|
+
pg_namespace ns ON ns.oid = t.relnamespace
|
|
152
|
+
JOIN
|
|
153
|
+
pg_index ix ON t.oid = ix.indrelid
|
|
154
|
+
JOIN
|
|
155
|
+
pg_class i ON i.oid = ix.indexrelid
|
|
156
|
+
JOIN
|
|
157
|
+
pg_am am ON i.relam = am.oid
|
|
158
|
+
JOIN
|
|
159
|
+
LATERAL unnest(ix.indkey) WITH ORDINALITY AS x(attnum, n)
|
|
160
|
+
ON TRUE
|
|
161
|
+
JOIN
|
|
162
|
+
pg_attribute a ON a.attnum = x.attnum AND a.attrelid = t.oid
|
|
163
|
+
WHERE
|
|
164
|
+
t.relkind = 'r'
|
|
165
|
+
AND t.relname = '{table}'
|
|
166
|
+
AND ns.nspname = '{schema}'
|
|
167
|
+
ORDER BY
|
|
168
|
+
i.relname, x.n
|
|
169
|
+
"""
|
|
170
|
+
rows = self.fetchall(query)
|
|
171
|
+
|
|
172
|
+
if not rows:
|
|
173
|
+
raise RuntimeError(f"No index information found for table '{table}' in schema '{schema}'.")
|
|
174
|
+
|
|
175
|
+
pk_query = f"""
|
|
176
|
+
SELECT kcu.column_name
|
|
177
|
+
FROM information_schema.table_constraints tc
|
|
178
|
+
JOIN information_schema.key_column_usage kcu
|
|
179
|
+
ON tc.constraint_name = kcu.constraint_name
|
|
180
|
+
AND tc.constraint_schema = kcu.constraint_schema
|
|
181
|
+
AND tc.table_name = kcu.table_name
|
|
182
|
+
WHERE tc.constraint_type = 'PRIMARY KEY'
|
|
183
|
+
AND tc.table_name = '{table}'
|
|
184
|
+
AND tc.table_schema = '{schema}'
|
|
185
|
+
ORDER BY kcu.ordinal_position
|
|
186
|
+
"""
|
|
187
|
+
pk_rows = self.fetchall(pk_query)
|
|
188
|
+
pk_columns = [row[0].strip() for row in pk_rows] if pk_rows else []
|
|
189
|
+
pk_columns_set = set(pk_columns)
|
|
190
|
+
|
|
191
|
+
indexes = {}
|
|
192
|
+
for row in rows:
|
|
193
|
+
index_name = row[0]
|
|
194
|
+
index_type = row[1]
|
|
195
|
+
column_info = {
|
|
196
|
+
"column_name": self.safe_get(row, 2),
|
|
197
|
+
"column_order": self.safe_get(row, 3),
|
|
198
|
+
}
|
|
199
|
+
if index_name not in indexes:
|
|
200
|
+
indexes[index_name] = {"columns": [], "index_type": index_type}
|
|
201
|
+
indexes[index_name]["columns"].append(column_info)
|
|
202
|
+
|
|
203
|
+
for index_name, idx in indexes.items():
|
|
204
|
+
index_columns = [col["column_name"].strip() for col in idx["columns"]]
|
|
205
|
+
index_columns_set = set(index_columns)
|
|
206
|
+
idx["is_primary_key"] = pk_columns_set == index_columns_set and len(index_columns) == len(pk_columns)
|
|
207
|
+
return indexes
|
|
208
|
+
|
|
209
|
+
def query_get_table_columns(
|
|
210
|
+
self,
|
|
211
|
+
table: str,
|
|
212
|
+
schema: str | None = None,
|
|
213
|
+
) -> RawColumnInfo:
|
|
214
|
+
"""
|
|
215
|
+
Get the schema of a table.
|
|
216
|
+
:param table: table name
|
|
217
|
+
:return: RawColumnInfo object containing column information
|
|
218
|
+
"""
|
|
219
|
+
schema = schema or self.schema_name
|
|
220
|
+
info_schema_path = ["information_schema", "columns"]
|
|
221
|
+
if self.database:
|
|
222
|
+
database = self.quote_database(self.database)
|
|
223
|
+
info_schema_path.insert(0, database)
|
|
224
|
+
query = (
|
|
225
|
+
f"SELECT column_name, data_type, datetime_precision, "
|
|
226
|
+
f"CASE WHEN data_type = 'numeric' "
|
|
227
|
+
f"THEN coalesce(numeric_precision, 131072 + {self.DEFAULT_NUMERIC_PRECISION}) "
|
|
228
|
+
f"ELSE numeric_precision END AS numeric_precision, "
|
|
229
|
+
f"CASE WHEN data_type = 'numeric' "
|
|
230
|
+
f"THEN coalesce(numeric_scale, {self.DEFAULT_NUMERIC_PRECISION}) "
|
|
231
|
+
f"ELSE numeric_scale END AS numeric_scale, "
|
|
232
|
+
f"COALESCE(collation_name, NULL) AS collation_name, "
|
|
233
|
+
f"CASE WHEN data_type = 'character varying' "
|
|
234
|
+
f"THEN character_maximum_length END AS character_maximum_length "
|
|
235
|
+
f"FROM {'.'.join(info_schema_path)} "
|
|
236
|
+
f"WHERE table_name = '{table}' AND table_schema = '{schema}'"
|
|
237
|
+
)
|
|
238
|
+
rows = self.fetchall(query)
|
|
239
|
+
if not rows:
|
|
240
|
+
raise RuntimeError(f"{table}: Table, {schema}: Schema, does not exist, or has no columns")
|
|
241
|
+
|
|
242
|
+
column_info = {
|
|
243
|
+
r[0]: RawColumnInfo(
|
|
244
|
+
column_name=self.safe_get(r, 0),
|
|
245
|
+
data_type=self.safe_get(r, 1),
|
|
246
|
+
datetime_precision=self.safe_get(r, 2),
|
|
247
|
+
numeric_precision=self.safe_get(r, 3),
|
|
248
|
+
numeric_scale=self.safe_get(r, 4),
|
|
249
|
+
collation_name=self.safe_get(r, 5),
|
|
250
|
+
character_maximum_length=self.safe_get(r, 6),
|
|
251
|
+
)
|
|
252
|
+
for r in rows
|
|
253
|
+
}
|
|
254
|
+
return column_info
|
|
255
|
+
|
|
256
|
+
def fetch_rows(
|
|
257
|
+
self,
|
|
258
|
+
query: str,
|
|
259
|
+
limit: int = 1,
|
|
260
|
+
with_column_names: bool = False,
|
|
261
|
+
complete_query: Optional[str] = None,
|
|
262
|
+
) -> Tuple[List, Optional[List[str]]]:
|
|
263
|
+
"""
|
|
264
|
+
Fetch rows from the database.
|
|
265
|
+
|
|
266
|
+
:param query: SQL query to execute.
|
|
267
|
+
:param limit: Number of rows to fetch.
|
|
268
|
+
:param with_column_names: Whether to include column names in the result.
|
|
269
|
+
:return: Tuple of (rows, column_names or None)
|
|
270
|
+
"""
|
|
271
|
+
query = complete_query or f"SELECT * FROM ({query}) AS subquery LIMIT {limit}"
|
|
272
|
+
|
|
273
|
+
result = self.connection.execute(text(query))
|
|
274
|
+
rows = result.fetchmany(limit)
|
|
275
|
+
|
|
276
|
+
if with_column_names:
|
|
277
|
+
column_names = result.keys()
|
|
278
|
+
return rows, list(column_names)
|
|
279
|
+
else:
|
|
280
|
+
return rows, None
|
|
281
|
+
|
|
282
|
+
def fetch_sample_values_from_database(
|
|
283
|
+
self,
|
|
284
|
+
table_name: str,
|
|
285
|
+
column_names: list[str],
|
|
286
|
+
limit: int = 5,
|
|
287
|
+
) -> List[Tuple]:
|
|
288
|
+
"""
|
|
289
|
+
Fetch sample rows for specific columns from the given table.
|
|
290
|
+
|
|
291
|
+
:param table_name: The name of the table.
|
|
292
|
+
:param column_names: List of column names to fetch.
|
|
293
|
+
:param limit: Number of rows to fetch.
|
|
294
|
+
:return: List of row tuples.
|
|
295
|
+
"""
|
|
296
|
+
table_name = self.qualified_table_name(table_name)
|
|
297
|
+
|
|
298
|
+
if not column_names:
|
|
299
|
+
raise ValueError("At least one column name must be provided")
|
|
300
|
+
|
|
301
|
+
if len(column_names) == 1 and column_names[0] == "*":
|
|
302
|
+
query = f"SELECT * FROM {table_name} LIMIT {limit}"
|
|
303
|
+
else:
|
|
304
|
+
columns = ", ".join([self.quote_column(col) for col in column_names])
|
|
305
|
+
query = f"SELECT {columns} FROM {table_name} LIMIT {limit}"
|
|
306
|
+
|
|
307
|
+
result = self.connection.execute(text(query))
|
|
308
|
+
column_names = list(result.keys())
|
|
309
|
+
rows = result.fetchall()
|
|
310
|
+
return rows, column_names
|
|
311
|
+
|
|
312
|
+
def build_table_metrics_query(
|
|
313
|
+
self,
|
|
314
|
+
table_name: str,
|
|
315
|
+
column_info: list[dict],
|
|
316
|
+
additional_queries: Optional[List[str]] = None,
|
|
317
|
+
) -> list[dict]:
|
|
318
|
+
query_parts = []
|
|
319
|
+
if not column_info:
|
|
320
|
+
return []
|
|
321
|
+
|
|
322
|
+
for col in column_info:
|
|
323
|
+
name = col["column_name"]
|
|
324
|
+
dtype = col["data_type"].lower()
|
|
325
|
+
quoted = self.quote_column(name)
|
|
326
|
+
|
|
327
|
+
if dtype in ("json", "jsonb"):
|
|
328
|
+
distinct_expr = f"{quoted}::text"
|
|
329
|
+
else:
|
|
330
|
+
distinct_expr = f"{quoted}"
|
|
331
|
+
|
|
332
|
+
query_parts.append(f'COUNT(DISTINCT {distinct_expr}) AS "{name}_distinct"')
|
|
333
|
+
query_parts.append(f'COUNT(*) - COUNT(DISTINCT {distinct_expr}) AS "{name}_duplicate"')
|
|
334
|
+
query_parts.append(
|
|
335
|
+
f'SUM(CASE WHEN {self.quote_column(name)} IS NULL THEN 1 ELSE 0 END) AS "{name}_is_null"'
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
if dtype in (
|
|
339
|
+
"int",
|
|
340
|
+
"integer",
|
|
341
|
+
"bigint",
|
|
342
|
+
"smallint",
|
|
343
|
+
"decimal",
|
|
344
|
+
"numeric",
|
|
345
|
+
"float",
|
|
346
|
+
"double",
|
|
347
|
+
):
|
|
348
|
+
query_parts.append(f'MIN({self.quote_column(name)}) AS "{name}_min"')
|
|
349
|
+
query_parts.append(f'MAX({self.quote_column(name)}) AS "{name}_max"')
|
|
350
|
+
query_parts.append(f'AVG({self.quote_column(name)}) AS "{name}_average"')
|
|
351
|
+
|
|
352
|
+
elif dtype in ("varchar", "text", "char", "string", "character varying"):
|
|
353
|
+
query_parts.append(f'MAX(CHAR_LENGTH({self.quote_column(name)})) AS "{name}_max_character_length"')
|
|
354
|
+
|
|
355
|
+
if additional_queries:
|
|
356
|
+
for queries in additional_queries:
|
|
357
|
+
query_parts.append(queries)
|
|
358
|
+
|
|
359
|
+
qualified_table = self.qualified_table_name(table_name)
|
|
360
|
+
joined_parts = ",\n ".join(query_parts)
|
|
361
|
+
query = f"SELECT\n {joined_parts}\nFROM {qualified_table};"
|
|
362
|
+
|
|
363
|
+
result = self.connection.execute(text(query))
|
|
364
|
+
row = dict(list(result)[0]._mapping)
|
|
365
|
+
|
|
366
|
+
def _normalize_metrics(value):
|
|
367
|
+
"""
|
|
368
|
+
Safely normalizes DB metric values into JSON-serializable Python types.
|
|
369
|
+
Handles:
|
|
370
|
+
- Decimal → float
|
|
371
|
+
- datetime/date → ISO 8601 string
|
|
372
|
+
- UUID → string
|
|
373
|
+
- Nested dict/list recursion
|
|
374
|
+
- None passthrough
|
|
375
|
+
"""
|
|
376
|
+
if value is None:
|
|
377
|
+
return None
|
|
378
|
+
|
|
379
|
+
if isinstance(value, Decimal):
|
|
380
|
+
return float(value)
|
|
381
|
+
if isinstance(value, (int, float, bool)):
|
|
382
|
+
return value
|
|
383
|
+
|
|
384
|
+
if isinstance(value, (datetime.datetime, datetime.date)):
|
|
385
|
+
return value.isoformat()
|
|
386
|
+
|
|
387
|
+
if isinstance(value, UUID):
|
|
388
|
+
return str(value)
|
|
389
|
+
|
|
390
|
+
if isinstance(value, list):
|
|
391
|
+
return [_normalize_metrics(v) for v in value]
|
|
392
|
+
if isinstance(value, dict):
|
|
393
|
+
return {k: _normalize_metrics(v) for k, v in value.items()}
|
|
394
|
+
|
|
395
|
+
return str(value)
|
|
396
|
+
|
|
397
|
+
column_wise = []
|
|
398
|
+
for col in column_info:
|
|
399
|
+
name = col["column_name"]
|
|
400
|
+
col_metrics = {}
|
|
401
|
+
|
|
402
|
+
for key, value in row.items():
|
|
403
|
+
if key.startswith(f"{name}_"):
|
|
404
|
+
metric_name = key[len(name) + 1 :]
|
|
405
|
+
col_metrics[metric_name] = _normalize_metrics(value)
|
|
406
|
+
|
|
407
|
+
column_wise.append({"column_name": name, "metrics": col_metrics})
|
|
408
|
+
|
|
409
|
+
for col_data in column_wise:
|
|
410
|
+
metrics = col_data["metrics"]
|
|
411
|
+
distinct_count = metrics.get("distinct")
|
|
412
|
+
col_name = col_data["column_name"]
|
|
413
|
+
dtype = next(c["data_type"].lower() for c in column_info if c["column_name"] == col_name)
|
|
414
|
+
|
|
415
|
+
quoted = self.quote_column(col_name)
|
|
416
|
+
|
|
417
|
+
is_dtype_numeric = (
|
|
418
|
+
True
|
|
419
|
+
if dtype
|
|
420
|
+
in (
|
|
421
|
+
"int",
|
|
422
|
+
"integer",
|
|
423
|
+
"bigint",
|
|
424
|
+
"smallint",
|
|
425
|
+
"decimal",
|
|
426
|
+
"numeric",
|
|
427
|
+
"float",
|
|
428
|
+
"double",
|
|
429
|
+
)
|
|
430
|
+
else False
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
if is_dtype_numeric:
|
|
434
|
+
col_min = metrics.get("min")
|
|
435
|
+
col_max = metrics.get("max")
|
|
436
|
+
|
|
437
|
+
if col_min is not None and col_max is not None and col_min != col_max:
|
|
438
|
+
bucket_count = 20
|
|
439
|
+
bucket_size = (col_max - col_min) / bucket_count
|
|
440
|
+
|
|
441
|
+
bucket_queries = []
|
|
442
|
+
for i in range(bucket_count):
|
|
443
|
+
start = col_min + i * bucket_size
|
|
444
|
+
end = col_min + (i + 1) * bucket_size
|
|
445
|
+
|
|
446
|
+
bucket_queries.append(
|
|
447
|
+
f"SUM(CASE WHEN {quoted} >= {start} AND {quoted} < {end} THEN 1 ELSE 0 END) AS bucket_{i}"
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
bucket_sql = f"SELECT {', '.join(bucket_queries)} FROM {qualified_table}"
|
|
451
|
+
|
|
452
|
+
try:
|
|
453
|
+
bucket_result = self.connection.execute(text(bucket_sql)).fetchone()
|
|
454
|
+
distribution = []
|
|
455
|
+
|
|
456
|
+
for i in range(bucket_count):
|
|
457
|
+
start_raw = col_min + i * bucket_size
|
|
458
|
+
end_raw = col_min + (i + 1) * bucket_size
|
|
459
|
+
if dtype in ("int", "integer", "bigint", "smallint"):
|
|
460
|
+
start = math.floor(start_raw)
|
|
461
|
+
end = math.ceil(end_raw)
|
|
462
|
+
else:
|
|
463
|
+
start = round(start_raw, 2)
|
|
464
|
+
end = round(end_raw, 2)
|
|
465
|
+
count = bucket_result[i]
|
|
466
|
+
|
|
467
|
+
distribution.append(
|
|
468
|
+
{
|
|
469
|
+
"col_val": f"{start} - {end}",
|
|
470
|
+
"count": count,
|
|
471
|
+
}
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
metrics["distribution_graph"] = distribution
|
|
475
|
+
|
|
476
|
+
except Exception as e:
|
|
477
|
+
print(f"Failed to generate numeric distribution for {col_name}: {e}")
|
|
478
|
+
|
|
479
|
+
continue
|
|
480
|
+
|
|
481
|
+
if isinstance(distinct_count, (int, float)) and distinct_count <= 20:
|
|
482
|
+
if dtype in ("json", "jsonb"):
|
|
483
|
+
group_expr = f"{quoted}::text"
|
|
484
|
+
else:
|
|
485
|
+
group_expr = quoted
|
|
486
|
+
|
|
487
|
+
dist_query = (
|
|
488
|
+
f"SELECT {group_expr}, COUNT(*) "
|
|
489
|
+
f"FROM {qualified_table} GROUP BY {group_expr} ORDER BY COUNT(*) DESC"
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
try:
|
|
493
|
+
dist_result = self.connection.execute(text(dist_query)).fetchall()
|
|
494
|
+
|
|
495
|
+
distribution = []
|
|
496
|
+
for r in dist_result:
|
|
497
|
+
val = _normalize_metrics(r[0])
|
|
498
|
+
distribution.append(
|
|
499
|
+
{
|
|
500
|
+
"col_val": val,
|
|
501
|
+
"count": r[1],
|
|
502
|
+
}
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
metrics["distribution_graph"] = distribution
|
|
506
|
+
|
|
507
|
+
except Exception as e:
|
|
508
|
+
print(f"Failed to generate distribution graph for column {col_name}: {e}")
|
|
509
|
+
|
|
510
|
+
for col_data in column_wise:
|
|
511
|
+
metrics = col_data["metrics"]
|
|
512
|
+
distinct_count = metrics.get("distinct")
|
|
513
|
+
col_name = col_data["column_name"]
|
|
514
|
+
dtype = next(c["data_type"].lower() for c in column_info if c["column_name"] == col_name)
|
|
515
|
+
|
|
516
|
+
quoted = self.quote_column(col_name)
|
|
517
|
+
|
|
518
|
+
is_dtype_numeric = (
|
|
519
|
+
True
|
|
520
|
+
if dtype
|
|
521
|
+
in (
|
|
522
|
+
"int",
|
|
523
|
+
"integer",
|
|
524
|
+
"bigint",
|
|
525
|
+
"smallint",
|
|
526
|
+
"decimal",
|
|
527
|
+
"numeric",
|
|
528
|
+
"float",
|
|
529
|
+
"double",
|
|
530
|
+
)
|
|
531
|
+
else False
|
|
532
|
+
)
|
|
533
|
+
|
|
534
|
+
formatted_metrics_data = {
|
|
535
|
+
"general_data": {key: value for key, value in metrics.items() if key != "distribution_graph"},
|
|
536
|
+
"is_dtype_numeric": is_dtype_numeric,
|
|
537
|
+
"distribution_data": metrics.get("distribution_graph", []),
|
|
538
|
+
}
|
|
539
|
+
col_data["metrics"] = formatted_metrics_data
|
|
540
|
+
|
|
541
|
+
return column_wise
|
|
542
|
+
|
|
543
|
+
def get_table_foreign_key_info(self, table_name: str, schema: str | None = None):
|
|
544
|
+
schema = schema or self.schema_name
|
|
545
|
+
|
|
546
|
+
query = f"""
|
|
547
|
+
SELECT
|
|
548
|
+
con.conname AS constraint_name,
|
|
549
|
+
rel_t.relname AS table_name,
|
|
550
|
+
att_t.attname AS fk_column,
|
|
551
|
+
rel_p.relname AS referenced_table,
|
|
552
|
+
att_p.attname AS referenced_column
|
|
553
|
+
FROM pg_constraint con
|
|
554
|
+
JOIN pg_class rel_t ON rel_t.oid = con.conrelid
|
|
555
|
+
JOIN pg_namespace nsp_t ON nsp_t.oid = rel_t.relnamespace
|
|
556
|
+
JOIN pg_class rel_p ON rel_p.oid = con.confrelid
|
|
557
|
+
JOIN pg_namespace nsp_p ON nsp_p.oid = rel_p.relnamespace
|
|
558
|
+
JOIN pg_attribute att_t ON att_t.attrelid = rel_t.oid AND att_t.attnum = ANY(con.conkey)
|
|
559
|
+
JOIN pg_attribute att_p ON att_p.attrelid = rel_p.oid AND att_p.attnum = ANY(con.confkey)
|
|
560
|
+
WHERE con.contype = 'f'
|
|
561
|
+
AND rel_t.relname = '{table_name}'
|
|
562
|
+
AND nsp_t.nspname = '{schema}';
|
|
563
|
+
"""
|
|
564
|
+
try:
|
|
565
|
+
result = self.connection.execute(text(query))
|
|
566
|
+
except Exception as e:
|
|
567
|
+
print(f"Failed to fetch fk info for dataset {table_name}")
|
|
568
|
+
return []
|
|
569
|
+
all_results = [dict(row._mapping) for row in result]
|
|
570
|
+
return all_results
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from typing import Any, Dict
|
|
16
|
+
|
|
17
|
+
from sqlalchemy import create_engine
|
|
18
|
+
from sqlalchemy.engine import URL
|
|
19
|
+
|
|
20
|
+
from dcs_core.core.common.errors import DataChecksDataSourcesConnectionError
|
|
21
|
+
from dcs_core.core.datasource.sql_datasource import SQLDataSource
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class RedShiftDataSource(SQLDataSource):
|
|
25
|
+
def __init__(self, data_source_name: str, data_connection: Dict):
|
|
26
|
+
super().__init__(data_source_name, data_connection)
|
|
27
|
+
|
|
28
|
+
def connect(self) -> Any:
|
|
29
|
+
"""
|
|
30
|
+
Connect to the data source
|
|
31
|
+
"""
|
|
32
|
+
try:
|
|
33
|
+
url = URL.create(
|
|
34
|
+
"redshift+psycopg2",
|
|
35
|
+
username=self.data_connection.get("username"),
|
|
36
|
+
password=self.data_connection.get("password"),
|
|
37
|
+
host=self.data_connection.get("host"),
|
|
38
|
+
port=self.data_connection.get("port"),
|
|
39
|
+
database=self.data_connection.get("database"),
|
|
40
|
+
)
|
|
41
|
+
schema = self.data_connection.get("schema")
|
|
42
|
+
engine = create_engine(
|
|
43
|
+
url,
|
|
44
|
+
connect_args={"options": f"-csearch_path={schema}"} if schema else None,
|
|
45
|
+
isolation_level="AUTOCOMMIT",
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
self.connection = engine.connect()
|
|
49
|
+
return self.connection
|
|
50
|
+
except Exception as e:
|
|
51
|
+
raise DataChecksDataSourcesConnectionError(
|
|
52
|
+
message=f"Failed to connect to AWS RedShift data source: [{str(e)}]"
|
|
53
|
+
)
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import urllib.parse
|
|
15
|
+
from typing import Any, Dict
|
|
16
|
+
|
|
17
|
+
from snowflake.sqlalchemy import URL
|
|
18
|
+
from sqlalchemy import create_engine
|
|
19
|
+
|
|
20
|
+
from dcs_core.core.common.errors import DataChecksDataSourcesConnectionError
|
|
21
|
+
from dcs_core.core.datasource.sql_datasource import SQLDataSource
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class SnowFlakeDataSource(SQLDataSource):
|
|
25
|
+
def __init__(self, data_source_name: str, data_connection: Dict):
|
|
26
|
+
super().__init__(data_source_name, data_connection)
|
|
27
|
+
|
|
28
|
+
def connect(self) -> Any:
|
|
29
|
+
"""
|
|
30
|
+
Connect to the data source
|
|
31
|
+
"""
|
|
32
|
+
try:
|
|
33
|
+
url = URL(
|
|
34
|
+
account=self.data_connection.get("account"),
|
|
35
|
+
user=self.data_connection.get("username"),
|
|
36
|
+
password=urllib.parse.quote(self.data_connection.get("password")),
|
|
37
|
+
database=self.data_connection.get("database"),
|
|
38
|
+
schema=self.data_connection.get("schema"),
|
|
39
|
+
warehouse=self.data_connection.get("warehouse"),
|
|
40
|
+
role=self.data_connection.get("role"),
|
|
41
|
+
)
|
|
42
|
+
engine = create_engine(url)
|
|
43
|
+
self.connection = engine.connect()
|
|
44
|
+
return self.connection
|
|
45
|
+
except Exception as e:
|
|
46
|
+
raise DataChecksDataSourcesConnectionError(
|
|
47
|
+
message=f"Failed to connect to Snowflake data source: [{str(e)}]"
|
|
48
|
+
)
|