dcs-sdk 1.6.4__py3-none-any.whl → 1.6.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dcs_core/__init__.py +13 -0
- dcs_core/__main__.py +17 -0
- dcs_core/__version__.py +15 -0
- dcs_core/cli/__init__.py +13 -0
- dcs_core/cli/cli.py +165 -0
- dcs_core/core/__init__.py +19 -0
- dcs_core/core/common/__init__.py +13 -0
- dcs_core/core/common/errors.py +50 -0
- dcs_core/core/common/models/__init__.py +13 -0
- dcs_core/core/common/models/configuration.py +284 -0
- dcs_core/core/common/models/dashboard.py +24 -0
- dcs_core/core/common/models/data_source_resource.py +75 -0
- dcs_core/core/common/models/metric.py +160 -0
- dcs_core/core/common/models/profile.py +75 -0
- dcs_core/core/common/models/validation.py +216 -0
- dcs_core/core/common/models/widget.py +44 -0
- dcs_core/core/configuration/__init__.py +13 -0
- dcs_core/core/configuration/config_loader.py +139 -0
- dcs_core/core/configuration/configuration_parser.py +262 -0
- dcs_core/core/configuration/configuration_parser_arc.py +328 -0
- dcs_core/core/datasource/__init__.py +13 -0
- dcs_core/core/datasource/base.py +62 -0
- dcs_core/core/datasource/manager.py +112 -0
- dcs_core/core/datasource/search_datasource.py +421 -0
- dcs_core/core/datasource/sql_datasource.py +1094 -0
- dcs_core/core/inspect.py +163 -0
- dcs_core/core/logger/__init__.py +13 -0
- dcs_core/core/logger/base.py +32 -0
- dcs_core/core/logger/default_logger.py +94 -0
- dcs_core/core/metric/__init__.py +13 -0
- dcs_core/core/metric/base.py +220 -0
- dcs_core/core/metric/combined_metric.py +98 -0
- dcs_core/core/metric/custom_metric.py +34 -0
- dcs_core/core/metric/manager.py +137 -0
- dcs_core/core/metric/numeric_metric.py +403 -0
- dcs_core/core/metric/reliability_metric.py +90 -0
- dcs_core/core/profiling/__init__.py +13 -0
- dcs_core/core/profiling/datasource_profiling.py +136 -0
- dcs_core/core/profiling/numeric_field_profiling.py +72 -0
- dcs_core/core/profiling/text_field_profiling.py +67 -0
- dcs_core/core/repository/__init__.py +13 -0
- dcs_core/core/repository/metric_repository.py +77 -0
- dcs_core/core/utils/__init__.py +13 -0
- dcs_core/core/utils/log.py +29 -0
- dcs_core/core/utils/tracking.py +105 -0
- dcs_core/core/utils/utils.py +44 -0
- dcs_core/core/validation/__init__.py +13 -0
- dcs_core/core/validation/base.py +230 -0
- dcs_core/core/validation/completeness_validation.py +153 -0
- dcs_core/core/validation/custom_query_validation.py +24 -0
- dcs_core/core/validation/manager.py +282 -0
- dcs_core/core/validation/numeric_validation.py +276 -0
- dcs_core/core/validation/reliability_validation.py +91 -0
- dcs_core/core/validation/uniqueness_validation.py +61 -0
- dcs_core/core/validation/validity_validation.py +738 -0
- dcs_core/integrations/__init__.py +13 -0
- dcs_core/integrations/databases/__init__.py +13 -0
- dcs_core/integrations/databases/bigquery.py +187 -0
- dcs_core/integrations/databases/databricks.py +51 -0
- dcs_core/integrations/databases/db2.py +652 -0
- dcs_core/integrations/databases/elasticsearch.py +61 -0
- dcs_core/integrations/databases/mssql.py +979 -0
- dcs_core/integrations/databases/mysql.py +409 -0
- dcs_core/integrations/databases/opensearch.py +64 -0
- dcs_core/integrations/databases/oracle.py +719 -0
- dcs_core/integrations/databases/postgres.py +570 -0
- dcs_core/integrations/databases/redshift.py +53 -0
- dcs_core/integrations/databases/snowflake.py +48 -0
- dcs_core/integrations/databases/spark_df.py +111 -0
- dcs_core/integrations/databases/sybase.py +1069 -0
- dcs_core/integrations/storage/__init__.py +13 -0
- dcs_core/integrations/storage/local_file.py +149 -0
- dcs_core/integrations/utils/__init__.py +13 -0
- dcs_core/integrations/utils/utils.py +36 -0
- dcs_core/report/__init__.py +13 -0
- dcs_core/report/dashboard.py +211 -0
- dcs_core/report/models.py +88 -0
- dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
- dcs_core/report/static/assets/images/docs.svg +6 -0
- dcs_core/report/static/assets/images/github.svg +4 -0
- dcs_core/report/static/assets/images/logo.svg +7 -0
- dcs_core/report/static/assets/images/slack.svg +13 -0
- dcs_core/report/static/index.js +2 -0
- dcs_core/report/static/index.js.LICENSE.txt +3971 -0
- dcs_sdk/__version__.py +1 -1
- dcs_sdk/cli/cli.py +3 -0
- {dcs_sdk-1.6.4.dist-info → dcs_sdk-1.6.6.dist-info}/METADATA +24 -2
- dcs_sdk-1.6.6.dist-info/RECORD +159 -0
- {dcs_sdk-1.6.4.dist-info → dcs_sdk-1.6.6.dist-info}/entry_points.txt +1 -0
- dcs_sdk-1.6.4.dist-info/RECORD +0 -72
- {dcs_sdk-1.6.4.dist-info → dcs_sdk-1.6.6.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,409 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from datetime import datetime
|
|
16
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
17
|
+
|
|
18
|
+
from loguru import logger
|
|
19
|
+
from sqlalchemy import create_engine, text
|
|
20
|
+
from sqlalchemy.engine import URL
|
|
21
|
+
|
|
22
|
+
from dcs_core.core.common.errors import DataChecksDataSourcesConnectionError
|
|
23
|
+
from dcs_core.core.common.models.data_source_resource import RawColumnInfo
|
|
24
|
+
from dcs_core.integrations.databases.db2 import DB2DataSource
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class MysqlDataSource(DB2DataSource):
|
|
28
|
+
def __init__(self, data_source_name: str, data_connection: Dict):
|
|
29
|
+
super().__init__(data_source_name, data_connection)
|
|
30
|
+
self.regex_patterns = {
|
|
31
|
+
"uuid": r"^[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$",
|
|
32
|
+
"usa_phone": r"^\\+?1?[-.[:space:]]?\\(?[0-9]{3}\\)?[-.[:space:]]?[0-9]{3}[-.[:space:]]?[0-9]{4}$",
|
|
33
|
+
"email": r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$",
|
|
34
|
+
"usa_zip_code": r"^[0-9B-DF-HJ-NP-TV-Z]{6}[0-9]$",
|
|
35
|
+
"ssn": r"^(?!666|000|9\\d{2})\\d{3}-(?!00)\\d{2}-(?!0{4})\\d{4}$",
|
|
36
|
+
"sedol": r"^[B-DF-HJ-NP-TV-XZ0-9]{6}[0-9]$",
|
|
37
|
+
"lei": r"^[A-Z0-9]{18}[0-9]{2}$",
|
|
38
|
+
"cusip": r"^[0-9A-Z]{9}$",
|
|
39
|
+
"figi": r"^BBG[A-Z0-9]{9}$",
|
|
40
|
+
"isin": r"^[A-Z]{2}[A-Z0-9]{9}[0-9]$",
|
|
41
|
+
"perm_id": r"^[0-9]{4}[- ]?[0-9]{4}[- ]?[0-9]{4}[- ]?[0-9]{4}[- ]?[0-9]{2,3}$",
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
def connect(self) -> Any:
|
|
45
|
+
"""
|
|
46
|
+
Connect to the data source
|
|
47
|
+
"""
|
|
48
|
+
try:
|
|
49
|
+
ssl = True if self.data_connection.get("security", False) in ["ssl", "SSL"] else False
|
|
50
|
+
self.schema_name = self.data_connection.get("schema") or self.data_connection.get("username")
|
|
51
|
+
url = URL.create(
|
|
52
|
+
drivername="mysql+pymysql",
|
|
53
|
+
username=self.data_connection.get("username"),
|
|
54
|
+
password=self.data_connection.get("password"),
|
|
55
|
+
host=self.data_connection.get("host"),
|
|
56
|
+
port=self.data_connection.get("port"),
|
|
57
|
+
database=self.data_connection.get("database"),
|
|
58
|
+
)
|
|
59
|
+
engine = create_engine(
|
|
60
|
+
url,
|
|
61
|
+
isolation_level="AUTOCOMMIT",
|
|
62
|
+
connect_args={"ssl": {"ssl": ssl} if ssl else None},
|
|
63
|
+
)
|
|
64
|
+
self.connection = engine.connect()
|
|
65
|
+
return self.connection
|
|
66
|
+
except Exception as e:
|
|
67
|
+
raise DataChecksDataSourcesConnectionError(message=f"Failed to connect to Mysql data source: [{str(e)}]")
|
|
68
|
+
|
|
69
|
+
def qualified_table_name(self, table_name: str) -> str:
|
|
70
|
+
"""
|
|
71
|
+
Get the qualified table name
|
|
72
|
+
:param table_name: name of the table
|
|
73
|
+
:return: qualified table name
|
|
74
|
+
"""
|
|
75
|
+
if self.schema_name:
|
|
76
|
+
return f"`{self.schema_name}`.`{table_name}`"
|
|
77
|
+
return f"`{table_name}`"
|
|
78
|
+
|
|
79
|
+
def quote_column(self, column: str) -> str:
|
|
80
|
+
"""
|
|
81
|
+
Quote the column name
|
|
82
|
+
:param column: name of the column
|
|
83
|
+
:return: quoted column name
|
|
84
|
+
"""
|
|
85
|
+
return f"`{column}`"
|
|
86
|
+
|
|
87
|
+
def query_get_table_names(
|
|
88
|
+
self,
|
|
89
|
+
schema: str | None = None,
|
|
90
|
+
with_view: bool = False,
|
|
91
|
+
) -> dict:
|
|
92
|
+
"""
|
|
93
|
+
Get the list of tables in the database.
|
|
94
|
+
:param schema: optional schema name
|
|
95
|
+
:param with_view: whether to include views
|
|
96
|
+
:return: dictionary with table names and optionally view names
|
|
97
|
+
"""
|
|
98
|
+
database = self.database
|
|
99
|
+
if with_view:
|
|
100
|
+
table_type_condition = "TABLES.TABLE_TYPE IN ('BASE TABLE', 'VIEW')"
|
|
101
|
+
else:
|
|
102
|
+
table_type_condition = "TABLES.TABLE_TYPE = 'BASE TABLE'"
|
|
103
|
+
|
|
104
|
+
query = f"SELECT TABLES.TABLE_NAME, TABLES.TABLE_TYPE FROM information_schema.tables WHERE TABLES.TABLE_SCHEMA = '{database}' and {table_type_condition}"
|
|
105
|
+
rows = self.fetchall(query)
|
|
106
|
+
|
|
107
|
+
if with_view:
|
|
108
|
+
result = {"table": [], "view": []}
|
|
109
|
+
if rows:
|
|
110
|
+
for row in rows:
|
|
111
|
+
table_name = row[0]
|
|
112
|
+
table_type = row[1].strip() if row[1] else row[1]
|
|
113
|
+
|
|
114
|
+
if table_type == "BASE TABLE":
|
|
115
|
+
result["table"].append(table_name)
|
|
116
|
+
elif table_type == "VIEW":
|
|
117
|
+
result["view"].append(table_name)
|
|
118
|
+
else:
|
|
119
|
+
result = {"table": []}
|
|
120
|
+
if rows:
|
|
121
|
+
result["table"] = [row[0] for row in rows]
|
|
122
|
+
|
|
123
|
+
return result
|
|
124
|
+
|
|
125
|
+
def query_get_table_columns(self, table: str, schema: str | None = None) -> RawColumnInfo:
|
|
126
|
+
"""
|
|
127
|
+
Get the schema of a table.
|
|
128
|
+
:param table: table name
|
|
129
|
+
:return: RawColumnInfo object containing column information
|
|
130
|
+
"""
|
|
131
|
+
schema = self.database
|
|
132
|
+
query = (
|
|
133
|
+
"SELECT column_name, data_type, datetime_precision, numeric_precision, numeric_scale, NULL as collation_name, character_maximum_length "
|
|
134
|
+
"FROM information_schema.columns "
|
|
135
|
+
f"WHERE table_name = '{table}' AND table_schema = '{schema}'"
|
|
136
|
+
)
|
|
137
|
+
rows = self.fetchall(query)
|
|
138
|
+
if not rows:
|
|
139
|
+
raise RuntimeError(f"{table}: Table, {schema}: Schema, does not exist, or has no columns")
|
|
140
|
+
|
|
141
|
+
column_info = {
|
|
142
|
+
r[0]: RawColumnInfo(
|
|
143
|
+
column_name=self.safe_get(r, 0),
|
|
144
|
+
data_type=self.safe_get(r, 1),
|
|
145
|
+
datetime_precision=self.safe_get(r, 2),
|
|
146
|
+
numeric_precision=self.safe_get(r, 3),
|
|
147
|
+
numeric_scale=self.safe_get(r, 4),
|
|
148
|
+
collation_name=self.safe_get(r, 5),
|
|
149
|
+
character_maximum_length=self.safe_get(r, 6),
|
|
150
|
+
)
|
|
151
|
+
for r in rows
|
|
152
|
+
}
|
|
153
|
+
return column_info
|
|
154
|
+
|
|
155
|
+
def fetch_rows(
|
|
156
|
+
self,
|
|
157
|
+
query: str,
|
|
158
|
+
limit: int = 1,
|
|
159
|
+
with_column_names: bool = False,
|
|
160
|
+
complete_query: Optional[str] = None,
|
|
161
|
+
) -> Tuple[List, Optional[List[str]]]:
|
|
162
|
+
"""
|
|
163
|
+
Fetch rows from the database.
|
|
164
|
+
|
|
165
|
+
:param query: SQL query to execute.
|
|
166
|
+
:param limit: Number of rows to fetch.
|
|
167
|
+
:param with_column_names: Whether to include column names in the result.
|
|
168
|
+
:return: Tuple of (rows, column_names or None)
|
|
169
|
+
"""
|
|
170
|
+
query = complete_query or f"SELECT * FROM ({query}) AS subquery LIMIT {limit}"
|
|
171
|
+
|
|
172
|
+
result = self.connection.execute(text(query))
|
|
173
|
+
rows = result.fetchmany(limit)
|
|
174
|
+
|
|
175
|
+
if with_column_names:
|
|
176
|
+
column_names = result.keys()
|
|
177
|
+
return rows, list(column_names)
|
|
178
|
+
else:
|
|
179
|
+
return rows, None
|
|
180
|
+
|
|
181
|
+
def query_get_distinct_count(self, table: str, field: str, filters: str = None) -> int:
|
|
182
|
+
"""
|
|
183
|
+
Get the distinct count value
|
|
184
|
+
:param table: table name
|
|
185
|
+
:param field: column name
|
|
186
|
+
:param filters: filter condition
|
|
187
|
+
:return:
|
|
188
|
+
"""
|
|
189
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
190
|
+
field = self.quote_column(field)
|
|
191
|
+
query = "SELECT COUNT(DISTINCT {}) FROM {}".format(field, qualified_table_name)
|
|
192
|
+
if filters:
|
|
193
|
+
query += " WHERE {}".format(filters)
|
|
194
|
+
|
|
195
|
+
return self.fetchone(query)[0]
|
|
196
|
+
|
|
197
|
+
def query_get_percentile(self, table: str, field: str, percentile: float, filters: str = None) -> float:
|
|
198
|
+
"""
|
|
199
|
+
Get the specified percentile value of a numeric column in a table.
|
|
200
|
+
:param table: table name
|
|
201
|
+
:param field: column name
|
|
202
|
+
:param percentile: percentile to calculate (e.g., 0.2 for 20th percentile)
|
|
203
|
+
:param filters: filter condition
|
|
204
|
+
:return: the value at the specified percentile
|
|
205
|
+
"""
|
|
206
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
207
|
+
field = self.quote_column(field)
|
|
208
|
+
rank = int(percentile * 100)
|
|
209
|
+
|
|
210
|
+
query = f"""
|
|
211
|
+
SELECT {field} FROM (
|
|
212
|
+
SELECT {field}, NTILE(100) OVER (ORDER BY {field}) AS percentile_rank
|
|
213
|
+
FROM {qualified_table_name}
|
|
214
|
+
{f'WHERE {filters}' if filters else ''}
|
|
215
|
+
) AS ranked
|
|
216
|
+
WHERE percentile_rank = {rank}
|
|
217
|
+
ORDER BY {field}
|
|
218
|
+
LIMIT 1
|
|
219
|
+
"""
|
|
220
|
+
|
|
221
|
+
result = self.fetchone(query)
|
|
222
|
+
return round(result[0], 2) if result and result[0] is not None else None
|
|
223
|
+
|
|
224
|
+
def query_negative_metric(self, table: str, field: str, operation: str, filters: str = None) -> Union[int, float]:
|
|
225
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
226
|
+
field = self.quote_column(field)
|
|
227
|
+
|
|
228
|
+
negative_query = f"SELECT COUNT(*) FROM {qualified_table_name} WHERE {field} < 0"
|
|
229
|
+
|
|
230
|
+
if filters:
|
|
231
|
+
negative_query += f" AND {filters}"
|
|
232
|
+
|
|
233
|
+
total_count_query = f"SELECT COUNT(*) FROM {qualified_table_name}"
|
|
234
|
+
|
|
235
|
+
if filters:
|
|
236
|
+
total_count_query += f" WHERE {filters}"
|
|
237
|
+
|
|
238
|
+
if operation == "percent":
|
|
239
|
+
query = f"SELECT (CAST(({negative_query}) AS float) / CAST(({total_count_query}) AS float)) * 100"
|
|
240
|
+
else:
|
|
241
|
+
query = negative_query
|
|
242
|
+
|
|
243
|
+
result = self.fetchone(query)[0]
|
|
244
|
+
return round(result, 2) if operation == "percent" else result
|
|
245
|
+
|
|
246
|
+
def query_get_string_length_metric(
|
|
247
|
+
self, table: str, field: str, metric: str, filters: str = None
|
|
248
|
+
) -> Union[int, float]:
|
|
249
|
+
"""
|
|
250
|
+
Get the string length metric (max, min, avg) in a column of a table.
|
|
251
|
+
|
|
252
|
+
:param table: table name
|
|
253
|
+
:param field: column name
|
|
254
|
+
:param metric: the metric to calculate ('max', 'min', 'avg')
|
|
255
|
+
:param filters: filter condition
|
|
256
|
+
:return: the calculated metric as int for 'max' and 'min', float for 'avg'
|
|
257
|
+
"""
|
|
258
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
259
|
+
field = self.quote_column(field)
|
|
260
|
+
|
|
261
|
+
if metric.lower() == "max":
|
|
262
|
+
sql_function = "MAX(LENGTH"
|
|
263
|
+
elif metric.lower() == "min":
|
|
264
|
+
sql_function = "MIN(LENGTH"
|
|
265
|
+
elif metric.lower() == "avg":
|
|
266
|
+
sql_function = "AVG(LENGTH"
|
|
267
|
+
else:
|
|
268
|
+
raise ValueError(f"Invalid metric '{metric}'. Choose from 'max', 'min', or 'avg'.")
|
|
269
|
+
|
|
270
|
+
query = f"SELECT {sql_function}({field})) FROM {qualified_table_name}"
|
|
271
|
+
|
|
272
|
+
if filters:
|
|
273
|
+
query += f" WHERE {filters}"
|
|
274
|
+
|
|
275
|
+
result = self.fetchone(query)[0]
|
|
276
|
+
return round(result, 2) if metric.lower() == "avg" else result
|
|
277
|
+
|
|
278
|
+
def query_string_pattern_validity(
|
|
279
|
+
self,
|
|
280
|
+
table: str,
|
|
281
|
+
field: str,
|
|
282
|
+
regex_pattern: str = None,
|
|
283
|
+
predefined_regex_pattern: str = None,
|
|
284
|
+
filters: str = None,
|
|
285
|
+
) -> Tuple[int, int]:
|
|
286
|
+
"""
|
|
287
|
+
Get the count of valid values based on the regex pattern.
|
|
288
|
+
:param table: table name
|
|
289
|
+
:param field: column name
|
|
290
|
+
:param regex_pattern: custom regex pattern
|
|
291
|
+
:param predefined_regex_pattern: predefined regex pattern
|
|
292
|
+
:param filters: filter condition
|
|
293
|
+
:return: count of valid values, count of total row count
|
|
294
|
+
"""
|
|
295
|
+
filters = f"WHERE {filters}" if filters else ""
|
|
296
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
297
|
+
field = self.quote_column(field)
|
|
298
|
+
|
|
299
|
+
if not regex_pattern and not predefined_regex_pattern:
|
|
300
|
+
raise ValueError("Either regex_pattern or predefined_regex_pattern should be provided")
|
|
301
|
+
|
|
302
|
+
if predefined_regex_pattern:
|
|
303
|
+
regex = self.regex_patterns[predefined_regex_pattern]
|
|
304
|
+
else:
|
|
305
|
+
regex = regex_pattern
|
|
306
|
+
|
|
307
|
+
regex_query = f"CASE WHEN {field} REGEXP '{regex}' THEN 1 ELSE 0 END"
|
|
308
|
+
query = f"""
|
|
309
|
+
SELECT SUM({regex_query}) AS valid_count, COUNT(*) AS total_count
|
|
310
|
+
FROM {qualified_table_name} {filters}
|
|
311
|
+
"""
|
|
312
|
+
result = self.fetchone(query)
|
|
313
|
+
return result[0], result[1]
|
|
314
|
+
|
|
315
|
+
def query_get_usa_state_code_validity(self, table: str, field: str, filters: str = None) -> Tuple[int, int]:
|
|
316
|
+
"""
|
|
317
|
+
Get the count of valid USA state codes
|
|
318
|
+
:param table: table name
|
|
319
|
+
:param field: column name
|
|
320
|
+
:param filters: filter condition
|
|
321
|
+
:return: count of valid state codes, count of total row count
|
|
322
|
+
"""
|
|
323
|
+
|
|
324
|
+
valid_state_codes_str = ", ".join(f"'{code}'" for code in self.valid_state_codes)
|
|
325
|
+
|
|
326
|
+
filters = f"WHERE {filters}" if filters else ""
|
|
327
|
+
|
|
328
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
329
|
+
field = self.quote_column(field)
|
|
330
|
+
|
|
331
|
+
regex_query = f"""
|
|
332
|
+
CASE WHEN REGEXP_LIKE({field}, '^[A-Z]{{2}}$') AND UPPER({field}) IN ({valid_state_codes_str}) THEN 1 ELSE 0 END
|
|
333
|
+
"""
|
|
334
|
+
|
|
335
|
+
query = f"""
|
|
336
|
+
SELECT SUM({regex_query}) AS valid_count, COUNT(*) AS total_count
|
|
337
|
+
FROM {qualified_table_name} {filters}
|
|
338
|
+
"""
|
|
339
|
+
result = self.fetchone(query)
|
|
340
|
+
return result[0], result[1]
|
|
341
|
+
|
|
342
|
+
def query_timestamp_metric(self):
|
|
343
|
+
raise NotImplementedError("Method not implemented for MySQLDataSource")
|
|
344
|
+
|
|
345
|
+
def query_timestamp_not_in_future_metric(self):
|
|
346
|
+
raise NotImplementedError("Method not implemented for MySQLDataSource")
|
|
347
|
+
|
|
348
|
+
def query_timestamp_date_not_in_future_metric(self):
|
|
349
|
+
raise NotImplementedError("Method not implemented for MySQLDataSource")
|
|
350
|
+
|
|
351
|
+
def query_get_time_diff(self, table: str, field: str) -> int:
|
|
352
|
+
"""
|
|
353
|
+
Get the time difference
|
|
354
|
+
:param table: name of the index
|
|
355
|
+
:param field: field name of updated time column
|
|
356
|
+
:return: time difference in seconds
|
|
357
|
+
"""
|
|
358
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
359
|
+
field = self.quote_column(field)
|
|
360
|
+
query = f"""
|
|
361
|
+
SELECT {field}
|
|
362
|
+
FROM {qualified_table_name}
|
|
363
|
+
ORDER BY {field} DESC
|
|
364
|
+
LIMIT 1;
|
|
365
|
+
"""
|
|
366
|
+
result = self.fetchone(query)
|
|
367
|
+
if result:
|
|
368
|
+
updated_time = result[0]
|
|
369
|
+
if isinstance(updated_time, str):
|
|
370
|
+
updated_time = datetime.strptime(updated_time, "%Y-%m-%d %H:%M:%S.%f")
|
|
371
|
+
return int((datetime.utcnow() - updated_time).total_seconds())
|
|
372
|
+
return 0
|
|
373
|
+
|
|
374
|
+
def get_table_foreign_key_info(self, table_name: str, schema: str | None = None):
|
|
375
|
+
schema = schema or self.schema_name
|
|
376
|
+
|
|
377
|
+
query = f"""
|
|
378
|
+
SELECT
|
|
379
|
+
kcu.CONSTRAINT_NAME AS constraint_name,
|
|
380
|
+
kcu.TABLE_NAME AS table_name,
|
|
381
|
+
kcu.COLUMN_NAME AS fk_column,
|
|
382
|
+
kcu.REFERENCED_TABLE_NAME AS referenced_table,
|
|
383
|
+
kcu.REFERENCED_COLUMN_NAME AS referenced_column
|
|
384
|
+
FROM information_schema.TABLE_CONSTRAINTS tc
|
|
385
|
+
JOIN information_schema.KEY_COLUMN_USAGE kcu
|
|
386
|
+
ON tc.CONSTRAINT_NAME = kcu.CONSTRAINT_NAME
|
|
387
|
+
AND tc.TABLE_SCHEMA = kcu.TABLE_SCHEMA
|
|
388
|
+
WHERE tc.CONSTRAINT_TYPE = 'FOREIGN KEY'
|
|
389
|
+
AND tc.TABLE_NAME = '{table_name}'
|
|
390
|
+
AND tc.TABLE_SCHEMA = '{schema}';
|
|
391
|
+
"""
|
|
392
|
+
|
|
393
|
+
try:
|
|
394
|
+
rows = self.fetchall(query)
|
|
395
|
+
except Exception as e:
|
|
396
|
+
logger.error(f"Failed to fetch fk info for dataset: {table_name} ({e})")
|
|
397
|
+
return []
|
|
398
|
+
|
|
399
|
+
data = [
|
|
400
|
+
{
|
|
401
|
+
"constraint_name": row[0],
|
|
402
|
+
"table_name": row[1],
|
|
403
|
+
"fk_column": row[2],
|
|
404
|
+
"referenced_table": row[3],
|
|
405
|
+
"referenced_column": row[4],
|
|
406
|
+
}
|
|
407
|
+
for row in rows
|
|
408
|
+
]
|
|
409
|
+
return data
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from typing import Dict
|
|
15
|
+
|
|
16
|
+
from opensearchpy import OpenSearch
|
|
17
|
+
|
|
18
|
+
from dcs_core.core.common.errors import DataChecksDataSourcesConnectionError
|
|
19
|
+
from dcs_core.core.datasource.search_datasource import SearchIndexDataSource
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class OpenSearchDataSource(SearchIndexDataSource):
|
|
23
|
+
"""
|
|
24
|
+
OpenSearch data source
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, data_source_name: str, data_connection: Dict):
|
|
28
|
+
super().__init__(data_source_name, data_connection)
|
|
29
|
+
|
|
30
|
+
def connect(self) -> OpenSearch:
|
|
31
|
+
"""
|
|
32
|
+
Connect to the data source
|
|
33
|
+
"""
|
|
34
|
+
try:
|
|
35
|
+
auth = (
|
|
36
|
+
self.data_connection.get("username"),
|
|
37
|
+
self.data_connection.get("password"),
|
|
38
|
+
)
|
|
39
|
+
host = self.data_connection.get("host")
|
|
40
|
+
port = int(self.data_connection.get("port"))
|
|
41
|
+
self.client = OpenSearch(
|
|
42
|
+
hosts=[{"host": host, "port": port}],
|
|
43
|
+
http_auth=auth,
|
|
44
|
+
use_ssl=True,
|
|
45
|
+
verify_certs=False,
|
|
46
|
+
ca_certs=False,
|
|
47
|
+
)
|
|
48
|
+
if not self.client.ping():
|
|
49
|
+
raise Exception("Failed to connect to OpenSearch data source")
|
|
50
|
+
return self.client
|
|
51
|
+
except Exception as e:
|
|
52
|
+
raise DataChecksDataSourcesConnectionError(f"Failed to connect to OpenSearch data source: [{str(e)}]")
|
|
53
|
+
|
|
54
|
+
def close(self):
|
|
55
|
+
"""
|
|
56
|
+
Close the connection
|
|
57
|
+
"""
|
|
58
|
+
self.client.close()
|
|
59
|
+
|
|
60
|
+
def is_connected(self) -> bool:
|
|
61
|
+
"""
|
|
62
|
+
Check if the data source is connected
|
|
63
|
+
"""
|
|
64
|
+
return self.client.ping()
|