dcs-sdk 1.6.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_diff/__init__.py +221 -0
- data_diff/__main__.py +517 -0
- data_diff/abcs/__init__.py +13 -0
- data_diff/abcs/compiler.py +27 -0
- data_diff/abcs/database_types.py +402 -0
- data_diff/config.py +141 -0
- data_diff/databases/__init__.py +38 -0
- data_diff/databases/_connect.py +323 -0
- data_diff/databases/base.py +1417 -0
- data_diff/databases/bigquery.py +376 -0
- data_diff/databases/clickhouse.py +217 -0
- data_diff/databases/databricks.py +262 -0
- data_diff/databases/duckdb.py +207 -0
- data_diff/databases/mssql.py +343 -0
- data_diff/databases/mysql.py +189 -0
- data_diff/databases/oracle.py +238 -0
- data_diff/databases/postgresql.py +293 -0
- data_diff/databases/presto.py +222 -0
- data_diff/databases/redis.py +93 -0
- data_diff/databases/redshift.py +233 -0
- data_diff/databases/snowflake.py +222 -0
- data_diff/databases/sybase.py +720 -0
- data_diff/databases/trino.py +73 -0
- data_diff/databases/vertica.py +174 -0
- data_diff/diff_tables.py +489 -0
- data_diff/errors.py +17 -0
- data_diff/format.py +369 -0
- data_diff/hashdiff_tables.py +1026 -0
- data_diff/info_tree.py +76 -0
- data_diff/joindiff_tables.py +434 -0
- data_diff/lexicographic_space.py +253 -0
- data_diff/parse_time.py +88 -0
- data_diff/py.typed +0 -0
- data_diff/queries/__init__.py +13 -0
- data_diff/queries/api.py +213 -0
- data_diff/queries/ast_classes.py +811 -0
- data_diff/queries/base.py +38 -0
- data_diff/queries/extras.py +43 -0
- data_diff/query_utils.py +70 -0
- data_diff/schema.py +67 -0
- data_diff/table_segment.py +583 -0
- data_diff/thread_utils.py +112 -0
- data_diff/utils.py +1022 -0
- data_diff/version.py +15 -0
- dcs_core/__init__.py +13 -0
- dcs_core/__main__.py +17 -0
- dcs_core/__version__.py +15 -0
- dcs_core/cli/__init__.py +13 -0
- dcs_core/cli/cli.py +165 -0
- dcs_core/core/__init__.py +19 -0
- dcs_core/core/common/__init__.py +13 -0
- dcs_core/core/common/errors.py +50 -0
- dcs_core/core/common/models/__init__.py +13 -0
- dcs_core/core/common/models/configuration.py +284 -0
- dcs_core/core/common/models/dashboard.py +24 -0
- dcs_core/core/common/models/data_source_resource.py +75 -0
- dcs_core/core/common/models/metric.py +160 -0
- dcs_core/core/common/models/profile.py +75 -0
- dcs_core/core/common/models/validation.py +216 -0
- dcs_core/core/common/models/widget.py +44 -0
- dcs_core/core/configuration/__init__.py +13 -0
- dcs_core/core/configuration/config_loader.py +139 -0
- dcs_core/core/configuration/configuration_parser.py +262 -0
- dcs_core/core/configuration/configuration_parser_arc.py +328 -0
- dcs_core/core/datasource/__init__.py +13 -0
- dcs_core/core/datasource/base.py +62 -0
- dcs_core/core/datasource/manager.py +112 -0
- dcs_core/core/datasource/search_datasource.py +421 -0
- dcs_core/core/datasource/sql_datasource.py +1094 -0
- dcs_core/core/inspect.py +163 -0
- dcs_core/core/logger/__init__.py +13 -0
- dcs_core/core/logger/base.py +32 -0
- dcs_core/core/logger/default_logger.py +94 -0
- dcs_core/core/metric/__init__.py +13 -0
- dcs_core/core/metric/base.py +220 -0
- dcs_core/core/metric/combined_metric.py +98 -0
- dcs_core/core/metric/custom_metric.py +34 -0
- dcs_core/core/metric/manager.py +137 -0
- dcs_core/core/metric/numeric_metric.py +403 -0
- dcs_core/core/metric/reliability_metric.py +90 -0
- dcs_core/core/profiling/__init__.py +13 -0
- dcs_core/core/profiling/datasource_profiling.py +136 -0
- dcs_core/core/profiling/numeric_field_profiling.py +72 -0
- dcs_core/core/profiling/text_field_profiling.py +67 -0
- dcs_core/core/repository/__init__.py +13 -0
- dcs_core/core/repository/metric_repository.py +77 -0
- dcs_core/core/utils/__init__.py +13 -0
- dcs_core/core/utils/log.py +29 -0
- dcs_core/core/utils/tracking.py +105 -0
- dcs_core/core/utils/utils.py +44 -0
- dcs_core/core/validation/__init__.py +13 -0
- dcs_core/core/validation/base.py +230 -0
- dcs_core/core/validation/completeness_validation.py +153 -0
- dcs_core/core/validation/custom_query_validation.py +24 -0
- dcs_core/core/validation/manager.py +282 -0
- dcs_core/core/validation/numeric_validation.py +276 -0
- dcs_core/core/validation/reliability_validation.py +91 -0
- dcs_core/core/validation/uniqueness_validation.py +61 -0
- dcs_core/core/validation/validity_validation.py +738 -0
- dcs_core/integrations/__init__.py +13 -0
- dcs_core/integrations/databases/__init__.py +13 -0
- dcs_core/integrations/databases/bigquery.py +187 -0
- dcs_core/integrations/databases/databricks.py +51 -0
- dcs_core/integrations/databases/db2.py +652 -0
- dcs_core/integrations/databases/elasticsearch.py +61 -0
- dcs_core/integrations/databases/mssql.py +829 -0
- dcs_core/integrations/databases/mysql.py +409 -0
- dcs_core/integrations/databases/opensearch.py +64 -0
- dcs_core/integrations/databases/oracle.py +719 -0
- dcs_core/integrations/databases/postgres.py +482 -0
- dcs_core/integrations/databases/redshift.py +53 -0
- dcs_core/integrations/databases/snowflake.py +48 -0
- dcs_core/integrations/databases/spark_df.py +111 -0
- dcs_core/integrations/databases/sybase.py +1069 -0
- dcs_core/integrations/storage/__init__.py +13 -0
- dcs_core/integrations/storage/local_file.py +149 -0
- dcs_core/integrations/utils/__init__.py +13 -0
- dcs_core/integrations/utils/utils.py +36 -0
- dcs_core/report/__init__.py +13 -0
- dcs_core/report/dashboard.py +211 -0
- dcs_core/report/models.py +88 -0
- dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
- dcs_core/report/static/assets/images/docs.svg +6 -0
- dcs_core/report/static/assets/images/github.svg +4 -0
- dcs_core/report/static/assets/images/logo.svg +7 -0
- dcs_core/report/static/assets/images/slack.svg +13 -0
- dcs_core/report/static/index.js +2 -0
- dcs_core/report/static/index.js.LICENSE.txt +3971 -0
- dcs_sdk/__init__.py +13 -0
- dcs_sdk/__main__.py +18 -0
- dcs_sdk/__version__.py +15 -0
- dcs_sdk/cli/__init__.py +13 -0
- dcs_sdk/cli/cli.py +163 -0
- dcs_sdk/sdk/__init__.py +58 -0
- dcs_sdk/sdk/config/__init__.py +13 -0
- dcs_sdk/sdk/config/config_loader.py +491 -0
- dcs_sdk/sdk/data_diff/__init__.py +13 -0
- dcs_sdk/sdk/data_diff/data_differ.py +821 -0
- dcs_sdk/sdk/rules/__init__.py +15 -0
- dcs_sdk/sdk/rules/rules_mappping.py +31 -0
- dcs_sdk/sdk/rules/rules_repository.py +214 -0
- dcs_sdk/sdk/rules/schema_rules.py +65 -0
- dcs_sdk/sdk/utils/__init__.py +13 -0
- dcs_sdk/sdk/utils/serializer.py +25 -0
- dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
- dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
- dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
- dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
- dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
- dcs_sdk/sdk/utils/table.py +475 -0
- dcs_sdk/sdk/utils/themes.py +40 -0
- dcs_sdk/sdk/utils/utils.py +349 -0
- dcs_sdk-1.6.5.dist-info/METADATA +150 -0
- dcs_sdk-1.6.5.dist-info/RECORD +159 -0
- dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
- dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,652 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from datetime import datetime
|
|
16
|
+
from typing import Any, Dict, List, Tuple, Union
|
|
17
|
+
|
|
18
|
+
from loguru import logger
|
|
19
|
+
from sqlalchemy import create_engine
|
|
20
|
+
from sqlalchemy.exc import SQLAlchemyError
|
|
21
|
+
|
|
22
|
+
from dcs_core.core.common.errors import DataChecksDataSourcesConnectionError
|
|
23
|
+
from dcs_core.core.datasource.sql_datasource import SQLDataSource
|
|
24
|
+
from dcs_core.integrations.utils.utils import ibm_db2_dll_files_loader
|
|
25
|
+
|
|
26
|
+
ibm_db2_dll_files_loader()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class DB2DataSource(SQLDataSource):
|
|
30
|
+
def __init__(self, data_source_name: str, data_connection: Dict):
|
|
31
|
+
super().__init__(data_source_name, data_connection)
|
|
32
|
+
|
|
33
|
+
def connect(self) -> Any:
|
|
34
|
+
"""
|
|
35
|
+
Connect to the DB2 data source using SQLAlchemy
|
|
36
|
+
"""
|
|
37
|
+
try:
|
|
38
|
+
url = self._build_connection_url()
|
|
39
|
+
engine = create_engine(url, echo=False)
|
|
40
|
+
self.connection = engine.connect()
|
|
41
|
+
return self.connection
|
|
42
|
+
except SQLAlchemyError as e:
|
|
43
|
+
raise DataChecksDataSourcesConnectionError(f"Failed to connect to DB2 data source: {str(e)}")
|
|
44
|
+
|
|
45
|
+
def _build_connection_url(self) -> str:
|
|
46
|
+
"""
|
|
47
|
+
Build the SQLAlchemy connection URL for DB2
|
|
48
|
+
"""
|
|
49
|
+
host = self.data_connection.get("host")
|
|
50
|
+
port = self.data_connection.get("port")
|
|
51
|
+
database = self.data_connection.get("database")
|
|
52
|
+
username = self.data_connection.get("username")
|
|
53
|
+
password = self.data_connection.get("password")
|
|
54
|
+
|
|
55
|
+
url = f"db2+ibm_db://{username}:{password}@{host}:{port}/{database}"
|
|
56
|
+
|
|
57
|
+
params = []
|
|
58
|
+
if self.data_connection.get("security"):
|
|
59
|
+
params.append(f"SECURITY={self.data_connection['security']}")
|
|
60
|
+
if self.data_connection.get("protocol"):
|
|
61
|
+
params.append(f"PROTOCOL={self.data_connection['protocol']}")
|
|
62
|
+
if self.data_connection.get("schema"):
|
|
63
|
+
params.append(f"CURRENTSCHEMA={self.data_connection.get('schema')}")
|
|
64
|
+
if params:
|
|
65
|
+
url += "?" + "&".join(params)
|
|
66
|
+
|
|
67
|
+
return url
|
|
68
|
+
|
|
69
|
+
def qualified_table_name(self, table_name: str) -> str:
|
|
70
|
+
"""
|
|
71
|
+
Get the qualified table name
|
|
72
|
+
:param table_name: name of the table
|
|
73
|
+
:return: qualified table name
|
|
74
|
+
"""
|
|
75
|
+
if self.schema_name:
|
|
76
|
+
return f'"{self.schema_name}"."{table_name}"'
|
|
77
|
+
return f'"{table_name}"'
|
|
78
|
+
|
|
79
|
+
def quote_column(self, column: str) -> str:
|
|
80
|
+
"""
|
|
81
|
+
Quote the column name
|
|
82
|
+
:param column: name of the column
|
|
83
|
+
:return: quoted column name
|
|
84
|
+
"""
|
|
85
|
+
return f'"{column}"'
|
|
86
|
+
|
|
87
|
+
def query_get_distinct_count(self, table: str, field: str, filters: str = None) -> int:
|
|
88
|
+
"""
|
|
89
|
+
Get the distinct count value
|
|
90
|
+
:param table: table name
|
|
91
|
+
:param field: column name
|
|
92
|
+
:param filters: filter condition
|
|
93
|
+
:return: distinct count as an integer
|
|
94
|
+
"""
|
|
95
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
96
|
+
field = self.quote_column(field)
|
|
97
|
+
query = f"SELECT COUNT(DISTINCT CAST({field} AS VARCHAR(255))) FROM {qualified_table_name}"
|
|
98
|
+
|
|
99
|
+
if filters:
|
|
100
|
+
query += f" WHERE {filters}"
|
|
101
|
+
|
|
102
|
+
result = self.fetchone(query)
|
|
103
|
+
return result[0] if result else 0
|
|
104
|
+
|
|
105
|
+
def query_negative_metric(self, table: str, field: str, operation: str, filters: str = None) -> Union[int, float]:
|
|
106
|
+
"""
|
|
107
|
+
Calculate a negative metric for a specified field in a Db2 table.
|
|
108
|
+
:param table: table name
|
|
109
|
+
:param field: column name
|
|
110
|
+
:param operation: type of operation, "percent" or "count"
|
|
111
|
+
:param filters: optional filter conditions
|
|
112
|
+
:return: percentage of negative values if operation is "percent", otherwise count of negatives
|
|
113
|
+
"""
|
|
114
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
115
|
+
field = self.quote_column(field)
|
|
116
|
+
|
|
117
|
+
negative_query = f"SELECT COUNT(*) FROM {qualified_table_name} WHERE {field} < 0"
|
|
118
|
+
if filters:
|
|
119
|
+
negative_query += f" AND {filters}"
|
|
120
|
+
|
|
121
|
+
total_count_query = f"SELECT COUNT(*) FROM {qualified_table_name}"
|
|
122
|
+
if filters:
|
|
123
|
+
total_count_query += f" WHERE {filters}"
|
|
124
|
+
|
|
125
|
+
if operation == "percent":
|
|
126
|
+
query = f"""
|
|
127
|
+
SELECT (CAST(({negative_query}) AS FLOAT) / NULLIF(CAST(({total_count_query}) AS FLOAT), 0)) * 100
|
|
128
|
+
FROM SYSIBM.SYSDUMMY1
|
|
129
|
+
"""
|
|
130
|
+
else:
|
|
131
|
+
query = negative_query
|
|
132
|
+
|
|
133
|
+
result = self.fetchone(query)[0]
|
|
134
|
+
return round(result, 2) if operation == "percent" else result
|
|
135
|
+
|
|
136
|
+
def query_get_null_keyword_count(
|
|
137
|
+
self, table: str, field: str, operation: str, filters: str = None
|
|
138
|
+
) -> Union[int, float]:
|
|
139
|
+
"""
|
|
140
|
+
Get the count of NULL-like values (specific keywords) in the specified column for IBM DB2.
|
|
141
|
+
:param table: table name
|
|
142
|
+
:param field: column name
|
|
143
|
+
:param operation: type of operation ('count' or 'percent')
|
|
144
|
+
:param filters: filter condition
|
|
145
|
+
:return: count or percentage of NULL-like keyword values
|
|
146
|
+
"""
|
|
147
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
148
|
+
field = self.quote_column(field)
|
|
149
|
+
|
|
150
|
+
query = f"""
|
|
151
|
+
SELECT
|
|
152
|
+
SUM(CASE
|
|
153
|
+
WHEN {field} IS NULL
|
|
154
|
+
OR TRIM(UPPER({field})) IN ('NOTHING', 'NIL', 'NULL', 'NONE', 'N/A')
|
|
155
|
+
THEN 1
|
|
156
|
+
ELSE 0
|
|
157
|
+
END) AS null_count,
|
|
158
|
+
COUNT(*) AS total_count
|
|
159
|
+
FROM {qualified_table_name}
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
if filters:
|
|
163
|
+
query += f" WHERE {filters}"
|
|
164
|
+
|
|
165
|
+
result = self.fetchone(query)
|
|
166
|
+
|
|
167
|
+
if not result or result[1] == 0:
|
|
168
|
+
return 0
|
|
169
|
+
|
|
170
|
+
if operation == "percent":
|
|
171
|
+
return round((result[0] or 0) / result[1] * 100, 2)
|
|
172
|
+
|
|
173
|
+
return result[0] or 0
|
|
174
|
+
|
|
175
|
+
def query_get_string_length_metric(
|
|
176
|
+
self, table: str, field: str, metric: str, filters: str = None
|
|
177
|
+
) -> Union[int, float]:
|
|
178
|
+
"""
|
|
179
|
+
Get the string length metric (max, min, avg) in a column of a table.
|
|
180
|
+
|
|
181
|
+
:param table: table name
|
|
182
|
+
:param field: column name
|
|
183
|
+
:param metric: the metric to calculate ('max', 'min', 'avg')
|
|
184
|
+
:param filters: filter condition
|
|
185
|
+
:return: the calculated metric as int for 'max' and 'min', float for 'avg'
|
|
186
|
+
"""
|
|
187
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
188
|
+
field = self.quote_column(field)
|
|
189
|
+
|
|
190
|
+
if metric.lower() == "max":
|
|
191
|
+
sql_function = "MAX(LENGTH"
|
|
192
|
+
elif metric.lower() == "min":
|
|
193
|
+
sql_function = "MIN(LENGTH"
|
|
194
|
+
elif metric.lower() == "avg":
|
|
195
|
+
sql_function = "AVG(CAST(LENGTH"
|
|
196
|
+
else:
|
|
197
|
+
raise ValueError(f"Invalid metric '{metric}'. Choose from 'max', 'min', or 'avg'.")
|
|
198
|
+
|
|
199
|
+
if metric.lower() == "avg":
|
|
200
|
+
query = f'SELECT {sql_function}("{field}") AS FLOAT)) FROM {qualified_table_name}'
|
|
201
|
+
else:
|
|
202
|
+
query = f'SELECT {sql_function}("{field}")) FROM {qualified_table_name}'
|
|
203
|
+
|
|
204
|
+
if filters:
|
|
205
|
+
query += f" WHERE {filters}"
|
|
206
|
+
|
|
207
|
+
result = self.fetchone(query)[0]
|
|
208
|
+
return round(result, 2) if metric.lower() == "avg" else result
|
|
209
|
+
|
|
210
|
+
def query_string_pattern_validity(
|
|
211
|
+
self,
|
|
212
|
+
table: str,
|
|
213
|
+
field: str,
|
|
214
|
+
regex_pattern: str = None,
|
|
215
|
+
predefined_regex_pattern: str = None,
|
|
216
|
+
filters: str = None,
|
|
217
|
+
) -> Tuple[int, int]:
|
|
218
|
+
"""
|
|
219
|
+
Get the count of valid values based on the regex pattern.
|
|
220
|
+
:param table: table name
|
|
221
|
+
:param field: column name
|
|
222
|
+
:param regex_pattern: custom regex pattern
|
|
223
|
+
:param predefined_regex_pattern: predefined regex pattern
|
|
224
|
+
:param filters: filter condition
|
|
225
|
+
:return: count of valid values, count of total row count
|
|
226
|
+
"""
|
|
227
|
+
filters = f"WHERE {filters}" if filters else ""
|
|
228
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
229
|
+
field = self.quote_column(field)
|
|
230
|
+
|
|
231
|
+
if not regex_pattern and not predefined_regex_pattern:
|
|
232
|
+
raise ValueError("Either regex_pattern or predefined_regex_pattern should be provided")
|
|
233
|
+
|
|
234
|
+
if predefined_regex_pattern:
|
|
235
|
+
regex = self.regex_patterns[predefined_regex_pattern]
|
|
236
|
+
else:
|
|
237
|
+
regex = regex_pattern
|
|
238
|
+
|
|
239
|
+
regex_query = f"""
|
|
240
|
+
CASE WHEN REGEXP_LIKE("{field}", '{regex}') THEN 1 ELSE 0 END"""
|
|
241
|
+
|
|
242
|
+
query = f"""
|
|
243
|
+
SELECT SUM({regex_query}) AS valid_count, COUNT(*) AS total_count
|
|
244
|
+
FROM {qualified_table_name} {filters}
|
|
245
|
+
"""
|
|
246
|
+
result = self.fetchone(query)
|
|
247
|
+
return result[0], result[1]
|
|
248
|
+
|
|
249
|
+
def query_valid_invalid_values_validity(
|
|
250
|
+
self,
|
|
251
|
+
table: str,
|
|
252
|
+
field: str,
|
|
253
|
+
regex_pattern: str = None,
|
|
254
|
+
filters: str = None,
|
|
255
|
+
values: List[str] = None,
|
|
256
|
+
) -> Tuple[int, int]:
|
|
257
|
+
"""
|
|
258
|
+
Get the count of valid and invalid values for a specified column.
|
|
259
|
+
:param table: table name
|
|
260
|
+
:param field: column name
|
|
261
|
+
:param values: list of valid values
|
|
262
|
+
:param regex_pattern: regex pattern
|
|
263
|
+
:param filters: filter condition
|
|
264
|
+
:return: count of valid values and total count of rows.
|
|
265
|
+
"""
|
|
266
|
+
filters = f"WHERE {filters}" if filters else ""
|
|
267
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
268
|
+
field = self.quote_column(field)
|
|
269
|
+
if values:
|
|
270
|
+
values_str = ", ".join([f"'{value}'" for value in values])
|
|
271
|
+
validity_condition = f"CASE WHEN {field} IN ({values_str}) THEN 1 ELSE 0 END"
|
|
272
|
+
else:
|
|
273
|
+
validity_condition = f"CASE WHEN REGEXP_LIKE({field}, '{regex_pattern}') THEN 1 ELSE 0 END"
|
|
274
|
+
|
|
275
|
+
query = f"""
|
|
276
|
+
SELECT SUM({validity_condition}) AS valid_count, COUNT(*) AS total_count
|
|
277
|
+
FROM {qualified_table_name}
|
|
278
|
+
{filters}
|
|
279
|
+
"""
|
|
280
|
+
|
|
281
|
+
result = self.fetchone(query)
|
|
282
|
+
return result[0], result[1]
|
|
283
|
+
|
|
284
|
+
def query_get_usa_state_code_validity(self, table: str, field: str, filters: str = None) -> Tuple[int, int]:
|
|
285
|
+
"""
|
|
286
|
+
Get the count of valid USA state codes
|
|
287
|
+
:param table: table name
|
|
288
|
+
:param field: column name
|
|
289
|
+
:param filters: filter condition
|
|
290
|
+
:return: count of valid state codes, count of total row count
|
|
291
|
+
"""
|
|
292
|
+
|
|
293
|
+
valid_state_codes_str = ", ".join(f"'{code}'" for code in self.valid_state_codes)
|
|
294
|
+
|
|
295
|
+
filters = f"WHERE {filters}" if filters else ""
|
|
296
|
+
|
|
297
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
298
|
+
field = self.quote_column(field)
|
|
299
|
+
regex_query = f"""
|
|
300
|
+
CASE WHEN REGEXP_LIKE("{field}", '^[A-Z]{{2}}$') AND UPPER("{field}") IN ({valid_state_codes_str}) THEN 1 ELSE 0 END
|
|
301
|
+
"""
|
|
302
|
+
|
|
303
|
+
query = f"""
|
|
304
|
+
SELECT SUM({regex_query}) AS valid_count, COUNT(*) AS total_count
|
|
305
|
+
FROM {qualified_table_name} {filters}
|
|
306
|
+
"""
|
|
307
|
+
result = self.fetchone(query)
|
|
308
|
+
return result[0], result[1]
|
|
309
|
+
|
|
310
|
+
def query_timestamp_metric(
|
|
311
|
+
self,
|
|
312
|
+
table: str,
|
|
313
|
+
field: str,
|
|
314
|
+
operation: str,
|
|
315
|
+
predefined_regex: str,
|
|
316
|
+
filters: str = None,
|
|
317
|
+
) -> Union[float, int]:
|
|
318
|
+
"""
|
|
319
|
+
:param table: Table name
|
|
320
|
+
:param field: Column name
|
|
321
|
+
:param operation: Metric operation ("count" or "percent")
|
|
322
|
+
:param predefined_regex: regex pattern
|
|
323
|
+
:param filters: filter condition
|
|
324
|
+
:return: Tuple containing valid count and total count (or percentage)
|
|
325
|
+
"""
|
|
326
|
+
|
|
327
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
328
|
+
field = self.quote_column(field)
|
|
329
|
+
|
|
330
|
+
timestamp_iso_regex = r"^\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])T([01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9](?:\.\d{1,3})?(Z|[+-](0[0-9]|1[0-4]):[0-5][0-9])?$"
|
|
331
|
+
|
|
332
|
+
if predefined_regex == "timestamp_iso":
|
|
333
|
+
regex_condition = f"REGEXP_LIKE({field}, '{timestamp_iso_regex}')"
|
|
334
|
+
else:
|
|
335
|
+
raise ValueError(f"Unknown predefined regex pattern: {predefined_regex}")
|
|
336
|
+
|
|
337
|
+
filters_clause = f"WHERE {filters}" if filters else ""
|
|
338
|
+
|
|
339
|
+
query = f"""
|
|
340
|
+
WITH extracted_timestamps AS (
|
|
341
|
+
SELECT
|
|
342
|
+
{field},
|
|
343
|
+
SUBSTR({field}, 1, 4) AS year, -- Extract year
|
|
344
|
+
SUBSTR({field}, 6, 2) AS month, -- Extract month
|
|
345
|
+
SUBSTR({field}, 9, 2) AS day, -- Extract day
|
|
346
|
+
SUBSTR({field}, 12, 2) AS hour, -- Extract hour
|
|
347
|
+
SUBSTR({field}, 15, 2) AS minute, -- Extract minute
|
|
348
|
+
SUBSTR({field}, 18, 2) AS second, -- Extract second
|
|
349
|
+
SUBSTR({field}, 20) AS timezone -- Extract timezone
|
|
350
|
+
FROM {qualified_table_name}
|
|
351
|
+
{filters_clause}
|
|
352
|
+
),
|
|
353
|
+
validated_timestamps AS (
|
|
354
|
+
SELECT
|
|
355
|
+
{field},
|
|
356
|
+
CASE
|
|
357
|
+
WHEN
|
|
358
|
+
-- Validate each component with its specific rules
|
|
359
|
+
REGEXP_LIKE(year, '^\d{{4}}$') AND
|
|
360
|
+
REGEXP_LIKE(month, '^(0[1-9]|1[0-2])$') AND
|
|
361
|
+
REGEXP_LIKE(day, '^((0[1-9]|[12][0-9])|(30|31))$') AND
|
|
362
|
+
REGEXP_LIKE(hour, '^([01][0-9]|2[0-3])$') AND
|
|
363
|
+
REGEXP_LIKE(minute, '^[0-5][0-9]$') AND
|
|
364
|
+
REGEXP_LIKE(second, '^[0-5][0-9]$') AND
|
|
365
|
+
(timezone IS NULL OR REGEXP_LIKE(timezone, '^(Z|[+-](0[0-9]|1[0-4]):[0-5][0-9])$')) AND
|
|
366
|
+
-- Additional check for days in months (e.g., February)
|
|
367
|
+
(
|
|
368
|
+
(month IN ('01', '03', '05', '07', '08', '10', '12') AND day BETWEEN '01' AND '31') OR
|
|
369
|
+
(month IN ('04', '06', '09', '11') AND day BETWEEN '01' AND '30') OR
|
|
370
|
+
(month = '02' AND day BETWEEN '01' AND
|
|
371
|
+
CASE
|
|
372
|
+
-- Handle leap years
|
|
373
|
+
WHEN (CAST(year AS INT) % 400 = 0 OR (CAST(year AS INT) % 100 != 0 AND CAST(year AS INT) % 4 = 0)) THEN '29'
|
|
374
|
+
ELSE '28'
|
|
375
|
+
END
|
|
376
|
+
)
|
|
377
|
+
)
|
|
378
|
+
THEN 1
|
|
379
|
+
ELSE 0
|
|
380
|
+
END AS is_valid
|
|
381
|
+
FROM extracted_timestamps
|
|
382
|
+
)
|
|
383
|
+
SELECT COUNT(*) AS valid_count
|
|
384
|
+
FROM validated_timestamps
|
|
385
|
+
WHERE is_valid = 1;
|
|
386
|
+
"""
|
|
387
|
+
|
|
388
|
+
try:
|
|
389
|
+
valid_count = self.fetchone(query)[0]
|
|
390
|
+
total_count_query = f"SELECT COUNT(*) FROM {qualified_table_name} {filters_clause}"
|
|
391
|
+
total_count = self.fetchone(total_count_query)[0]
|
|
392
|
+
|
|
393
|
+
if operation == "count":
|
|
394
|
+
return valid_count, total_count
|
|
395
|
+
elif operation == "percent":
|
|
396
|
+
return (valid_count / total_count) * 100 if total_count > 0 else 0.0
|
|
397
|
+
else:
|
|
398
|
+
raise ValueError(f"Unknown operation: {operation}")
|
|
399
|
+
|
|
400
|
+
except Exception as e:
|
|
401
|
+
logger.error(f"Failed to execute query: {str(e)}")
|
|
402
|
+
return 0, 0
|
|
403
|
+
|
|
404
|
+
def query_timestamp_not_in_future_metric(
|
|
405
|
+
self,
|
|
406
|
+
table: str,
|
|
407
|
+
field: str,
|
|
408
|
+
operation: str,
|
|
409
|
+
predefined_regex: str,
|
|
410
|
+
filters: str = None,
|
|
411
|
+
) -> Union[float, int]:
|
|
412
|
+
"""
|
|
413
|
+
:param table: Table name
|
|
414
|
+
:param field: Column name
|
|
415
|
+
:param operation: Metric operation ("count" or "percent")
|
|
416
|
+
:param predefined_regex: regex pattern
|
|
417
|
+
:param filters: filter condition
|
|
418
|
+
:return: Tuple containing count of valid timestamps not in the future and total count
|
|
419
|
+
"""
|
|
420
|
+
|
|
421
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
422
|
+
field = self.quote_column(field)
|
|
423
|
+
|
|
424
|
+
timestamp_iso_regex = r"^\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])T([01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9](?:\.\d{1,3})?(Z|[+-](0[0-9]|1[0-4]):[0-5][0-9])?$"
|
|
425
|
+
|
|
426
|
+
if predefined_regex == "timestamp_iso":
|
|
427
|
+
regex_condition = f"REGEXP_LIKE({field}, '{timestamp_iso_regex}')"
|
|
428
|
+
else:
|
|
429
|
+
raise ValueError(f"Unknown predefined regex pattern: {predefined_regex}")
|
|
430
|
+
|
|
431
|
+
filters_clause = f"WHERE {filters}" if filters else ""
|
|
432
|
+
|
|
433
|
+
query = f"""
|
|
434
|
+
WITH extracted_timestamps AS (
|
|
435
|
+
SELECT
|
|
436
|
+
{field},
|
|
437
|
+
SUBSTRING({field}, 1, 4) AS year, -- Extract year
|
|
438
|
+
SUBSTRING({field}, 6, 2) AS month, -- Extract month
|
|
439
|
+
SUBSTRING({field}, 9, 2) AS day, -- Extract day
|
|
440
|
+
SUBSTRING({field}, 12, 2) AS hour, -- Extract hour
|
|
441
|
+
SUBSTRING({field}, 15, 2) AS minute, -- Extract minute
|
|
442
|
+
SUBSTRING({field}, 18, 2) AS second, -- Extract second
|
|
443
|
+
SUBSTRING({field}, 20) AS timezone -- Extract timezone
|
|
444
|
+
FROM {qualified_table_name}
|
|
445
|
+
{filters_clause}
|
|
446
|
+
),
|
|
447
|
+
validated_timestamps AS (
|
|
448
|
+
SELECT
|
|
449
|
+
{field},
|
|
450
|
+
CASE
|
|
451
|
+
WHEN
|
|
452
|
+
REGEXP_LIKE(year, '^\d{{4}}$') AND
|
|
453
|
+
REGEXP_LIKE(month, '^(0[1-9]|1[0-2])$') AND
|
|
454
|
+
REGEXP_LIKE(day, '^((0[1-9]|[12][0-9])|(30|31))$') AND
|
|
455
|
+
REGEXP_LIKE(hour, '^([01][0-9]|2[0-3])$') AND
|
|
456
|
+
REGEXP_LIKE(minute, '^[0-5][0-9]$') AND
|
|
457
|
+
REGEXP_LIKE(second, '^[0-5][0-9]$') AND
|
|
458
|
+
(timezone IS NULL OR REGEXP_LIKE(timezone, '^(Z|[+-](0[0-9]|1[0-4]):[0-5][0-9])$')) AND
|
|
459
|
+
(
|
|
460
|
+
(month IN ('01', '03', '05', '07', '08', '10', '12') AND day BETWEEN '01' AND '31') OR
|
|
461
|
+
(month IN ('04', '06', '09', '11') AND day BETWEEN '01' AND '30') OR
|
|
462
|
+
(month = '02' AND day BETWEEN '01' AND
|
|
463
|
+
CASE
|
|
464
|
+
WHEN (CAST(year AS INTEGER) % 400 = 0 OR
|
|
465
|
+
(CAST(year AS INTEGER) % 100 != 0 AND
|
|
466
|
+
CAST(year AS INTEGER) % 4 = 0)) THEN '29'
|
|
467
|
+
ELSE '28'
|
|
468
|
+
END
|
|
469
|
+
)
|
|
470
|
+
)
|
|
471
|
+
THEN 1
|
|
472
|
+
ELSE 0
|
|
473
|
+
END AS is_valid
|
|
474
|
+
FROM extracted_timestamps
|
|
475
|
+
),
|
|
476
|
+
timestamps_not_in_future AS (
|
|
477
|
+
SELECT *
|
|
478
|
+
FROM validated_timestamps
|
|
479
|
+
WHERE is_valid = 1 AND {regex_condition}
|
|
480
|
+
AND TO_TIMESTAMP({field}, 'YYYY-MM-DD"T"HH24:MI:SS') <= CURRENT TIMESTAMP
|
|
481
|
+
)
|
|
482
|
+
SELECT COUNT(*) AS valid_count, (SELECT COUNT(*) FROM {qualified_table_name} {filters_clause}) AS total_count
|
|
483
|
+
FROM timestamps_not_in_future;
|
|
484
|
+
"""
|
|
485
|
+
try:
|
|
486
|
+
valid_count = self.fetchone(query)[0]
|
|
487
|
+
total_count_query = f"SELECT COUNT(*) FROM {qualified_table_name} {filters_clause}"
|
|
488
|
+
total_count = self.fetchone(total_count_query)[0]
|
|
489
|
+
|
|
490
|
+
if operation == "count":
|
|
491
|
+
return valid_count, total_count
|
|
492
|
+
elif operation == "percent":
|
|
493
|
+
return (valid_count / total_count) * 100 if total_count > 0 else 0
|
|
494
|
+
else:
|
|
495
|
+
raise ValueError(f"Unknown operation: {operation}")
|
|
496
|
+
|
|
497
|
+
except Exception as e:
|
|
498
|
+
logger.error(f"Failed to execute query: {str(e)}")
|
|
499
|
+
return 0, 0
|
|
500
|
+
|
|
501
|
+
def query_timestamp_date_not_in_future_metric(
|
|
502
|
+
self,
|
|
503
|
+
table: str,
|
|
504
|
+
field: str,
|
|
505
|
+
operation: str,
|
|
506
|
+
predefined_regex: str,
|
|
507
|
+
filters: str = None,
|
|
508
|
+
) -> Union[float, int]:
|
|
509
|
+
"""
|
|
510
|
+
:param table: Table name
|
|
511
|
+
:param field: Column name
|
|
512
|
+
:param operation: Metric operation ("count" or "percent")
|
|
513
|
+
:param predefined_regex: The regex pattern to use (e.g., "timestamp_iso")
|
|
514
|
+
:param filters: Optional filter condition
|
|
515
|
+
:return: Tuple containing count of valid dates not in the future and total count
|
|
516
|
+
"""
|
|
517
|
+
|
|
518
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
519
|
+
field = self.quote_column(field)
|
|
520
|
+
|
|
521
|
+
timestamp_iso_regex = r"^\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])T([01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9](?:\.\d{1,3})?(Z|[+-](0[0-9]|1[0-4]):[0-5][0-9])?$"
|
|
522
|
+
|
|
523
|
+
if predefined_regex == "timestamp_iso":
|
|
524
|
+
regex_condition = f"REGEXP_LIKE({field}, '{timestamp_iso_regex}')"
|
|
525
|
+
else:
|
|
526
|
+
raise ValueError(f"Unknown predefined regex pattern: {predefined_regex}")
|
|
527
|
+
|
|
528
|
+
filters_clause = f"WHERE {filters}" if filters else ""
|
|
529
|
+
|
|
530
|
+
query = f"""
|
|
531
|
+
WITH extracted_timestamps AS (
|
|
532
|
+
SELECT
|
|
533
|
+
{field},
|
|
534
|
+
SUBSTRING({field}, 1, 4) AS year, -- Extract year
|
|
535
|
+
SUBSTRING({field}, 6, 2) AS month, -- Extract month
|
|
536
|
+
SUBSTRING({field}, 9, 2) AS day -- Extract day
|
|
537
|
+
FROM {qualified_table_name}
|
|
538
|
+
{filters_clause}
|
|
539
|
+
),
|
|
540
|
+
validated_dates AS (
|
|
541
|
+
SELECT
|
|
542
|
+
{field},
|
|
543
|
+
CASE
|
|
544
|
+
WHEN
|
|
545
|
+
REGEXP_LIKE(year, '^\d{{4}}$') AND
|
|
546
|
+
REGEXP_LIKE(month, '^(0[1-9]|1[0-2])$') AND
|
|
547
|
+
REGEXP_LIKE(day, '^((0[1-9]|[12][0-9])|(30|31))$') AND
|
|
548
|
+
(
|
|
549
|
+
(month IN ('01', '03', '05', '07', '08', '10', '12') AND day BETWEEN '01' AND '31') OR
|
|
550
|
+
(month IN ('04', '06', '09', '11') AND day BETWEEN '01' AND '30') OR
|
|
551
|
+
(month = '02' AND day BETWEEN '01' AND
|
|
552
|
+
CASE
|
|
553
|
+
WHEN (CAST(year AS INTEGER) % 400 = 0 OR
|
|
554
|
+
(CAST(year AS INTEGER) % 100 != 0 AND
|
|
555
|
+
CAST(year AS INTEGER) % 4 = 0)) THEN '29'
|
|
556
|
+
ELSE '28'
|
|
557
|
+
END
|
|
558
|
+
)
|
|
559
|
+
)
|
|
560
|
+
THEN 1
|
|
561
|
+
ELSE 0
|
|
562
|
+
END AS is_valid
|
|
563
|
+
FROM extracted_timestamps
|
|
564
|
+
),
|
|
565
|
+
dates_not_in_future AS (
|
|
566
|
+
SELECT *
|
|
567
|
+
FROM validated_dates
|
|
568
|
+
WHERE is_valid = 1
|
|
569
|
+
AND {regex_condition}
|
|
570
|
+
AND DATE(TO_TIMESTAMP({field}, 'YYYY-MM-DD"T"HH24:MI:SS')) <= CURRENT DATE -- Compare only the date part
|
|
571
|
+
)
|
|
572
|
+
SELECT COUNT(*) AS valid_count, (SELECT COUNT(*) FROM {qualified_table_name} {filters_clause}) AS total_count
|
|
573
|
+
FROM dates_not_in_future;
|
|
574
|
+
"""
|
|
575
|
+
|
|
576
|
+
try:
|
|
577
|
+
valid_count = self.fetchone(query)[0]
|
|
578
|
+
total_count_query = f"SELECT COUNT(*) FROM {qualified_table_name} {filters_clause}"
|
|
579
|
+
total_count = self.fetchone(total_count_query)[0]
|
|
580
|
+
|
|
581
|
+
if operation == "count":
|
|
582
|
+
return valid_count, total_count
|
|
583
|
+
elif operation == "percent":
|
|
584
|
+
return (valid_count / total_count) * 100 if total_count > 0 else 0
|
|
585
|
+
else:
|
|
586
|
+
raise ValueError(f"Unknown operation: {operation}")
|
|
587
|
+
|
|
588
|
+
except Exception as e:
|
|
589
|
+
logger.error(f"Failed to execute query: {str(e)}")
|
|
590
|
+
return 0, 0
|
|
591
|
+
|
|
592
|
+
def query_geolocation_metric(
|
|
593
|
+
self, table: str, field: str, operation: str, filters: str = None
|
|
594
|
+
) -> Union[int, float]:
|
|
595
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
596
|
+
field = self.quote_column(field)
|
|
597
|
+
|
|
598
|
+
valid_query = f'SELECT COUNT("{field}") FROM {qualified_table_name} WHERE "{field}" IS NOT NULL AND "{field}"'
|
|
599
|
+
|
|
600
|
+
if field.lower().startswith("lat"):
|
|
601
|
+
valid_query += "BETWEEN -90 AND 90"
|
|
602
|
+
elif field.lower().startswith("lon"):
|
|
603
|
+
valid_query += "BETWEEN -180 AND 180"
|
|
604
|
+
|
|
605
|
+
if filters:
|
|
606
|
+
valid_query += f" AND {filters}"
|
|
607
|
+
|
|
608
|
+
valid_count = self.fetchone(valid_query)[0]
|
|
609
|
+
|
|
610
|
+
if operation == "percent":
|
|
611
|
+
total_query = f"SELECT COUNT(*) FROM {qualified_table_name}"
|
|
612
|
+
if filters:
|
|
613
|
+
total_query += f" WHERE {filters}"
|
|
614
|
+
|
|
615
|
+
total_count = self.fetchone(total_query)[0]
|
|
616
|
+
|
|
617
|
+
result = (valid_count / total_count) * 100 if total_count > 0 else 0
|
|
618
|
+
return round(result, 2)
|
|
619
|
+
|
|
620
|
+
return valid_count
|
|
621
|
+
|
|
622
|
+
def query_timestamp_metric(self):
|
|
623
|
+
raise NotImplementedError("Method not implemented for DB2DataSource")
|
|
624
|
+
|
|
625
|
+
def query_timestamp_not_in_future_metric(self):
|
|
626
|
+
raise NotImplementedError("Method not implemented for DB2DataSource")
|
|
627
|
+
|
|
628
|
+
def query_timestamp_date_not_in_future_metric(self):
|
|
629
|
+
raise NotImplementedError("Method not implemented for DB2DataSource")
|
|
630
|
+
|
|
631
|
+
def query_get_time_diff(self, table: str, field: str) -> int:
|
|
632
|
+
"""
|
|
633
|
+
Get the time difference
|
|
634
|
+
:param table: name of the index
|
|
635
|
+
:param field: field name of updated time column
|
|
636
|
+
:return: time difference in seconds
|
|
637
|
+
"""
|
|
638
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
639
|
+
field = self.quote_column(field)
|
|
640
|
+
query = f"""
|
|
641
|
+
SELECT {field}
|
|
642
|
+
FROM {qualified_table_name}
|
|
643
|
+
ORDER BY {field} DESC
|
|
644
|
+
FETCH FIRST 1 ROWS ONLY;
|
|
645
|
+
"""
|
|
646
|
+
result = self.fetchone(query)
|
|
647
|
+
if result:
|
|
648
|
+
updated_time = result[0]
|
|
649
|
+
if isinstance(updated_time, str):
|
|
650
|
+
updated_time = datetime.strptime(updated_time, "%Y-%m-%d %H:%M:%S.%f")
|
|
651
|
+
return int((datetime.utcnow() - updated_time).total_seconds())
|
|
652
|
+
return 0
|