dcs-sdk 1.6.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_diff/__init__.py +221 -0
- data_diff/__main__.py +517 -0
- data_diff/abcs/__init__.py +13 -0
- data_diff/abcs/compiler.py +27 -0
- data_diff/abcs/database_types.py +402 -0
- data_diff/config.py +141 -0
- data_diff/databases/__init__.py +38 -0
- data_diff/databases/_connect.py +323 -0
- data_diff/databases/base.py +1417 -0
- data_diff/databases/bigquery.py +376 -0
- data_diff/databases/clickhouse.py +217 -0
- data_diff/databases/databricks.py +262 -0
- data_diff/databases/duckdb.py +207 -0
- data_diff/databases/mssql.py +343 -0
- data_diff/databases/mysql.py +189 -0
- data_diff/databases/oracle.py +238 -0
- data_diff/databases/postgresql.py +293 -0
- data_diff/databases/presto.py +222 -0
- data_diff/databases/redis.py +93 -0
- data_diff/databases/redshift.py +233 -0
- data_diff/databases/snowflake.py +222 -0
- data_diff/databases/sybase.py +720 -0
- data_diff/databases/trino.py +73 -0
- data_diff/databases/vertica.py +174 -0
- data_diff/diff_tables.py +489 -0
- data_diff/errors.py +17 -0
- data_diff/format.py +369 -0
- data_diff/hashdiff_tables.py +1026 -0
- data_diff/info_tree.py +76 -0
- data_diff/joindiff_tables.py +434 -0
- data_diff/lexicographic_space.py +253 -0
- data_diff/parse_time.py +88 -0
- data_diff/py.typed +0 -0
- data_diff/queries/__init__.py +13 -0
- data_diff/queries/api.py +213 -0
- data_diff/queries/ast_classes.py +811 -0
- data_diff/queries/base.py +38 -0
- data_diff/queries/extras.py +43 -0
- data_diff/query_utils.py +70 -0
- data_diff/schema.py +67 -0
- data_diff/table_segment.py +583 -0
- data_diff/thread_utils.py +112 -0
- data_diff/utils.py +1022 -0
- data_diff/version.py +15 -0
- dcs_core/__init__.py +13 -0
- dcs_core/__main__.py +17 -0
- dcs_core/__version__.py +15 -0
- dcs_core/cli/__init__.py +13 -0
- dcs_core/cli/cli.py +165 -0
- dcs_core/core/__init__.py +19 -0
- dcs_core/core/common/__init__.py +13 -0
- dcs_core/core/common/errors.py +50 -0
- dcs_core/core/common/models/__init__.py +13 -0
- dcs_core/core/common/models/configuration.py +284 -0
- dcs_core/core/common/models/dashboard.py +24 -0
- dcs_core/core/common/models/data_source_resource.py +75 -0
- dcs_core/core/common/models/metric.py +160 -0
- dcs_core/core/common/models/profile.py +75 -0
- dcs_core/core/common/models/validation.py +216 -0
- dcs_core/core/common/models/widget.py +44 -0
- dcs_core/core/configuration/__init__.py +13 -0
- dcs_core/core/configuration/config_loader.py +139 -0
- dcs_core/core/configuration/configuration_parser.py +262 -0
- dcs_core/core/configuration/configuration_parser_arc.py +328 -0
- dcs_core/core/datasource/__init__.py +13 -0
- dcs_core/core/datasource/base.py +62 -0
- dcs_core/core/datasource/manager.py +112 -0
- dcs_core/core/datasource/search_datasource.py +421 -0
- dcs_core/core/datasource/sql_datasource.py +1094 -0
- dcs_core/core/inspect.py +163 -0
- dcs_core/core/logger/__init__.py +13 -0
- dcs_core/core/logger/base.py +32 -0
- dcs_core/core/logger/default_logger.py +94 -0
- dcs_core/core/metric/__init__.py +13 -0
- dcs_core/core/metric/base.py +220 -0
- dcs_core/core/metric/combined_metric.py +98 -0
- dcs_core/core/metric/custom_metric.py +34 -0
- dcs_core/core/metric/manager.py +137 -0
- dcs_core/core/metric/numeric_metric.py +403 -0
- dcs_core/core/metric/reliability_metric.py +90 -0
- dcs_core/core/profiling/__init__.py +13 -0
- dcs_core/core/profiling/datasource_profiling.py +136 -0
- dcs_core/core/profiling/numeric_field_profiling.py +72 -0
- dcs_core/core/profiling/text_field_profiling.py +67 -0
- dcs_core/core/repository/__init__.py +13 -0
- dcs_core/core/repository/metric_repository.py +77 -0
- dcs_core/core/utils/__init__.py +13 -0
- dcs_core/core/utils/log.py +29 -0
- dcs_core/core/utils/tracking.py +105 -0
- dcs_core/core/utils/utils.py +44 -0
- dcs_core/core/validation/__init__.py +13 -0
- dcs_core/core/validation/base.py +230 -0
- dcs_core/core/validation/completeness_validation.py +153 -0
- dcs_core/core/validation/custom_query_validation.py +24 -0
- dcs_core/core/validation/manager.py +282 -0
- dcs_core/core/validation/numeric_validation.py +276 -0
- dcs_core/core/validation/reliability_validation.py +91 -0
- dcs_core/core/validation/uniqueness_validation.py +61 -0
- dcs_core/core/validation/validity_validation.py +738 -0
- dcs_core/integrations/__init__.py +13 -0
- dcs_core/integrations/databases/__init__.py +13 -0
- dcs_core/integrations/databases/bigquery.py +187 -0
- dcs_core/integrations/databases/databricks.py +51 -0
- dcs_core/integrations/databases/db2.py +652 -0
- dcs_core/integrations/databases/elasticsearch.py +61 -0
- dcs_core/integrations/databases/mssql.py +829 -0
- dcs_core/integrations/databases/mysql.py +409 -0
- dcs_core/integrations/databases/opensearch.py +64 -0
- dcs_core/integrations/databases/oracle.py +719 -0
- dcs_core/integrations/databases/postgres.py +482 -0
- dcs_core/integrations/databases/redshift.py +53 -0
- dcs_core/integrations/databases/snowflake.py +48 -0
- dcs_core/integrations/databases/spark_df.py +111 -0
- dcs_core/integrations/databases/sybase.py +1069 -0
- dcs_core/integrations/storage/__init__.py +13 -0
- dcs_core/integrations/storage/local_file.py +149 -0
- dcs_core/integrations/utils/__init__.py +13 -0
- dcs_core/integrations/utils/utils.py +36 -0
- dcs_core/report/__init__.py +13 -0
- dcs_core/report/dashboard.py +211 -0
- dcs_core/report/models.py +88 -0
- dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
- dcs_core/report/static/assets/images/docs.svg +6 -0
- dcs_core/report/static/assets/images/github.svg +4 -0
- dcs_core/report/static/assets/images/logo.svg +7 -0
- dcs_core/report/static/assets/images/slack.svg +13 -0
- dcs_core/report/static/index.js +2 -0
- dcs_core/report/static/index.js.LICENSE.txt +3971 -0
- dcs_sdk/__init__.py +13 -0
- dcs_sdk/__main__.py +18 -0
- dcs_sdk/__version__.py +15 -0
- dcs_sdk/cli/__init__.py +13 -0
- dcs_sdk/cli/cli.py +163 -0
- dcs_sdk/sdk/__init__.py +58 -0
- dcs_sdk/sdk/config/__init__.py +13 -0
- dcs_sdk/sdk/config/config_loader.py +491 -0
- dcs_sdk/sdk/data_diff/__init__.py +13 -0
- dcs_sdk/sdk/data_diff/data_differ.py +821 -0
- dcs_sdk/sdk/rules/__init__.py +15 -0
- dcs_sdk/sdk/rules/rules_mappping.py +31 -0
- dcs_sdk/sdk/rules/rules_repository.py +214 -0
- dcs_sdk/sdk/rules/schema_rules.py +65 -0
- dcs_sdk/sdk/utils/__init__.py +13 -0
- dcs_sdk/sdk/utils/serializer.py +25 -0
- dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
- dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
- dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
- dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
- dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
- dcs_sdk/sdk/utils/table.py +475 -0
- dcs_sdk/sdk/utils/themes.py +40 -0
- dcs_sdk/sdk/utils/utils.py +349 -0
- dcs_sdk-1.6.5.dist-info/METADATA +150 -0
- dcs_sdk-1.6.5.dist-info/RECORD +159 -0
- dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
- dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,1094 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import secrets
|
|
16
|
+
import string
|
|
17
|
+
import time
|
|
18
|
+
from datetime import datetime
|
|
19
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
20
|
+
|
|
21
|
+
from loguru import logger
|
|
22
|
+
from sqlalchemy import inspect, text
|
|
23
|
+
from sqlalchemy.engine import Connection, Engine
|
|
24
|
+
|
|
25
|
+
from dcs_core.core.datasource.base import DataSource
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class SQLDataSource(DataSource):
|
|
29
|
+
"""
|
|
30
|
+
Abstract class for SQL data sources
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, data_source_name: str, data_connection: Dict):
|
|
34
|
+
super().__init__(data_source_name, data_connection)
|
|
35
|
+
|
|
36
|
+
self.connection: Union[Connection, None] = None
|
|
37
|
+
self.database: str = data_connection.get("database")
|
|
38
|
+
self.use_sa_text_query = True
|
|
39
|
+
self.schema_name = data_connection.get("schema", None)
|
|
40
|
+
self.regex_patterns = {
|
|
41
|
+
"uuid": r"^[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$",
|
|
42
|
+
"usa_phone": r"^(\+1[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}$",
|
|
43
|
+
"email": r"^(?!.*\.\.)(?!.*@.*@)[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$",
|
|
44
|
+
"usa_zip_code": r"^[0-9]{5}(?:-[0-9]{4})?$",
|
|
45
|
+
"ssn": r"^(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}$",
|
|
46
|
+
"sedol": r"^[B-DF-HJ-NP-TV-XZ0-9]{6}[0-9]$",
|
|
47
|
+
"lei": r"^[A-Z0-9]{18}[0-9]{2}$",
|
|
48
|
+
"cusip": r"^[0-9A-Z]{9}$",
|
|
49
|
+
"figi": r"^BBG[A-Z0-9]{9}$",
|
|
50
|
+
"isin": r"^[A-Z]{2}[A-Z0-9]{9}[0-9]$",
|
|
51
|
+
"perm_id": r"^\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{3}$",
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
self.valid_state_codes = [
|
|
55
|
+
"AL",
|
|
56
|
+
"AK",
|
|
57
|
+
"AZ",
|
|
58
|
+
"AR",
|
|
59
|
+
"CA",
|
|
60
|
+
"CO",
|
|
61
|
+
"CT",
|
|
62
|
+
"DE",
|
|
63
|
+
"FL",
|
|
64
|
+
"GA",
|
|
65
|
+
"HI",
|
|
66
|
+
"ID",
|
|
67
|
+
"IL",
|
|
68
|
+
"IN",
|
|
69
|
+
"IA",
|
|
70
|
+
"KS",
|
|
71
|
+
"KY",
|
|
72
|
+
"LA",
|
|
73
|
+
"ME",
|
|
74
|
+
"MD",
|
|
75
|
+
"MA",
|
|
76
|
+
"MI",
|
|
77
|
+
"MN",
|
|
78
|
+
"MS",
|
|
79
|
+
"MO",
|
|
80
|
+
"MT",
|
|
81
|
+
"NE",
|
|
82
|
+
"NV",
|
|
83
|
+
"NH",
|
|
84
|
+
"NJ",
|
|
85
|
+
"NM",
|
|
86
|
+
"NY",
|
|
87
|
+
"NC",
|
|
88
|
+
"ND",
|
|
89
|
+
"OH",
|
|
90
|
+
"OK",
|
|
91
|
+
"OR",
|
|
92
|
+
"PA",
|
|
93
|
+
"RI",
|
|
94
|
+
"SC",
|
|
95
|
+
"SD",
|
|
96
|
+
"TN",
|
|
97
|
+
"TX",
|
|
98
|
+
"UT",
|
|
99
|
+
"VT",
|
|
100
|
+
"VA",
|
|
101
|
+
"WA",
|
|
102
|
+
"WV",
|
|
103
|
+
"WI",
|
|
104
|
+
"WY",
|
|
105
|
+
]
|
|
106
|
+
|
|
107
|
+
def is_connected(self) -> bool:
|
|
108
|
+
"""
|
|
109
|
+
Check if the data source is connected
|
|
110
|
+
"""
|
|
111
|
+
return self.connection is not None
|
|
112
|
+
|
|
113
|
+
def close(self):
|
|
114
|
+
self.connection.close()
|
|
115
|
+
try:
|
|
116
|
+
self.connection.engine.dispose()
|
|
117
|
+
except Exception as e:
|
|
118
|
+
logger.error(f"Failed to close the connection: {str(e)}")
|
|
119
|
+
|
|
120
|
+
def fetchall(self, query):
|
|
121
|
+
if self.use_sa_text_query:
|
|
122
|
+
return self.connection.execute(text(query)).fetchall()
|
|
123
|
+
return self.connection.execute(query).fetchall()
|
|
124
|
+
|
|
125
|
+
def fetchone(self, query):
|
|
126
|
+
if self.use_sa_text_query:
|
|
127
|
+
return self.connection.execute(text(query)).fetchone()
|
|
128
|
+
return self.connection.execute(query).fetchone()
|
|
129
|
+
|
|
130
|
+
def safe_get(self, lst, idx, default=None):
|
|
131
|
+
return lst[idx] if 0 <= idx < len(lst) else default
|
|
132
|
+
|
|
133
|
+
def qualified_table_name(self, table_name: str) -> str:
|
|
134
|
+
"""
|
|
135
|
+
Get the qualified table name
|
|
136
|
+
:param table_name: name of the table
|
|
137
|
+
:return: qualified table name
|
|
138
|
+
"""
|
|
139
|
+
if self.schema_name:
|
|
140
|
+
return f"[{self.schema_name}].[{table_name}]"
|
|
141
|
+
return f"[{table_name}]"
|
|
142
|
+
|
|
143
|
+
def quote_database(self, database: str) -> str:
|
|
144
|
+
"""
|
|
145
|
+
Quote the database name
|
|
146
|
+
:param database: name of the database
|
|
147
|
+
:return: quoted database name
|
|
148
|
+
"""
|
|
149
|
+
return f'"{database}"'
|
|
150
|
+
|
|
151
|
+
def quote_column(self, column: str) -> str:
|
|
152
|
+
"""
|
|
153
|
+
Quote the column name
|
|
154
|
+
:param column: name of the column
|
|
155
|
+
:return: quoted column name
|
|
156
|
+
"""
|
|
157
|
+
return f"[{column}]"
|
|
158
|
+
|
|
159
|
+
def query_get_database_version(self, database_version_query: Optional[str] = None) -> str:
|
|
160
|
+
"""
|
|
161
|
+
Get the database version
|
|
162
|
+
:return: version string
|
|
163
|
+
"""
|
|
164
|
+
query = database_version_query or "SELECT @@version"
|
|
165
|
+
result = self.fetchone(query)[0]
|
|
166
|
+
return result if result else None
|
|
167
|
+
|
|
168
|
+
def query_get_column_metadata(self, table_name: str) -> Dict[str, str]:
|
|
169
|
+
"""
|
|
170
|
+
Get the column metadata
|
|
171
|
+
:param table_name: name of the table
|
|
172
|
+
:return: query for column metadata
|
|
173
|
+
"""
|
|
174
|
+
results_: Dict[str, str] = {}
|
|
175
|
+
|
|
176
|
+
columns = inspect(self.connection.engine).get_columns(table_name)
|
|
177
|
+
for column in columns:
|
|
178
|
+
results_[column["name"]] = column["type"].python_type.__name__
|
|
179
|
+
|
|
180
|
+
return results_
|
|
181
|
+
|
|
182
|
+
def query_get_table_metadata(self) -> List[str]:
|
|
183
|
+
"""
|
|
184
|
+
Get the table metadata
|
|
185
|
+
:return: query for table metadata
|
|
186
|
+
"""
|
|
187
|
+
return inspect(self.connection.engine).get_table_names()
|
|
188
|
+
|
|
189
|
+
def query_get_row_count(self, table: str, filters: str = None) -> int:
|
|
190
|
+
"""
|
|
191
|
+
Get the row count
|
|
192
|
+
:param table: name of the table
|
|
193
|
+
:param filters: optional filter
|
|
194
|
+
"""
|
|
195
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
196
|
+
query = f"SELECT COUNT(*) FROM {qualified_table_name}"
|
|
197
|
+
if filters:
|
|
198
|
+
query += f" WHERE {filters}"
|
|
199
|
+
return self.fetchone(query)[0]
|
|
200
|
+
|
|
201
|
+
def query_get_custom_sql(self, query: str) -> Union[int, float, None]:
|
|
202
|
+
"""
|
|
203
|
+
Get the first row of the custom sql query
|
|
204
|
+
:param query: custom sql query
|
|
205
|
+
"""
|
|
206
|
+
row = self.fetchone(query)
|
|
207
|
+
if row is not None:
|
|
208
|
+
return row[0]
|
|
209
|
+
else:
|
|
210
|
+
return None
|
|
211
|
+
|
|
212
|
+
def query_get_max(self, table: str, field: str, filters: str = None) -> int:
|
|
213
|
+
"""
|
|
214
|
+
Get the max value
|
|
215
|
+
:param table: table name
|
|
216
|
+
:param field: column name
|
|
217
|
+
:param filters: filter condition
|
|
218
|
+
:return:
|
|
219
|
+
"""
|
|
220
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
221
|
+
field = self.quote_column(field)
|
|
222
|
+
|
|
223
|
+
query = "SELECT MAX({}) FROM {}".format(field, qualified_table_name)
|
|
224
|
+
|
|
225
|
+
if filters:
|
|
226
|
+
query += " WHERE {}".format(filters)
|
|
227
|
+
var = self.fetchone(query)[0]
|
|
228
|
+
return var
|
|
229
|
+
|
|
230
|
+
def query_get_min(self, table: str, field: str, filters: str = None) -> int:
|
|
231
|
+
"""
|
|
232
|
+
Get the min value
|
|
233
|
+
:param table: table name
|
|
234
|
+
:param field: column name
|
|
235
|
+
:param filters: filter condition
|
|
236
|
+
:return:
|
|
237
|
+
"""
|
|
238
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
239
|
+
field = self.quote_column(field)
|
|
240
|
+
query = "SELECT MIN({}) FROM {}".format(field, qualified_table_name)
|
|
241
|
+
if filters:
|
|
242
|
+
query += " WHERE {}".format(filters)
|
|
243
|
+
|
|
244
|
+
return self.fetchone(query)[0]
|
|
245
|
+
|
|
246
|
+
def query_get_avg(self, table: str, field: str, filters: str = None) -> int:
|
|
247
|
+
"""
|
|
248
|
+
Get the average value
|
|
249
|
+
:param table: table name
|
|
250
|
+
:param field: column name
|
|
251
|
+
:param filters: filter condition
|
|
252
|
+
:return:
|
|
253
|
+
"""
|
|
254
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
255
|
+
field = self.quote_column(field)
|
|
256
|
+
query = "SELECT AVG({}) FROM {}".format(field, qualified_table_name)
|
|
257
|
+
if filters:
|
|
258
|
+
query += " WHERE {}".format(filters)
|
|
259
|
+
|
|
260
|
+
return round(self.fetchone(query)[0], 2)
|
|
261
|
+
|
|
262
|
+
def query_get_sum(self, table: str, field: str, filters: str = None) -> int:
|
|
263
|
+
"""
|
|
264
|
+
Get the sum value
|
|
265
|
+
:param table: table name
|
|
266
|
+
:param field: column name
|
|
267
|
+
:param filters: filter condition
|
|
268
|
+
:return:
|
|
269
|
+
"""
|
|
270
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
271
|
+
field = self.quote_column(field)
|
|
272
|
+
query = "SELECT SUM({}) FROM {}".format(field, qualified_table_name)
|
|
273
|
+
if filters:
|
|
274
|
+
query += " WHERE {}".format(filters)
|
|
275
|
+
|
|
276
|
+
return round(self.fetchone(query)[0], 2)
|
|
277
|
+
|
|
278
|
+
def query_get_variance(self, table: str, field: str, filters: str = None) -> int:
|
|
279
|
+
"""
|
|
280
|
+
Get the variance value
|
|
281
|
+
:param table: table name
|
|
282
|
+
:param field: column name
|
|
283
|
+
:param filters: filter condition
|
|
284
|
+
:return:
|
|
285
|
+
"""
|
|
286
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
287
|
+
field = self.quote_column(field)
|
|
288
|
+
query = "SELECT VAR_SAMP({}) FROM {}".format(field, qualified_table_name)
|
|
289
|
+
if filters:
|
|
290
|
+
query += " WHERE {}".format(filters)
|
|
291
|
+
|
|
292
|
+
return round(self.fetchone(query)[0], 2)
|
|
293
|
+
|
|
294
|
+
def query_get_stddev(self, table: str, field: str, filters: str = None) -> int:
|
|
295
|
+
"""
|
|
296
|
+
Get the standard deviation value
|
|
297
|
+
:param table: table name
|
|
298
|
+
:param field: column name
|
|
299
|
+
:param filters: filter condition
|
|
300
|
+
:return:
|
|
301
|
+
"""
|
|
302
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
303
|
+
field = self.quote_column(field)
|
|
304
|
+
query = "SELECT STDDEV_SAMP({}) FROM {}".format(field, qualified_table_name)
|
|
305
|
+
if filters:
|
|
306
|
+
query += " WHERE {}".format(filters)
|
|
307
|
+
|
|
308
|
+
return round(self.fetchone(query)[0], 2)
|
|
309
|
+
|
|
310
|
+
def query_get_null_count(self, table: str, field: str, filters: str = None) -> int:
|
|
311
|
+
"""
|
|
312
|
+
Get the null count
|
|
313
|
+
:param table: table name
|
|
314
|
+
:param field: column name
|
|
315
|
+
:param filters: filter condition
|
|
316
|
+
:return:
|
|
317
|
+
"""
|
|
318
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
319
|
+
field = self.quote_column(field)
|
|
320
|
+
query = "SELECT COUNT(*) FROM {} WHERE {} IS NULL".format(qualified_table_name, field)
|
|
321
|
+
if filters:
|
|
322
|
+
query += " AND {}".format(filters)
|
|
323
|
+
return self.fetchone(query)[0]
|
|
324
|
+
|
|
325
|
+
def query_get_empty_string_count(self, table: str, field: str, filters: str = None) -> int:
|
|
326
|
+
"""
|
|
327
|
+
Get the count of empty strings in a column of a table
|
|
328
|
+
:param table: table name
|
|
329
|
+
:param field: column name
|
|
330
|
+
:param filters: filter condition
|
|
331
|
+
:return: count of empty strings
|
|
332
|
+
"""
|
|
333
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
334
|
+
field = self.quote_column(field)
|
|
335
|
+
query = "SELECT COUNT(*) FROM {} WHERE {} = ''".format(qualified_table_name, field)
|
|
336
|
+
if filters:
|
|
337
|
+
query += " AND {}".format(filters)
|
|
338
|
+
result = self.fetchone(query)
|
|
339
|
+
return result[0] if result else 0
|
|
340
|
+
|
|
341
|
+
def query_get_empty_string_percentage(self, table: str, field: str, filters: str = None) -> float:
|
|
342
|
+
"""
|
|
343
|
+
Get the empty string percentage in a column of a table
|
|
344
|
+
:param table: table name
|
|
345
|
+
:param field: column name
|
|
346
|
+
:param filters: filter condition
|
|
347
|
+
:return: empty string percentage
|
|
348
|
+
"""
|
|
349
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
350
|
+
field = self.quote_column(field)
|
|
351
|
+
query = "SELECT SUM(CASE WHEN {} = '' THEN 1 ELSE 0 END) AS empty_string_count, COUNT(*) AS total_count FROM {}".format(
|
|
352
|
+
field, qualified_table_name
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
if filters:
|
|
356
|
+
query += " WHERE {}".format(filters)
|
|
357
|
+
|
|
358
|
+
result = self.fetchone(query)
|
|
359
|
+
if result and result[1] > 0:
|
|
360
|
+
return round((result[0] / result[1]) * 100, 2)
|
|
361
|
+
return 0.0
|
|
362
|
+
|
|
363
|
+
def query_get_distinct_count(self, table: str, field: str, filters: str = None) -> int:
|
|
364
|
+
"""
|
|
365
|
+
Get the distinct count value
|
|
366
|
+
:param table: table name
|
|
367
|
+
:param field: column name
|
|
368
|
+
:param filters: filter condition
|
|
369
|
+
:return:
|
|
370
|
+
"""
|
|
371
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
372
|
+
field = self.quote_column(field)
|
|
373
|
+
query = "SELECT COUNT(DISTINCT {}) FROM {}".format(field, qualified_table_name)
|
|
374
|
+
if filters:
|
|
375
|
+
query += " WHERE {}".format(filters)
|
|
376
|
+
|
|
377
|
+
return self.fetchone(query)[0]
|
|
378
|
+
|
|
379
|
+
def query_get_null_percentage(self, table: str, field: str, filters: str = None) -> int:
|
|
380
|
+
"""
|
|
381
|
+
Get the null percentage
|
|
382
|
+
:param table: table name
|
|
383
|
+
:param field: column name
|
|
384
|
+
:param filters: filter condition
|
|
385
|
+
:return:
|
|
386
|
+
"""
|
|
387
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
388
|
+
field = self.quote_column(field)
|
|
389
|
+
query = (
|
|
390
|
+
"SELECT SUM(CASE WHEN {} IS NULL THEN 1 ELSE 0 END) AS null_count, COUNT(*) AS total_count FROM {}".format(
|
|
391
|
+
field, qualified_table_name
|
|
392
|
+
)
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
if filters:
|
|
396
|
+
query += " WHERE {}".format(filters)
|
|
397
|
+
|
|
398
|
+
result = self.fetchone(query)
|
|
399
|
+
if result:
|
|
400
|
+
return round((result[0] / result[1]) * 100, 2)
|
|
401
|
+
return 0
|
|
402
|
+
|
|
403
|
+
def query_get_time_diff(self, table: str, field: str) -> int:
|
|
404
|
+
"""
|
|
405
|
+
Get the time difference
|
|
406
|
+
:param table: name of the index
|
|
407
|
+
:param field: field name of updated time column
|
|
408
|
+
:return: time difference in seconds
|
|
409
|
+
"""
|
|
410
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
411
|
+
field = self.quote_column(field)
|
|
412
|
+
query = f"""
|
|
413
|
+
SELECT {field} from {qualified_table_name} ORDER BY {field} DESC LIMIT 1;
|
|
414
|
+
"""
|
|
415
|
+
result = self.fetchone(query)
|
|
416
|
+
if result:
|
|
417
|
+
return int((datetime.utcnow() - result[0]).total_seconds())
|
|
418
|
+
return 0
|
|
419
|
+
|
|
420
|
+
def profiling_sql_aggregates_numeric(self, table_name: str, column_name: str) -> Dict:
|
|
421
|
+
column_name = f'"{column_name}"'
|
|
422
|
+
qualified_table_name = self.qualified_table_name(table_name)
|
|
423
|
+
query = f"""
|
|
424
|
+
SELECT
|
|
425
|
+
avg({column_name}) as avg,
|
|
426
|
+
min({column_name}) as min,
|
|
427
|
+
max({column_name}) as max,
|
|
428
|
+
sum({column_name}) as sum,
|
|
429
|
+
stddev_samp({column_name}) as stddev,
|
|
430
|
+
var_samp({column_name}) as variance,
|
|
431
|
+
count(distinct({column_name})) as distinct_count,
|
|
432
|
+
sum(case when {column_name} is null then 1 else 0 end) as missing_count
|
|
433
|
+
FROM {qualified_table_name}
|
|
434
|
+
"""
|
|
435
|
+
|
|
436
|
+
result = self.fetchone(query)
|
|
437
|
+
return {
|
|
438
|
+
"avg": result[0],
|
|
439
|
+
"min": result[1],
|
|
440
|
+
"max": result[2],
|
|
441
|
+
"sum": result[3],
|
|
442
|
+
"stddev": result[4],
|
|
443
|
+
"variance": result[5],
|
|
444
|
+
"distinct_count": result[6],
|
|
445
|
+
"missing_count": result[7],
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
def profiling_sql_aggregates_string(self, table_name: str, column_name: str) -> Dict:
|
|
449
|
+
column_name = f'"{column_name}"'
|
|
450
|
+
qualified_table_name = self.qualified_table_name(table_name)
|
|
451
|
+
query = f"""
|
|
452
|
+
SELECT
|
|
453
|
+
count(distinct({column_name})) as distinct_count,
|
|
454
|
+
sum(case when {column_name} is null then 1 else 0 end) as missing_count,
|
|
455
|
+
max(length({column_name})) as max_length,
|
|
456
|
+
min(length({column_name})) as min_length,
|
|
457
|
+
avg(length({column_name})) as avg_length
|
|
458
|
+
FROM {qualified_table_name}
|
|
459
|
+
"""
|
|
460
|
+
|
|
461
|
+
result = self.fetchone(query)
|
|
462
|
+
return {
|
|
463
|
+
"distinct_count": result[0],
|
|
464
|
+
"missing_count": result[1],
|
|
465
|
+
"max_length": result[2],
|
|
466
|
+
"min_length": result[3],
|
|
467
|
+
"avg_length": result[4],
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
def query_get_duplicate_count(self, table: str, field: str, filters: str = None) -> int:
|
|
471
|
+
filters = f"WHERE {filters}" if filters else ""
|
|
472
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
473
|
+
field = self.quote_column(field)
|
|
474
|
+
query = f"""
|
|
475
|
+
SELECT
|
|
476
|
+
count(*) as duplicate_count
|
|
477
|
+
FROM {qualified_table_name}
|
|
478
|
+
{filters}
|
|
479
|
+
GROUP BY {field}
|
|
480
|
+
HAVING COUNT(*) > 1
|
|
481
|
+
"""
|
|
482
|
+
|
|
483
|
+
result = self.fetchall(query)
|
|
484
|
+
return len(result) if result else 0
|
|
485
|
+
|
|
486
|
+
def query_string_pattern_validity(
|
|
487
|
+
self,
|
|
488
|
+
table: str,
|
|
489
|
+
field: str,
|
|
490
|
+
regex_pattern: str = None,
|
|
491
|
+
predefined_regex_pattern: str = None,
|
|
492
|
+
filters: str = None,
|
|
493
|
+
) -> Tuple[int, int]:
|
|
494
|
+
"""
|
|
495
|
+
Get the count of valid values based on the regex pattern
|
|
496
|
+
:param table: table name
|
|
497
|
+
:param field: column name
|
|
498
|
+
:param regex_pattern: regex pattern
|
|
499
|
+
:param predefined_regex_pattern: predefined regex pattern
|
|
500
|
+
:param filters: filter condition
|
|
501
|
+
:return: count of valid values, count of total row count
|
|
502
|
+
"""
|
|
503
|
+
filters = f"WHERE {filters}" if filters else ""
|
|
504
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
505
|
+
field = self.quote_column(field)
|
|
506
|
+
|
|
507
|
+
if not regex_pattern and not predefined_regex_pattern:
|
|
508
|
+
raise ValueError("Either regex_pattern or predefined_regex_pattern should be provided")
|
|
509
|
+
|
|
510
|
+
if predefined_regex_pattern:
|
|
511
|
+
regex_query = f"case when {field} ~ '{self.regex_patterns[predefined_regex_pattern]}' then 1 else 0 end"
|
|
512
|
+
else:
|
|
513
|
+
regex_query = f"case when {field} ~ '{regex_pattern}' then 1 else 0 end"
|
|
514
|
+
|
|
515
|
+
query = f"""
|
|
516
|
+
select sum({regex_query}) as valid_count, count(*) as total_count
|
|
517
|
+
from {qualified_table_name} {filters}
|
|
518
|
+
"""
|
|
519
|
+
result = self.fetchone(query)
|
|
520
|
+
return result[0], result[1]
|
|
521
|
+
|
|
522
|
+
def query_valid_invalid_values_validity(
|
|
523
|
+
self,
|
|
524
|
+
table: str,
|
|
525
|
+
field: str,
|
|
526
|
+
regex_pattern: str = None,
|
|
527
|
+
filters: str = None,
|
|
528
|
+
values: List[str] = None,
|
|
529
|
+
) -> Tuple[int, int]:
|
|
530
|
+
"""
|
|
531
|
+
Get the count of valid and invalid values
|
|
532
|
+
:param table: table name
|
|
533
|
+
:param field: column name
|
|
534
|
+
:param values: list of valid values
|
|
535
|
+
:param regex_pattern: regex pattern
|
|
536
|
+
:param filters: filter condition
|
|
537
|
+
:return: count of valid/invalid values and total count of valid/invalid values
|
|
538
|
+
"""
|
|
539
|
+
filters = f"WHERE {filters}" if filters else ""
|
|
540
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
541
|
+
field = self.quote_column(field)
|
|
542
|
+
if values:
|
|
543
|
+
values_str = ", ".join([f"'{value}'" for value in values])
|
|
544
|
+
regex_query = f"CASE WHEN {field} IN ({values_str}) THEN 1 ELSE 0 END"
|
|
545
|
+
else:
|
|
546
|
+
regex_query = f"CASE WHEN {field} ~ '{regex_pattern}' THEN 1 ELSE 0 END"
|
|
547
|
+
query = f"""
|
|
548
|
+
SELECT SUM({regex_query}) AS valid_count, COUNT(*) as total_count
|
|
549
|
+
FROM {qualified_table_name}
|
|
550
|
+
{filters}
|
|
551
|
+
"""
|
|
552
|
+
result = self.fetchone(query)
|
|
553
|
+
return result[0], result[1]
|
|
554
|
+
|
|
555
|
+
def query_get_string_length_metric(
|
|
556
|
+
self, table: str, field: str, metric: str, filters: str = None
|
|
557
|
+
) -> Union[int, float]:
|
|
558
|
+
"""
|
|
559
|
+
Get the string length metric (max, min, avg) in a column of a table.
|
|
560
|
+
|
|
561
|
+
:param table: table name
|
|
562
|
+
:param field: column name
|
|
563
|
+
:param metric: the metric to calculate ('max', 'min', 'avg')
|
|
564
|
+
:param filters: filter condition
|
|
565
|
+
:return: the calculated metric as int for 'max' and 'min', float for 'avg'
|
|
566
|
+
"""
|
|
567
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
568
|
+
field = self.quote_column(field)
|
|
569
|
+
|
|
570
|
+
if metric.lower() == "max":
|
|
571
|
+
sql_function = "MAX(LENGTH"
|
|
572
|
+
elif metric.lower() == "min":
|
|
573
|
+
sql_function = "MIN(LENGTH"
|
|
574
|
+
elif metric.lower() == "avg":
|
|
575
|
+
sql_function = "AVG(LENGTH"
|
|
576
|
+
else:
|
|
577
|
+
raise ValueError(f"Invalid metric '{metric}'. Choose from 'max', 'min', or 'avg'.")
|
|
578
|
+
|
|
579
|
+
query = f"SELECT {sql_function}({field})) FROM {qualified_table_name}"
|
|
580
|
+
|
|
581
|
+
if filters:
|
|
582
|
+
query += f" WHERE {filters}"
|
|
583
|
+
|
|
584
|
+
result = self.fetchone(query)[0]
|
|
585
|
+
return round(result, 2) if metric.lower() == "avg" else result
|
|
586
|
+
|
|
587
|
+
def query_get_usa_state_code_validity(self, table: str, field: str, filters: str = None) -> Tuple[int, int]:
|
|
588
|
+
"""
|
|
589
|
+
Get the count of valid USA state codes
|
|
590
|
+
:param table: table name
|
|
591
|
+
:param field: column name
|
|
592
|
+
:param filters: filter condition
|
|
593
|
+
:return: count of valid state codes, count of total row count
|
|
594
|
+
"""
|
|
595
|
+
|
|
596
|
+
valid_state_codes_str = ", ".join(f"'{code}'" for code in self.valid_state_codes)
|
|
597
|
+
|
|
598
|
+
filters = f"WHERE {filters}" if filters else ""
|
|
599
|
+
|
|
600
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
601
|
+
field = self.quote_column(field)
|
|
602
|
+
|
|
603
|
+
regex_query = f"CASE WHEN {field} ~ '^[A-Z]{{2}}$' AND {field} IN ({valid_state_codes_str}) THEN 1 ELSE 0 END"
|
|
604
|
+
|
|
605
|
+
query = f"""
|
|
606
|
+
SELECT SUM({regex_query}) AS valid_count, COUNT(*) AS total_count
|
|
607
|
+
FROM {qualified_table_name} {filters}
|
|
608
|
+
"""
|
|
609
|
+
|
|
610
|
+
result = self.fetchone(query)
|
|
611
|
+
return result[0], result[1]
|
|
612
|
+
|
|
613
|
+
def query_geolocation_metric(
|
|
614
|
+
self, table: str, field: str, operation: str, filters: str = None
|
|
615
|
+
) -> Union[int, float]:
|
|
616
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
617
|
+
field = self.quote_column(field)
|
|
618
|
+
|
|
619
|
+
valid_query = f"SELECT COUNT({field}) FROM {qualified_table_name} WHERE {field} IS NOT NULL AND {field} "
|
|
620
|
+
|
|
621
|
+
if field.lower().startswith("lat"):
|
|
622
|
+
valid_query += "BETWEEN -90 AND 90"
|
|
623
|
+
elif field.lower().startswith("lon"):
|
|
624
|
+
valid_query += "BETWEEN -180 AND 180"
|
|
625
|
+
|
|
626
|
+
if filters:
|
|
627
|
+
valid_query += f" AND {filters}"
|
|
628
|
+
|
|
629
|
+
valid_count = self.fetchone(valid_query)[0]
|
|
630
|
+
|
|
631
|
+
if operation == "percent":
|
|
632
|
+
total_query = f"SELECT COUNT(*) FROM {qualified_table_name}"
|
|
633
|
+
if filters:
|
|
634
|
+
total_query += f" WHERE {filters}"
|
|
635
|
+
|
|
636
|
+
total_count = self.fetchone(total_query)[0]
|
|
637
|
+
|
|
638
|
+
result = (valid_count / total_count) * 100 if total_count > 0 else 0
|
|
639
|
+
return round(result, 2)
|
|
640
|
+
|
|
641
|
+
return valid_count
|
|
642
|
+
|
|
643
|
+
def query_get_percentile(self, table: str, field: str, percentile: float, filters: str = None) -> float:
|
|
644
|
+
"""
|
|
645
|
+
Get the specified percentile value of a numeric column in a table.
|
|
646
|
+
:param table: table name
|
|
647
|
+
:param field: column name
|
|
648
|
+
:param percentile: percentile to calculate (e.g., 0.2 for 20th percentile)
|
|
649
|
+
:param filters: filter condition
|
|
650
|
+
:return: the value at the specified percentile
|
|
651
|
+
"""
|
|
652
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
653
|
+
field = self.quote_column(field)
|
|
654
|
+
query = f"SELECT PERCENTILE_DISC({percentile}) WITHIN GROUP (ORDER BY {field}) FROM {qualified_table_name}"
|
|
655
|
+
if filters:
|
|
656
|
+
query += f" WHERE {filters}"
|
|
657
|
+
return round(self.fetchone(query)[0], 2)
|
|
658
|
+
|
|
659
|
+
def query_zero_metric(self, table: str, field: str, operation: str, filters: str = None) -> Union[int, float]:
|
|
660
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
661
|
+
field = self.quote_column(field)
|
|
662
|
+
|
|
663
|
+
zero_query = f"SELECT COUNT(*) FROM {qualified_table_name} WHERE {field} = 0"
|
|
664
|
+
|
|
665
|
+
if filters:
|
|
666
|
+
zero_query += f" AND {filters}"
|
|
667
|
+
|
|
668
|
+
if operation == "percent":
|
|
669
|
+
total_count_query = f"SELECT COUNT(*) FROM {qualified_table_name}"
|
|
670
|
+
if filters:
|
|
671
|
+
total_count_query += f" WHERE {filters}"
|
|
672
|
+
|
|
673
|
+
zero_count = self.fetchone(zero_query)[0]
|
|
674
|
+
total_count = self.fetchone(total_count_query)[0]
|
|
675
|
+
|
|
676
|
+
if total_count == 0:
|
|
677
|
+
return 0.0
|
|
678
|
+
|
|
679
|
+
result = (zero_count / total_count) * 100
|
|
680
|
+
return round(result, 2)
|
|
681
|
+
else:
|
|
682
|
+
result = self.fetchone(zero_query)[0]
|
|
683
|
+
return result
|
|
684
|
+
|
|
685
|
+
def query_negative_metric(self, table: str, field: str, operation: str, filters: str = None) -> Union[int, float]:
|
|
686
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
687
|
+
field = self.quote_column(field)
|
|
688
|
+
|
|
689
|
+
negative_query = f"SELECT COUNT(*) FROM {qualified_table_name} WHERE {field} < 0"
|
|
690
|
+
|
|
691
|
+
if filters:
|
|
692
|
+
negative_query += f" AND {filters}"
|
|
693
|
+
|
|
694
|
+
total_count_query = f"SELECT COUNT(*) FROM {qualified_table_name}"
|
|
695
|
+
|
|
696
|
+
if filters:
|
|
697
|
+
total_count_query += f" WHERE {filters}"
|
|
698
|
+
|
|
699
|
+
if operation == "percent":
|
|
700
|
+
query = f"SELECT (CAST(({negative_query}) AS float) / CAST(({total_count_query}) AS float)) * 100 FROM {qualified_table_name}"
|
|
701
|
+
else:
|
|
702
|
+
query = negative_query
|
|
703
|
+
|
|
704
|
+
result = self.fetchone(query)[0]
|
|
705
|
+
return round(result, 2) if operation == "percent" else result
|
|
706
|
+
|
|
707
|
+
def query_get_all_space_count(
|
|
708
|
+
self, table: str, field: str, operation: str, filters: str = None
|
|
709
|
+
) -> Union[int, float]:
|
|
710
|
+
"""
|
|
711
|
+
Get the count of rows where the specified column contains only spaces.
|
|
712
|
+
:param table: table name
|
|
713
|
+
:param field: column name
|
|
714
|
+
:param filters: filter condition
|
|
715
|
+
:return: count of rows with only spaces
|
|
716
|
+
"""
|
|
717
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
718
|
+
field = self.quote_column(field)
|
|
719
|
+
|
|
720
|
+
query = f"""SELECT COUNT(CASE WHEN TRIM({field}) = '' THEN 1 END) AS space_count,COUNT(*) AS total_count FROM {qualified_table_name}
|
|
721
|
+
"""
|
|
722
|
+
|
|
723
|
+
if filters:
|
|
724
|
+
query += f" AND {filters}"
|
|
725
|
+
|
|
726
|
+
result = self.fetchone(query)
|
|
727
|
+
|
|
728
|
+
if operation == "percent":
|
|
729
|
+
return round((result[0] / result[1]) * 100) if result[1] > 0 else 0
|
|
730
|
+
|
|
731
|
+
return result[0] if result else 0
|
|
732
|
+
|
|
733
|
+
def query_get_null_keyword_count(
|
|
734
|
+
self, table: str, field: str, operation: str, filters: str = None
|
|
735
|
+
) -> Union[int, float]:
|
|
736
|
+
"""
|
|
737
|
+
Get the count of NULL-like values (specific keywords) in the specified column.
|
|
738
|
+
:param table: table name
|
|
739
|
+
:param field: column name
|
|
740
|
+
:param filters: filter condition
|
|
741
|
+
:return: count of NULL-like keyword values
|
|
742
|
+
"""
|
|
743
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
744
|
+
field = self.quote_column(field)
|
|
745
|
+
|
|
746
|
+
query = f""" SELECT SUM(CASE WHEN LOWER({field}) IN ('nothing', 'nil', 'null', 'none', 'n/a', null) THEN 1 ELSE 0 END) AS null_count,COUNT(*) AS total_count
|
|
747
|
+
FROM {qualified_table_name}"""
|
|
748
|
+
|
|
749
|
+
if filters:
|
|
750
|
+
query += f" WHERE {filters}"
|
|
751
|
+
|
|
752
|
+
result = self.fetchone(query)
|
|
753
|
+
|
|
754
|
+
if operation == "percent":
|
|
755
|
+
return round((result[0] / result[1]) * 100, 2) if result[1] > 0 else 0
|
|
756
|
+
|
|
757
|
+
return result[0] if result else 0
|
|
758
|
+
|
|
759
|
+
def query_timestamp_metric(
|
|
760
|
+
self,
|
|
761
|
+
table: str,
|
|
762
|
+
field: str,
|
|
763
|
+
predefined_regex: str,
|
|
764
|
+
filters: str = None,
|
|
765
|
+
) -> Union[float, int]:
|
|
766
|
+
"""
|
|
767
|
+
:param table: Table name
|
|
768
|
+
:param field: Column name
|
|
769
|
+
:param predefined_regex: regex pattern
|
|
770
|
+
:param filters: filter condition
|
|
771
|
+
:return: Tuple containing valid count and total count (or percentage)
|
|
772
|
+
"""
|
|
773
|
+
|
|
774
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
775
|
+
field = self.quote_column(field)
|
|
776
|
+
|
|
777
|
+
timestamp_iso_regex = r"^\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])T([01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9](?:\.\d{1,3})?(Z|[+-](0[0-9]|1[0-4]):[0-5][0-9])?$"
|
|
778
|
+
|
|
779
|
+
if predefined_regex == "timestamp_iso":
|
|
780
|
+
regex_condition = f"{field} ~ '{timestamp_iso_regex}'"
|
|
781
|
+
else:
|
|
782
|
+
raise ValueError(f"Unknown predefined regex pattern: {predefined_regex}")
|
|
783
|
+
|
|
784
|
+
filters_clause = f"WHERE {filters}" if filters else ""
|
|
785
|
+
|
|
786
|
+
query = f"""
|
|
787
|
+
WITH extracted_timestamps AS (
|
|
788
|
+
SELECT
|
|
789
|
+
{field},
|
|
790
|
+
SUBSTRING({field} FROM '^(\d{{4}})') AS year, -- Extract year
|
|
791
|
+
SUBSTRING({field} FROM '^\d{{4}}-(\d{{2}})') AS month, -- Extract month
|
|
792
|
+
SUBSTRING({field} FROM '^\d{{4}}-\d{{2}}-(\d{{2}})') AS day, -- Extract day
|
|
793
|
+
SUBSTRING({field} FROM 'T(\d{{2}})') AS hour, -- Extract hour
|
|
794
|
+
SUBSTRING({field} FROM 'T\d{{2}}:(\d{{2}})') AS minute, -- Extract minute
|
|
795
|
+
SUBSTRING({field} FROM 'T\d{{2}}:\d{{2}}:(\d{{2}})') AS second, -- Extract second
|
|
796
|
+
SUBSTRING({field} FROM '([+-]\d{{2}}:\d{{2}}|Z)$') AS timezone -- Extract timezone
|
|
797
|
+
FROM {qualified_table_name}
|
|
798
|
+
{filters_clause}
|
|
799
|
+
),
|
|
800
|
+
validated_timestamps AS (
|
|
801
|
+
SELECT
|
|
802
|
+
{field},
|
|
803
|
+
CASE
|
|
804
|
+
WHEN
|
|
805
|
+
-- Validate each component with its specific rules
|
|
806
|
+
year ~ '^\d{{4}}$' AND
|
|
807
|
+
month ~ '^(0[1-9]|1[0-2])$' AND
|
|
808
|
+
day ~ '^((0[1-9]|[12][0-9])|(30|31))$' AND
|
|
809
|
+
hour ~ '^([01][0-9]|2[0-3])$' AND
|
|
810
|
+
minute ~ '^[0-5][0-9]$' AND
|
|
811
|
+
second ~ '^[0-5][0-9]$' AND
|
|
812
|
+
(timezone IS NULL OR timezone ~ '^(Z|[+-](0[0-9]|1[0-4]):[0-5][0-9])$') AND
|
|
813
|
+
-- Additional check for days in months (e.g., February)
|
|
814
|
+
(
|
|
815
|
+
(month IN ('01', '03', '05', '07', '08', '10', '12') AND day BETWEEN '01' AND '31') OR
|
|
816
|
+
(month IN ('04', '06', '09', '11') AND day BETWEEN '01' AND '30') OR
|
|
817
|
+
(month = '02' AND day BETWEEN '01' AND
|
|
818
|
+
CASE
|
|
819
|
+
-- Handle leap years
|
|
820
|
+
WHEN (year::int % 400 = 0 OR (year::int % 100 != 0 AND year::int % 4 = 0)) THEN '29'
|
|
821
|
+
ELSE '28'
|
|
822
|
+
END
|
|
823
|
+
)
|
|
824
|
+
)
|
|
825
|
+
THEN 1
|
|
826
|
+
ELSE 0
|
|
827
|
+
END AS is_valid
|
|
828
|
+
FROM extracted_timestamps
|
|
829
|
+
)
|
|
830
|
+
SELECT COUNT(*) AS valid_count, COUNT(*) AS total_count
|
|
831
|
+
FROM validated_timestamps
|
|
832
|
+
WHERE is_valid = 1;
|
|
833
|
+
"""
|
|
834
|
+
|
|
835
|
+
try:
|
|
836
|
+
valid_count = self.fetchone(query)[0]
|
|
837
|
+
total_count_query = f"SELECT COUNT(*) FROM {qualified_table_name} {filters_clause}"
|
|
838
|
+
total_count = self.fetchone(total_count_query)[0]
|
|
839
|
+
|
|
840
|
+
return valid_count, total_count
|
|
841
|
+
|
|
842
|
+
except Exception as e:
|
|
843
|
+
logger.error(f"Error occurred: {e}")
|
|
844
|
+
return 0, 0
|
|
845
|
+
|
|
846
|
+
def query_timestamp_not_in_future_metric(
|
|
847
|
+
self,
|
|
848
|
+
table: str,
|
|
849
|
+
field: str,
|
|
850
|
+
predefined_regex: str,
|
|
851
|
+
filters: str = None,
|
|
852
|
+
) -> Union[float, int]:
|
|
853
|
+
"""
|
|
854
|
+
:param table: Table name
|
|
855
|
+
:param field: Column name
|
|
856
|
+
:param predefined_regex: regex pattern
|
|
857
|
+
:param filters: filter condition
|
|
858
|
+
:return: Tuple containing count of valid timestamps not in the future and total count
|
|
859
|
+
"""
|
|
860
|
+
|
|
861
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
862
|
+
field = self.quote_column(field)
|
|
863
|
+
|
|
864
|
+
timestamp_iso_regex = r"^\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])T([01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9](?:\.\d{1,3})?(Z|[+-](0[0-9]|1[0-4]):[0-5][0-9])?$"
|
|
865
|
+
|
|
866
|
+
if predefined_regex == "timestamp_iso":
|
|
867
|
+
regex_condition = f"{field} ~ '{timestamp_iso_regex}'"
|
|
868
|
+
else:
|
|
869
|
+
raise ValueError(f"Unknown predefined regex pattern: {predefined_regex}")
|
|
870
|
+
|
|
871
|
+
filters_clause = f"WHERE {filters}" if filters else ""
|
|
872
|
+
|
|
873
|
+
query = f"""
|
|
874
|
+
WITH extracted_timestamps AS (
|
|
875
|
+
SELECT
|
|
876
|
+
{field},
|
|
877
|
+
SUBSTRING({field} FROM '^(\d{{4}})') AS year, -- Extract year
|
|
878
|
+
SUBSTRING({field} FROM '^\d{{4}}-(\d{{2}})') AS month, -- Extract month
|
|
879
|
+
SUBSTRING({field} FROM '^\d{{4}}-\d{{2}}-(\d{{2}})') AS day, -- Extract day
|
|
880
|
+
SUBSTRING({field} FROM 'T(\d{{2}})') AS hour, -- Extract hour
|
|
881
|
+
SUBSTRING({field} FROM 'T\d{{2}}:(\d{{2}})') AS minute, -- Extract minute
|
|
882
|
+
SUBSTRING({field} FROM 'T\d{{2}}:\d{{2}}:(\d{{2}})') AS second, -- Extract second
|
|
883
|
+
SUBSTRING({field} FROM '([+-]\d{{2}}:\d{{2}}|Z)$') AS timezone -- Extract timezone
|
|
884
|
+
FROM {qualified_table_name}
|
|
885
|
+
{filters_clause}
|
|
886
|
+
),
|
|
887
|
+
validated_timestamps AS (
|
|
888
|
+
SELECT
|
|
889
|
+
{field},
|
|
890
|
+
CASE
|
|
891
|
+
WHEN
|
|
892
|
+
year ~ '^\d{{4}}$' AND
|
|
893
|
+
month ~ '^(0[1-9]|1[0-2])$' AND
|
|
894
|
+
day ~ '^((0[1-9]|[12][0-9])|(30|31))$' AND
|
|
895
|
+
hour ~ '^([01][0-9]|2[0-3])$' AND
|
|
896
|
+
minute ~ '^[0-5][0-9]$' AND
|
|
897
|
+
second ~ '^[0-5][0-9]$' AND
|
|
898
|
+
(timezone IS NULL OR timezone ~ '^(Z|[+-](0[0-9]|1[0-4]):[0-5][0-9])$') AND
|
|
899
|
+
(
|
|
900
|
+
(month IN ('01', '03', '05', '07', '08', '10', '12') AND day BETWEEN '01' AND '31') OR
|
|
901
|
+
(month IN ('04', '06', '09', '11') AND day BETWEEN '01' AND '30') OR
|
|
902
|
+
(month = '02' AND day BETWEEN '01' AND
|
|
903
|
+
CASE
|
|
904
|
+
WHEN (year::int % 400 = 0 OR (year::int % 100 != 0 AND year::int % 4 = 0)) THEN '29'
|
|
905
|
+
ELSE '28'
|
|
906
|
+
END
|
|
907
|
+
)
|
|
908
|
+
)
|
|
909
|
+
THEN 1
|
|
910
|
+
ELSE 0
|
|
911
|
+
END AS is_valid
|
|
912
|
+
FROM extracted_timestamps
|
|
913
|
+
),
|
|
914
|
+
timestamps_not_in_future AS (
|
|
915
|
+
SELECT *
|
|
916
|
+
FROM validated_timestamps
|
|
917
|
+
WHERE is_valid = 1 AND ({field} ~ '{timestamp_iso_regex}') AND {field}::timestamp <= CURRENT_TIMESTAMP
|
|
918
|
+
)
|
|
919
|
+
SELECT COUNT(*) AS valid_count, (SELECT COUNT(*) FROM {qualified_table_name} {filters_clause}) AS total_count
|
|
920
|
+
FROM timestamps_not_in_future;
|
|
921
|
+
"""
|
|
922
|
+
try:
|
|
923
|
+
valid_count = self.fetchone(query)[0]
|
|
924
|
+
total_count_query = f"SELECT COUNT(*) FROM {qualified_table_name} {filters_clause}"
|
|
925
|
+
total_count = self.fetchone(total_count_query)[0]
|
|
926
|
+
|
|
927
|
+
return valid_count, total_count
|
|
928
|
+
|
|
929
|
+
except Exception as e:
|
|
930
|
+
logger.error(f"Error occurred: {e}")
|
|
931
|
+
return 0, 0
|
|
932
|
+
|
|
933
|
+
def query_timestamp_date_not_in_future_metric(
|
|
934
|
+
self,
|
|
935
|
+
table: str,
|
|
936
|
+
field: str,
|
|
937
|
+
predefined_regex: str,
|
|
938
|
+
filters: str = None,
|
|
939
|
+
) -> Union[float, int]:
|
|
940
|
+
"""
|
|
941
|
+
:param table: Table name
|
|
942
|
+
:param field: Column name
|
|
943
|
+
:param predefined_regex: The regex pattern to use (e.g., "timestamp_iso")
|
|
944
|
+
:param filters: Optional filter condition
|
|
945
|
+
:return: Tuple containing count of valid dates not in the future and total count
|
|
946
|
+
"""
|
|
947
|
+
|
|
948
|
+
qualified_table_name = self.qualified_table_name(table)
|
|
949
|
+
field = self.quote_column(field)
|
|
950
|
+
|
|
951
|
+
timestamp_iso_regex = r"^\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])T([01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9](?:\.\d{1,3})?(Z|[+-](0[0-9]|1[0-4]):[0-5][0-9])?$"
|
|
952
|
+
|
|
953
|
+
if predefined_regex == "timestamp_iso":
|
|
954
|
+
regex_condition = f"{field} ~ '{timestamp_iso_regex}'"
|
|
955
|
+
else:
|
|
956
|
+
raise ValueError(f"Unknown predefined regex pattern: {predefined_regex}")
|
|
957
|
+
|
|
958
|
+
filters_clause = f"WHERE {filters}" if filters else ""
|
|
959
|
+
|
|
960
|
+
query = f"""
|
|
961
|
+
WITH extracted_timestamps AS (
|
|
962
|
+
SELECT
|
|
963
|
+
{field},
|
|
964
|
+
SUBSTRING({field} FROM '^(\d{{4}})') AS year, -- Extract year
|
|
965
|
+
SUBSTRING({field} FROM '^\d{{4}}-(\d{{2}})') AS month, -- Extract month
|
|
966
|
+
SUBSTRING({field} FROM '^\d{{4}}-\d{{2}}-(\d{{2}})') AS day -- Extract day
|
|
967
|
+
FROM {qualified_table_name}
|
|
968
|
+
{filters_clause}
|
|
969
|
+
),
|
|
970
|
+
validated_dates AS (
|
|
971
|
+
SELECT
|
|
972
|
+
{field},
|
|
973
|
+
CASE
|
|
974
|
+
WHEN
|
|
975
|
+
year ~ '^\d{{4}}$' AND
|
|
976
|
+
month ~ '^(0[1-9]|1[0-2])$' AND
|
|
977
|
+
day ~ '^((0[1-9]|[12][0-9])|(30|31))$' AND
|
|
978
|
+
(
|
|
979
|
+
(month IN ('01', '03', '05', '07', '08', '10', '12') AND day BETWEEN '01' AND '31') OR
|
|
980
|
+
(month IN ('04', '06', '09', '11') AND day BETWEEN '01' AND '30') OR
|
|
981
|
+
(month = '02' AND day BETWEEN '01' AND
|
|
982
|
+
CASE
|
|
983
|
+
WHEN (year::int % 400 = 0 OR (year::int % 100 != 0 AND year::int % 4 = 0)) THEN '29'
|
|
984
|
+
ELSE '28'
|
|
985
|
+
END
|
|
986
|
+
)
|
|
987
|
+
)
|
|
988
|
+
THEN 1
|
|
989
|
+
ELSE 0
|
|
990
|
+
END AS is_valid
|
|
991
|
+
FROM extracted_timestamps
|
|
992
|
+
),
|
|
993
|
+
dates_not_in_future AS (
|
|
994
|
+
SELECT *
|
|
995
|
+
FROM validated_dates
|
|
996
|
+
WHERE is_valid = 1
|
|
997
|
+
AND ({field} ~ '{timestamp_iso_regex}')
|
|
998
|
+
AND ({field})::date <= CURRENT_DATE -- Compare only the date part against the current date
|
|
999
|
+
)
|
|
1000
|
+
SELECT COUNT(*) AS valid_count, (SELECT COUNT(*) FROM {qualified_table_name} {filters_clause}) AS total_count
|
|
1001
|
+
FROM dates_not_in_future;
|
|
1002
|
+
"""
|
|
1003
|
+
|
|
1004
|
+
try:
|
|
1005
|
+
valid_count = self.fetchone(query)[0]
|
|
1006
|
+
total_count_query = f"SELECT COUNT(*) FROM {qualified_table_name} {filters_clause}"
|
|
1007
|
+
total_count = self.fetchone(total_count_query)[0]
|
|
1008
|
+
|
|
1009
|
+
return valid_count, total_count
|
|
1010
|
+
except Exception as e:
|
|
1011
|
+
logger.error(f"Error occurred: {e}")
|
|
1012
|
+
return 0, 0
|
|
1013
|
+
|
|
1014
|
+
def generate_view_name(self, view_name: str | None = None) -> str:
|
|
1015
|
+
if view_name is not None:
|
|
1016
|
+
return view_name
|
|
1017
|
+
random_string = "".join(secrets.choice(string.ascii_letters + string.digits) for _ in range(8))
|
|
1018
|
+
timestamp = int(time.time())
|
|
1019
|
+
return f"dcs_view_{timestamp}_{random_string.lower()}"
|
|
1020
|
+
|
|
1021
|
+
def create_view(
|
|
1022
|
+
self,
|
|
1023
|
+
query: str | None = None,
|
|
1024
|
+
schema: str | None = None,
|
|
1025
|
+
view_name: str | None = None,
|
|
1026
|
+
) -> str | None:
|
|
1027
|
+
view_name = self.generate_view_name(view_name=view_name)
|
|
1028
|
+
schema_prefix = f"{schema}." if schema else ""
|
|
1029
|
+
view_name_full = f"{schema_prefix}{view_name}"
|
|
1030
|
+
|
|
1031
|
+
if query is None:
|
|
1032
|
+
sql = f"CREATE VIEW {view_name_full} AS SELECT 1 AS dummy WHERE 1 = 0"
|
|
1033
|
+
else:
|
|
1034
|
+
sql = f"CREATE VIEW {view_name_full} AS {query}"
|
|
1035
|
+
|
|
1036
|
+
try:
|
|
1037
|
+
if isinstance(self.connection, (Connection, Engine)):
|
|
1038
|
+
if isinstance(self.connection, Engine):
|
|
1039
|
+
with self.connection.connect() as conn:
|
|
1040
|
+
conn.execute(text(sql))
|
|
1041
|
+
conn.commit()
|
|
1042
|
+
else:
|
|
1043
|
+
self.connection.execute(text(sql))
|
|
1044
|
+
try:
|
|
1045
|
+
self.connection.commit()
|
|
1046
|
+
except Exception:
|
|
1047
|
+
pass
|
|
1048
|
+
else:
|
|
1049
|
+
plain_sql = str(sql)
|
|
1050
|
+
if hasattr(self.connection, "cursor"):
|
|
1051
|
+
cur = self.connection.cursor()
|
|
1052
|
+
cur.execute(plain_sql)
|
|
1053
|
+
try:
|
|
1054
|
+
self.connection.commit()
|
|
1055
|
+
except Exception:
|
|
1056
|
+
pass
|
|
1057
|
+
else:
|
|
1058
|
+
self.connection.execute(plain_sql)
|
|
1059
|
+
|
|
1060
|
+
return view_name_full
|
|
1061
|
+
except Exception as e:
|
|
1062
|
+
logger.error(f"Error creating view {view_name_full}: {e}")
|
|
1063
|
+
return None
|
|
1064
|
+
|
|
1065
|
+
def drop_view(self, view_name: str, schema: str | None) -> bool:
|
|
1066
|
+
schema_prefix = f"{schema}." if schema else ""
|
|
1067
|
+
full_view_name = f"{schema_prefix}{view_name}"
|
|
1068
|
+
drop_query = f"DROP VIEW {full_view_name}"
|
|
1069
|
+
try:
|
|
1070
|
+
if isinstance(self.connection, (Connection, Engine)):
|
|
1071
|
+
if isinstance(self.connection, Engine):
|
|
1072
|
+
with self.connection.connect() as conn:
|
|
1073
|
+
conn.execute(text(drop_query))
|
|
1074
|
+
conn.commit()
|
|
1075
|
+
else:
|
|
1076
|
+
self.connection.execute(text(drop_query))
|
|
1077
|
+
try:
|
|
1078
|
+
self.connection.commit()
|
|
1079
|
+
except Exception:
|
|
1080
|
+
pass
|
|
1081
|
+
else:
|
|
1082
|
+
if hasattr(self.connection, "cursor"):
|
|
1083
|
+
cur = self.connection.cursor()
|
|
1084
|
+
cur.execute(drop_query)
|
|
1085
|
+
try:
|
|
1086
|
+
self.connection.commit()
|
|
1087
|
+
except Exception:
|
|
1088
|
+
pass
|
|
1089
|
+
else:
|
|
1090
|
+
self.connection.execute(str(drop_query))
|
|
1091
|
+
return True
|
|
1092
|
+
except Exception as e:
|
|
1093
|
+
logger.error(f"Error dropping view {full_view_name}: {e}")
|
|
1094
|
+
return False
|