dcs-sdk 1.6.4__py3-none-any.whl → 1.6.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. dcs_core/__init__.py +13 -0
  2. dcs_core/__main__.py +17 -0
  3. dcs_core/__version__.py +15 -0
  4. dcs_core/cli/__init__.py +13 -0
  5. dcs_core/cli/cli.py +165 -0
  6. dcs_core/core/__init__.py +19 -0
  7. dcs_core/core/common/__init__.py +13 -0
  8. dcs_core/core/common/errors.py +50 -0
  9. dcs_core/core/common/models/__init__.py +13 -0
  10. dcs_core/core/common/models/configuration.py +284 -0
  11. dcs_core/core/common/models/dashboard.py +24 -0
  12. dcs_core/core/common/models/data_source_resource.py +75 -0
  13. dcs_core/core/common/models/metric.py +160 -0
  14. dcs_core/core/common/models/profile.py +75 -0
  15. dcs_core/core/common/models/validation.py +216 -0
  16. dcs_core/core/common/models/widget.py +44 -0
  17. dcs_core/core/configuration/__init__.py +13 -0
  18. dcs_core/core/configuration/config_loader.py +139 -0
  19. dcs_core/core/configuration/configuration_parser.py +262 -0
  20. dcs_core/core/configuration/configuration_parser_arc.py +328 -0
  21. dcs_core/core/datasource/__init__.py +13 -0
  22. dcs_core/core/datasource/base.py +62 -0
  23. dcs_core/core/datasource/manager.py +112 -0
  24. dcs_core/core/datasource/search_datasource.py +421 -0
  25. dcs_core/core/datasource/sql_datasource.py +1094 -0
  26. dcs_core/core/inspect.py +163 -0
  27. dcs_core/core/logger/__init__.py +13 -0
  28. dcs_core/core/logger/base.py +32 -0
  29. dcs_core/core/logger/default_logger.py +94 -0
  30. dcs_core/core/metric/__init__.py +13 -0
  31. dcs_core/core/metric/base.py +220 -0
  32. dcs_core/core/metric/combined_metric.py +98 -0
  33. dcs_core/core/metric/custom_metric.py +34 -0
  34. dcs_core/core/metric/manager.py +137 -0
  35. dcs_core/core/metric/numeric_metric.py +403 -0
  36. dcs_core/core/metric/reliability_metric.py +90 -0
  37. dcs_core/core/profiling/__init__.py +13 -0
  38. dcs_core/core/profiling/datasource_profiling.py +136 -0
  39. dcs_core/core/profiling/numeric_field_profiling.py +72 -0
  40. dcs_core/core/profiling/text_field_profiling.py +67 -0
  41. dcs_core/core/repository/__init__.py +13 -0
  42. dcs_core/core/repository/metric_repository.py +77 -0
  43. dcs_core/core/utils/__init__.py +13 -0
  44. dcs_core/core/utils/log.py +29 -0
  45. dcs_core/core/utils/tracking.py +105 -0
  46. dcs_core/core/utils/utils.py +44 -0
  47. dcs_core/core/validation/__init__.py +13 -0
  48. dcs_core/core/validation/base.py +230 -0
  49. dcs_core/core/validation/completeness_validation.py +153 -0
  50. dcs_core/core/validation/custom_query_validation.py +24 -0
  51. dcs_core/core/validation/manager.py +282 -0
  52. dcs_core/core/validation/numeric_validation.py +276 -0
  53. dcs_core/core/validation/reliability_validation.py +91 -0
  54. dcs_core/core/validation/uniqueness_validation.py +61 -0
  55. dcs_core/core/validation/validity_validation.py +738 -0
  56. dcs_core/integrations/__init__.py +13 -0
  57. dcs_core/integrations/databases/__init__.py +13 -0
  58. dcs_core/integrations/databases/bigquery.py +187 -0
  59. dcs_core/integrations/databases/databricks.py +51 -0
  60. dcs_core/integrations/databases/db2.py +652 -0
  61. dcs_core/integrations/databases/elasticsearch.py +61 -0
  62. dcs_core/integrations/databases/mssql.py +979 -0
  63. dcs_core/integrations/databases/mysql.py +409 -0
  64. dcs_core/integrations/databases/opensearch.py +64 -0
  65. dcs_core/integrations/databases/oracle.py +719 -0
  66. dcs_core/integrations/databases/postgres.py +570 -0
  67. dcs_core/integrations/databases/redshift.py +53 -0
  68. dcs_core/integrations/databases/snowflake.py +48 -0
  69. dcs_core/integrations/databases/spark_df.py +111 -0
  70. dcs_core/integrations/databases/sybase.py +1069 -0
  71. dcs_core/integrations/storage/__init__.py +13 -0
  72. dcs_core/integrations/storage/local_file.py +149 -0
  73. dcs_core/integrations/utils/__init__.py +13 -0
  74. dcs_core/integrations/utils/utils.py +36 -0
  75. dcs_core/report/__init__.py +13 -0
  76. dcs_core/report/dashboard.py +211 -0
  77. dcs_core/report/models.py +88 -0
  78. dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
  79. dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
  80. dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
  81. dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
  82. dcs_core/report/static/assets/images/docs.svg +6 -0
  83. dcs_core/report/static/assets/images/github.svg +4 -0
  84. dcs_core/report/static/assets/images/logo.svg +7 -0
  85. dcs_core/report/static/assets/images/slack.svg +13 -0
  86. dcs_core/report/static/index.js +2 -0
  87. dcs_core/report/static/index.js.LICENSE.txt +3971 -0
  88. dcs_sdk/__version__.py +1 -1
  89. dcs_sdk/cli/cli.py +3 -0
  90. {dcs_sdk-1.6.4.dist-info → dcs_sdk-1.6.6.dist-info}/METADATA +24 -2
  91. dcs_sdk-1.6.6.dist-info/RECORD +159 -0
  92. {dcs_sdk-1.6.4.dist-info → dcs_sdk-1.6.6.dist-info}/entry_points.txt +1 -0
  93. dcs_sdk-1.6.4.dist-info/RECORD +0 -72
  94. {dcs_sdk-1.6.4.dist-info → dcs_sdk-1.6.6.dist-info}/WHEEL +0 -0
@@ -0,0 +1,652 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from datetime import datetime
16
+ from typing import Any, Dict, List, Tuple, Union
17
+
18
+ from loguru import logger
19
+ from sqlalchemy import create_engine
20
+ from sqlalchemy.exc import SQLAlchemyError
21
+
22
+ from dcs_core.core.common.errors import DataChecksDataSourcesConnectionError
23
+ from dcs_core.core.datasource.sql_datasource import SQLDataSource
24
+ from dcs_core.integrations.utils.utils import ibm_db2_dll_files_loader
25
+
26
+ ibm_db2_dll_files_loader()
27
+
28
+
29
+ class DB2DataSource(SQLDataSource):
30
+ def __init__(self, data_source_name: str, data_connection: Dict):
31
+ super().__init__(data_source_name, data_connection)
32
+
33
+ def connect(self) -> Any:
34
+ """
35
+ Connect to the DB2 data source using SQLAlchemy
36
+ """
37
+ try:
38
+ url = self._build_connection_url()
39
+ engine = create_engine(url, echo=False)
40
+ self.connection = engine.connect()
41
+ return self.connection
42
+ except SQLAlchemyError as e:
43
+ raise DataChecksDataSourcesConnectionError(f"Failed to connect to DB2 data source: {str(e)}")
44
+
45
+ def _build_connection_url(self) -> str:
46
+ """
47
+ Build the SQLAlchemy connection URL for DB2
48
+ """
49
+ host = self.data_connection.get("host")
50
+ port = self.data_connection.get("port")
51
+ database = self.data_connection.get("database")
52
+ username = self.data_connection.get("username")
53
+ password = self.data_connection.get("password")
54
+
55
+ url = f"db2+ibm_db://{username}:{password}@{host}:{port}/{database}"
56
+
57
+ params = []
58
+ if self.data_connection.get("security"):
59
+ params.append(f"SECURITY={self.data_connection['security']}")
60
+ if self.data_connection.get("protocol"):
61
+ params.append(f"PROTOCOL={self.data_connection['protocol']}")
62
+ if self.data_connection.get("schema"):
63
+ params.append(f"CURRENTSCHEMA={self.data_connection.get('schema')}")
64
+ if params:
65
+ url += "?" + "&".join(params)
66
+
67
+ return url
68
+
69
+ def qualified_table_name(self, table_name: str) -> str:
70
+ """
71
+ Get the qualified table name
72
+ :param table_name: name of the table
73
+ :return: qualified table name
74
+ """
75
+ if self.schema_name:
76
+ return f'"{self.schema_name}"."{table_name}"'
77
+ return f'"{table_name}"'
78
+
79
+ def quote_column(self, column: str) -> str:
80
+ """
81
+ Quote the column name
82
+ :param column: name of the column
83
+ :return: quoted column name
84
+ """
85
+ return f'"{column}"'
86
+
87
+ def query_get_distinct_count(self, table: str, field: str, filters: str = None) -> int:
88
+ """
89
+ Get the distinct count value
90
+ :param table: table name
91
+ :param field: column name
92
+ :param filters: filter condition
93
+ :return: distinct count as an integer
94
+ """
95
+ qualified_table_name = self.qualified_table_name(table)
96
+ field = self.quote_column(field)
97
+ query = f"SELECT COUNT(DISTINCT CAST({field} AS VARCHAR(255))) FROM {qualified_table_name}"
98
+
99
+ if filters:
100
+ query += f" WHERE {filters}"
101
+
102
+ result = self.fetchone(query)
103
+ return result[0] if result else 0
104
+
105
+ def query_negative_metric(self, table: str, field: str, operation: str, filters: str = None) -> Union[int, float]:
106
+ """
107
+ Calculate a negative metric for a specified field in a Db2 table.
108
+ :param table: table name
109
+ :param field: column name
110
+ :param operation: type of operation, "percent" or "count"
111
+ :param filters: optional filter conditions
112
+ :return: percentage of negative values if operation is "percent", otherwise count of negatives
113
+ """
114
+ qualified_table_name = self.qualified_table_name(table)
115
+ field = self.quote_column(field)
116
+
117
+ negative_query = f"SELECT COUNT(*) FROM {qualified_table_name} WHERE {field} < 0"
118
+ if filters:
119
+ negative_query += f" AND {filters}"
120
+
121
+ total_count_query = f"SELECT COUNT(*) FROM {qualified_table_name}"
122
+ if filters:
123
+ total_count_query += f" WHERE {filters}"
124
+
125
+ if operation == "percent":
126
+ query = f"""
127
+ SELECT (CAST(({negative_query}) AS FLOAT) / NULLIF(CAST(({total_count_query}) AS FLOAT), 0)) * 100
128
+ FROM SYSIBM.SYSDUMMY1
129
+ """
130
+ else:
131
+ query = negative_query
132
+
133
+ result = self.fetchone(query)[0]
134
+ return round(result, 2) if operation == "percent" else result
135
+
136
+ def query_get_null_keyword_count(
137
+ self, table: str, field: str, operation: str, filters: str = None
138
+ ) -> Union[int, float]:
139
+ """
140
+ Get the count of NULL-like values (specific keywords) in the specified column for IBM DB2.
141
+ :param table: table name
142
+ :param field: column name
143
+ :param operation: type of operation ('count' or 'percent')
144
+ :param filters: filter condition
145
+ :return: count or percentage of NULL-like keyword values
146
+ """
147
+ qualified_table_name = self.qualified_table_name(table)
148
+ field = self.quote_column(field)
149
+
150
+ query = f"""
151
+ SELECT
152
+ SUM(CASE
153
+ WHEN {field} IS NULL
154
+ OR TRIM(UPPER({field})) IN ('NOTHING', 'NIL', 'NULL', 'NONE', 'N/A')
155
+ THEN 1
156
+ ELSE 0
157
+ END) AS null_count,
158
+ COUNT(*) AS total_count
159
+ FROM {qualified_table_name}
160
+ """
161
+
162
+ if filters:
163
+ query += f" WHERE {filters}"
164
+
165
+ result = self.fetchone(query)
166
+
167
+ if not result or result[1] == 0:
168
+ return 0
169
+
170
+ if operation == "percent":
171
+ return round((result[0] or 0) / result[1] * 100, 2)
172
+
173
+ return result[0] or 0
174
+
175
+ def query_get_string_length_metric(
176
+ self, table: str, field: str, metric: str, filters: str = None
177
+ ) -> Union[int, float]:
178
+ """
179
+ Get the string length metric (max, min, avg) in a column of a table.
180
+
181
+ :param table: table name
182
+ :param field: column name
183
+ :param metric: the metric to calculate ('max', 'min', 'avg')
184
+ :param filters: filter condition
185
+ :return: the calculated metric as int for 'max' and 'min', float for 'avg'
186
+ """
187
+ qualified_table_name = self.qualified_table_name(table)
188
+ field = self.quote_column(field)
189
+
190
+ if metric.lower() == "max":
191
+ sql_function = "MAX(LENGTH"
192
+ elif metric.lower() == "min":
193
+ sql_function = "MIN(LENGTH"
194
+ elif metric.lower() == "avg":
195
+ sql_function = "AVG(CAST(LENGTH"
196
+ else:
197
+ raise ValueError(f"Invalid metric '{metric}'. Choose from 'max', 'min', or 'avg'.")
198
+
199
+ if metric.lower() == "avg":
200
+ query = f'SELECT {sql_function}("{field}") AS FLOAT)) FROM {qualified_table_name}'
201
+ else:
202
+ query = f'SELECT {sql_function}("{field}")) FROM {qualified_table_name}'
203
+
204
+ if filters:
205
+ query += f" WHERE {filters}"
206
+
207
+ result = self.fetchone(query)[0]
208
+ return round(result, 2) if metric.lower() == "avg" else result
209
+
210
+ def query_string_pattern_validity(
211
+ self,
212
+ table: str,
213
+ field: str,
214
+ regex_pattern: str = None,
215
+ predefined_regex_pattern: str = None,
216
+ filters: str = None,
217
+ ) -> Tuple[int, int]:
218
+ """
219
+ Get the count of valid values based on the regex pattern.
220
+ :param table: table name
221
+ :param field: column name
222
+ :param regex_pattern: custom regex pattern
223
+ :param predefined_regex_pattern: predefined regex pattern
224
+ :param filters: filter condition
225
+ :return: count of valid values, count of total row count
226
+ """
227
+ filters = f"WHERE {filters}" if filters else ""
228
+ qualified_table_name = self.qualified_table_name(table)
229
+ field = self.quote_column(field)
230
+
231
+ if not regex_pattern and not predefined_regex_pattern:
232
+ raise ValueError("Either regex_pattern or predefined_regex_pattern should be provided")
233
+
234
+ if predefined_regex_pattern:
235
+ regex = self.regex_patterns[predefined_regex_pattern]
236
+ else:
237
+ regex = regex_pattern
238
+
239
+ regex_query = f"""
240
+ CASE WHEN REGEXP_LIKE("{field}", '{regex}') THEN 1 ELSE 0 END"""
241
+
242
+ query = f"""
243
+ SELECT SUM({regex_query}) AS valid_count, COUNT(*) AS total_count
244
+ FROM {qualified_table_name} {filters}
245
+ """
246
+ result = self.fetchone(query)
247
+ return result[0], result[1]
248
+
249
+ def query_valid_invalid_values_validity(
250
+ self,
251
+ table: str,
252
+ field: str,
253
+ regex_pattern: str = None,
254
+ filters: str = None,
255
+ values: List[str] = None,
256
+ ) -> Tuple[int, int]:
257
+ """
258
+ Get the count of valid and invalid values for a specified column.
259
+ :param table: table name
260
+ :param field: column name
261
+ :param values: list of valid values
262
+ :param regex_pattern: regex pattern
263
+ :param filters: filter condition
264
+ :return: count of valid values and total count of rows.
265
+ """
266
+ filters = f"WHERE {filters}" if filters else ""
267
+ qualified_table_name = self.qualified_table_name(table)
268
+ field = self.quote_column(field)
269
+ if values:
270
+ values_str = ", ".join([f"'{value}'" for value in values])
271
+ validity_condition = f"CASE WHEN {field} IN ({values_str}) THEN 1 ELSE 0 END"
272
+ else:
273
+ validity_condition = f"CASE WHEN REGEXP_LIKE({field}, '{regex_pattern}') THEN 1 ELSE 0 END"
274
+
275
+ query = f"""
276
+ SELECT SUM({validity_condition}) AS valid_count, COUNT(*) AS total_count
277
+ FROM {qualified_table_name}
278
+ {filters}
279
+ """
280
+
281
+ result = self.fetchone(query)
282
+ return result[0], result[1]
283
+
284
+ def query_get_usa_state_code_validity(self, table: str, field: str, filters: str = None) -> Tuple[int, int]:
285
+ """
286
+ Get the count of valid USA state codes
287
+ :param table: table name
288
+ :param field: column name
289
+ :param filters: filter condition
290
+ :return: count of valid state codes, count of total row count
291
+ """
292
+
293
+ valid_state_codes_str = ", ".join(f"'{code}'" for code in self.valid_state_codes)
294
+
295
+ filters = f"WHERE {filters}" if filters else ""
296
+
297
+ qualified_table_name = self.qualified_table_name(table)
298
+ field = self.quote_column(field)
299
+ regex_query = f"""
300
+ CASE WHEN REGEXP_LIKE("{field}", '^[A-Z]{{2}}$') AND UPPER("{field}") IN ({valid_state_codes_str}) THEN 1 ELSE 0 END
301
+ """
302
+
303
+ query = f"""
304
+ SELECT SUM({regex_query}) AS valid_count, COUNT(*) AS total_count
305
+ FROM {qualified_table_name} {filters}
306
+ """
307
+ result = self.fetchone(query)
308
+ return result[0], result[1]
309
+
310
+ def query_timestamp_metric(
311
+ self,
312
+ table: str,
313
+ field: str,
314
+ operation: str,
315
+ predefined_regex: str,
316
+ filters: str = None,
317
+ ) -> Union[float, int]:
318
+ """
319
+ :param table: Table name
320
+ :param field: Column name
321
+ :param operation: Metric operation ("count" or "percent")
322
+ :param predefined_regex: regex pattern
323
+ :param filters: filter condition
324
+ :return: Tuple containing valid count and total count (or percentage)
325
+ """
326
+
327
+ qualified_table_name = self.qualified_table_name(table)
328
+ field = self.quote_column(field)
329
+
330
+ timestamp_iso_regex = r"^\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])T([01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9](?:\.\d{1,3})?(Z|[+-](0[0-9]|1[0-4]):[0-5][0-9])?$"
331
+
332
+ if predefined_regex == "timestamp_iso":
333
+ regex_condition = f"REGEXP_LIKE({field}, '{timestamp_iso_regex}')"
334
+ else:
335
+ raise ValueError(f"Unknown predefined regex pattern: {predefined_regex}")
336
+
337
+ filters_clause = f"WHERE {filters}" if filters else ""
338
+
339
+ query = f"""
340
+ WITH extracted_timestamps AS (
341
+ SELECT
342
+ {field},
343
+ SUBSTR({field}, 1, 4) AS year, -- Extract year
344
+ SUBSTR({field}, 6, 2) AS month, -- Extract month
345
+ SUBSTR({field}, 9, 2) AS day, -- Extract day
346
+ SUBSTR({field}, 12, 2) AS hour, -- Extract hour
347
+ SUBSTR({field}, 15, 2) AS minute, -- Extract minute
348
+ SUBSTR({field}, 18, 2) AS second, -- Extract second
349
+ SUBSTR({field}, 20) AS timezone -- Extract timezone
350
+ FROM {qualified_table_name}
351
+ {filters_clause}
352
+ ),
353
+ validated_timestamps AS (
354
+ SELECT
355
+ {field},
356
+ CASE
357
+ WHEN
358
+ -- Validate each component with its specific rules
359
+ REGEXP_LIKE(year, '^\d{{4}}$') AND
360
+ REGEXP_LIKE(month, '^(0[1-9]|1[0-2])$') AND
361
+ REGEXP_LIKE(day, '^((0[1-9]|[12][0-9])|(30|31))$') AND
362
+ REGEXP_LIKE(hour, '^([01][0-9]|2[0-3])$') AND
363
+ REGEXP_LIKE(minute, '^[0-5][0-9]$') AND
364
+ REGEXP_LIKE(second, '^[0-5][0-9]$') AND
365
+ (timezone IS NULL OR REGEXP_LIKE(timezone, '^(Z|[+-](0[0-9]|1[0-4]):[0-5][0-9])$')) AND
366
+ -- Additional check for days in months (e.g., February)
367
+ (
368
+ (month IN ('01', '03', '05', '07', '08', '10', '12') AND day BETWEEN '01' AND '31') OR
369
+ (month IN ('04', '06', '09', '11') AND day BETWEEN '01' AND '30') OR
370
+ (month = '02' AND day BETWEEN '01' AND
371
+ CASE
372
+ -- Handle leap years
373
+ WHEN (CAST(year AS INT) % 400 = 0 OR (CAST(year AS INT) % 100 != 0 AND CAST(year AS INT) % 4 = 0)) THEN '29'
374
+ ELSE '28'
375
+ END
376
+ )
377
+ )
378
+ THEN 1
379
+ ELSE 0
380
+ END AS is_valid
381
+ FROM extracted_timestamps
382
+ )
383
+ SELECT COUNT(*) AS valid_count
384
+ FROM validated_timestamps
385
+ WHERE is_valid = 1;
386
+ """
387
+
388
+ try:
389
+ valid_count = self.fetchone(query)[0]
390
+ total_count_query = f"SELECT COUNT(*) FROM {qualified_table_name} {filters_clause}"
391
+ total_count = self.fetchone(total_count_query)[0]
392
+
393
+ if operation == "count":
394
+ return valid_count, total_count
395
+ elif operation == "percent":
396
+ return (valid_count / total_count) * 100 if total_count > 0 else 0.0
397
+ else:
398
+ raise ValueError(f"Unknown operation: {operation}")
399
+
400
+ except Exception as e:
401
+ logger.error(f"Failed to execute query: {str(e)}")
402
+ return 0, 0
403
+
404
+ def query_timestamp_not_in_future_metric(
405
+ self,
406
+ table: str,
407
+ field: str,
408
+ operation: str,
409
+ predefined_regex: str,
410
+ filters: str = None,
411
+ ) -> Union[float, int]:
412
+ """
413
+ :param table: Table name
414
+ :param field: Column name
415
+ :param operation: Metric operation ("count" or "percent")
416
+ :param predefined_regex: regex pattern
417
+ :param filters: filter condition
418
+ :return: Tuple containing count of valid timestamps not in the future and total count
419
+ """
420
+
421
+ qualified_table_name = self.qualified_table_name(table)
422
+ field = self.quote_column(field)
423
+
424
+ timestamp_iso_regex = r"^\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])T([01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9](?:\.\d{1,3})?(Z|[+-](0[0-9]|1[0-4]):[0-5][0-9])?$"
425
+
426
+ if predefined_regex == "timestamp_iso":
427
+ regex_condition = f"REGEXP_LIKE({field}, '{timestamp_iso_regex}')"
428
+ else:
429
+ raise ValueError(f"Unknown predefined regex pattern: {predefined_regex}")
430
+
431
+ filters_clause = f"WHERE {filters}" if filters else ""
432
+
433
+ query = f"""
434
+ WITH extracted_timestamps AS (
435
+ SELECT
436
+ {field},
437
+ SUBSTRING({field}, 1, 4) AS year, -- Extract year
438
+ SUBSTRING({field}, 6, 2) AS month, -- Extract month
439
+ SUBSTRING({field}, 9, 2) AS day, -- Extract day
440
+ SUBSTRING({field}, 12, 2) AS hour, -- Extract hour
441
+ SUBSTRING({field}, 15, 2) AS minute, -- Extract minute
442
+ SUBSTRING({field}, 18, 2) AS second, -- Extract second
443
+ SUBSTRING({field}, 20) AS timezone -- Extract timezone
444
+ FROM {qualified_table_name}
445
+ {filters_clause}
446
+ ),
447
+ validated_timestamps AS (
448
+ SELECT
449
+ {field},
450
+ CASE
451
+ WHEN
452
+ REGEXP_LIKE(year, '^\d{{4}}$') AND
453
+ REGEXP_LIKE(month, '^(0[1-9]|1[0-2])$') AND
454
+ REGEXP_LIKE(day, '^((0[1-9]|[12][0-9])|(30|31))$') AND
455
+ REGEXP_LIKE(hour, '^([01][0-9]|2[0-3])$') AND
456
+ REGEXP_LIKE(minute, '^[0-5][0-9]$') AND
457
+ REGEXP_LIKE(second, '^[0-5][0-9]$') AND
458
+ (timezone IS NULL OR REGEXP_LIKE(timezone, '^(Z|[+-](0[0-9]|1[0-4]):[0-5][0-9])$')) AND
459
+ (
460
+ (month IN ('01', '03', '05', '07', '08', '10', '12') AND day BETWEEN '01' AND '31') OR
461
+ (month IN ('04', '06', '09', '11') AND day BETWEEN '01' AND '30') OR
462
+ (month = '02' AND day BETWEEN '01' AND
463
+ CASE
464
+ WHEN (CAST(year AS INTEGER) % 400 = 0 OR
465
+ (CAST(year AS INTEGER) % 100 != 0 AND
466
+ CAST(year AS INTEGER) % 4 = 0)) THEN '29'
467
+ ELSE '28'
468
+ END
469
+ )
470
+ )
471
+ THEN 1
472
+ ELSE 0
473
+ END AS is_valid
474
+ FROM extracted_timestamps
475
+ ),
476
+ timestamps_not_in_future AS (
477
+ SELECT *
478
+ FROM validated_timestamps
479
+ WHERE is_valid = 1 AND {regex_condition}
480
+ AND TO_TIMESTAMP({field}, 'YYYY-MM-DD"T"HH24:MI:SS') <= CURRENT TIMESTAMP
481
+ )
482
+ SELECT COUNT(*) AS valid_count, (SELECT COUNT(*) FROM {qualified_table_name} {filters_clause}) AS total_count
483
+ FROM timestamps_not_in_future;
484
+ """
485
+ try:
486
+ valid_count = self.fetchone(query)[0]
487
+ total_count_query = f"SELECT COUNT(*) FROM {qualified_table_name} {filters_clause}"
488
+ total_count = self.fetchone(total_count_query)[0]
489
+
490
+ if operation == "count":
491
+ return valid_count, total_count
492
+ elif operation == "percent":
493
+ return (valid_count / total_count) * 100 if total_count > 0 else 0
494
+ else:
495
+ raise ValueError(f"Unknown operation: {operation}")
496
+
497
+ except Exception as e:
498
+ logger.error(f"Failed to execute query: {str(e)}")
499
+ return 0, 0
500
+
501
+ def query_timestamp_date_not_in_future_metric(
502
+ self,
503
+ table: str,
504
+ field: str,
505
+ operation: str,
506
+ predefined_regex: str,
507
+ filters: str = None,
508
+ ) -> Union[float, int]:
509
+ """
510
+ :param table: Table name
511
+ :param field: Column name
512
+ :param operation: Metric operation ("count" or "percent")
513
+ :param predefined_regex: The regex pattern to use (e.g., "timestamp_iso")
514
+ :param filters: Optional filter condition
515
+ :return: Tuple containing count of valid dates not in the future and total count
516
+ """
517
+
518
+ qualified_table_name = self.qualified_table_name(table)
519
+ field = self.quote_column(field)
520
+
521
+ timestamp_iso_regex = r"^\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])T([01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9](?:\.\d{1,3})?(Z|[+-](0[0-9]|1[0-4]):[0-5][0-9])?$"
522
+
523
+ if predefined_regex == "timestamp_iso":
524
+ regex_condition = f"REGEXP_LIKE({field}, '{timestamp_iso_regex}')"
525
+ else:
526
+ raise ValueError(f"Unknown predefined regex pattern: {predefined_regex}")
527
+
528
+ filters_clause = f"WHERE {filters}" if filters else ""
529
+
530
+ query = f"""
531
+ WITH extracted_timestamps AS (
532
+ SELECT
533
+ {field},
534
+ SUBSTRING({field}, 1, 4) AS year, -- Extract year
535
+ SUBSTRING({field}, 6, 2) AS month, -- Extract month
536
+ SUBSTRING({field}, 9, 2) AS day -- Extract day
537
+ FROM {qualified_table_name}
538
+ {filters_clause}
539
+ ),
540
+ validated_dates AS (
541
+ SELECT
542
+ {field},
543
+ CASE
544
+ WHEN
545
+ REGEXP_LIKE(year, '^\d{{4}}$') AND
546
+ REGEXP_LIKE(month, '^(0[1-9]|1[0-2])$') AND
547
+ REGEXP_LIKE(day, '^((0[1-9]|[12][0-9])|(30|31))$') AND
548
+ (
549
+ (month IN ('01', '03', '05', '07', '08', '10', '12') AND day BETWEEN '01' AND '31') OR
550
+ (month IN ('04', '06', '09', '11') AND day BETWEEN '01' AND '30') OR
551
+ (month = '02' AND day BETWEEN '01' AND
552
+ CASE
553
+ WHEN (CAST(year AS INTEGER) % 400 = 0 OR
554
+ (CAST(year AS INTEGER) % 100 != 0 AND
555
+ CAST(year AS INTEGER) % 4 = 0)) THEN '29'
556
+ ELSE '28'
557
+ END
558
+ )
559
+ )
560
+ THEN 1
561
+ ELSE 0
562
+ END AS is_valid
563
+ FROM extracted_timestamps
564
+ ),
565
+ dates_not_in_future AS (
566
+ SELECT *
567
+ FROM validated_dates
568
+ WHERE is_valid = 1
569
+ AND {regex_condition}
570
+ AND DATE(TO_TIMESTAMP({field}, 'YYYY-MM-DD"T"HH24:MI:SS')) <= CURRENT DATE -- Compare only the date part
571
+ )
572
+ SELECT COUNT(*) AS valid_count, (SELECT COUNT(*) FROM {qualified_table_name} {filters_clause}) AS total_count
573
+ FROM dates_not_in_future;
574
+ """
575
+
576
+ try:
577
+ valid_count = self.fetchone(query)[0]
578
+ total_count_query = f"SELECT COUNT(*) FROM {qualified_table_name} {filters_clause}"
579
+ total_count = self.fetchone(total_count_query)[0]
580
+
581
+ if operation == "count":
582
+ return valid_count, total_count
583
+ elif operation == "percent":
584
+ return (valid_count / total_count) * 100 if total_count > 0 else 0
585
+ else:
586
+ raise ValueError(f"Unknown operation: {operation}")
587
+
588
+ except Exception as e:
589
+ logger.error(f"Failed to execute query: {str(e)}")
590
+ return 0, 0
591
+
592
+ def query_geolocation_metric(
593
+ self, table: str, field: str, operation: str, filters: str = None
594
+ ) -> Union[int, float]:
595
+ qualified_table_name = self.qualified_table_name(table)
596
+ field = self.quote_column(field)
597
+
598
+ valid_query = f'SELECT COUNT("{field}") FROM {qualified_table_name} WHERE "{field}" IS NOT NULL AND "{field}"'
599
+
600
+ if field.lower().startswith("lat"):
601
+ valid_query += "BETWEEN -90 AND 90"
602
+ elif field.lower().startswith("lon"):
603
+ valid_query += "BETWEEN -180 AND 180"
604
+
605
+ if filters:
606
+ valid_query += f" AND {filters}"
607
+
608
+ valid_count = self.fetchone(valid_query)[0]
609
+
610
+ if operation == "percent":
611
+ total_query = f"SELECT COUNT(*) FROM {qualified_table_name}"
612
+ if filters:
613
+ total_query += f" WHERE {filters}"
614
+
615
+ total_count = self.fetchone(total_query)[0]
616
+
617
+ result = (valid_count / total_count) * 100 if total_count > 0 else 0
618
+ return round(result, 2)
619
+
620
+ return valid_count
621
+
622
+ def query_timestamp_metric(self):
623
+ raise NotImplementedError("Method not implemented for DB2DataSource")
624
+
625
+ def query_timestamp_not_in_future_metric(self):
626
+ raise NotImplementedError("Method not implemented for DB2DataSource")
627
+
628
+ def query_timestamp_date_not_in_future_metric(self):
629
+ raise NotImplementedError("Method not implemented for DB2DataSource")
630
+
631
+ def query_get_time_diff(self, table: str, field: str) -> int:
632
+ """
633
+ Get the time difference
634
+ :param table: name of the index
635
+ :param field: field name of updated time column
636
+ :return: time difference in seconds
637
+ """
638
+ qualified_table_name = self.qualified_table_name(table)
639
+ field = self.quote_column(field)
640
+ query = f"""
641
+ SELECT {field}
642
+ FROM {qualified_table_name}
643
+ ORDER BY {field} DESC
644
+ FETCH FIRST 1 ROWS ONLY;
645
+ """
646
+ result = self.fetchone(query)
647
+ if result:
648
+ updated_time = result[0]
649
+ if isinstance(updated_time, str):
650
+ updated_time = datetime.strptime(updated_time, "%Y-%m-%d %H:%M:%S.%f")
651
+ return int((datetime.utcnow() - updated_time).total_seconds())
652
+ return 0