dcs-sdk 1.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. data_diff/__init__.py +221 -0
  2. data_diff/__main__.py +517 -0
  3. data_diff/abcs/__init__.py +13 -0
  4. data_diff/abcs/compiler.py +27 -0
  5. data_diff/abcs/database_types.py +402 -0
  6. data_diff/config.py +141 -0
  7. data_diff/databases/__init__.py +38 -0
  8. data_diff/databases/_connect.py +323 -0
  9. data_diff/databases/base.py +1417 -0
  10. data_diff/databases/bigquery.py +376 -0
  11. data_diff/databases/clickhouse.py +217 -0
  12. data_diff/databases/databricks.py +262 -0
  13. data_diff/databases/duckdb.py +207 -0
  14. data_diff/databases/mssql.py +343 -0
  15. data_diff/databases/mysql.py +189 -0
  16. data_diff/databases/oracle.py +238 -0
  17. data_diff/databases/postgresql.py +293 -0
  18. data_diff/databases/presto.py +222 -0
  19. data_diff/databases/redis.py +93 -0
  20. data_diff/databases/redshift.py +233 -0
  21. data_diff/databases/snowflake.py +222 -0
  22. data_diff/databases/sybase.py +720 -0
  23. data_diff/databases/trino.py +73 -0
  24. data_diff/databases/vertica.py +174 -0
  25. data_diff/diff_tables.py +489 -0
  26. data_diff/errors.py +17 -0
  27. data_diff/format.py +369 -0
  28. data_diff/hashdiff_tables.py +1026 -0
  29. data_diff/info_tree.py +76 -0
  30. data_diff/joindiff_tables.py +434 -0
  31. data_diff/lexicographic_space.py +253 -0
  32. data_diff/parse_time.py +88 -0
  33. data_diff/py.typed +0 -0
  34. data_diff/queries/__init__.py +13 -0
  35. data_diff/queries/api.py +213 -0
  36. data_diff/queries/ast_classes.py +811 -0
  37. data_diff/queries/base.py +38 -0
  38. data_diff/queries/extras.py +43 -0
  39. data_diff/query_utils.py +70 -0
  40. data_diff/schema.py +67 -0
  41. data_diff/table_segment.py +583 -0
  42. data_diff/thread_utils.py +112 -0
  43. data_diff/utils.py +1022 -0
  44. data_diff/version.py +15 -0
  45. dcs_core/__init__.py +13 -0
  46. dcs_core/__main__.py +17 -0
  47. dcs_core/__version__.py +15 -0
  48. dcs_core/cli/__init__.py +13 -0
  49. dcs_core/cli/cli.py +165 -0
  50. dcs_core/core/__init__.py +19 -0
  51. dcs_core/core/common/__init__.py +13 -0
  52. dcs_core/core/common/errors.py +50 -0
  53. dcs_core/core/common/models/__init__.py +13 -0
  54. dcs_core/core/common/models/configuration.py +284 -0
  55. dcs_core/core/common/models/dashboard.py +24 -0
  56. dcs_core/core/common/models/data_source_resource.py +75 -0
  57. dcs_core/core/common/models/metric.py +160 -0
  58. dcs_core/core/common/models/profile.py +75 -0
  59. dcs_core/core/common/models/validation.py +216 -0
  60. dcs_core/core/common/models/widget.py +44 -0
  61. dcs_core/core/configuration/__init__.py +13 -0
  62. dcs_core/core/configuration/config_loader.py +139 -0
  63. dcs_core/core/configuration/configuration_parser.py +262 -0
  64. dcs_core/core/configuration/configuration_parser_arc.py +328 -0
  65. dcs_core/core/datasource/__init__.py +13 -0
  66. dcs_core/core/datasource/base.py +62 -0
  67. dcs_core/core/datasource/manager.py +112 -0
  68. dcs_core/core/datasource/search_datasource.py +421 -0
  69. dcs_core/core/datasource/sql_datasource.py +1094 -0
  70. dcs_core/core/inspect.py +163 -0
  71. dcs_core/core/logger/__init__.py +13 -0
  72. dcs_core/core/logger/base.py +32 -0
  73. dcs_core/core/logger/default_logger.py +94 -0
  74. dcs_core/core/metric/__init__.py +13 -0
  75. dcs_core/core/metric/base.py +220 -0
  76. dcs_core/core/metric/combined_metric.py +98 -0
  77. dcs_core/core/metric/custom_metric.py +34 -0
  78. dcs_core/core/metric/manager.py +137 -0
  79. dcs_core/core/metric/numeric_metric.py +403 -0
  80. dcs_core/core/metric/reliability_metric.py +90 -0
  81. dcs_core/core/profiling/__init__.py +13 -0
  82. dcs_core/core/profiling/datasource_profiling.py +136 -0
  83. dcs_core/core/profiling/numeric_field_profiling.py +72 -0
  84. dcs_core/core/profiling/text_field_profiling.py +67 -0
  85. dcs_core/core/repository/__init__.py +13 -0
  86. dcs_core/core/repository/metric_repository.py +77 -0
  87. dcs_core/core/utils/__init__.py +13 -0
  88. dcs_core/core/utils/log.py +29 -0
  89. dcs_core/core/utils/tracking.py +105 -0
  90. dcs_core/core/utils/utils.py +44 -0
  91. dcs_core/core/validation/__init__.py +13 -0
  92. dcs_core/core/validation/base.py +230 -0
  93. dcs_core/core/validation/completeness_validation.py +153 -0
  94. dcs_core/core/validation/custom_query_validation.py +24 -0
  95. dcs_core/core/validation/manager.py +282 -0
  96. dcs_core/core/validation/numeric_validation.py +276 -0
  97. dcs_core/core/validation/reliability_validation.py +91 -0
  98. dcs_core/core/validation/uniqueness_validation.py +61 -0
  99. dcs_core/core/validation/validity_validation.py +738 -0
  100. dcs_core/integrations/__init__.py +13 -0
  101. dcs_core/integrations/databases/__init__.py +13 -0
  102. dcs_core/integrations/databases/bigquery.py +187 -0
  103. dcs_core/integrations/databases/databricks.py +51 -0
  104. dcs_core/integrations/databases/db2.py +652 -0
  105. dcs_core/integrations/databases/elasticsearch.py +61 -0
  106. dcs_core/integrations/databases/mssql.py +829 -0
  107. dcs_core/integrations/databases/mysql.py +409 -0
  108. dcs_core/integrations/databases/opensearch.py +64 -0
  109. dcs_core/integrations/databases/oracle.py +719 -0
  110. dcs_core/integrations/databases/postgres.py +482 -0
  111. dcs_core/integrations/databases/redshift.py +53 -0
  112. dcs_core/integrations/databases/snowflake.py +48 -0
  113. dcs_core/integrations/databases/spark_df.py +111 -0
  114. dcs_core/integrations/databases/sybase.py +1069 -0
  115. dcs_core/integrations/storage/__init__.py +13 -0
  116. dcs_core/integrations/storage/local_file.py +149 -0
  117. dcs_core/integrations/utils/__init__.py +13 -0
  118. dcs_core/integrations/utils/utils.py +36 -0
  119. dcs_core/report/__init__.py +13 -0
  120. dcs_core/report/dashboard.py +211 -0
  121. dcs_core/report/models.py +88 -0
  122. dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
  123. dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
  124. dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
  125. dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
  126. dcs_core/report/static/assets/images/docs.svg +6 -0
  127. dcs_core/report/static/assets/images/github.svg +4 -0
  128. dcs_core/report/static/assets/images/logo.svg +7 -0
  129. dcs_core/report/static/assets/images/slack.svg +13 -0
  130. dcs_core/report/static/index.js +2 -0
  131. dcs_core/report/static/index.js.LICENSE.txt +3971 -0
  132. dcs_sdk/__init__.py +13 -0
  133. dcs_sdk/__main__.py +18 -0
  134. dcs_sdk/__version__.py +15 -0
  135. dcs_sdk/cli/__init__.py +13 -0
  136. dcs_sdk/cli/cli.py +163 -0
  137. dcs_sdk/sdk/__init__.py +58 -0
  138. dcs_sdk/sdk/config/__init__.py +13 -0
  139. dcs_sdk/sdk/config/config_loader.py +491 -0
  140. dcs_sdk/sdk/data_diff/__init__.py +13 -0
  141. dcs_sdk/sdk/data_diff/data_differ.py +821 -0
  142. dcs_sdk/sdk/rules/__init__.py +15 -0
  143. dcs_sdk/sdk/rules/rules_mappping.py +31 -0
  144. dcs_sdk/sdk/rules/rules_repository.py +214 -0
  145. dcs_sdk/sdk/rules/schema_rules.py +65 -0
  146. dcs_sdk/sdk/utils/__init__.py +13 -0
  147. dcs_sdk/sdk/utils/serializer.py +25 -0
  148. dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
  149. dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
  150. dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
  151. dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
  152. dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
  153. dcs_sdk/sdk/utils/table.py +475 -0
  154. dcs_sdk/sdk/utils/themes.py +40 -0
  155. dcs_sdk/sdk/utils/utils.py +349 -0
  156. dcs_sdk-1.6.5.dist-info/METADATA +150 -0
  157. dcs_sdk-1.6.5.dist-info/RECORD +159 -0
  158. dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
  159. dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,1094 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import secrets
16
+ import string
17
+ import time
18
+ from datetime import datetime
19
+ from typing import Dict, List, Optional, Tuple, Union
20
+
21
+ from loguru import logger
22
+ from sqlalchemy import inspect, text
23
+ from sqlalchemy.engine import Connection, Engine
24
+
25
+ from dcs_core.core.datasource.base import DataSource
26
+
27
+
28
+ class SQLDataSource(DataSource):
29
+ """
30
+ Abstract class for SQL data sources
31
+ """
32
+
33
+ def __init__(self, data_source_name: str, data_connection: Dict):
34
+ super().__init__(data_source_name, data_connection)
35
+
36
+ self.connection: Union[Connection, None] = None
37
+ self.database: str = data_connection.get("database")
38
+ self.use_sa_text_query = True
39
+ self.schema_name = data_connection.get("schema", None)
40
+ self.regex_patterns = {
41
+ "uuid": r"^[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$",
42
+ "usa_phone": r"^(\+1[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}$",
43
+ "email": r"^(?!.*\.\.)(?!.*@.*@)[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$",
44
+ "usa_zip_code": r"^[0-9]{5}(?:-[0-9]{4})?$",
45
+ "ssn": r"^(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}$",
46
+ "sedol": r"^[B-DF-HJ-NP-TV-XZ0-9]{6}[0-9]$",
47
+ "lei": r"^[A-Z0-9]{18}[0-9]{2}$",
48
+ "cusip": r"^[0-9A-Z]{9}$",
49
+ "figi": r"^BBG[A-Z0-9]{9}$",
50
+ "isin": r"^[A-Z]{2}[A-Z0-9]{9}[0-9]$",
51
+ "perm_id": r"^\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{3}$",
52
+ }
53
+
54
+ self.valid_state_codes = [
55
+ "AL",
56
+ "AK",
57
+ "AZ",
58
+ "AR",
59
+ "CA",
60
+ "CO",
61
+ "CT",
62
+ "DE",
63
+ "FL",
64
+ "GA",
65
+ "HI",
66
+ "ID",
67
+ "IL",
68
+ "IN",
69
+ "IA",
70
+ "KS",
71
+ "KY",
72
+ "LA",
73
+ "ME",
74
+ "MD",
75
+ "MA",
76
+ "MI",
77
+ "MN",
78
+ "MS",
79
+ "MO",
80
+ "MT",
81
+ "NE",
82
+ "NV",
83
+ "NH",
84
+ "NJ",
85
+ "NM",
86
+ "NY",
87
+ "NC",
88
+ "ND",
89
+ "OH",
90
+ "OK",
91
+ "OR",
92
+ "PA",
93
+ "RI",
94
+ "SC",
95
+ "SD",
96
+ "TN",
97
+ "TX",
98
+ "UT",
99
+ "VT",
100
+ "VA",
101
+ "WA",
102
+ "WV",
103
+ "WI",
104
+ "WY",
105
+ ]
106
+
107
+ def is_connected(self) -> bool:
108
+ """
109
+ Check if the data source is connected
110
+ """
111
+ return self.connection is not None
112
+
113
+ def close(self):
114
+ self.connection.close()
115
+ try:
116
+ self.connection.engine.dispose()
117
+ except Exception as e:
118
+ logger.error(f"Failed to close the connection: {str(e)}")
119
+
120
+ def fetchall(self, query):
121
+ if self.use_sa_text_query:
122
+ return self.connection.execute(text(query)).fetchall()
123
+ return self.connection.execute(query).fetchall()
124
+
125
+ def fetchone(self, query):
126
+ if self.use_sa_text_query:
127
+ return self.connection.execute(text(query)).fetchone()
128
+ return self.connection.execute(query).fetchone()
129
+
130
+ def safe_get(self, lst, idx, default=None):
131
+ return lst[idx] if 0 <= idx < len(lst) else default
132
+
133
+ def qualified_table_name(self, table_name: str) -> str:
134
+ """
135
+ Get the qualified table name
136
+ :param table_name: name of the table
137
+ :return: qualified table name
138
+ """
139
+ if self.schema_name:
140
+ return f"[{self.schema_name}].[{table_name}]"
141
+ return f"[{table_name}]"
142
+
143
+ def quote_database(self, database: str) -> str:
144
+ """
145
+ Quote the database name
146
+ :param database: name of the database
147
+ :return: quoted database name
148
+ """
149
+ return f'"{database}"'
150
+
151
+ def quote_column(self, column: str) -> str:
152
+ """
153
+ Quote the column name
154
+ :param column: name of the column
155
+ :return: quoted column name
156
+ """
157
+ return f"[{column}]"
158
+
159
+ def query_get_database_version(self, database_version_query: Optional[str] = None) -> str:
160
+ """
161
+ Get the database version
162
+ :return: version string
163
+ """
164
+ query = database_version_query or "SELECT @@version"
165
+ result = self.fetchone(query)[0]
166
+ return result if result else None
167
+
168
+ def query_get_column_metadata(self, table_name: str) -> Dict[str, str]:
169
+ """
170
+ Get the column metadata
171
+ :param table_name: name of the table
172
+ :return: query for column metadata
173
+ """
174
+ results_: Dict[str, str] = {}
175
+
176
+ columns = inspect(self.connection.engine).get_columns(table_name)
177
+ for column in columns:
178
+ results_[column["name"]] = column["type"].python_type.__name__
179
+
180
+ return results_
181
+
182
+ def query_get_table_metadata(self) -> List[str]:
183
+ """
184
+ Get the table metadata
185
+ :return: query for table metadata
186
+ """
187
+ return inspect(self.connection.engine).get_table_names()
188
+
189
+ def query_get_row_count(self, table: str, filters: str = None) -> int:
190
+ """
191
+ Get the row count
192
+ :param table: name of the table
193
+ :param filters: optional filter
194
+ """
195
+ qualified_table_name = self.qualified_table_name(table)
196
+ query = f"SELECT COUNT(*) FROM {qualified_table_name}"
197
+ if filters:
198
+ query += f" WHERE {filters}"
199
+ return self.fetchone(query)[0]
200
+
201
+ def query_get_custom_sql(self, query: str) -> Union[int, float, None]:
202
+ """
203
+ Get the first row of the custom sql query
204
+ :param query: custom sql query
205
+ """
206
+ row = self.fetchone(query)
207
+ if row is not None:
208
+ return row[0]
209
+ else:
210
+ return None
211
+
212
+ def query_get_max(self, table: str, field: str, filters: str = None) -> int:
213
+ """
214
+ Get the max value
215
+ :param table: table name
216
+ :param field: column name
217
+ :param filters: filter condition
218
+ :return:
219
+ """
220
+ qualified_table_name = self.qualified_table_name(table)
221
+ field = self.quote_column(field)
222
+
223
+ query = "SELECT MAX({}) FROM {}".format(field, qualified_table_name)
224
+
225
+ if filters:
226
+ query += " WHERE {}".format(filters)
227
+ var = self.fetchone(query)[0]
228
+ return var
229
+
230
+ def query_get_min(self, table: str, field: str, filters: str = None) -> int:
231
+ """
232
+ Get the min value
233
+ :param table: table name
234
+ :param field: column name
235
+ :param filters: filter condition
236
+ :return:
237
+ """
238
+ qualified_table_name = self.qualified_table_name(table)
239
+ field = self.quote_column(field)
240
+ query = "SELECT MIN({}) FROM {}".format(field, qualified_table_name)
241
+ if filters:
242
+ query += " WHERE {}".format(filters)
243
+
244
+ return self.fetchone(query)[0]
245
+
246
+ def query_get_avg(self, table: str, field: str, filters: str = None) -> int:
247
+ """
248
+ Get the average value
249
+ :param table: table name
250
+ :param field: column name
251
+ :param filters: filter condition
252
+ :return:
253
+ """
254
+ qualified_table_name = self.qualified_table_name(table)
255
+ field = self.quote_column(field)
256
+ query = "SELECT AVG({}) FROM {}".format(field, qualified_table_name)
257
+ if filters:
258
+ query += " WHERE {}".format(filters)
259
+
260
+ return round(self.fetchone(query)[0], 2)
261
+
262
+ def query_get_sum(self, table: str, field: str, filters: str = None) -> int:
263
+ """
264
+ Get the sum value
265
+ :param table: table name
266
+ :param field: column name
267
+ :param filters: filter condition
268
+ :return:
269
+ """
270
+ qualified_table_name = self.qualified_table_name(table)
271
+ field = self.quote_column(field)
272
+ query = "SELECT SUM({}) FROM {}".format(field, qualified_table_name)
273
+ if filters:
274
+ query += " WHERE {}".format(filters)
275
+
276
+ return round(self.fetchone(query)[0], 2)
277
+
278
+ def query_get_variance(self, table: str, field: str, filters: str = None) -> int:
279
+ """
280
+ Get the variance value
281
+ :param table: table name
282
+ :param field: column name
283
+ :param filters: filter condition
284
+ :return:
285
+ """
286
+ qualified_table_name = self.qualified_table_name(table)
287
+ field = self.quote_column(field)
288
+ query = "SELECT VAR_SAMP({}) FROM {}".format(field, qualified_table_name)
289
+ if filters:
290
+ query += " WHERE {}".format(filters)
291
+
292
+ return round(self.fetchone(query)[0], 2)
293
+
294
+ def query_get_stddev(self, table: str, field: str, filters: str = None) -> int:
295
+ """
296
+ Get the standard deviation value
297
+ :param table: table name
298
+ :param field: column name
299
+ :param filters: filter condition
300
+ :return:
301
+ """
302
+ qualified_table_name = self.qualified_table_name(table)
303
+ field = self.quote_column(field)
304
+ query = "SELECT STDDEV_SAMP({}) FROM {}".format(field, qualified_table_name)
305
+ if filters:
306
+ query += " WHERE {}".format(filters)
307
+
308
+ return round(self.fetchone(query)[0], 2)
309
+
310
+ def query_get_null_count(self, table: str, field: str, filters: str = None) -> int:
311
+ """
312
+ Get the null count
313
+ :param table: table name
314
+ :param field: column name
315
+ :param filters: filter condition
316
+ :return:
317
+ """
318
+ qualified_table_name = self.qualified_table_name(table)
319
+ field = self.quote_column(field)
320
+ query = "SELECT COUNT(*) FROM {} WHERE {} IS NULL".format(qualified_table_name, field)
321
+ if filters:
322
+ query += " AND {}".format(filters)
323
+ return self.fetchone(query)[0]
324
+
325
+ def query_get_empty_string_count(self, table: str, field: str, filters: str = None) -> int:
326
+ """
327
+ Get the count of empty strings in a column of a table
328
+ :param table: table name
329
+ :param field: column name
330
+ :param filters: filter condition
331
+ :return: count of empty strings
332
+ """
333
+ qualified_table_name = self.qualified_table_name(table)
334
+ field = self.quote_column(field)
335
+ query = "SELECT COUNT(*) FROM {} WHERE {} = ''".format(qualified_table_name, field)
336
+ if filters:
337
+ query += " AND {}".format(filters)
338
+ result = self.fetchone(query)
339
+ return result[0] if result else 0
340
+
341
+ def query_get_empty_string_percentage(self, table: str, field: str, filters: str = None) -> float:
342
+ """
343
+ Get the empty string percentage in a column of a table
344
+ :param table: table name
345
+ :param field: column name
346
+ :param filters: filter condition
347
+ :return: empty string percentage
348
+ """
349
+ qualified_table_name = self.qualified_table_name(table)
350
+ field = self.quote_column(field)
351
+ query = "SELECT SUM(CASE WHEN {} = '' THEN 1 ELSE 0 END) AS empty_string_count, COUNT(*) AS total_count FROM {}".format(
352
+ field, qualified_table_name
353
+ )
354
+
355
+ if filters:
356
+ query += " WHERE {}".format(filters)
357
+
358
+ result = self.fetchone(query)
359
+ if result and result[1] > 0:
360
+ return round((result[0] / result[1]) * 100, 2)
361
+ return 0.0
362
+
363
+ def query_get_distinct_count(self, table: str, field: str, filters: str = None) -> int:
364
+ """
365
+ Get the distinct count value
366
+ :param table: table name
367
+ :param field: column name
368
+ :param filters: filter condition
369
+ :return:
370
+ """
371
+ qualified_table_name = self.qualified_table_name(table)
372
+ field = self.quote_column(field)
373
+ query = "SELECT COUNT(DISTINCT {}) FROM {}".format(field, qualified_table_name)
374
+ if filters:
375
+ query += " WHERE {}".format(filters)
376
+
377
+ return self.fetchone(query)[0]
378
+
379
+ def query_get_null_percentage(self, table: str, field: str, filters: str = None) -> int:
380
+ """
381
+ Get the null percentage
382
+ :param table: table name
383
+ :param field: column name
384
+ :param filters: filter condition
385
+ :return:
386
+ """
387
+ qualified_table_name = self.qualified_table_name(table)
388
+ field = self.quote_column(field)
389
+ query = (
390
+ "SELECT SUM(CASE WHEN {} IS NULL THEN 1 ELSE 0 END) AS null_count, COUNT(*) AS total_count FROM {}".format(
391
+ field, qualified_table_name
392
+ )
393
+ )
394
+
395
+ if filters:
396
+ query += " WHERE {}".format(filters)
397
+
398
+ result = self.fetchone(query)
399
+ if result:
400
+ return round((result[0] / result[1]) * 100, 2)
401
+ return 0
402
+
403
+ def query_get_time_diff(self, table: str, field: str) -> int:
404
+ """
405
+ Get the time difference
406
+ :param table: name of the index
407
+ :param field: field name of updated time column
408
+ :return: time difference in seconds
409
+ """
410
+ qualified_table_name = self.qualified_table_name(table)
411
+ field = self.quote_column(field)
412
+ query = f"""
413
+ SELECT {field} from {qualified_table_name} ORDER BY {field} DESC LIMIT 1;
414
+ """
415
+ result = self.fetchone(query)
416
+ if result:
417
+ return int((datetime.utcnow() - result[0]).total_seconds())
418
+ return 0
419
+
420
+ def profiling_sql_aggregates_numeric(self, table_name: str, column_name: str) -> Dict:
421
+ column_name = f'"{column_name}"'
422
+ qualified_table_name = self.qualified_table_name(table_name)
423
+ query = f"""
424
+ SELECT
425
+ avg({column_name}) as avg,
426
+ min({column_name}) as min,
427
+ max({column_name}) as max,
428
+ sum({column_name}) as sum,
429
+ stddev_samp({column_name}) as stddev,
430
+ var_samp({column_name}) as variance,
431
+ count(distinct({column_name})) as distinct_count,
432
+ sum(case when {column_name} is null then 1 else 0 end) as missing_count
433
+ FROM {qualified_table_name}
434
+ """
435
+
436
+ result = self.fetchone(query)
437
+ return {
438
+ "avg": result[0],
439
+ "min": result[1],
440
+ "max": result[2],
441
+ "sum": result[3],
442
+ "stddev": result[4],
443
+ "variance": result[5],
444
+ "distinct_count": result[6],
445
+ "missing_count": result[7],
446
+ }
447
+
448
+ def profiling_sql_aggregates_string(self, table_name: str, column_name: str) -> Dict:
449
+ column_name = f'"{column_name}"'
450
+ qualified_table_name = self.qualified_table_name(table_name)
451
+ query = f"""
452
+ SELECT
453
+ count(distinct({column_name})) as distinct_count,
454
+ sum(case when {column_name} is null then 1 else 0 end) as missing_count,
455
+ max(length({column_name})) as max_length,
456
+ min(length({column_name})) as min_length,
457
+ avg(length({column_name})) as avg_length
458
+ FROM {qualified_table_name}
459
+ """
460
+
461
+ result = self.fetchone(query)
462
+ return {
463
+ "distinct_count": result[0],
464
+ "missing_count": result[1],
465
+ "max_length": result[2],
466
+ "min_length": result[3],
467
+ "avg_length": result[4],
468
+ }
469
+
470
+ def query_get_duplicate_count(self, table: str, field: str, filters: str = None) -> int:
471
+ filters = f"WHERE {filters}" if filters else ""
472
+ qualified_table_name = self.qualified_table_name(table)
473
+ field = self.quote_column(field)
474
+ query = f"""
475
+ SELECT
476
+ count(*) as duplicate_count
477
+ FROM {qualified_table_name}
478
+ {filters}
479
+ GROUP BY {field}
480
+ HAVING COUNT(*) > 1
481
+ """
482
+
483
+ result = self.fetchall(query)
484
+ return len(result) if result else 0
485
+
486
+ def query_string_pattern_validity(
487
+ self,
488
+ table: str,
489
+ field: str,
490
+ regex_pattern: str = None,
491
+ predefined_regex_pattern: str = None,
492
+ filters: str = None,
493
+ ) -> Tuple[int, int]:
494
+ """
495
+ Get the count of valid values based on the regex pattern
496
+ :param table: table name
497
+ :param field: column name
498
+ :param regex_pattern: regex pattern
499
+ :param predefined_regex_pattern: predefined regex pattern
500
+ :param filters: filter condition
501
+ :return: count of valid values, count of total row count
502
+ """
503
+ filters = f"WHERE {filters}" if filters else ""
504
+ qualified_table_name = self.qualified_table_name(table)
505
+ field = self.quote_column(field)
506
+
507
+ if not regex_pattern and not predefined_regex_pattern:
508
+ raise ValueError("Either regex_pattern or predefined_regex_pattern should be provided")
509
+
510
+ if predefined_regex_pattern:
511
+ regex_query = f"case when {field} ~ '{self.regex_patterns[predefined_regex_pattern]}' then 1 else 0 end"
512
+ else:
513
+ regex_query = f"case when {field} ~ '{regex_pattern}' then 1 else 0 end"
514
+
515
+ query = f"""
516
+ select sum({regex_query}) as valid_count, count(*) as total_count
517
+ from {qualified_table_name} {filters}
518
+ """
519
+ result = self.fetchone(query)
520
+ return result[0], result[1]
521
+
522
+ def query_valid_invalid_values_validity(
523
+ self,
524
+ table: str,
525
+ field: str,
526
+ regex_pattern: str = None,
527
+ filters: str = None,
528
+ values: List[str] = None,
529
+ ) -> Tuple[int, int]:
530
+ """
531
+ Get the count of valid and invalid values
532
+ :param table: table name
533
+ :param field: column name
534
+ :param values: list of valid values
535
+ :param regex_pattern: regex pattern
536
+ :param filters: filter condition
537
+ :return: count of valid/invalid values and total count of valid/invalid values
538
+ """
539
+ filters = f"WHERE {filters}" if filters else ""
540
+ qualified_table_name = self.qualified_table_name(table)
541
+ field = self.quote_column(field)
542
+ if values:
543
+ values_str = ", ".join([f"'{value}'" for value in values])
544
+ regex_query = f"CASE WHEN {field} IN ({values_str}) THEN 1 ELSE 0 END"
545
+ else:
546
+ regex_query = f"CASE WHEN {field} ~ '{regex_pattern}' THEN 1 ELSE 0 END"
547
+ query = f"""
548
+ SELECT SUM({regex_query}) AS valid_count, COUNT(*) as total_count
549
+ FROM {qualified_table_name}
550
+ {filters}
551
+ """
552
+ result = self.fetchone(query)
553
+ return result[0], result[1]
554
+
555
+ def query_get_string_length_metric(
556
+ self, table: str, field: str, metric: str, filters: str = None
557
+ ) -> Union[int, float]:
558
+ """
559
+ Get the string length metric (max, min, avg) in a column of a table.
560
+
561
+ :param table: table name
562
+ :param field: column name
563
+ :param metric: the metric to calculate ('max', 'min', 'avg')
564
+ :param filters: filter condition
565
+ :return: the calculated metric as int for 'max' and 'min', float for 'avg'
566
+ """
567
+ qualified_table_name = self.qualified_table_name(table)
568
+ field = self.quote_column(field)
569
+
570
+ if metric.lower() == "max":
571
+ sql_function = "MAX(LENGTH"
572
+ elif metric.lower() == "min":
573
+ sql_function = "MIN(LENGTH"
574
+ elif metric.lower() == "avg":
575
+ sql_function = "AVG(LENGTH"
576
+ else:
577
+ raise ValueError(f"Invalid metric '{metric}'. Choose from 'max', 'min', or 'avg'.")
578
+
579
+ query = f"SELECT {sql_function}({field})) FROM {qualified_table_name}"
580
+
581
+ if filters:
582
+ query += f" WHERE {filters}"
583
+
584
+ result = self.fetchone(query)[0]
585
+ return round(result, 2) if metric.lower() == "avg" else result
586
+
587
+ def query_get_usa_state_code_validity(self, table: str, field: str, filters: str = None) -> Tuple[int, int]:
588
+ """
589
+ Get the count of valid USA state codes
590
+ :param table: table name
591
+ :param field: column name
592
+ :param filters: filter condition
593
+ :return: count of valid state codes, count of total row count
594
+ """
595
+
596
+ valid_state_codes_str = ", ".join(f"'{code}'" for code in self.valid_state_codes)
597
+
598
+ filters = f"WHERE {filters}" if filters else ""
599
+
600
+ qualified_table_name = self.qualified_table_name(table)
601
+ field = self.quote_column(field)
602
+
603
+ regex_query = f"CASE WHEN {field} ~ '^[A-Z]{{2}}$' AND {field} IN ({valid_state_codes_str}) THEN 1 ELSE 0 END"
604
+
605
+ query = f"""
606
+ SELECT SUM({regex_query}) AS valid_count, COUNT(*) AS total_count
607
+ FROM {qualified_table_name} {filters}
608
+ """
609
+
610
+ result = self.fetchone(query)
611
+ return result[0], result[1]
612
+
613
+ def query_geolocation_metric(
614
+ self, table: str, field: str, operation: str, filters: str = None
615
+ ) -> Union[int, float]:
616
+ qualified_table_name = self.qualified_table_name(table)
617
+ field = self.quote_column(field)
618
+
619
+ valid_query = f"SELECT COUNT({field}) FROM {qualified_table_name} WHERE {field} IS NOT NULL AND {field} "
620
+
621
+ if field.lower().startswith("lat"):
622
+ valid_query += "BETWEEN -90 AND 90"
623
+ elif field.lower().startswith("lon"):
624
+ valid_query += "BETWEEN -180 AND 180"
625
+
626
+ if filters:
627
+ valid_query += f" AND {filters}"
628
+
629
+ valid_count = self.fetchone(valid_query)[0]
630
+
631
+ if operation == "percent":
632
+ total_query = f"SELECT COUNT(*) FROM {qualified_table_name}"
633
+ if filters:
634
+ total_query += f" WHERE {filters}"
635
+
636
+ total_count = self.fetchone(total_query)[0]
637
+
638
+ result = (valid_count / total_count) * 100 if total_count > 0 else 0
639
+ return round(result, 2)
640
+
641
+ return valid_count
642
+
643
+ def query_get_percentile(self, table: str, field: str, percentile: float, filters: str = None) -> float:
644
+ """
645
+ Get the specified percentile value of a numeric column in a table.
646
+ :param table: table name
647
+ :param field: column name
648
+ :param percentile: percentile to calculate (e.g., 0.2 for 20th percentile)
649
+ :param filters: filter condition
650
+ :return: the value at the specified percentile
651
+ """
652
+ qualified_table_name = self.qualified_table_name(table)
653
+ field = self.quote_column(field)
654
+ query = f"SELECT PERCENTILE_DISC({percentile}) WITHIN GROUP (ORDER BY {field}) FROM {qualified_table_name}"
655
+ if filters:
656
+ query += f" WHERE {filters}"
657
+ return round(self.fetchone(query)[0], 2)
658
+
659
+ def query_zero_metric(self, table: str, field: str, operation: str, filters: str = None) -> Union[int, float]:
660
+ qualified_table_name = self.qualified_table_name(table)
661
+ field = self.quote_column(field)
662
+
663
+ zero_query = f"SELECT COUNT(*) FROM {qualified_table_name} WHERE {field} = 0"
664
+
665
+ if filters:
666
+ zero_query += f" AND {filters}"
667
+
668
+ if operation == "percent":
669
+ total_count_query = f"SELECT COUNT(*) FROM {qualified_table_name}"
670
+ if filters:
671
+ total_count_query += f" WHERE {filters}"
672
+
673
+ zero_count = self.fetchone(zero_query)[0]
674
+ total_count = self.fetchone(total_count_query)[0]
675
+
676
+ if total_count == 0:
677
+ return 0.0
678
+
679
+ result = (zero_count / total_count) * 100
680
+ return round(result, 2)
681
+ else:
682
+ result = self.fetchone(zero_query)[0]
683
+ return result
684
+
685
+ def query_negative_metric(self, table: str, field: str, operation: str, filters: str = None) -> Union[int, float]:
686
+ qualified_table_name = self.qualified_table_name(table)
687
+ field = self.quote_column(field)
688
+
689
+ negative_query = f"SELECT COUNT(*) FROM {qualified_table_name} WHERE {field} < 0"
690
+
691
+ if filters:
692
+ negative_query += f" AND {filters}"
693
+
694
+ total_count_query = f"SELECT COUNT(*) FROM {qualified_table_name}"
695
+
696
+ if filters:
697
+ total_count_query += f" WHERE {filters}"
698
+
699
+ if operation == "percent":
700
+ query = f"SELECT (CAST(({negative_query}) AS float) / CAST(({total_count_query}) AS float)) * 100 FROM {qualified_table_name}"
701
+ else:
702
+ query = negative_query
703
+
704
+ result = self.fetchone(query)[0]
705
+ return round(result, 2) if operation == "percent" else result
706
+
707
+ def query_get_all_space_count(
708
+ self, table: str, field: str, operation: str, filters: str = None
709
+ ) -> Union[int, float]:
710
+ """
711
+ Get the count of rows where the specified column contains only spaces.
712
+ :param table: table name
713
+ :param field: column name
714
+ :param filters: filter condition
715
+ :return: count of rows with only spaces
716
+ """
717
+ qualified_table_name = self.qualified_table_name(table)
718
+ field = self.quote_column(field)
719
+
720
+ query = f"""SELECT COUNT(CASE WHEN TRIM({field}) = '' THEN 1 END) AS space_count,COUNT(*) AS total_count FROM {qualified_table_name}
721
+ """
722
+
723
+ if filters:
724
+ query += f" AND {filters}"
725
+
726
+ result = self.fetchone(query)
727
+
728
+ if operation == "percent":
729
+ return round((result[0] / result[1]) * 100) if result[1] > 0 else 0
730
+
731
+ return result[0] if result else 0
732
+
733
+ def query_get_null_keyword_count(
734
+ self, table: str, field: str, operation: str, filters: str = None
735
+ ) -> Union[int, float]:
736
+ """
737
+ Get the count of NULL-like values (specific keywords) in the specified column.
738
+ :param table: table name
739
+ :param field: column name
740
+ :param filters: filter condition
741
+ :return: count of NULL-like keyword values
742
+ """
743
+ qualified_table_name = self.qualified_table_name(table)
744
+ field = self.quote_column(field)
745
+
746
+ query = f""" SELECT SUM(CASE WHEN LOWER({field}) IN ('nothing', 'nil', 'null', 'none', 'n/a', null) THEN 1 ELSE 0 END) AS null_count,COUNT(*) AS total_count
747
+ FROM {qualified_table_name}"""
748
+
749
+ if filters:
750
+ query += f" WHERE {filters}"
751
+
752
+ result = self.fetchone(query)
753
+
754
+ if operation == "percent":
755
+ return round((result[0] / result[1]) * 100, 2) if result[1] > 0 else 0
756
+
757
+ return result[0] if result else 0
758
+
759
+ def query_timestamp_metric(
760
+ self,
761
+ table: str,
762
+ field: str,
763
+ predefined_regex: str,
764
+ filters: str = None,
765
+ ) -> Union[float, int]:
766
+ """
767
+ :param table: Table name
768
+ :param field: Column name
769
+ :param predefined_regex: regex pattern
770
+ :param filters: filter condition
771
+ :return: Tuple containing valid count and total count (or percentage)
772
+ """
773
+
774
+ qualified_table_name = self.qualified_table_name(table)
775
+ field = self.quote_column(field)
776
+
777
+ timestamp_iso_regex = r"^\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])T([01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9](?:\.\d{1,3})?(Z|[+-](0[0-9]|1[0-4]):[0-5][0-9])?$"
778
+
779
+ if predefined_regex == "timestamp_iso":
780
+ regex_condition = f"{field} ~ '{timestamp_iso_regex}'"
781
+ else:
782
+ raise ValueError(f"Unknown predefined regex pattern: {predefined_regex}")
783
+
784
+ filters_clause = f"WHERE {filters}" if filters else ""
785
+
786
+ query = f"""
787
+ WITH extracted_timestamps AS (
788
+ SELECT
789
+ {field},
790
+ SUBSTRING({field} FROM '^(\d{{4}})') AS year, -- Extract year
791
+ SUBSTRING({field} FROM '^\d{{4}}-(\d{{2}})') AS month, -- Extract month
792
+ SUBSTRING({field} FROM '^\d{{4}}-\d{{2}}-(\d{{2}})') AS day, -- Extract day
793
+ SUBSTRING({field} FROM 'T(\d{{2}})') AS hour, -- Extract hour
794
+ SUBSTRING({field} FROM 'T\d{{2}}:(\d{{2}})') AS minute, -- Extract minute
795
+ SUBSTRING({field} FROM 'T\d{{2}}:\d{{2}}:(\d{{2}})') AS second, -- Extract second
796
+ SUBSTRING({field} FROM '([+-]\d{{2}}:\d{{2}}|Z)$') AS timezone -- Extract timezone
797
+ FROM {qualified_table_name}
798
+ {filters_clause}
799
+ ),
800
+ validated_timestamps AS (
801
+ SELECT
802
+ {field},
803
+ CASE
804
+ WHEN
805
+ -- Validate each component with its specific rules
806
+ year ~ '^\d{{4}}$' AND
807
+ month ~ '^(0[1-9]|1[0-2])$' AND
808
+ day ~ '^((0[1-9]|[12][0-9])|(30|31))$' AND
809
+ hour ~ '^([01][0-9]|2[0-3])$' AND
810
+ minute ~ '^[0-5][0-9]$' AND
811
+ second ~ '^[0-5][0-9]$' AND
812
+ (timezone IS NULL OR timezone ~ '^(Z|[+-](0[0-9]|1[0-4]):[0-5][0-9])$') AND
813
+ -- Additional check for days in months (e.g., February)
814
+ (
815
+ (month IN ('01', '03', '05', '07', '08', '10', '12') AND day BETWEEN '01' AND '31') OR
816
+ (month IN ('04', '06', '09', '11') AND day BETWEEN '01' AND '30') OR
817
+ (month = '02' AND day BETWEEN '01' AND
818
+ CASE
819
+ -- Handle leap years
820
+ WHEN (year::int % 400 = 0 OR (year::int % 100 != 0 AND year::int % 4 = 0)) THEN '29'
821
+ ELSE '28'
822
+ END
823
+ )
824
+ )
825
+ THEN 1
826
+ ELSE 0
827
+ END AS is_valid
828
+ FROM extracted_timestamps
829
+ )
830
+ SELECT COUNT(*) AS valid_count, COUNT(*) AS total_count
831
+ FROM validated_timestamps
832
+ WHERE is_valid = 1;
833
+ """
834
+
835
+ try:
836
+ valid_count = self.fetchone(query)[0]
837
+ total_count_query = f"SELECT COUNT(*) FROM {qualified_table_name} {filters_clause}"
838
+ total_count = self.fetchone(total_count_query)[0]
839
+
840
+ return valid_count, total_count
841
+
842
+ except Exception as e:
843
+ logger.error(f"Error occurred: {e}")
844
+ return 0, 0
845
+
846
+ def query_timestamp_not_in_future_metric(
847
+ self,
848
+ table: str,
849
+ field: str,
850
+ predefined_regex: str,
851
+ filters: str = None,
852
+ ) -> Union[float, int]:
853
+ """
854
+ :param table: Table name
855
+ :param field: Column name
856
+ :param predefined_regex: regex pattern
857
+ :param filters: filter condition
858
+ :return: Tuple containing count of valid timestamps not in the future and total count
859
+ """
860
+
861
+ qualified_table_name = self.qualified_table_name(table)
862
+ field = self.quote_column(field)
863
+
864
+ timestamp_iso_regex = r"^\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])T([01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9](?:\.\d{1,3})?(Z|[+-](0[0-9]|1[0-4]):[0-5][0-9])?$"
865
+
866
+ if predefined_regex == "timestamp_iso":
867
+ regex_condition = f"{field} ~ '{timestamp_iso_regex}'"
868
+ else:
869
+ raise ValueError(f"Unknown predefined regex pattern: {predefined_regex}")
870
+
871
+ filters_clause = f"WHERE {filters}" if filters else ""
872
+
873
+ query = f"""
874
+ WITH extracted_timestamps AS (
875
+ SELECT
876
+ {field},
877
+ SUBSTRING({field} FROM '^(\d{{4}})') AS year, -- Extract year
878
+ SUBSTRING({field} FROM '^\d{{4}}-(\d{{2}})') AS month, -- Extract month
879
+ SUBSTRING({field} FROM '^\d{{4}}-\d{{2}}-(\d{{2}})') AS day, -- Extract day
880
+ SUBSTRING({field} FROM 'T(\d{{2}})') AS hour, -- Extract hour
881
+ SUBSTRING({field} FROM 'T\d{{2}}:(\d{{2}})') AS minute, -- Extract minute
882
+ SUBSTRING({field} FROM 'T\d{{2}}:\d{{2}}:(\d{{2}})') AS second, -- Extract second
883
+ SUBSTRING({field} FROM '([+-]\d{{2}}:\d{{2}}|Z)$') AS timezone -- Extract timezone
884
+ FROM {qualified_table_name}
885
+ {filters_clause}
886
+ ),
887
+ validated_timestamps AS (
888
+ SELECT
889
+ {field},
890
+ CASE
891
+ WHEN
892
+ year ~ '^\d{{4}}$' AND
893
+ month ~ '^(0[1-9]|1[0-2])$' AND
894
+ day ~ '^((0[1-9]|[12][0-9])|(30|31))$' AND
895
+ hour ~ '^([01][0-9]|2[0-3])$' AND
896
+ minute ~ '^[0-5][0-9]$' AND
897
+ second ~ '^[0-5][0-9]$' AND
898
+ (timezone IS NULL OR timezone ~ '^(Z|[+-](0[0-9]|1[0-4]):[0-5][0-9])$') AND
899
+ (
900
+ (month IN ('01', '03', '05', '07', '08', '10', '12') AND day BETWEEN '01' AND '31') OR
901
+ (month IN ('04', '06', '09', '11') AND day BETWEEN '01' AND '30') OR
902
+ (month = '02' AND day BETWEEN '01' AND
903
+ CASE
904
+ WHEN (year::int % 400 = 0 OR (year::int % 100 != 0 AND year::int % 4 = 0)) THEN '29'
905
+ ELSE '28'
906
+ END
907
+ )
908
+ )
909
+ THEN 1
910
+ ELSE 0
911
+ END AS is_valid
912
+ FROM extracted_timestamps
913
+ ),
914
+ timestamps_not_in_future AS (
915
+ SELECT *
916
+ FROM validated_timestamps
917
+ WHERE is_valid = 1 AND ({field} ~ '{timestamp_iso_regex}') AND {field}::timestamp <= CURRENT_TIMESTAMP
918
+ )
919
+ SELECT COUNT(*) AS valid_count, (SELECT COUNT(*) FROM {qualified_table_name} {filters_clause}) AS total_count
920
+ FROM timestamps_not_in_future;
921
+ """
922
+ try:
923
+ valid_count = self.fetchone(query)[0]
924
+ total_count_query = f"SELECT COUNT(*) FROM {qualified_table_name} {filters_clause}"
925
+ total_count = self.fetchone(total_count_query)[0]
926
+
927
+ return valid_count, total_count
928
+
929
+ except Exception as e:
930
+ logger.error(f"Error occurred: {e}")
931
+ return 0, 0
932
+
933
+ def query_timestamp_date_not_in_future_metric(
934
+ self,
935
+ table: str,
936
+ field: str,
937
+ predefined_regex: str,
938
+ filters: str = None,
939
+ ) -> Union[float, int]:
940
+ """
941
+ :param table: Table name
942
+ :param field: Column name
943
+ :param predefined_regex: The regex pattern to use (e.g., "timestamp_iso")
944
+ :param filters: Optional filter condition
945
+ :return: Tuple containing count of valid dates not in the future and total count
946
+ """
947
+
948
+ qualified_table_name = self.qualified_table_name(table)
949
+ field = self.quote_column(field)
950
+
951
+ timestamp_iso_regex = r"^\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])T([01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9](?:\.\d{1,3})?(Z|[+-](0[0-9]|1[0-4]):[0-5][0-9])?$"
952
+
953
+ if predefined_regex == "timestamp_iso":
954
+ regex_condition = f"{field} ~ '{timestamp_iso_regex}'"
955
+ else:
956
+ raise ValueError(f"Unknown predefined regex pattern: {predefined_regex}")
957
+
958
+ filters_clause = f"WHERE {filters}" if filters else ""
959
+
960
+ query = f"""
961
+ WITH extracted_timestamps AS (
962
+ SELECT
963
+ {field},
964
+ SUBSTRING({field} FROM '^(\d{{4}})') AS year, -- Extract year
965
+ SUBSTRING({field} FROM '^\d{{4}}-(\d{{2}})') AS month, -- Extract month
966
+ SUBSTRING({field} FROM '^\d{{4}}-\d{{2}}-(\d{{2}})') AS day -- Extract day
967
+ FROM {qualified_table_name}
968
+ {filters_clause}
969
+ ),
970
+ validated_dates AS (
971
+ SELECT
972
+ {field},
973
+ CASE
974
+ WHEN
975
+ year ~ '^\d{{4}}$' AND
976
+ month ~ '^(0[1-9]|1[0-2])$' AND
977
+ day ~ '^((0[1-9]|[12][0-9])|(30|31))$' AND
978
+ (
979
+ (month IN ('01', '03', '05', '07', '08', '10', '12') AND day BETWEEN '01' AND '31') OR
980
+ (month IN ('04', '06', '09', '11') AND day BETWEEN '01' AND '30') OR
981
+ (month = '02' AND day BETWEEN '01' AND
982
+ CASE
983
+ WHEN (year::int % 400 = 0 OR (year::int % 100 != 0 AND year::int % 4 = 0)) THEN '29'
984
+ ELSE '28'
985
+ END
986
+ )
987
+ )
988
+ THEN 1
989
+ ELSE 0
990
+ END AS is_valid
991
+ FROM extracted_timestamps
992
+ ),
993
+ dates_not_in_future AS (
994
+ SELECT *
995
+ FROM validated_dates
996
+ WHERE is_valid = 1
997
+ AND ({field} ~ '{timestamp_iso_regex}')
998
+ AND ({field})::date <= CURRENT_DATE -- Compare only the date part against the current date
999
+ )
1000
+ SELECT COUNT(*) AS valid_count, (SELECT COUNT(*) FROM {qualified_table_name} {filters_clause}) AS total_count
1001
+ FROM dates_not_in_future;
1002
+ """
1003
+
1004
+ try:
1005
+ valid_count = self.fetchone(query)[0]
1006
+ total_count_query = f"SELECT COUNT(*) FROM {qualified_table_name} {filters_clause}"
1007
+ total_count = self.fetchone(total_count_query)[0]
1008
+
1009
+ return valid_count, total_count
1010
+ except Exception as e:
1011
+ logger.error(f"Error occurred: {e}")
1012
+ return 0, 0
1013
+
1014
+ def generate_view_name(self, view_name: str | None = None) -> str:
1015
+ if view_name is not None:
1016
+ return view_name
1017
+ random_string = "".join(secrets.choice(string.ascii_letters + string.digits) for _ in range(8))
1018
+ timestamp = int(time.time())
1019
+ return f"dcs_view_{timestamp}_{random_string.lower()}"
1020
+
1021
+ def create_view(
1022
+ self,
1023
+ query: str | None = None,
1024
+ schema: str | None = None,
1025
+ view_name: str | None = None,
1026
+ ) -> str | None:
1027
+ view_name = self.generate_view_name(view_name=view_name)
1028
+ schema_prefix = f"{schema}." if schema else ""
1029
+ view_name_full = f"{schema_prefix}{view_name}"
1030
+
1031
+ if query is None:
1032
+ sql = f"CREATE VIEW {view_name_full} AS SELECT 1 AS dummy WHERE 1 = 0"
1033
+ else:
1034
+ sql = f"CREATE VIEW {view_name_full} AS {query}"
1035
+
1036
+ try:
1037
+ if isinstance(self.connection, (Connection, Engine)):
1038
+ if isinstance(self.connection, Engine):
1039
+ with self.connection.connect() as conn:
1040
+ conn.execute(text(sql))
1041
+ conn.commit()
1042
+ else:
1043
+ self.connection.execute(text(sql))
1044
+ try:
1045
+ self.connection.commit()
1046
+ except Exception:
1047
+ pass
1048
+ else:
1049
+ plain_sql = str(sql)
1050
+ if hasattr(self.connection, "cursor"):
1051
+ cur = self.connection.cursor()
1052
+ cur.execute(plain_sql)
1053
+ try:
1054
+ self.connection.commit()
1055
+ except Exception:
1056
+ pass
1057
+ else:
1058
+ self.connection.execute(plain_sql)
1059
+
1060
+ return view_name_full
1061
+ except Exception as e:
1062
+ logger.error(f"Error creating view {view_name_full}: {e}")
1063
+ return None
1064
+
1065
+ def drop_view(self, view_name: str, schema: str | None) -> bool:
1066
+ schema_prefix = f"{schema}." if schema else ""
1067
+ full_view_name = f"{schema_prefix}{view_name}"
1068
+ drop_query = f"DROP VIEW {full_view_name}"
1069
+ try:
1070
+ if isinstance(self.connection, (Connection, Engine)):
1071
+ if isinstance(self.connection, Engine):
1072
+ with self.connection.connect() as conn:
1073
+ conn.execute(text(drop_query))
1074
+ conn.commit()
1075
+ else:
1076
+ self.connection.execute(text(drop_query))
1077
+ try:
1078
+ self.connection.commit()
1079
+ except Exception:
1080
+ pass
1081
+ else:
1082
+ if hasattr(self.connection, "cursor"):
1083
+ cur = self.connection.cursor()
1084
+ cur.execute(drop_query)
1085
+ try:
1086
+ self.connection.commit()
1087
+ except Exception:
1088
+ pass
1089
+ else:
1090
+ self.connection.execute(str(drop_query))
1091
+ return True
1092
+ except Exception as e:
1093
+ logger.error(f"Error dropping view {full_view_name}: {e}")
1094
+ return False