dcs-sdk 1.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. data_diff/__init__.py +221 -0
  2. data_diff/__main__.py +517 -0
  3. data_diff/abcs/__init__.py +13 -0
  4. data_diff/abcs/compiler.py +27 -0
  5. data_diff/abcs/database_types.py +402 -0
  6. data_diff/config.py +141 -0
  7. data_diff/databases/__init__.py +38 -0
  8. data_diff/databases/_connect.py +323 -0
  9. data_diff/databases/base.py +1417 -0
  10. data_diff/databases/bigquery.py +376 -0
  11. data_diff/databases/clickhouse.py +217 -0
  12. data_diff/databases/databricks.py +262 -0
  13. data_diff/databases/duckdb.py +207 -0
  14. data_diff/databases/mssql.py +343 -0
  15. data_diff/databases/mysql.py +189 -0
  16. data_diff/databases/oracle.py +238 -0
  17. data_diff/databases/postgresql.py +293 -0
  18. data_diff/databases/presto.py +222 -0
  19. data_diff/databases/redis.py +93 -0
  20. data_diff/databases/redshift.py +233 -0
  21. data_diff/databases/snowflake.py +222 -0
  22. data_diff/databases/sybase.py +720 -0
  23. data_diff/databases/trino.py +73 -0
  24. data_diff/databases/vertica.py +174 -0
  25. data_diff/diff_tables.py +489 -0
  26. data_diff/errors.py +17 -0
  27. data_diff/format.py +369 -0
  28. data_diff/hashdiff_tables.py +1026 -0
  29. data_diff/info_tree.py +76 -0
  30. data_diff/joindiff_tables.py +434 -0
  31. data_diff/lexicographic_space.py +253 -0
  32. data_diff/parse_time.py +88 -0
  33. data_diff/py.typed +0 -0
  34. data_diff/queries/__init__.py +13 -0
  35. data_diff/queries/api.py +213 -0
  36. data_diff/queries/ast_classes.py +811 -0
  37. data_diff/queries/base.py +38 -0
  38. data_diff/queries/extras.py +43 -0
  39. data_diff/query_utils.py +70 -0
  40. data_diff/schema.py +67 -0
  41. data_diff/table_segment.py +583 -0
  42. data_diff/thread_utils.py +112 -0
  43. data_diff/utils.py +1022 -0
  44. data_diff/version.py +15 -0
  45. dcs_core/__init__.py +13 -0
  46. dcs_core/__main__.py +17 -0
  47. dcs_core/__version__.py +15 -0
  48. dcs_core/cli/__init__.py +13 -0
  49. dcs_core/cli/cli.py +165 -0
  50. dcs_core/core/__init__.py +19 -0
  51. dcs_core/core/common/__init__.py +13 -0
  52. dcs_core/core/common/errors.py +50 -0
  53. dcs_core/core/common/models/__init__.py +13 -0
  54. dcs_core/core/common/models/configuration.py +284 -0
  55. dcs_core/core/common/models/dashboard.py +24 -0
  56. dcs_core/core/common/models/data_source_resource.py +75 -0
  57. dcs_core/core/common/models/metric.py +160 -0
  58. dcs_core/core/common/models/profile.py +75 -0
  59. dcs_core/core/common/models/validation.py +216 -0
  60. dcs_core/core/common/models/widget.py +44 -0
  61. dcs_core/core/configuration/__init__.py +13 -0
  62. dcs_core/core/configuration/config_loader.py +139 -0
  63. dcs_core/core/configuration/configuration_parser.py +262 -0
  64. dcs_core/core/configuration/configuration_parser_arc.py +328 -0
  65. dcs_core/core/datasource/__init__.py +13 -0
  66. dcs_core/core/datasource/base.py +62 -0
  67. dcs_core/core/datasource/manager.py +112 -0
  68. dcs_core/core/datasource/search_datasource.py +421 -0
  69. dcs_core/core/datasource/sql_datasource.py +1094 -0
  70. dcs_core/core/inspect.py +163 -0
  71. dcs_core/core/logger/__init__.py +13 -0
  72. dcs_core/core/logger/base.py +32 -0
  73. dcs_core/core/logger/default_logger.py +94 -0
  74. dcs_core/core/metric/__init__.py +13 -0
  75. dcs_core/core/metric/base.py +220 -0
  76. dcs_core/core/metric/combined_metric.py +98 -0
  77. dcs_core/core/metric/custom_metric.py +34 -0
  78. dcs_core/core/metric/manager.py +137 -0
  79. dcs_core/core/metric/numeric_metric.py +403 -0
  80. dcs_core/core/metric/reliability_metric.py +90 -0
  81. dcs_core/core/profiling/__init__.py +13 -0
  82. dcs_core/core/profiling/datasource_profiling.py +136 -0
  83. dcs_core/core/profiling/numeric_field_profiling.py +72 -0
  84. dcs_core/core/profiling/text_field_profiling.py +67 -0
  85. dcs_core/core/repository/__init__.py +13 -0
  86. dcs_core/core/repository/metric_repository.py +77 -0
  87. dcs_core/core/utils/__init__.py +13 -0
  88. dcs_core/core/utils/log.py +29 -0
  89. dcs_core/core/utils/tracking.py +105 -0
  90. dcs_core/core/utils/utils.py +44 -0
  91. dcs_core/core/validation/__init__.py +13 -0
  92. dcs_core/core/validation/base.py +230 -0
  93. dcs_core/core/validation/completeness_validation.py +153 -0
  94. dcs_core/core/validation/custom_query_validation.py +24 -0
  95. dcs_core/core/validation/manager.py +282 -0
  96. dcs_core/core/validation/numeric_validation.py +276 -0
  97. dcs_core/core/validation/reliability_validation.py +91 -0
  98. dcs_core/core/validation/uniqueness_validation.py +61 -0
  99. dcs_core/core/validation/validity_validation.py +738 -0
  100. dcs_core/integrations/__init__.py +13 -0
  101. dcs_core/integrations/databases/__init__.py +13 -0
  102. dcs_core/integrations/databases/bigquery.py +187 -0
  103. dcs_core/integrations/databases/databricks.py +51 -0
  104. dcs_core/integrations/databases/db2.py +652 -0
  105. dcs_core/integrations/databases/elasticsearch.py +61 -0
  106. dcs_core/integrations/databases/mssql.py +829 -0
  107. dcs_core/integrations/databases/mysql.py +409 -0
  108. dcs_core/integrations/databases/opensearch.py +64 -0
  109. dcs_core/integrations/databases/oracle.py +719 -0
  110. dcs_core/integrations/databases/postgres.py +482 -0
  111. dcs_core/integrations/databases/redshift.py +53 -0
  112. dcs_core/integrations/databases/snowflake.py +48 -0
  113. dcs_core/integrations/databases/spark_df.py +111 -0
  114. dcs_core/integrations/databases/sybase.py +1069 -0
  115. dcs_core/integrations/storage/__init__.py +13 -0
  116. dcs_core/integrations/storage/local_file.py +149 -0
  117. dcs_core/integrations/utils/__init__.py +13 -0
  118. dcs_core/integrations/utils/utils.py +36 -0
  119. dcs_core/report/__init__.py +13 -0
  120. dcs_core/report/dashboard.py +211 -0
  121. dcs_core/report/models.py +88 -0
  122. dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
  123. dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
  124. dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
  125. dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
  126. dcs_core/report/static/assets/images/docs.svg +6 -0
  127. dcs_core/report/static/assets/images/github.svg +4 -0
  128. dcs_core/report/static/assets/images/logo.svg +7 -0
  129. dcs_core/report/static/assets/images/slack.svg +13 -0
  130. dcs_core/report/static/index.js +2 -0
  131. dcs_core/report/static/index.js.LICENSE.txt +3971 -0
  132. dcs_sdk/__init__.py +13 -0
  133. dcs_sdk/__main__.py +18 -0
  134. dcs_sdk/__version__.py +15 -0
  135. dcs_sdk/cli/__init__.py +13 -0
  136. dcs_sdk/cli/cli.py +163 -0
  137. dcs_sdk/sdk/__init__.py +58 -0
  138. dcs_sdk/sdk/config/__init__.py +13 -0
  139. dcs_sdk/sdk/config/config_loader.py +491 -0
  140. dcs_sdk/sdk/data_diff/__init__.py +13 -0
  141. dcs_sdk/sdk/data_diff/data_differ.py +821 -0
  142. dcs_sdk/sdk/rules/__init__.py +15 -0
  143. dcs_sdk/sdk/rules/rules_mappping.py +31 -0
  144. dcs_sdk/sdk/rules/rules_repository.py +214 -0
  145. dcs_sdk/sdk/rules/schema_rules.py +65 -0
  146. dcs_sdk/sdk/utils/__init__.py +13 -0
  147. dcs_sdk/sdk/utils/serializer.py +25 -0
  148. dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
  149. dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
  150. dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
  151. dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
  152. dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
  153. dcs_sdk/sdk/utils/table.py +475 -0
  154. dcs_sdk/sdk/utils/themes.py +40 -0
  155. dcs_sdk/sdk/utils/utils.py +349 -0
  156. dcs_sdk-1.6.5.dist-info/METADATA +150 -0
  157. dcs_sdk-1.6.5.dist-info/RECORD +159 -0
  158. dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
  159. dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,719 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import secrets
16
+ import string
17
+ import time
18
+ from datetime import datetime
19
+ from typing import Any, Dict, List, Optional, Tuple, Union
20
+
21
+ from loguru import logger
22
+ from sqlalchemy import create_engine, text
23
+
24
+ from dcs_core.core.common.errors import DataChecksDataSourcesConnectionError
25
+ from dcs_core.core.common.models.data_source_resource import RawColumnInfo
26
+ from dcs_core.core.datasource.sql_datasource import SQLDataSource
27
+
28
+
29
+ class OracleDataSource(SQLDataSource):
30
+ def __init__(self, data_source_name: str, data_connection: Dict):
31
+ super().__init__(data_source_name, data_connection)
32
+
33
+ self.regex_patterns = {
34
+ "uuid": r"^[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$",
35
+ "usa_phone": r"^\(\d{3}\) \d{3}-\d{4}$",
36
+ "email": r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$",
37
+ "usa_zip_code": r"^[0-9]{5}(?:-[0-9]{4})?$",
38
+ "ssn": r"^[0-6]\d{2}-(0[1-9]|[1-9]\d)-([1-9]\d{3}|\d{4})$",
39
+ "sedol": r"^[A-Z0-9]{6}\d$",
40
+ "lei": r"^[A-Z0-9]{18}[0-9]{2}$",
41
+ "cusip": r"^[0-9A-Z]{8}[0-9]$",
42
+ "figi": r"^BBG[A-Z0-9]{9}$",
43
+ "isin": r"^[A-Z]{2}[A-Z0-9]{9}[0-9]$",
44
+ "perm_id": r"^\d{4}([- ]?)\d{4}\1\d{4}\1\d{4}([- ]?)\d{3}$",
45
+ }
46
+
47
+ def connect(self) -> Any:
48
+ """
49
+ Connect to the data source
50
+ """
51
+ try:
52
+ engine = create_engine(
53
+ f"oracle+oracledb://:@",
54
+ thick_mode=False,
55
+ connect_args={
56
+ "user": self.data_connection.get("username"),
57
+ "password": self.data_connection.get("password"),
58
+ "host": self.data_connection.get("host"),
59
+ "port": self.data_connection.get("port"),
60
+ "service_name": self.data_connection.get("service_name"),
61
+ },
62
+ )
63
+ self.schema_name = self.data_connection.get("schema") or self.data_connection.get("username")
64
+ self.connection = engine.connect()
65
+ return self.connection
66
+ except Exception as e:
67
+ raise DataChecksDataSourcesConnectionError(message=f"Failed to connect to Oracle data source: [{str(e)}]")
68
+
69
+ def qualified_table_name(self, table_name: str) -> str:
70
+ """
71
+ Get the qualified table name
72
+ :param table_name: name of the table
73
+ :return: qualified table name
74
+ """
75
+ if self.schema_name:
76
+ return f'"{self.schema_name}"."{table_name}"'
77
+ return f'"{table_name}"'
78
+
79
+ def quote_column(self, column: str) -> str:
80
+ """
81
+ Quote the column name
82
+ :param column: name of the column
83
+ :return: quoted column name
84
+ """
85
+ return f'"{column}"'
86
+
87
+ def query_get_database_version(self, database_version_query: Optional[str] = None) -> str:
88
+ """
89
+ Get the database version
90
+ :return: version string
91
+ """
92
+ query = database_version_query or "SELECT BANNER FROM v$version"
93
+ result = self.fetchone(query)[0]
94
+ return result if result else None
95
+
96
+ def query_get_table_names(
97
+ self,
98
+ schema: str | None = None,
99
+ with_view: bool = False,
100
+ ) -> dict:
101
+ """
102
+ Get the list of tables in the database.
103
+ :param schema: optional schema name
104
+ :param with_view: whether to include views
105
+ :return: dictionary with table names and optionally view names
106
+ """
107
+ schema = schema or self.schema_name
108
+
109
+ if with_view:
110
+ query = (
111
+ f"SELECT TABLE_NAME, 'TABLE' AS OBJECT_TYPE FROM ALL_ALL_TABLES WHERE OWNER = '{schema}' "
112
+ f"UNION "
113
+ f"SELECT VIEW_NAME AS TABLE_NAME, 'VIEW' AS OBJECT_TYPE FROM ALL_VIEWS WHERE OWNER = '{schema}'"
114
+ )
115
+ else:
116
+ query = f"SELECT TABLE_NAME, 'TABLE' AS OBJECT_TYPE FROM ALL_ALL_TABLES WHERE OWNER = '{schema}'"
117
+
118
+ rows = self.fetchall(query)
119
+
120
+ if with_view:
121
+ result = {"table": [], "view": []}
122
+ if rows:
123
+ for row in rows:
124
+ object_name = row[0]
125
+ object_type = row[1].strip() if row[1] else row[1]
126
+
127
+ if object_type == "TABLE":
128
+ result["table"].append(object_name)
129
+ elif object_type == "VIEW":
130
+ result["view"].append(object_name)
131
+ else:
132
+ result = {"table": []}
133
+ if rows:
134
+ result["table"] = [row[0] for row in rows]
135
+
136
+ return result
137
+
138
+ def query_get_table_indexes(self, table: str, schema: str | None = None) -> dict[str, dict]:
139
+ """
140
+ Get index information for a table in Oracle DB.
141
+ :param table: Table name
142
+ :param schema: Optional schema name
143
+ :return: Dictionary with index details
144
+ """
145
+ schema = schema or self.schema_name
146
+ table = table.upper()
147
+ schema = schema.upper()
148
+
149
+ query = f"""
150
+ SELECT
151
+ ind.index_name,
152
+ ind.index_type,
153
+ col.column_name,
154
+ col.column_position AS column_order
155
+ FROM
156
+ ALL_INDEXES ind
157
+ JOIN
158
+ ALL_IND_COLUMNS col ON ind.index_name = col.index_name AND ind.table_name = col.table_name AND ind.owner = col.index_owner
159
+ WHERE
160
+ ind.table_name = '{table}'
161
+ AND ind.owner = '{schema}'
162
+ ORDER BY
163
+ ind.index_name, col.column_position
164
+ """
165
+ rows = self.fetchall(query)
166
+
167
+ if not rows:
168
+ raise RuntimeError(f"No index information found for table '{table}' in schema '{schema}'.")
169
+
170
+ pk_query = f"""
171
+ SELECT acc.column_name
172
+ FROM ALL_CONSTRAINTS ac
173
+ JOIN ALL_CONS_COLUMNS acc ON ac.constraint_name = acc.constraint_name AND ac.owner = acc.owner
174
+ WHERE ac.constraint_type = 'P'
175
+ AND ac.table_name = '{table}'
176
+ AND ac.owner = '{schema}'
177
+ ORDER BY acc.position
178
+ """
179
+ pk_rows = self.fetchall(pk_query)
180
+ pk_columns = [row[0].strip() for row in pk_rows] if pk_rows else []
181
+ pk_columns_set = set(pk_columns)
182
+
183
+ indexes = {}
184
+ for row in rows:
185
+ index_name = row[0]
186
+ index_type = row[1]
187
+ column_info = {
188
+ "column_name": self.safe_get(row, 2),
189
+ "column_order": self.safe_get(row, 3),
190
+ }
191
+ if index_name not in indexes:
192
+ indexes[index_name] = {"columns": [], "index_type": index_type}
193
+ indexes[index_name]["columns"].append(column_info)
194
+
195
+ for index_name, idx in indexes.items():
196
+ index_columns = [col["column_name"].strip() for col in idx["columns"]]
197
+ index_columns_set = set(index_columns)
198
+ idx["is_primary_key"] = pk_columns_set == index_columns_set and len(index_columns) == len(pk_columns)
199
+
200
+ return indexes
201
+
202
+ def query_get_table_columns(
203
+ self,
204
+ table: str,
205
+ schema: str | None = None,
206
+ ) -> RawColumnInfo:
207
+ """
208
+ Get the schema of a table.
209
+ :param table: table name
210
+ :return: RawColumnInfo object containing column information
211
+ """
212
+ schema = schema or self.schema_name
213
+ query = (
214
+ f"SELECT column_name, data_type, 6 as datetime_precision, data_precision as numeric_precision, "
215
+ f"data_scale as numeric_scale, NULL as collation_name, char_length as character_maximum_length "
216
+ f"FROM ALL_TAB_COLUMNS WHERE table_name = '{table}' AND owner = '{schema}'"
217
+ )
218
+ rows = self.fetchall(query)
219
+ if not rows:
220
+ raise RuntimeError(f"{table}: Table, {schema}: Schema, does not exist, or has no columns")
221
+
222
+ column_info = {
223
+ r[0]: RawColumnInfo(
224
+ column_name=self.safe_get(r, 0),
225
+ data_type=self.safe_get(r, 1),
226
+ datetime_precision=self.safe_get(r, 2),
227
+ numeric_precision=self.safe_get(r, 3),
228
+ numeric_scale=self.safe_get(r, 4),
229
+ collation_name=self.safe_get(r, 5),
230
+ character_maximum_length=self.safe_get(r, 6),
231
+ )
232
+ for r in rows
233
+ }
234
+ return column_info
235
+
236
+ def fetch_rows(
237
+ self,
238
+ query: str,
239
+ limit: int = 1,
240
+ with_column_names: bool = False,
241
+ complete_query: Optional[str] = None,
242
+ ) -> Tuple[List, Optional[List[str]]]:
243
+ """
244
+ Fetch rows from the database.
245
+
246
+ :param query: SQL query to execute.
247
+ :param limit: Number of rows to fetch.
248
+ :param with_column_names: Whether to include column names in the result.
249
+ :return: Tuple of (rows, column_names or None)
250
+ """
251
+ query = complete_query or f"SELECT * FROM ({query}) subquery ORDER BY 1 FETCH NEXT {limit} ROWS ONLY"
252
+
253
+ result = self.connection.execute(text(query))
254
+ rows = result.fetchmany(limit)
255
+
256
+ if with_column_names:
257
+ column_names = result.keys()
258
+ return rows, list(column_names)
259
+ else:
260
+ return rows, None
261
+
262
+ def query_valid_invalid_values_validity(
263
+ self,
264
+ table: str,
265
+ field: str,
266
+ regex_pattern: str = None,
267
+ filters: str = None,
268
+ values: List[str] = None,
269
+ ) -> Tuple[int, int]:
270
+ """
271
+ Get the count of valid and invalid values
272
+ :param table: table name
273
+ :param field: column name
274
+ :param values: list of valid values
275
+ :param regex_pattern: regex pattern
276
+ :param filters: filter condition
277
+ :return: count of valid/invalid values and total count of valid/invalid values
278
+ """
279
+ filters = f"WHERE {filters}" if filters else ""
280
+ qualified_table_name = self.qualified_table_name(table)
281
+ if values:
282
+ values_str = ", ".join([f"'{value}'" for value in values])
283
+ regex_query = f"CASE WHEN {field} IN ({values_str}) THEN 1 ELSE 0 END"
284
+ else:
285
+ regex_query = f"CASE WHEN REGEXP_LIKE({field}, '{regex_pattern}') THEN 1 ELSE 0 END"
286
+
287
+ query = f"""
288
+ SELECT SUM({regex_query}) AS valid_count, COUNT(*) AS total_count
289
+ FROM {qualified_table_name}
290
+ {filters}
291
+ """
292
+
293
+ result = self.fetchone(query)
294
+ return result[0], result[1]
295
+
296
+ def query_string_pattern_validity(
297
+ self,
298
+ table: str,
299
+ field: str,
300
+ regex_pattern: str = None,
301
+ predefined_regex_pattern: str = None,
302
+ filters: str = None,
303
+ ) -> Tuple[int, int]:
304
+ """
305
+ Get the count of valid values based on the regex pattern
306
+ :param table: table name
307
+ :param field: column name
308
+ :param regex_pattern: regex pattern
309
+ :param predefined_regex_pattern: predefined regex pattern
310
+ :param filters: filter condition
311
+ :return: count of valid values, count of total row count
312
+ """
313
+ filters = f"WHERE {filters}" if filters else ""
314
+ qualified_table_name = self.qualified_table_name(table)
315
+ field = self.quote_column(field)
316
+
317
+ if not regex_pattern and not predefined_regex_pattern:
318
+ raise ValueError("Either regex_pattern or predefined_regex_pattern should be provided")
319
+
320
+ if predefined_regex_pattern:
321
+ regex_condition = f"REGEXP_LIKE({field}, '{self.regex_patterns[predefined_regex_pattern]}')"
322
+ else:
323
+ regex_condition = f"REGEXP_LIKE({field}, '{regex_pattern}')"
324
+
325
+ regex_query = f"CASE WHEN {regex_condition} THEN 1 ELSE 0 END"
326
+
327
+ query = f"""
328
+ SELECT SUM({regex_query}) AS valid_count, COUNT(*) AS total_count
329
+ FROM {qualified_table_name} {filters}
330
+ """
331
+ result = self.fetchone(query)
332
+ return result[0], result[1]
333
+
334
+ def query_get_usa_state_code_validity(self, table: str, field: str, filters: str = None) -> Tuple[int, int]:
335
+ """
336
+ Get the count of valid USA state codes
337
+ :param table: table name
338
+ :param field: column name
339
+ :param filters: filter condition
340
+ :return: count of valid state codes, count of total row count
341
+ """
342
+
343
+ valid_state_codes_str = ", ".join(f"'{code}'" for code in self.valid_state_codes)
344
+
345
+ filters = f"WHERE {filters}" if filters else ""
346
+
347
+ qualified_table_name = self.qualified_table_name(table)
348
+ field = self.quote_column(field)
349
+
350
+ regex_query = (
351
+ f"CASE WHEN REGEXP_LIKE({field}, '^[A-Z]{{2}}$') "
352
+ f"AND {field} IN ({valid_state_codes_str}) THEN 1 ELSE 0 END"
353
+ )
354
+
355
+ query = f"""
356
+ SELECT SUM({regex_query}) AS valid_count, COUNT(*) AS total_count
357
+ FROM {qualified_table_name} {filters}
358
+ """
359
+
360
+ result = self.fetchone(query)
361
+ return result[0], result[1]
362
+
363
+ def query_timestamp_metric(
364
+ self,
365
+ table: str,
366
+ field: str,
367
+ predefined_regex: str,
368
+ filters: str = None,
369
+ ) -> Union[float, int]:
370
+ """
371
+ :param table: Table name
372
+ :param field: Column name
373
+ :param predefined_regex: regex pattern
374
+ :param filters: filter condition
375
+ :return: Tuple containing valid count and total count (or percentage)
376
+ """
377
+
378
+ qualified_table_name = self.qualified_table_name(table)
379
+ field = self.quote_column(field)
380
+
381
+ if predefined_regex == "timestamp_iso":
382
+ filters_clause = f"WHERE {filters}" if filters else ""
383
+
384
+ query = f"""
385
+ WITH extracted_timestamps AS (
386
+ SELECT
387
+ {field},
388
+ TO_CHAR({field}, 'YYYY') AS year,
389
+ TO_CHAR({field}, 'MM') AS month,
390
+ TO_CHAR({field}, 'DD') AS day,
391
+ TO_CHAR({field}, 'HH24') AS hour,
392
+ TO_CHAR({field}, 'MI') AS minute,
393
+ TO_CHAR({field}, 'SS') AS second
394
+ FROM {qualified_table_name}
395
+ {filters_clause}
396
+ ),
397
+ validated_timestamps AS (
398
+ SELECT
399
+ {field},
400
+ CASE
401
+ WHEN
402
+ REGEXP_LIKE(year, '^\\d{{4}}$') AND
403
+ REGEXP_LIKE(month, '^(0[1-9]|1[0-2])$') AND
404
+ REGEXP_LIKE(day, '^([0-2][0-9]|3[01])$') AND
405
+ (
406
+ (month IN ('01', '03', '05', '07', '08', '10', '12') AND day BETWEEN '01' AND '31') OR
407
+ (month IN ('04', '06', '09', '11') AND day BETWEEN '01' AND '30') OR
408
+ (month = '02' AND day BETWEEN '01' AND
409
+ CASE
410
+ WHEN MOD(TO_NUMBER(year), 400) = 0 OR
411
+ (MOD(TO_NUMBER(year), 4) = 0 AND MOD(TO_NUMBER(year), 100) != 0) THEN '29'
412
+ ELSE '28'
413
+ END
414
+ )
415
+ ) AND
416
+ REGEXP_LIKE(hour, '^(0[0-9]|1[0-9]|2[0-3])$') AND
417
+ REGEXP_LIKE(minute, '^[0-5][0-9]$') AND
418
+ REGEXP_LIKE(second, '^[0-5][0-9]$')
419
+ THEN 1
420
+ ELSE 0
421
+ END AS is_valid
422
+ FROM extracted_timestamps
423
+ )
424
+ SELECT SUM(is_valid) AS valid_count, COUNT(*) AS total_count
425
+ FROM validated_timestamps
426
+ """
427
+ try:
428
+ result = self.fetchone(query)
429
+ valid_count = result[0]
430
+ total_count = result[1]
431
+
432
+ return valid_count, total_count
433
+ except Exception as e:
434
+ logger.error(f"Error occurred: {e}")
435
+ return 0, 0
436
+ else:
437
+ raise ValueError(f"Unknown predefined regex pattern: {predefined_regex}")
438
+
439
+ def query_timestamp_not_in_future_metric(
440
+ self,
441
+ table: str,
442
+ field: str,
443
+ predefined_regex: str,
444
+ filters: str = None,
445
+ ) -> Union[float, int]:
446
+ """
447
+ :param table: Table name
448
+ :param field: Column name
449
+ :param predefined_regex: regex pattern
450
+ :param filters: filter condition
451
+ :return: Count of valid timestamps not in the future and total count or percentage
452
+ """
453
+ qualified_table_name = self.qualified_table_name(table)
454
+ field = self.quote_column(field)
455
+
456
+ timestamp_iso_regex = r"^\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])T([01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9](?:\.\d{1,3})?(Z|[+-](0[0-9]|1[0-4]):[0-5][0-9])?$"
457
+
458
+ if predefined_regex == "timestamp_iso":
459
+ regex_condition = f"REGEXP_LIKE({field}, '{timestamp_iso_regex}')"
460
+ else:
461
+ raise ValueError(f"Unknown predefined regex pattern: {predefined_regex}")
462
+
463
+ filters_clause = f"WHERE {filters}" if filters else ""
464
+
465
+ query = f"""
466
+ WITH extracted_timestamps AS (
467
+ SELECT
468
+ TO_CHAR({field}, 'YYYY-MM-DD HH24:MI:SS') AS formatted_{field},
469
+ REGEXP_SUBSTR(TO_CHAR({field}, 'YYYY-MM-DD HH24:MI:SS'), '^\d{{4}}', 1, 1) AS year,
470
+ REGEXP_SUBSTR(TO_CHAR({field}, 'YYYY-MM-DD HH24:MI:SS'), '^\d{{4}}-(\d{{2}})', 1, 1, NULL, 1) AS month,
471
+ REGEXP_SUBSTR(TO_CHAR({field}, 'YYYY-MM-DD HH24:MI:SS'), '^\d{{4}}-\d{{2}}-(\d{{2}})', 1, 1, NULL, 1) AS day,
472
+ REGEXP_SUBSTR(TO_CHAR({field}, 'YYYY-MM-DD HH24:MI:SS'), ' (\d{{2}})', 1, 1, NULL, 1) AS hour,
473
+ REGEXP_SUBSTR(TO_CHAR({field}, 'YYYY-MM-DD HH24:MI:SS'), ':\d{{2}}:(\d{{2}})', 1, 1, NULL, 1) AS minute,
474
+ REGEXP_SUBSTR(TO_CHAR({field}, 'YYYY-MM-DD HH24:MI:SS'), ':(\d{{2}})$', 1, 1, NULL, 1) AS second
475
+ FROM {qualified_table_name}
476
+ {filters_clause}
477
+ ),
478
+ validated_timestamps AS (
479
+ SELECT
480
+ formatted_{field},
481
+ CASE
482
+ WHEN
483
+ REGEXP_LIKE(year, '^\d{{4}}$') AND
484
+ REGEXP_LIKE(month, '^(0[1-9]|1[0-2])$') AND
485
+ REGEXP_LIKE(day, '^([0-2][0-9]|3[01])$') AND
486
+ (
487
+ (month IN ('01', '03', '05', '07', '08', '10', '12') AND day BETWEEN '01' AND '31') OR
488
+ (month IN ('04', '06', '09', '11') AND day BETWEEN '01' AND '30') OR
489
+ (month = '02' AND day BETWEEN '01' AND
490
+ CASE
491
+ WHEN MOD(TO_NUMBER(year), 400) = 0 OR
492
+ (MOD(TO_NUMBER(year), 4) = 0 AND MOD(TO_NUMBER(year), 100) != 0) THEN '29'
493
+ ELSE '28'
494
+ END
495
+ )
496
+ ) AND
497
+ REGEXP_LIKE(hour, '^(0[0-9]|1[0-9]|2[0-3])$') AND
498
+ REGEXP_LIKE(minute, '^[0-5][0-9]$') AND
499
+ REGEXP_LIKE(second, '^[0-5][0-9]$')
500
+ THEN 1
501
+ ELSE 0
502
+ END AS is_valid
503
+ FROM extracted_timestamps
504
+ ),
505
+ timestamps_not_in_future AS (
506
+ SELECT *
507
+ FROM validated_timestamps
508
+ WHERE is_valid = 1 AND TO_TIMESTAMP(formatted_{field}, 'YYYY-MM-DD HH24:MI:SS') <= CURRENT_TIMESTAMP
509
+ )
510
+ SELECT
511
+ (SELECT COUNT(*) FROM timestamps_not_in_future) AS valid_count,
512
+ (SELECT COUNT(*) FROM {qualified_table_name}) AS total_count
513
+ FROM dual
514
+ """
515
+ try:
516
+ result = self.fetchone(query)
517
+ valid_count = result[0]
518
+ total_count = result[1]
519
+
520
+ return valid_count, total_count
521
+ except Exception as e:
522
+ logger.error(f"Error occurred: {e}")
523
+ return 0, 0
524
+
525
+ def query_timestamp_date_not_in_future_metric(
526
+ self,
527
+ table: str,
528
+ field: str,
529
+ predefined_regex: str,
530
+ filters: str = None,
531
+ ) -> Union[float, int]:
532
+ """
533
+ :param table: Table name
534
+ :param field: Column name
535
+ :param predefined_regex: The regex pattern to use (e.g., "timestamp_iso")
536
+ :param filters: Optional filter condition
537
+ :return: Tuple containing count of valid dates not in the future and total count
538
+ """
539
+
540
+ qualified_table_name = self.qualified_table_name(table)
541
+ field = self.quote_column(field)
542
+ filters_clause = f"WHERE {filters}" if filters else ""
543
+
544
+ query = f"""
545
+ WITH extracted_timestamps AS (
546
+ SELECT
547
+ TO_CHAR({field}, 'YYYY-MM-DD HH24:MI:SS') AS formatted_{field},
548
+ REGEXP_SUBSTR(TO_CHAR({field}, 'YYYY-MM-DD HH24:MI:SS'), '^\d{{4}}', 1, 1) AS year,
549
+ REGEXP_SUBSTR(TO_CHAR({field}, 'YYYY-MM-DD HH24:MI:SS'), '^\d{{4}}-(\d{{2}})', 1, 1, NULL, 1) AS month,
550
+ REGEXP_SUBSTR(TO_CHAR({field}, 'YYYY-MM-DD HH24:MI:SS'), '^\d{{4}}-\d{{2}}-(\d{{2}})', 1, 1, NULL, 1) AS day,
551
+ REGEXP_SUBSTR(TO_CHAR({field}, 'YYYY-MM-DD HH24:MI:SS'), ' (\d{{2}})', 1, 1, NULL, 1) AS hour,
552
+ REGEXP_SUBSTR(TO_CHAR({field}, 'YYYY-MM-DD HH24:MI:SS'), ':\d{{2}}:(\d{{2}})', 1, 1, NULL, 1) AS minute,
553
+ REGEXP_SUBSTR(TO_CHAR({field}, 'YYYY-MM-DD HH24:MI:SS'), ':(\d{{2}})$', 1, 1, NULL, 1) AS second
554
+ FROM {qualified_table_name}
555
+ {filters_clause}
556
+ ),
557
+ validated_timestamps AS (
558
+ SELECT
559
+ formatted_{field},
560
+ CASE
561
+ WHEN
562
+ REGEXP_LIKE(year, '^\d{{4}}$') AND
563
+ REGEXP_LIKE(month, '^(0[1-9]|1[0-2])$') AND
564
+ REGEXP_LIKE(day, '^([0-2][0-9]|3[01])$') AND
565
+ (
566
+ (month IN ('01', '03', '05', '07', '08', '10', '12') AND day BETWEEN '01' AND '31') OR
567
+ (month IN ('04', '06', '09', '11') AND day BETWEEN '01' AND '30') OR
568
+ (month = '02' AND day BETWEEN '01' AND
569
+ CASE
570
+ WHEN MOD(TO_NUMBER(year), 400) = 0 OR
571
+ (MOD(TO_NUMBER(year), 4) = 0 AND MOD(TO_NUMBER(year), 100) != 0) THEN '29'
572
+ ELSE '28'
573
+ END
574
+ )
575
+ ) AND
576
+ REGEXP_LIKE(hour, '^(0[0-9]|1[0-9]|2[0-3])$') AND
577
+ REGEXP_LIKE(minute, '^[0-5][0-9]$') AND
578
+ REGEXP_LIKE(second, '^[0-5][0-9]$')
579
+ THEN 1
580
+ ELSE 0
581
+ END AS is_valid
582
+ FROM extracted_timestamps
583
+ ),
584
+ validated_dates AS (
585
+ SELECT
586
+ formatted_{field},
587
+ is_valid
588
+ FROM validated_timestamps
589
+ WHERE is_valid = 1
590
+ ),
591
+ dates_not_in_future AS (
592
+ SELECT *
593
+ FROM validated_dates
594
+ WHERE is_valid = 1
595
+ AND REGEXP_LIKE(formatted_{field}, '^\d{{4}}-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01]) (\d{{2}}):([0-5][0-9]):([0-5][0-9])(\.\d{{1,3}})?$')
596
+ AND TO_TIMESTAMP(formatted_{field}, 'YYYY-MM-DD HH24:MI:SS') <= CURRENT_TIMESTAMP
597
+ )
598
+ SELECT
599
+ (SELECT COUNT(*) FROM dates_not_in_future) AS valid_count,
600
+ (SELECT COUNT(*) FROM {qualified_table_name}) AS total_count
601
+ FROM dual
602
+ """
603
+
604
+ try:
605
+ valid_count = self.fetchone(query)[0]
606
+ total_count_query = f"SELECT COUNT(*) FROM {qualified_table_name} {filters_clause}"
607
+ total_count = self.fetchone(total_count_query)[0]
608
+
609
+ return valid_count, total_count
610
+ except Exception as e:
611
+ logger.error(f"Error occurred: {e}")
612
+ return 0, 0
613
+
614
+ def query_get_time_diff(self, table: str, field: str) -> int:
615
+ """
616
+ Get the time difference
617
+ :param table: name of the index
618
+ :param field: field name of updated time column
619
+ :return: time difference in seconds
620
+ """
621
+ qualified_table_name = self.qualified_table_name(table)
622
+ field = self.quote_column(field)
623
+ query = f"""
624
+ SELECT {field} from {qualified_table_name} ORDER BY {field} DESC LIMIT 1;
625
+ """
626
+ query = f"""
627
+ SELECT {field}
628
+ FROM (
629
+ SELECT {field}
630
+ FROM {qualified_table_name}
631
+ ORDER BY {field} DESC
632
+ )
633
+ WHERE ROWNUM = 1
634
+ """
635
+ result = self.fetchone(query)
636
+ if result:
637
+ return int(abs(datetime.utcnow() - result[0]).total_seconds())
638
+ return 0
639
+
640
+ def query_get_all_space_count(
641
+ self, table: str, field: str, operation: str, filters: str = None
642
+ ) -> Union[int, float]:
643
+ """
644
+ Get the count of rows where the specified column contains only spaces.
645
+ :param table: table name
646
+ :param field: column name
647
+ :param filters: filter condition
648
+ :return: count of rows with only spaces
649
+ """
650
+ qualified_table_name = self.qualified_table_name(table)
651
+ field = self.quote_column(field)
652
+
653
+ query = f"""
654
+ SELECT
655
+ COUNT(CASE WHEN TRIM({field}) IS NULL OR TRIM({field}) = '' THEN 1 END) AS space_count,
656
+ COUNT(*) AS total_count
657
+ FROM {qualified_table_name}
658
+ """
659
+
660
+ if filters:
661
+ query += f"WHERE {filters}"
662
+
663
+ result = self.fetchone(query)
664
+
665
+ if operation == "percent":
666
+ return round((result[0] / result[1]) * 100) if result[1] > 0 else 0
667
+
668
+ return result[0] if result else 0
669
+
670
+ def generate_view_name(self, view_name: str | None = None) -> str:
671
+ if view_name is not None:
672
+ return view_name.upper()
673
+ random_string = "".join(secrets.choice(string.ascii_letters + string.digits) for _ in range(8))
674
+ timestamp = int(time.time())
675
+ return f"dcs_view_{timestamp}_{random_string.lower()}".upper()
676
+
677
+ def get_table_foreign_key_info(self, table_name: str, schema: str | None = None):
678
+ schema = schema or self.schema_name
679
+
680
+ query = f"""
681
+ SELECT
682
+ ac.CONSTRAINT_NAME AS constraint_name,
683
+ ac.TABLE_NAME AS table_name,
684
+ acc.COLUMN_NAME AS fk_column,
685
+ r_ac.TABLE_NAME AS referenced_table,
686
+ r_acc.COLUMN_NAME AS referenced_column
687
+ FROM ALL_CONSTRAINTS ac
688
+ JOIN ALL_CONS_COLUMNS acc
689
+ ON ac.CONSTRAINT_NAME = acc.CONSTRAINT_NAME
690
+ AND ac.OWNER = acc.OWNER
691
+ JOIN ALL_CONSTRAINTS r_ac
692
+ ON ac.R_CONSTRAINT_NAME = r_ac.CONSTRAINT_NAME
693
+ AND ac.R_OWNER = r_ac.OWNER
694
+ JOIN ALL_CONS_COLUMNS r_acc
695
+ ON r_ac.CONSTRAINT_NAME = r_acc.CONSTRAINT_NAME
696
+ AND r_ac.OWNER = r_acc.OWNER
697
+ AND acc.POSITION = r_acc.POSITION
698
+ WHERE ac.CONSTRAINT_TYPE = 'R'
699
+ AND ac.TABLE_NAME = '{table_name.upper()}'
700
+ AND ac.OWNER = '{schema.upper()}';
701
+ """
702
+
703
+ try:
704
+ rows = self.fetchall(query)
705
+ except Exception as e:
706
+ logger.error(f"Failed to fetch fk info for dataset: {table_name} ({e})")
707
+ return []
708
+
709
+ data = [
710
+ {
711
+ "constraint_name": row[0],
712
+ "table_name": row[1],
713
+ "fk_column": row[2],
714
+ "referenced_table": row[3],
715
+ "referenced_column": row[4],
716
+ }
717
+ for row in rows
718
+ ]
719
+ return data