dcs-sdk 1.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. data_diff/__init__.py +221 -0
  2. data_diff/__main__.py +517 -0
  3. data_diff/abcs/__init__.py +13 -0
  4. data_diff/abcs/compiler.py +27 -0
  5. data_diff/abcs/database_types.py +402 -0
  6. data_diff/config.py +141 -0
  7. data_diff/databases/__init__.py +38 -0
  8. data_diff/databases/_connect.py +323 -0
  9. data_diff/databases/base.py +1417 -0
  10. data_diff/databases/bigquery.py +376 -0
  11. data_diff/databases/clickhouse.py +217 -0
  12. data_diff/databases/databricks.py +262 -0
  13. data_diff/databases/duckdb.py +207 -0
  14. data_diff/databases/mssql.py +343 -0
  15. data_diff/databases/mysql.py +189 -0
  16. data_diff/databases/oracle.py +238 -0
  17. data_diff/databases/postgresql.py +293 -0
  18. data_diff/databases/presto.py +222 -0
  19. data_diff/databases/redis.py +93 -0
  20. data_diff/databases/redshift.py +233 -0
  21. data_diff/databases/snowflake.py +222 -0
  22. data_diff/databases/sybase.py +720 -0
  23. data_diff/databases/trino.py +73 -0
  24. data_diff/databases/vertica.py +174 -0
  25. data_diff/diff_tables.py +489 -0
  26. data_diff/errors.py +17 -0
  27. data_diff/format.py +369 -0
  28. data_diff/hashdiff_tables.py +1026 -0
  29. data_diff/info_tree.py +76 -0
  30. data_diff/joindiff_tables.py +434 -0
  31. data_diff/lexicographic_space.py +253 -0
  32. data_diff/parse_time.py +88 -0
  33. data_diff/py.typed +0 -0
  34. data_diff/queries/__init__.py +13 -0
  35. data_diff/queries/api.py +213 -0
  36. data_diff/queries/ast_classes.py +811 -0
  37. data_diff/queries/base.py +38 -0
  38. data_diff/queries/extras.py +43 -0
  39. data_diff/query_utils.py +70 -0
  40. data_diff/schema.py +67 -0
  41. data_diff/table_segment.py +583 -0
  42. data_diff/thread_utils.py +112 -0
  43. data_diff/utils.py +1022 -0
  44. data_diff/version.py +15 -0
  45. dcs_core/__init__.py +13 -0
  46. dcs_core/__main__.py +17 -0
  47. dcs_core/__version__.py +15 -0
  48. dcs_core/cli/__init__.py +13 -0
  49. dcs_core/cli/cli.py +165 -0
  50. dcs_core/core/__init__.py +19 -0
  51. dcs_core/core/common/__init__.py +13 -0
  52. dcs_core/core/common/errors.py +50 -0
  53. dcs_core/core/common/models/__init__.py +13 -0
  54. dcs_core/core/common/models/configuration.py +284 -0
  55. dcs_core/core/common/models/dashboard.py +24 -0
  56. dcs_core/core/common/models/data_source_resource.py +75 -0
  57. dcs_core/core/common/models/metric.py +160 -0
  58. dcs_core/core/common/models/profile.py +75 -0
  59. dcs_core/core/common/models/validation.py +216 -0
  60. dcs_core/core/common/models/widget.py +44 -0
  61. dcs_core/core/configuration/__init__.py +13 -0
  62. dcs_core/core/configuration/config_loader.py +139 -0
  63. dcs_core/core/configuration/configuration_parser.py +262 -0
  64. dcs_core/core/configuration/configuration_parser_arc.py +328 -0
  65. dcs_core/core/datasource/__init__.py +13 -0
  66. dcs_core/core/datasource/base.py +62 -0
  67. dcs_core/core/datasource/manager.py +112 -0
  68. dcs_core/core/datasource/search_datasource.py +421 -0
  69. dcs_core/core/datasource/sql_datasource.py +1094 -0
  70. dcs_core/core/inspect.py +163 -0
  71. dcs_core/core/logger/__init__.py +13 -0
  72. dcs_core/core/logger/base.py +32 -0
  73. dcs_core/core/logger/default_logger.py +94 -0
  74. dcs_core/core/metric/__init__.py +13 -0
  75. dcs_core/core/metric/base.py +220 -0
  76. dcs_core/core/metric/combined_metric.py +98 -0
  77. dcs_core/core/metric/custom_metric.py +34 -0
  78. dcs_core/core/metric/manager.py +137 -0
  79. dcs_core/core/metric/numeric_metric.py +403 -0
  80. dcs_core/core/metric/reliability_metric.py +90 -0
  81. dcs_core/core/profiling/__init__.py +13 -0
  82. dcs_core/core/profiling/datasource_profiling.py +136 -0
  83. dcs_core/core/profiling/numeric_field_profiling.py +72 -0
  84. dcs_core/core/profiling/text_field_profiling.py +67 -0
  85. dcs_core/core/repository/__init__.py +13 -0
  86. dcs_core/core/repository/metric_repository.py +77 -0
  87. dcs_core/core/utils/__init__.py +13 -0
  88. dcs_core/core/utils/log.py +29 -0
  89. dcs_core/core/utils/tracking.py +105 -0
  90. dcs_core/core/utils/utils.py +44 -0
  91. dcs_core/core/validation/__init__.py +13 -0
  92. dcs_core/core/validation/base.py +230 -0
  93. dcs_core/core/validation/completeness_validation.py +153 -0
  94. dcs_core/core/validation/custom_query_validation.py +24 -0
  95. dcs_core/core/validation/manager.py +282 -0
  96. dcs_core/core/validation/numeric_validation.py +276 -0
  97. dcs_core/core/validation/reliability_validation.py +91 -0
  98. dcs_core/core/validation/uniqueness_validation.py +61 -0
  99. dcs_core/core/validation/validity_validation.py +738 -0
  100. dcs_core/integrations/__init__.py +13 -0
  101. dcs_core/integrations/databases/__init__.py +13 -0
  102. dcs_core/integrations/databases/bigquery.py +187 -0
  103. dcs_core/integrations/databases/databricks.py +51 -0
  104. dcs_core/integrations/databases/db2.py +652 -0
  105. dcs_core/integrations/databases/elasticsearch.py +61 -0
  106. dcs_core/integrations/databases/mssql.py +829 -0
  107. dcs_core/integrations/databases/mysql.py +409 -0
  108. dcs_core/integrations/databases/opensearch.py +64 -0
  109. dcs_core/integrations/databases/oracle.py +719 -0
  110. dcs_core/integrations/databases/postgres.py +482 -0
  111. dcs_core/integrations/databases/redshift.py +53 -0
  112. dcs_core/integrations/databases/snowflake.py +48 -0
  113. dcs_core/integrations/databases/spark_df.py +111 -0
  114. dcs_core/integrations/databases/sybase.py +1069 -0
  115. dcs_core/integrations/storage/__init__.py +13 -0
  116. dcs_core/integrations/storage/local_file.py +149 -0
  117. dcs_core/integrations/utils/__init__.py +13 -0
  118. dcs_core/integrations/utils/utils.py +36 -0
  119. dcs_core/report/__init__.py +13 -0
  120. dcs_core/report/dashboard.py +211 -0
  121. dcs_core/report/models.py +88 -0
  122. dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
  123. dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
  124. dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
  125. dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
  126. dcs_core/report/static/assets/images/docs.svg +6 -0
  127. dcs_core/report/static/assets/images/github.svg +4 -0
  128. dcs_core/report/static/assets/images/logo.svg +7 -0
  129. dcs_core/report/static/assets/images/slack.svg +13 -0
  130. dcs_core/report/static/index.js +2 -0
  131. dcs_core/report/static/index.js.LICENSE.txt +3971 -0
  132. dcs_sdk/__init__.py +13 -0
  133. dcs_sdk/__main__.py +18 -0
  134. dcs_sdk/__version__.py +15 -0
  135. dcs_sdk/cli/__init__.py +13 -0
  136. dcs_sdk/cli/cli.py +163 -0
  137. dcs_sdk/sdk/__init__.py +58 -0
  138. dcs_sdk/sdk/config/__init__.py +13 -0
  139. dcs_sdk/sdk/config/config_loader.py +491 -0
  140. dcs_sdk/sdk/data_diff/__init__.py +13 -0
  141. dcs_sdk/sdk/data_diff/data_differ.py +821 -0
  142. dcs_sdk/sdk/rules/__init__.py +15 -0
  143. dcs_sdk/sdk/rules/rules_mappping.py +31 -0
  144. dcs_sdk/sdk/rules/rules_repository.py +214 -0
  145. dcs_sdk/sdk/rules/schema_rules.py +65 -0
  146. dcs_sdk/sdk/utils/__init__.py +13 -0
  147. dcs_sdk/sdk/utils/serializer.py +25 -0
  148. dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
  149. dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
  150. dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
  151. dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
  152. dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
  153. dcs_sdk/sdk/utils/table.py +475 -0
  154. dcs_sdk/sdk/utils/themes.py +40 -0
  155. dcs_sdk/sdk/utils/utils.py +349 -0
  156. dcs_sdk-1.6.5.dist-info/METADATA +150 -0
  157. dcs_sdk-1.6.5.dist-info/RECORD +159 -0
  158. dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
  159. dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,409 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from datetime import datetime
16
+ from typing import Any, Dict, List, Optional, Tuple, Union
17
+
18
+ from loguru import logger
19
+ from sqlalchemy import create_engine, text
20
+ from sqlalchemy.engine import URL
21
+
22
+ from dcs_core.core.common.errors import DataChecksDataSourcesConnectionError
23
+ from dcs_core.core.common.models.data_source_resource import RawColumnInfo
24
+ from dcs_core.integrations.databases.db2 import DB2DataSource
25
+
26
+
27
+ class MysqlDataSource(DB2DataSource):
28
+ def __init__(self, data_source_name: str, data_connection: Dict):
29
+ super().__init__(data_source_name, data_connection)
30
+ self.regex_patterns = {
31
+ "uuid": r"^[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$",
32
+ "usa_phone": r"^\\+?1?[-.[:space:]]?\\(?[0-9]{3}\\)?[-.[:space:]]?[0-9]{3}[-.[:space:]]?[0-9]{4}$",
33
+ "email": r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$",
34
+ "usa_zip_code": r"^[0-9B-DF-HJ-NP-TV-Z]{6}[0-9]$",
35
+ "ssn": r"^(?!666|000|9\\d{2})\\d{3}-(?!00)\\d{2}-(?!0{4})\\d{4}$",
36
+ "sedol": r"^[B-DF-HJ-NP-TV-XZ0-9]{6}[0-9]$",
37
+ "lei": r"^[A-Z0-9]{18}[0-9]{2}$",
38
+ "cusip": r"^[0-9A-Z]{9}$",
39
+ "figi": r"^BBG[A-Z0-9]{9}$",
40
+ "isin": r"^[A-Z]{2}[A-Z0-9]{9}[0-9]$",
41
+ "perm_id": r"^[0-9]{4}[- ]?[0-9]{4}[- ]?[0-9]{4}[- ]?[0-9]{4}[- ]?[0-9]{2,3}$",
42
+ }
43
+
44
+ def connect(self) -> Any:
45
+ """
46
+ Connect to the data source
47
+ """
48
+ try:
49
+ ssl = True if self.data_connection.get("security", False) in ["ssl", "SSL"] else False
50
+ self.schema_name = self.data_connection.get("schema") or self.data_connection.get("username")
51
+ url = URL.create(
52
+ drivername="mysql+pymysql",
53
+ username=self.data_connection.get("username"),
54
+ password=self.data_connection.get("password"),
55
+ host=self.data_connection.get("host"),
56
+ port=self.data_connection.get("port"),
57
+ database=self.data_connection.get("database"),
58
+ )
59
+ engine = create_engine(
60
+ url,
61
+ isolation_level="AUTOCOMMIT",
62
+ connect_args={"ssl": {"ssl": ssl} if ssl else None},
63
+ )
64
+ self.connection = engine.connect()
65
+ return self.connection
66
+ except Exception as e:
67
+ raise DataChecksDataSourcesConnectionError(message=f"Failed to connect to Mysql data source: [{str(e)}]")
68
+
69
+ def qualified_table_name(self, table_name: str) -> str:
70
+ """
71
+ Get the qualified table name
72
+ :param table_name: name of the table
73
+ :return: qualified table name
74
+ """
75
+ if self.schema_name:
76
+ return f"`{self.schema_name}`.`{table_name}`"
77
+ return f"`{table_name}`"
78
+
79
+ def quote_column(self, column: str) -> str:
80
+ """
81
+ Quote the column name
82
+ :param column: name of the column
83
+ :return: quoted column name
84
+ """
85
+ return f"`{column}`"
86
+
87
+ def query_get_table_names(
88
+ self,
89
+ schema: str | None = None,
90
+ with_view: bool = False,
91
+ ) -> dict:
92
+ """
93
+ Get the list of tables in the database.
94
+ :param schema: optional schema name
95
+ :param with_view: whether to include views
96
+ :return: dictionary with table names and optionally view names
97
+ """
98
+ database = self.database
99
+ if with_view:
100
+ table_type_condition = "TABLES.TABLE_TYPE IN ('BASE TABLE', 'VIEW')"
101
+ else:
102
+ table_type_condition = "TABLES.TABLE_TYPE = 'BASE TABLE'"
103
+
104
+ query = f"SELECT TABLES.TABLE_NAME, TABLES.TABLE_TYPE FROM information_schema.tables WHERE TABLES.TABLE_SCHEMA = '{database}' and {table_type_condition}"
105
+ rows = self.fetchall(query)
106
+
107
+ if with_view:
108
+ result = {"table": [], "view": []}
109
+ if rows:
110
+ for row in rows:
111
+ table_name = row[0]
112
+ table_type = row[1].strip() if row[1] else row[1]
113
+
114
+ if table_type == "BASE TABLE":
115
+ result["table"].append(table_name)
116
+ elif table_type == "VIEW":
117
+ result["view"].append(table_name)
118
+ else:
119
+ result = {"table": []}
120
+ if rows:
121
+ result["table"] = [row[0] for row in rows]
122
+
123
+ return result
124
+
125
+ def query_get_table_columns(self, table: str, schema: str | None = None) -> RawColumnInfo:
126
+ """
127
+ Get the schema of a table.
128
+ :param table: table name
129
+ :return: RawColumnInfo object containing column information
130
+ """
131
+ schema = self.database
132
+ query = (
133
+ "SELECT column_name, data_type, datetime_precision, numeric_precision, numeric_scale, NULL as collation_name, character_maximum_length "
134
+ "FROM information_schema.columns "
135
+ f"WHERE table_name = '{table}' AND table_schema = '{schema}'"
136
+ )
137
+ rows = self.fetchall(query)
138
+ if not rows:
139
+ raise RuntimeError(f"{table}: Table, {schema}: Schema, does not exist, or has no columns")
140
+
141
+ column_info = {
142
+ r[0]: RawColumnInfo(
143
+ column_name=self.safe_get(r, 0),
144
+ data_type=self.safe_get(r, 1),
145
+ datetime_precision=self.safe_get(r, 2),
146
+ numeric_precision=self.safe_get(r, 3),
147
+ numeric_scale=self.safe_get(r, 4),
148
+ collation_name=self.safe_get(r, 5),
149
+ character_maximum_length=self.safe_get(r, 6),
150
+ )
151
+ for r in rows
152
+ }
153
+ return column_info
154
+
155
+ def fetch_rows(
156
+ self,
157
+ query: str,
158
+ limit: int = 1,
159
+ with_column_names: bool = False,
160
+ complete_query: Optional[str] = None,
161
+ ) -> Tuple[List, Optional[List[str]]]:
162
+ """
163
+ Fetch rows from the database.
164
+
165
+ :param query: SQL query to execute.
166
+ :param limit: Number of rows to fetch.
167
+ :param with_column_names: Whether to include column names in the result.
168
+ :return: Tuple of (rows, column_names or None)
169
+ """
170
+ query = complete_query or f"SELECT * FROM ({query}) AS subquery LIMIT {limit}"
171
+
172
+ result = self.connection.execute(text(query))
173
+ rows = result.fetchmany(limit)
174
+
175
+ if with_column_names:
176
+ column_names = result.keys()
177
+ return rows, list(column_names)
178
+ else:
179
+ return rows, None
180
+
181
+ def query_get_distinct_count(self, table: str, field: str, filters: str = None) -> int:
182
+ """
183
+ Get the distinct count value
184
+ :param table: table name
185
+ :param field: column name
186
+ :param filters: filter condition
187
+ :return:
188
+ """
189
+ qualified_table_name = self.qualified_table_name(table)
190
+ field = self.quote_column(field)
191
+ query = "SELECT COUNT(DISTINCT {}) FROM {}".format(field, qualified_table_name)
192
+ if filters:
193
+ query += " WHERE {}".format(filters)
194
+
195
+ return self.fetchone(query)[0]
196
+
197
+ def query_get_percentile(self, table: str, field: str, percentile: float, filters: str = None) -> float:
198
+ """
199
+ Get the specified percentile value of a numeric column in a table.
200
+ :param table: table name
201
+ :param field: column name
202
+ :param percentile: percentile to calculate (e.g., 0.2 for 20th percentile)
203
+ :param filters: filter condition
204
+ :return: the value at the specified percentile
205
+ """
206
+ qualified_table_name = self.qualified_table_name(table)
207
+ field = self.quote_column(field)
208
+ rank = int(percentile * 100)
209
+
210
+ query = f"""
211
+ SELECT {field} FROM (
212
+ SELECT {field}, NTILE(100) OVER (ORDER BY {field}) AS percentile_rank
213
+ FROM {qualified_table_name}
214
+ {f'WHERE {filters}' if filters else ''}
215
+ ) AS ranked
216
+ WHERE percentile_rank = {rank}
217
+ ORDER BY {field}
218
+ LIMIT 1
219
+ """
220
+
221
+ result = self.fetchone(query)
222
+ return round(result[0], 2) if result and result[0] is not None else None
223
+
224
+ def query_negative_metric(self, table: str, field: str, operation: str, filters: str = None) -> Union[int, float]:
225
+ qualified_table_name = self.qualified_table_name(table)
226
+ field = self.quote_column(field)
227
+
228
+ negative_query = f"SELECT COUNT(*) FROM {qualified_table_name} WHERE {field} < 0"
229
+
230
+ if filters:
231
+ negative_query += f" AND {filters}"
232
+
233
+ total_count_query = f"SELECT COUNT(*) FROM {qualified_table_name}"
234
+
235
+ if filters:
236
+ total_count_query += f" WHERE {filters}"
237
+
238
+ if operation == "percent":
239
+ query = f"SELECT (CAST(({negative_query}) AS float) / CAST(({total_count_query}) AS float)) * 100"
240
+ else:
241
+ query = negative_query
242
+
243
+ result = self.fetchone(query)[0]
244
+ return round(result, 2) if operation == "percent" else result
245
+
246
+ def query_get_string_length_metric(
247
+ self, table: str, field: str, metric: str, filters: str = None
248
+ ) -> Union[int, float]:
249
+ """
250
+ Get the string length metric (max, min, avg) in a column of a table.
251
+
252
+ :param table: table name
253
+ :param field: column name
254
+ :param metric: the metric to calculate ('max', 'min', 'avg')
255
+ :param filters: filter condition
256
+ :return: the calculated metric as int for 'max' and 'min', float for 'avg'
257
+ """
258
+ qualified_table_name = self.qualified_table_name(table)
259
+ field = self.quote_column(field)
260
+
261
+ if metric.lower() == "max":
262
+ sql_function = "MAX(LENGTH"
263
+ elif metric.lower() == "min":
264
+ sql_function = "MIN(LENGTH"
265
+ elif metric.lower() == "avg":
266
+ sql_function = "AVG(LENGTH"
267
+ else:
268
+ raise ValueError(f"Invalid metric '{metric}'. Choose from 'max', 'min', or 'avg'.")
269
+
270
+ query = f"SELECT {sql_function}({field})) FROM {qualified_table_name}"
271
+
272
+ if filters:
273
+ query += f" WHERE {filters}"
274
+
275
+ result = self.fetchone(query)[0]
276
+ return round(result, 2) if metric.lower() == "avg" else result
277
+
278
+ def query_string_pattern_validity(
279
+ self,
280
+ table: str,
281
+ field: str,
282
+ regex_pattern: str = None,
283
+ predefined_regex_pattern: str = None,
284
+ filters: str = None,
285
+ ) -> Tuple[int, int]:
286
+ """
287
+ Get the count of valid values based on the regex pattern.
288
+ :param table: table name
289
+ :param field: column name
290
+ :param regex_pattern: custom regex pattern
291
+ :param predefined_regex_pattern: predefined regex pattern
292
+ :param filters: filter condition
293
+ :return: count of valid values, count of total row count
294
+ """
295
+ filters = f"WHERE {filters}" if filters else ""
296
+ qualified_table_name = self.qualified_table_name(table)
297
+ field = self.quote_column(field)
298
+
299
+ if not regex_pattern and not predefined_regex_pattern:
300
+ raise ValueError("Either regex_pattern or predefined_regex_pattern should be provided")
301
+
302
+ if predefined_regex_pattern:
303
+ regex = self.regex_patterns[predefined_regex_pattern]
304
+ else:
305
+ regex = regex_pattern
306
+
307
+ regex_query = f"CASE WHEN {field} REGEXP '{regex}' THEN 1 ELSE 0 END"
308
+ query = f"""
309
+ SELECT SUM({regex_query}) AS valid_count, COUNT(*) AS total_count
310
+ FROM {qualified_table_name} {filters}
311
+ """
312
+ result = self.fetchone(query)
313
+ return result[0], result[1]
314
+
315
+ def query_get_usa_state_code_validity(self, table: str, field: str, filters: str = None) -> Tuple[int, int]:
316
+ """
317
+ Get the count of valid USA state codes
318
+ :param table: table name
319
+ :param field: column name
320
+ :param filters: filter condition
321
+ :return: count of valid state codes, count of total row count
322
+ """
323
+
324
+ valid_state_codes_str = ", ".join(f"'{code}'" for code in self.valid_state_codes)
325
+
326
+ filters = f"WHERE {filters}" if filters else ""
327
+
328
+ qualified_table_name = self.qualified_table_name(table)
329
+ field = self.quote_column(field)
330
+
331
+ regex_query = f"""
332
+ CASE WHEN REGEXP_LIKE({field}, '^[A-Z]{{2}}$') AND UPPER({field}) IN ({valid_state_codes_str}) THEN 1 ELSE 0 END
333
+ """
334
+
335
+ query = f"""
336
+ SELECT SUM({regex_query}) AS valid_count, COUNT(*) AS total_count
337
+ FROM {qualified_table_name} {filters}
338
+ """
339
+ result = self.fetchone(query)
340
+ return result[0], result[1]
341
+
342
+ def query_timestamp_metric(self):
343
+ raise NotImplementedError("Method not implemented for MySQLDataSource")
344
+
345
+ def query_timestamp_not_in_future_metric(self):
346
+ raise NotImplementedError("Method not implemented for MySQLDataSource")
347
+
348
+ def query_timestamp_date_not_in_future_metric(self):
349
+ raise NotImplementedError("Method not implemented for MySQLDataSource")
350
+
351
+ def query_get_time_diff(self, table: str, field: str) -> int:
352
+ """
353
+ Get the time difference
354
+ :param table: name of the index
355
+ :param field: field name of updated time column
356
+ :return: time difference in seconds
357
+ """
358
+ qualified_table_name = self.qualified_table_name(table)
359
+ field = self.quote_column(field)
360
+ query = f"""
361
+ SELECT {field}
362
+ FROM {qualified_table_name}
363
+ ORDER BY {field} DESC
364
+ LIMIT 1;
365
+ """
366
+ result = self.fetchone(query)
367
+ if result:
368
+ updated_time = result[0]
369
+ if isinstance(updated_time, str):
370
+ updated_time = datetime.strptime(updated_time, "%Y-%m-%d %H:%M:%S.%f")
371
+ return int((datetime.utcnow() - updated_time).total_seconds())
372
+ return 0
373
+
374
+ def get_table_foreign_key_info(self, table_name: str, schema: str | None = None):
375
+ schema = schema or self.schema_name
376
+
377
+ query = f"""
378
+ SELECT
379
+ kcu.CONSTRAINT_NAME AS constraint_name,
380
+ kcu.TABLE_NAME AS table_name,
381
+ kcu.COLUMN_NAME AS fk_column,
382
+ kcu.REFERENCED_TABLE_NAME AS referenced_table,
383
+ kcu.REFERENCED_COLUMN_NAME AS referenced_column
384
+ FROM information_schema.TABLE_CONSTRAINTS tc
385
+ JOIN information_schema.KEY_COLUMN_USAGE kcu
386
+ ON tc.CONSTRAINT_NAME = kcu.CONSTRAINT_NAME
387
+ AND tc.TABLE_SCHEMA = kcu.TABLE_SCHEMA
388
+ WHERE tc.CONSTRAINT_TYPE = 'FOREIGN KEY'
389
+ AND tc.TABLE_NAME = '{table_name}'
390
+ AND tc.TABLE_SCHEMA = '{schema}';
391
+ """
392
+
393
+ try:
394
+ rows = self.fetchall(query)
395
+ except Exception as e:
396
+ logger.error(f"Failed to fetch fk info for dataset: {table_name} ({e})")
397
+ return []
398
+
399
+ data = [
400
+ {
401
+ "constraint_name": row[0],
402
+ "table_name": row[1],
403
+ "fk_column": row[2],
404
+ "referenced_table": row[3],
405
+ "referenced_column": row[4],
406
+ }
407
+ for row in rows
408
+ ]
409
+ return data
@@ -0,0 +1,64 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import Dict
15
+
16
+ from opensearchpy import OpenSearch
17
+
18
+ from dcs_core.core.common.errors import DataChecksDataSourcesConnectionError
19
+ from dcs_core.core.datasource.search_datasource import SearchIndexDataSource
20
+
21
+
22
+ class OpenSearchDataSource(SearchIndexDataSource):
23
+ """
24
+ OpenSearch data source
25
+ """
26
+
27
+ def __init__(self, data_source_name: str, data_connection: Dict):
28
+ super().__init__(data_source_name, data_connection)
29
+
30
+ def connect(self) -> OpenSearch:
31
+ """
32
+ Connect to the data source
33
+ """
34
+ try:
35
+ auth = (
36
+ self.data_connection.get("username"),
37
+ self.data_connection.get("password"),
38
+ )
39
+ host = self.data_connection.get("host")
40
+ port = int(self.data_connection.get("port"))
41
+ self.client = OpenSearch(
42
+ hosts=[{"host": host, "port": port}],
43
+ http_auth=auth,
44
+ use_ssl=True,
45
+ verify_certs=False,
46
+ ca_certs=False,
47
+ )
48
+ if not self.client.ping():
49
+ raise Exception("Failed to connect to OpenSearch data source")
50
+ return self.client
51
+ except Exception as e:
52
+ raise DataChecksDataSourcesConnectionError(f"Failed to connect to OpenSearch data source: [{str(e)}]")
53
+
54
+ def close(self):
55
+ """
56
+ Close the connection
57
+ """
58
+ self.client.close()
59
+
60
+ def is_connected(self) -> bool:
61
+ """
62
+ Check if the data source is connected
63
+ """
64
+ return self.client.ping()