dcs-sdk 1.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. data_diff/__init__.py +221 -0
  2. data_diff/__main__.py +517 -0
  3. data_diff/abcs/__init__.py +13 -0
  4. data_diff/abcs/compiler.py +27 -0
  5. data_diff/abcs/database_types.py +402 -0
  6. data_diff/config.py +141 -0
  7. data_diff/databases/__init__.py +38 -0
  8. data_diff/databases/_connect.py +323 -0
  9. data_diff/databases/base.py +1417 -0
  10. data_diff/databases/bigquery.py +376 -0
  11. data_diff/databases/clickhouse.py +217 -0
  12. data_diff/databases/databricks.py +262 -0
  13. data_diff/databases/duckdb.py +207 -0
  14. data_diff/databases/mssql.py +343 -0
  15. data_diff/databases/mysql.py +189 -0
  16. data_diff/databases/oracle.py +238 -0
  17. data_diff/databases/postgresql.py +293 -0
  18. data_diff/databases/presto.py +222 -0
  19. data_diff/databases/redis.py +93 -0
  20. data_diff/databases/redshift.py +233 -0
  21. data_diff/databases/snowflake.py +222 -0
  22. data_diff/databases/sybase.py +720 -0
  23. data_diff/databases/trino.py +73 -0
  24. data_diff/databases/vertica.py +174 -0
  25. data_diff/diff_tables.py +489 -0
  26. data_diff/errors.py +17 -0
  27. data_diff/format.py +369 -0
  28. data_diff/hashdiff_tables.py +1026 -0
  29. data_diff/info_tree.py +76 -0
  30. data_diff/joindiff_tables.py +434 -0
  31. data_diff/lexicographic_space.py +253 -0
  32. data_diff/parse_time.py +88 -0
  33. data_diff/py.typed +0 -0
  34. data_diff/queries/__init__.py +13 -0
  35. data_diff/queries/api.py +213 -0
  36. data_diff/queries/ast_classes.py +811 -0
  37. data_diff/queries/base.py +38 -0
  38. data_diff/queries/extras.py +43 -0
  39. data_diff/query_utils.py +70 -0
  40. data_diff/schema.py +67 -0
  41. data_diff/table_segment.py +583 -0
  42. data_diff/thread_utils.py +112 -0
  43. data_diff/utils.py +1022 -0
  44. data_diff/version.py +15 -0
  45. dcs_core/__init__.py +13 -0
  46. dcs_core/__main__.py +17 -0
  47. dcs_core/__version__.py +15 -0
  48. dcs_core/cli/__init__.py +13 -0
  49. dcs_core/cli/cli.py +165 -0
  50. dcs_core/core/__init__.py +19 -0
  51. dcs_core/core/common/__init__.py +13 -0
  52. dcs_core/core/common/errors.py +50 -0
  53. dcs_core/core/common/models/__init__.py +13 -0
  54. dcs_core/core/common/models/configuration.py +284 -0
  55. dcs_core/core/common/models/dashboard.py +24 -0
  56. dcs_core/core/common/models/data_source_resource.py +75 -0
  57. dcs_core/core/common/models/metric.py +160 -0
  58. dcs_core/core/common/models/profile.py +75 -0
  59. dcs_core/core/common/models/validation.py +216 -0
  60. dcs_core/core/common/models/widget.py +44 -0
  61. dcs_core/core/configuration/__init__.py +13 -0
  62. dcs_core/core/configuration/config_loader.py +139 -0
  63. dcs_core/core/configuration/configuration_parser.py +262 -0
  64. dcs_core/core/configuration/configuration_parser_arc.py +328 -0
  65. dcs_core/core/datasource/__init__.py +13 -0
  66. dcs_core/core/datasource/base.py +62 -0
  67. dcs_core/core/datasource/manager.py +112 -0
  68. dcs_core/core/datasource/search_datasource.py +421 -0
  69. dcs_core/core/datasource/sql_datasource.py +1094 -0
  70. dcs_core/core/inspect.py +163 -0
  71. dcs_core/core/logger/__init__.py +13 -0
  72. dcs_core/core/logger/base.py +32 -0
  73. dcs_core/core/logger/default_logger.py +94 -0
  74. dcs_core/core/metric/__init__.py +13 -0
  75. dcs_core/core/metric/base.py +220 -0
  76. dcs_core/core/metric/combined_metric.py +98 -0
  77. dcs_core/core/metric/custom_metric.py +34 -0
  78. dcs_core/core/metric/manager.py +137 -0
  79. dcs_core/core/metric/numeric_metric.py +403 -0
  80. dcs_core/core/metric/reliability_metric.py +90 -0
  81. dcs_core/core/profiling/__init__.py +13 -0
  82. dcs_core/core/profiling/datasource_profiling.py +136 -0
  83. dcs_core/core/profiling/numeric_field_profiling.py +72 -0
  84. dcs_core/core/profiling/text_field_profiling.py +67 -0
  85. dcs_core/core/repository/__init__.py +13 -0
  86. dcs_core/core/repository/metric_repository.py +77 -0
  87. dcs_core/core/utils/__init__.py +13 -0
  88. dcs_core/core/utils/log.py +29 -0
  89. dcs_core/core/utils/tracking.py +105 -0
  90. dcs_core/core/utils/utils.py +44 -0
  91. dcs_core/core/validation/__init__.py +13 -0
  92. dcs_core/core/validation/base.py +230 -0
  93. dcs_core/core/validation/completeness_validation.py +153 -0
  94. dcs_core/core/validation/custom_query_validation.py +24 -0
  95. dcs_core/core/validation/manager.py +282 -0
  96. dcs_core/core/validation/numeric_validation.py +276 -0
  97. dcs_core/core/validation/reliability_validation.py +91 -0
  98. dcs_core/core/validation/uniqueness_validation.py +61 -0
  99. dcs_core/core/validation/validity_validation.py +738 -0
  100. dcs_core/integrations/__init__.py +13 -0
  101. dcs_core/integrations/databases/__init__.py +13 -0
  102. dcs_core/integrations/databases/bigquery.py +187 -0
  103. dcs_core/integrations/databases/databricks.py +51 -0
  104. dcs_core/integrations/databases/db2.py +652 -0
  105. dcs_core/integrations/databases/elasticsearch.py +61 -0
  106. dcs_core/integrations/databases/mssql.py +829 -0
  107. dcs_core/integrations/databases/mysql.py +409 -0
  108. dcs_core/integrations/databases/opensearch.py +64 -0
  109. dcs_core/integrations/databases/oracle.py +719 -0
  110. dcs_core/integrations/databases/postgres.py +482 -0
  111. dcs_core/integrations/databases/redshift.py +53 -0
  112. dcs_core/integrations/databases/snowflake.py +48 -0
  113. dcs_core/integrations/databases/spark_df.py +111 -0
  114. dcs_core/integrations/databases/sybase.py +1069 -0
  115. dcs_core/integrations/storage/__init__.py +13 -0
  116. dcs_core/integrations/storage/local_file.py +149 -0
  117. dcs_core/integrations/utils/__init__.py +13 -0
  118. dcs_core/integrations/utils/utils.py +36 -0
  119. dcs_core/report/__init__.py +13 -0
  120. dcs_core/report/dashboard.py +211 -0
  121. dcs_core/report/models.py +88 -0
  122. dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
  123. dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
  124. dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
  125. dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
  126. dcs_core/report/static/assets/images/docs.svg +6 -0
  127. dcs_core/report/static/assets/images/github.svg +4 -0
  128. dcs_core/report/static/assets/images/logo.svg +7 -0
  129. dcs_core/report/static/assets/images/slack.svg +13 -0
  130. dcs_core/report/static/index.js +2 -0
  131. dcs_core/report/static/index.js.LICENSE.txt +3971 -0
  132. dcs_sdk/__init__.py +13 -0
  133. dcs_sdk/__main__.py +18 -0
  134. dcs_sdk/__version__.py +15 -0
  135. dcs_sdk/cli/__init__.py +13 -0
  136. dcs_sdk/cli/cli.py +163 -0
  137. dcs_sdk/sdk/__init__.py +58 -0
  138. dcs_sdk/sdk/config/__init__.py +13 -0
  139. dcs_sdk/sdk/config/config_loader.py +491 -0
  140. dcs_sdk/sdk/data_diff/__init__.py +13 -0
  141. dcs_sdk/sdk/data_diff/data_differ.py +821 -0
  142. dcs_sdk/sdk/rules/__init__.py +15 -0
  143. dcs_sdk/sdk/rules/rules_mappping.py +31 -0
  144. dcs_sdk/sdk/rules/rules_repository.py +214 -0
  145. dcs_sdk/sdk/rules/schema_rules.py +65 -0
  146. dcs_sdk/sdk/utils/__init__.py +13 -0
  147. dcs_sdk/sdk/utils/serializer.py +25 -0
  148. dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
  149. dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
  150. dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
  151. dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
  152. dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
  153. dcs_sdk/sdk/utils/table.py +475 -0
  154. dcs_sdk/sdk/utils/themes.py +40 -0
  155. dcs_sdk/sdk/utils/utils.py +349 -0
  156. dcs_sdk-1.6.5.dist-info/METADATA +150 -0
  157. dcs_sdk-1.6.5.dist-info/RECORD +159 -0
  158. dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
  159. dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,482 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import datetime
16
+ from decimal import Decimal
17
+ from typing import Any, Dict, List, Optional, Tuple
18
+ from uuid import UUID
19
+
20
+ from sqlalchemy import create_engine, text
21
+ from sqlalchemy.engine import URL
22
+
23
+ from dcs_core.core.common.errors import DataChecksDataSourcesConnectionError
24
+ from dcs_core.core.common.models.data_source_resource import RawColumnInfo
25
+ from dcs_core.core.datasource.sql_datasource import SQLDataSource
26
+
27
+
28
+ class PostgresDataSource(SQLDataSource):
29
+ def __init__(self, data_source_name: str, data_connection: Dict):
30
+ super().__init__(data_source_name, data_connection)
31
+ self.DEFAULT_NUMERIC_PRECISION = 16383
32
+
33
+ def connect(self) -> Any:
34
+ """
35
+ Connect to the data source
36
+ """
37
+ try:
38
+ url = URL.create(
39
+ drivername="postgresql",
40
+ username=self.data_connection.get("username"),
41
+ password=self.data_connection.get("password"),
42
+ host=self.data_connection.get("host"),
43
+ port=self.data_connection.get("port"),
44
+ database=self.data_connection.get("database"),
45
+ )
46
+ schema = self.data_connection.get("schema") or "public"
47
+ engine = create_engine(
48
+ url,
49
+ connect_args={"options": f"-csearch_path={schema}"},
50
+ isolation_level="AUTOCOMMIT",
51
+ )
52
+ self.connection = engine.connect()
53
+ return self.connection
54
+ except Exception as e:
55
+ raise DataChecksDataSourcesConnectionError(
56
+ message=f"Failed to connect to PostgresSQL data source: [{str(e)}]"
57
+ )
58
+
59
+ def qualified_table_name(self, table_name: str) -> str:
60
+ """
61
+ Get the qualified table name
62
+ :param table_name: name of the table
63
+ :return: qualified table name
64
+ """
65
+ if self.schema_name:
66
+ return f'"{self.schema_name}"."{table_name}"'
67
+ return f'"{table_name}"'
68
+
69
+ def quote_column(self, column: str) -> str:
70
+ """
71
+ Quote the column name
72
+ :param column: name of the column
73
+ :return: quoted column name
74
+ """
75
+ return f'"{column}"'
76
+
77
+ def query_get_database_version(self, database_version_query: Optional[str] = None) -> str:
78
+ """
79
+ Get the database version
80
+ :return: version string
81
+ """
82
+ query = database_version_query or "SELECT version()"
83
+ result = self.fetchone(query)[0]
84
+ return result if result else None
85
+
86
+ def query_get_table_names(
87
+ self,
88
+ schema: str | None = None,
89
+ with_view: bool = False,
90
+ ) -> dict:
91
+ """
92
+ Get the list of tables in the database.
93
+ :param schema: optional schema name
94
+ :param with_view: whether to include views
95
+ :return: dictionary with table names and optionally view names
96
+ """
97
+
98
+ schema = schema or self.schema_name
99
+ database = self.quote_database(self.database)
100
+
101
+ if with_view:
102
+ table_type_condition = "table_type IN ('BASE TABLE', 'VIEW')"
103
+ else:
104
+ table_type_condition = "table_type = 'BASE TABLE'"
105
+
106
+ query = (
107
+ f"SELECT table_name, table_type FROM {database}.information_schema.tables "
108
+ f"WHERE table_schema = '{schema}' AND {table_type_condition}"
109
+ )
110
+ rows = self.fetchall(query)
111
+
112
+ if with_view:
113
+ result = {"table": [], "view": []}
114
+ if rows:
115
+ for row in rows:
116
+ table_name = row[0]
117
+ table_type = row[1].strip() if row[1] else row[1]
118
+
119
+ if table_type == "BASE TABLE":
120
+ result["table"].append(table_name)
121
+ elif table_type == "VIEW":
122
+ result["view"].append(table_name)
123
+ else:
124
+ result = {"table": []}
125
+ if rows:
126
+ result["table"] = [row[0] for row in rows]
127
+
128
+ return result
129
+
130
+ def query_get_table_indexes(self, table: str, schema: str | None = None) -> dict[str, dict]:
131
+ """
132
+ Get index information for a table in PostgreSQL DB.
133
+ :param table: Table name
134
+ :param schema: Optional schema name
135
+ :return: Dictionary with index details
136
+ """
137
+ schema = schema or self.schema_name
138
+ table = table.lower()
139
+ schema = schema.lower()
140
+
141
+ query = f"""
142
+ SELECT
143
+ i.relname AS index_name,
144
+ am.amname AS index_type,
145
+ a.attname AS column_name,
146
+ x.n AS column_order
147
+ FROM
148
+ pg_class t
149
+ JOIN
150
+ pg_namespace ns ON ns.oid = t.relnamespace
151
+ JOIN
152
+ pg_index ix ON t.oid = ix.indrelid
153
+ JOIN
154
+ pg_class i ON i.oid = ix.indexrelid
155
+ JOIN
156
+ pg_am am ON i.relam = am.oid
157
+ JOIN
158
+ LATERAL unnest(ix.indkey) WITH ORDINALITY AS x(attnum, n)
159
+ ON TRUE
160
+ JOIN
161
+ pg_attribute a ON a.attnum = x.attnum AND a.attrelid = t.oid
162
+ WHERE
163
+ t.relkind = 'r'
164
+ AND t.relname = '{table}'
165
+ AND ns.nspname = '{schema}'
166
+ ORDER BY
167
+ i.relname, x.n
168
+ """
169
+ rows = self.fetchall(query)
170
+
171
+ if not rows:
172
+ raise RuntimeError(f"No index information found for table '{table}' in schema '{schema}'.")
173
+
174
+ pk_query = f"""
175
+ SELECT kcu.column_name
176
+ FROM information_schema.table_constraints tc
177
+ JOIN information_schema.key_column_usage kcu
178
+ ON tc.constraint_name = kcu.constraint_name
179
+ AND tc.constraint_schema = kcu.constraint_schema
180
+ AND tc.table_name = kcu.table_name
181
+ WHERE tc.constraint_type = 'PRIMARY KEY'
182
+ AND tc.table_name = '{table}'
183
+ AND tc.table_schema = '{schema}'
184
+ ORDER BY kcu.ordinal_position
185
+ """
186
+ pk_rows = self.fetchall(pk_query)
187
+ pk_columns = [row[0].strip() for row in pk_rows] if pk_rows else []
188
+ pk_columns_set = set(pk_columns)
189
+
190
+ indexes = {}
191
+ for row in rows:
192
+ index_name = row[0]
193
+ index_type = row[1]
194
+ column_info = {
195
+ "column_name": self.safe_get(row, 2),
196
+ "column_order": self.safe_get(row, 3),
197
+ }
198
+ if index_name not in indexes:
199
+ indexes[index_name] = {"columns": [], "index_type": index_type}
200
+ indexes[index_name]["columns"].append(column_info)
201
+
202
+ for index_name, idx in indexes.items():
203
+ index_columns = [col["column_name"].strip() for col in idx["columns"]]
204
+ index_columns_set = set(index_columns)
205
+ idx["is_primary_key"] = pk_columns_set == index_columns_set and len(index_columns) == len(pk_columns)
206
+ return indexes
207
+
208
+ def query_get_table_columns(
209
+ self,
210
+ table: str,
211
+ schema: str | None = None,
212
+ ) -> RawColumnInfo:
213
+ """
214
+ Get the schema of a table.
215
+ :param table: table name
216
+ :return: RawColumnInfo object containing column information
217
+ """
218
+ schema = schema or self.schema_name
219
+ info_schema_path = ["information_schema", "columns"]
220
+ if self.database:
221
+ database = self.quote_database(self.database)
222
+ info_schema_path.insert(0, database)
223
+ query = (
224
+ f"SELECT column_name, data_type, datetime_precision, "
225
+ f"CASE WHEN data_type = 'numeric' "
226
+ f"THEN coalesce(numeric_precision, 131072 + {self.DEFAULT_NUMERIC_PRECISION}) "
227
+ f"ELSE numeric_precision END AS numeric_precision, "
228
+ f"CASE WHEN data_type = 'numeric' "
229
+ f"THEN coalesce(numeric_scale, {self.DEFAULT_NUMERIC_PRECISION}) "
230
+ f"ELSE numeric_scale END AS numeric_scale, "
231
+ f"COALESCE(collation_name, NULL) AS collation_name, "
232
+ f"CASE WHEN data_type = 'character varying' "
233
+ f"THEN character_maximum_length END AS character_maximum_length "
234
+ f"FROM {'.'.join(info_schema_path)} "
235
+ f"WHERE table_name = '{table}' AND table_schema = '{schema}'"
236
+ )
237
+ rows = self.fetchall(query)
238
+ if not rows:
239
+ raise RuntimeError(f"{table}: Table, {schema}: Schema, does not exist, or has no columns")
240
+
241
+ column_info = {
242
+ r[0]: RawColumnInfo(
243
+ column_name=self.safe_get(r, 0),
244
+ data_type=self.safe_get(r, 1),
245
+ datetime_precision=self.safe_get(r, 2),
246
+ numeric_precision=self.safe_get(r, 3),
247
+ numeric_scale=self.safe_get(r, 4),
248
+ collation_name=self.safe_get(r, 5),
249
+ character_maximum_length=self.safe_get(r, 6),
250
+ )
251
+ for r in rows
252
+ }
253
+ return column_info
254
+
255
+ def fetch_rows(
256
+ self,
257
+ query: str,
258
+ limit: int = 1,
259
+ with_column_names: bool = False,
260
+ complete_query: Optional[str] = None,
261
+ ) -> Tuple[List, Optional[List[str]]]:
262
+ """
263
+ Fetch rows from the database.
264
+
265
+ :param query: SQL query to execute.
266
+ :param limit: Number of rows to fetch.
267
+ :param with_column_names: Whether to include column names in the result.
268
+ :return: Tuple of (rows, column_names or None)
269
+ """
270
+ query = complete_query or f"SELECT * FROM ({query}) AS subquery LIMIT {limit}"
271
+
272
+ result = self.connection.execute(text(query))
273
+ rows = result.fetchmany(limit)
274
+
275
+ if with_column_names:
276
+ column_names = result.keys()
277
+ return rows, list(column_names)
278
+ else:
279
+ return rows, None
280
+
281
+ def fetch_sample_values_from_database(
282
+ self,
283
+ table_name: str,
284
+ column_names: list[str],
285
+ limit: int = 5,
286
+ ) -> List[Tuple]:
287
+ """
288
+ Fetch sample rows for specific columns from the given table.
289
+
290
+ :param table_name: The name of the table.
291
+ :param column_names: List of column names to fetch.
292
+ :param limit: Number of rows to fetch.
293
+ :return: List of row tuples.
294
+ """
295
+ table_name = self.qualified_table_name(table_name)
296
+
297
+ if not column_names:
298
+ raise ValueError("At least one column name must be provided")
299
+
300
+ if len(column_names) == 1 and column_names[0] == "*":
301
+ query = f"SELECT * FROM {table_name} LIMIT {limit}"
302
+ else:
303
+ columns = ", ".join([self.quote_column(col) for col in column_names])
304
+ query = f"SELECT {columns} FROM {table_name} LIMIT {limit}"
305
+
306
+ result = self.connection.execute(text(query))
307
+ column_names = list(result.keys())
308
+ rows = result.fetchall()
309
+ return rows, column_names
310
+
311
+ def build_table_metrics_query(
312
+ self,
313
+ table_name: str,
314
+ column_info: list[dict],
315
+ additional_queries: Optional[List[str]] = None,
316
+ ) -> list[dict]:
317
+ query_parts = []
318
+ if not column_info:
319
+ return []
320
+
321
+ for col in column_info:
322
+ name = col["column_name"]
323
+ dtype = col["data_type"].lower()
324
+ quoted = self.quote_column(name)
325
+
326
+ if dtype in ("json", "jsonb"):
327
+ distinct_expr = f"{quoted}::text"
328
+ else:
329
+ distinct_expr = f"{quoted}"
330
+
331
+ query_parts.append(f'COUNT(DISTINCT {distinct_expr}) AS "{name}_distinct"')
332
+ query_parts.append(f'COUNT(*) - COUNT(DISTINCT {distinct_expr}) AS "{name}_duplicate"')
333
+ query_parts.append(
334
+ f'SUM(CASE WHEN {self.quote_column(name)} IS NULL THEN 1 ELSE 0 END) AS "{name}_is_null"'
335
+ )
336
+
337
+ if dtype in (
338
+ "int",
339
+ "integer",
340
+ "bigint",
341
+ "smallint",
342
+ "decimal",
343
+ "numeric",
344
+ "float",
345
+ "double",
346
+ ):
347
+ query_parts.append(f'MIN({self.quote_column(name)}) AS "{name}_min"')
348
+ query_parts.append(f'MAX({self.quote_column(name)}) AS "{name}_max"')
349
+ query_parts.append(f'AVG({self.quote_column(name)}) AS "{name}_average"')
350
+
351
+ elif dtype in ("varchar", "text", "char", "string", "character varying"):
352
+ query_parts.append(f'MAX(CHAR_LENGTH({self.quote_column(name)})) AS "{name}_max_character_length"')
353
+
354
+ if additional_queries:
355
+ for queries in additional_queries:
356
+ query_parts.append(queries)
357
+
358
+ qualified_table = self.qualified_table_name(table_name)
359
+ joined_parts = ",\n ".join(query_parts)
360
+ query = f"SELECT\n {joined_parts}\nFROM {qualified_table};"
361
+
362
+ result = self.connection.execute(text(query))
363
+ row = dict(list(result)[0]._mapping)
364
+
365
+ def _normalize_metrics(value):
366
+ """
367
+ Safely normalizes DB metric values into JSON-serializable Python types.
368
+ Handles:
369
+ - Decimal → float
370
+ - datetime/date → ISO 8601 string
371
+ - UUID → string
372
+ - Nested dict/list recursion
373
+ - None passthrough
374
+ """
375
+ if value is None:
376
+ return None
377
+
378
+ if isinstance(value, Decimal):
379
+ return float(value)
380
+ if isinstance(value, (int, float, bool)):
381
+ return value
382
+
383
+ if isinstance(value, (datetime.datetime, datetime.date)):
384
+ return value.isoformat()
385
+
386
+ if isinstance(value, UUID):
387
+ return str(value)
388
+
389
+ if isinstance(value, list):
390
+ return [_normalize_metrics(v) for v in value]
391
+ if isinstance(value, dict):
392
+ return {k: _normalize_metrics(v) for k, v in value.items()}
393
+
394
+ return str(value)
395
+
396
+ column_wise = []
397
+ for col in column_info:
398
+ name = col["column_name"]
399
+ col_metrics = {}
400
+
401
+ for key, value in row.items():
402
+ if key.startswith(f"{name}_"):
403
+ metric_name = key[len(name) + 1 :]
404
+ col_metrics[metric_name] = _normalize_metrics(value)
405
+
406
+ column_wise.append({"column_name": name, "metrics": col_metrics})
407
+
408
+ for col_data in column_wise:
409
+ metrics = col_data["metrics"]
410
+ distinct_count = metrics.get("distinct")
411
+ col_name = col_data["column_name"]
412
+ dtype = next(c["data_type"].lower() for c in column_info if c["column_name"] == col_name)
413
+
414
+ if isinstance(distinct_count, (int, float)) and distinct_count < 20:
415
+ quoted = self.quote_column(col_name)
416
+
417
+ if dtype in ("json", "jsonb"):
418
+ group_expr = f"{quoted}::text"
419
+ else:
420
+ group_expr = quoted
421
+
422
+ dist_query = (
423
+ f"SELECT {group_expr}, COUNT(*) "
424
+ f"FROM {qualified_table} GROUP BY {group_expr} ORDER BY COUNT(*) DESC"
425
+ )
426
+
427
+ try:
428
+ dist_result = self.connection.execute(text(dist_query)).fetchall()
429
+
430
+ distribution = []
431
+ for r in dist_result:
432
+ val = _normalize_metrics(r[0])
433
+ distribution.append(
434
+ {
435
+ "col_val": val,
436
+ "count": r[1],
437
+ }
438
+ )
439
+
440
+ metrics["distribution_graph"] = distribution
441
+
442
+ except Exception as e:
443
+ print(f"Failed to generate distribution graph for column {col_name}: {e}")
444
+
445
+ for col_data in column_wise:
446
+ metrics = col_data["metrics"]
447
+ formatted_metrics_data = {
448
+ "general_data": {key: value for key, value in metrics.items() if key != "distribution_graph"},
449
+ "distribution_data": metrics.get("distribution_graph", []),
450
+ }
451
+ col_data["metrics"] = formatted_metrics_data
452
+
453
+ return column_wise
454
+
455
+ def get_table_foreign_key_info(self, table_name: str, schema: str | None = None):
456
+ schema = schema or self.schema_name
457
+
458
+ query = f"""
459
+ SELECT
460
+ con.conname AS constraint_name,
461
+ rel_t.relname AS table_name,
462
+ att_t.attname AS fk_column,
463
+ rel_p.relname AS referenced_table,
464
+ att_p.attname AS referenced_column
465
+ FROM pg_constraint con
466
+ JOIN pg_class rel_t ON rel_t.oid = con.conrelid
467
+ JOIN pg_namespace nsp_t ON nsp_t.oid = rel_t.relnamespace
468
+ JOIN pg_class rel_p ON rel_p.oid = con.confrelid
469
+ JOIN pg_namespace nsp_p ON nsp_p.oid = rel_p.relnamespace
470
+ JOIN pg_attribute att_t ON att_t.attrelid = rel_t.oid AND att_t.attnum = ANY(con.conkey)
471
+ JOIN pg_attribute att_p ON att_p.attrelid = rel_p.oid AND att_p.attnum = ANY(con.confkey)
472
+ WHERE con.contype = 'f'
473
+ AND rel_t.relname = '{table_name}'
474
+ AND nsp_t.nspname = '{schema}';
475
+ """
476
+ try:
477
+ result = self.connection.execute(text(query))
478
+ except Exception as e:
479
+ print(f"Failed to fetch fk info for dataset {table_name}")
480
+ return []
481
+ all_results = [dict(row._mapping) for row in result]
482
+ return all_results
@@ -0,0 +1,53 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Any, Dict
16
+
17
+ from sqlalchemy import create_engine
18
+ from sqlalchemy.engine import URL
19
+
20
+ from dcs_core.core.common.errors import DataChecksDataSourcesConnectionError
21
+ from dcs_core.core.datasource.sql_datasource import SQLDataSource
22
+
23
+
24
+ class RedShiftDataSource(SQLDataSource):
25
+ def __init__(self, data_source_name: str, data_connection: Dict):
26
+ super().__init__(data_source_name, data_connection)
27
+
28
+ def connect(self) -> Any:
29
+ """
30
+ Connect to the data source
31
+ """
32
+ try:
33
+ url = URL.create(
34
+ "redshift+psycopg2",
35
+ username=self.data_connection.get("username"),
36
+ password=self.data_connection.get("password"),
37
+ host=self.data_connection.get("host"),
38
+ port=self.data_connection.get("port"),
39
+ database=self.data_connection.get("database"),
40
+ )
41
+ schema = self.data_connection.get("schema")
42
+ engine = create_engine(
43
+ url,
44
+ connect_args={"options": f"-csearch_path={schema}"} if schema else None,
45
+ isolation_level="AUTOCOMMIT",
46
+ )
47
+
48
+ self.connection = engine.connect()
49
+ return self.connection
50
+ except Exception as e:
51
+ raise DataChecksDataSourcesConnectionError(
52
+ message=f"Failed to connect to AWS RedShift data source: [{str(e)}]"
53
+ )
@@ -0,0 +1,48 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import urllib.parse
15
+ from typing import Any, Dict
16
+
17
+ from snowflake.sqlalchemy import URL
18
+ from sqlalchemy import create_engine
19
+
20
+ from dcs_core.core.common.errors import DataChecksDataSourcesConnectionError
21
+ from dcs_core.core.datasource.sql_datasource import SQLDataSource
22
+
23
+
24
+ class SnowFlakeDataSource(SQLDataSource):
25
+ def __init__(self, data_source_name: str, data_connection: Dict):
26
+ super().__init__(data_source_name, data_connection)
27
+
28
+ def connect(self) -> Any:
29
+ """
30
+ Connect to the data source
31
+ """
32
+ try:
33
+ url = URL(
34
+ account=self.data_connection.get("account"),
35
+ user=self.data_connection.get("username"),
36
+ password=urllib.parse.quote(self.data_connection.get("password")),
37
+ database=self.data_connection.get("database"),
38
+ schema=self.data_connection.get("schema"),
39
+ warehouse=self.data_connection.get("warehouse"),
40
+ role=self.data_connection.get("role"),
41
+ )
42
+ engine = create_engine(url)
43
+ self.connection = engine.connect()
44
+ return self.connection
45
+ except Exception as e:
46
+ raise DataChecksDataSourcesConnectionError(
47
+ message=f"Failed to connect to Snowflake data source: [{str(e)}]"
48
+ )
@@ -0,0 +1,111 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import Any, List, Union
15
+
16
+ from pyspark.sql import DataFrame
17
+ from pyspark.sql.session import SparkSession
18
+ from pyspark.sql.types import Row
19
+
20
+ from dcs_core.core.datasource.sql_datasource import SQLDataSource
21
+
22
+
23
+ class SparkDfCursor:
24
+ def __init__(self, spark_session: SparkSession):
25
+ self.spark_session = spark_session
26
+ self.df: Union[DataFrame, None] = None
27
+ self.description: Union[tuple[tuple], None] = None
28
+ self.rowcount: int = -1
29
+ self.cursor_index: int = -1
30
+
31
+ def execute(self, sql: str):
32
+ self.df = self.spark_session.sql(sqlQuery=sql)
33
+ self.description = self.convert_spark_df_schema_to_dbapi_description(self.df)
34
+ self.cursor_index = 0
35
+
36
+ def fetchall(self) -> tuple[List, ...]:
37
+ rows = []
38
+ spark_rows: list[Row] = self.df.collect()
39
+ self.rowcount = len(spark_rows)
40
+ for spark_row in spark_rows:
41
+ row = self.convert_spark_row_to_dbapi_row(spark_row)
42
+ rows.append(row)
43
+ return tuple(rows)
44
+
45
+ def fetchmany(self, size: int) -> tuple[List, ...]:
46
+ rows = []
47
+ self.rowcount = self.df.count()
48
+ spark_rows: list[Row] = self.df.limit(size).offset(self.cursor_index).collect()
49
+ self.cursor_index += len(spark_rows)
50
+ for spark_row in spark_rows:
51
+ row = self.convert_spark_row_to_dbapi_row(spark_row)
52
+ rows.append(row)
53
+ return tuple(rows)
54
+
55
+ def fetchone(self) -> tuple:
56
+ spark_rows: list[Row] = self.df.collect()
57
+ self.rowcount = len(spark_rows)
58
+ spark_row = spark_rows[0]
59
+ row = self.convert_spark_row_to_dbapi_row(spark_row)
60
+ return tuple(row)
61
+
62
+ @staticmethod
63
+ def convert_spark_row_to_dbapi_row(spark_row):
64
+ return [spark_row[field] for field in spark_row.__fields__]
65
+
66
+ def close(self):
67
+ pass
68
+
69
+ @staticmethod
70
+ def convert_spark_df_schema_to_dbapi_description(df) -> tuple[tuple[Any, Any], ...]:
71
+ return tuple((field.name, type(field.dataType).__name__) for field in df.schema.fields)
72
+
73
+
74
+ class SparkDfConnection:
75
+ def __init__(self, spark_session: SparkSession):
76
+ self.spark_session = spark_session
77
+
78
+ def cursor(self) -> SparkDfCursor:
79
+ return SparkDfCursor(self.spark_session)
80
+
81
+ def close(self):
82
+ pass
83
+
84
+ def commit(self):
85
+ pass
86
+
87
+ def rollback(self):
88
+ pass
89
+
90
+
91
+ class SparkDFDataSource(SQLDataSource):
92
+ def __init__(self, data_source_name: str, data_connection: dict):
93
+ super().__init__(data_source_name, data_connection)
94
+ self.spark_session = data_connection.get("spark_session")
95
+ self.use_sa_text_query = False
96
+
97
+ def connect(self):
98
+ self.connection = SparkDfConnection(self.spark_session)
99
+
100
+ def close(self):
101
+ pass
102
+
103
+ def fetchone(self, query):
104
+ cursor = self.connection.cursor()
105
+ cursor.execute(query)
106
+ return cursor.fetchone()
107
+
108
+ def fetchall(self, query):
109
+ cursor = self.connection.cursor()
110
+ cursor.execute(query)
111
+ return cursor.fetchall()