dcs-sdk 1.7.4__py3-none-any.whl → 1.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dcs_core/integrations/databases/databricks.py +504 -2
- dcs_sdk/__version__.py +1 -1
- {dcs_sdk-1.7.4.dist-info → dcs_sdk-1.7.5.dist-info}/METADATA +1 -1
- {dcs_sdk-1.7.4.dist-info → dcs_sdk-1.7.5.dist-info}/RECORD +6 -6
- {dcs_sdk-1.7.4.dist-info → dcs_sdk-1.7.5.dist-info}/WHEEL +1 -1
- {dcs_sdk-1.7.4.dist-info → dcs_sdk-1.7.5.dist-info}/entry_points.txt +0 -0
|
@@ -12,12 +12,17 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
import datetime
|
|
16
|
+
import math
|
|
17
|
+
from decimal import Decimal
|
|
18
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
19
|
+
from uuid import UUID
|
|
16
20
|
|
|
17
|
-
from sqlalchemy import create_engine
|
|
21
|
+
from sqlalchemy import create_engine, text
|
|
18
22
|
from sqlalchemy.engine import URL
|
|
19
23
|
|
|
20
24
|
from dcs_core.core.common.errors import DataChecksDataSourcesConnectionError
|
|
25
|
+
from dcs_core.core.common.models.data_source_resource import RawColumnInfo
|
|
21
26
|
from dcs_core.core.datasource.sql_datasource import SQLDataSource
|
|
22
27
|
|
|
23
28
|
|
|
@@ -44,8 +49,505 @@ class DatabricksDataSource(SQLDataSource):
|
|
|
44
49
|
)
|
|
45
50
|
engine = create_engine(url, echo=True)
|
|
46
51
|
self.connection = engine.connect()
|
|
52
|
+
self.schema_name = self.data_connection.get("schema")
|
|
47
53
|
return self.connection
|
|
48
54
|
except Exception as e:
|
|
49
55
|
raise DataChecksDataSourcesConnectionError(
|
|
50
56
|
message=f"Failed to connect to Databricks data source: [{str(e)}]"
|
|
51
57
|
)
|
|
58
|
+
|
|
59
|
+
def quote_column(self, column: str) -> str:
|
|
60
|
+
return f"`{column}`"
|
|
61
|
+
|
|
62
|
+
def quote_database(self, database: str) -> str:
|
|
63
|
+
return f"`{database}`"
|
|
64
|
+
|
|
65
|
+
def qualified_table_name(self, table_name: str) -> str:
|
|
66
|
+
if self.schema_name:
|
|
67
|
+
return f"`{self.schema_name}`.`{table_name}`"
|
|
68
|
+
return f"`{table_name}`"
|
|
69
|
+
|
|
70
|
+
def query_get_database_version(self, database_version_query: Optional[str] = None) -> str:
|
|
71
|
+
"""
|
|
72
|
+
Get the database version
|
|
73
|
+
:return: version string
|
|
74
|
+
"""
|
|
75
|
+
query = database_version_query or "SELECT version()"
|
|
76
|
+
result = self.fetchone(query)
|
|
77
|
+
if result:
|
|
78
|
+
return result[0]
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
def query_get_table_indexes(self, table: str, schema: str | None = None) -> dict[str, dict]:
|
|
82
|
+
"""
|
|
83
|
+
Get index information for a table.
|
|
84
|
+
For Databricks, this primarily returns Primary Key information as traditional indexes are not exposed identically.
|
|
85
|
+
:param table: Table name
|
|
86
|
+
:param schema: Optional schema name
|
|
87
|
+
:return: Dictionary with index details
|
|
88
|
+
"""
|
|
89
|
+
schema = schema or self.schema_name
|
|
90
|
+
database = self.data_connection.get("catalog") or "hive_metastore"
|
|
91
|
+
quoted_database = self.quote_database(database)
|
|
92
|
+
|
|
93
|
+
# Databricks Unity Catalog stores constraints in information_schema
|
|
94
|
+
# We will fetch Primary Key info and structure it as an "index"
|
|
95
|
+
query = f"""
|
|
96
|
+
SELECT
|
|
97
|
+
tc.constraint_name,
|
|
98
|
+
kcu.column_name,
|
|
99
|
+
kcu.ordinal_position
|
|
100
|
+
FROM {quoted_database}.information_schema.table_constraints AS tc
|
|
101
|
+
JOIN {quoted_database}.information_schema.key_column_usage AS kcu
|
|
102
|
+
ON tc.constraint_name = kcu.constraint_name
|
|
103
|
+
AND tc.table_schema = kcu.table_schema
|
|
104
|
+
WHERE tc.table_schema = '{schema}'
|
|
105
|
+
AND tc.table_name = '{table}'
|
|
106
|
+
AND tc.constraint_type = 'PRIMARY KEY'
|
|
107
|
+
ORDER BY kcu.ordinal_position
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
try:
|
|
111
|
+
rows = self.fetchall(query)
|
|
112
|
+
except Exception as e:
|
|
113
|
+
# Fallback or silent failure if table doesn't exist or info schema not accessible
|
|
114
|
+
return {}
|
|
115
|
+
|
|
116
|
+
indexes = {}
|
|
117
|
+
if rows:
|
|
118
|
+
# In Databricks, the PK constraint name acts as the index name for this purpose
|
|
119
|
+
constraint_name = rows[0][0]
|
|
120
|
+
indexes[constraint_name] = {"columns": [], "index_type": "PRIMARY KEY", "is_primary_key": True}
|
|
121
|
+
|
|
122
|
+
for row in rows:
|
|
123
|
+
col_name = row[1]
|
|
124
|
+
ordinal = row[2]
|
|
125
|
+
indexes[constraint_name]["columns"].append({"column_name": col_name, "column_order": ordinal})
|
|
126
|
+
|
|
127
|
+
return indexes
|
|
128
|
+
|
|
129
|
+
def get_table_foreign_key_info(self, table_name: str, schema: str | None = None) -> list[dict]:
|
|
130
|
+
"""
|
|
131
|
+
Get foreign key information for a table.
|
|
132
|
+
:param table_name: Table name
|
|
133
|
+
:param schema: Optional schema name
|
|
134
|
+
:return: List of dicts with FK details
|
|
135
|
+
"""
|
|
136
|
+
schema = schema or self.schema_name
|
|
137
|
+
database = self.data_connection.get("catalog") or "hive_metastore"
|
|
138
|
+
quoted_database = self.quote_database(database)
|
|
139
|
+
|
|
140
|
+
# Standard ISO SQL query for Foreign Keys using information_schema
|
|
141
|
+
# Works for Unity Catalog
|
|
142
|
+
query = f"""
|
|
143
|
+
SELECT
|
|
144
|
+
tc.constraint_name,
|
|
145
|
+
tc.table_name,
|
|
146
|
+
kcu.column_name AS fk_column,
|
|
147
|
+
rel_kcu.table_name AS referenced_table,
|
|
148
|
+
rel_kcu.column_name AS referenced_column
|
|
149
|
+
FROM {quoted_database}.information_schema.table_constraints tc
|
|
150
|
+
JOIN {quoted_database}.information_schema.key_column_usage kcu
|
|
151
|
+
ON tc.constraint_name = kcu.constraint_name
|
|
152
|
+
AND tc.table_schema = kcu.table_schema
|
|
153
|
+
JOIN {quoted_database}.information_schema.referential_constraints rc
|
|
154
|
+
ON tc.constraint_name = rc.constraint_name
|
|
155
|
+
AND tc.table_schema = rc.constraint_schema
|
|
156
|
+
JOIN {quoted_database}.information_schema.key_column_usage rel_kcu
|
|
157
|
+
ON rc.unique_constraint_name = rel_kcu.constraint_name
|
|
158
|
+
AND rc.unique_constraint_schema = rel_kcu.table_schema
|
|
159
|
+
AND kcu.ordinal_position = rel_kcu.ordinal_position
|
|
160
|
+
WHERE tc.constraint_type = 'FOREIGN KEY'
|
|
161
|
+
AND tc.table_name = '{table_name}'
|
|
162
|
+
AND tc.table_schema = '{schema}'
|
|
163
|
+
"""
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
rows = self.fetchall(query)
|
|
167
|
+
except Exception:
|
|
168
|
+
return []
|
|
169
|
+
|
|
170
|
+
fk_info = []
|
|
171
|
+
for row in rows:
|
|
172
|
+
fk_info.append(
|
|
173
|
+
{
|
|
174
|
+
"constraint_name": row[0],
|
|
175
|
+
"table_name": row[1],
|
|
176
|
+
"fk_column": row[2],
|
|
177
|
+
"referenced_table": row[3],
|
|
178
|
+
"referenced_column": row[4],
|
|
179
|
+
}
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
return fk_info
|
|
183
|
+
|
|
184
|
+
def query_get_table_names(
|
|
185
|
+
self,
|
|
186
|
+
schema: str | None = None,
|
|
187
|
+
with_view: bool = False,
|
|
188
|
+
) -> dict:
|
|
189
|
+
"""
|
|
190
|
+
Get the list of tables in the database.
|
|
191
|
+
:param schema: optional schema name
|
|
192
|
+
:param with_view: whether to include views
|
|
193
|
+
:return: dictionary with table names and optionally view names
|
|
194
|
+
"""
|
|
195
|
+
schema = schema or self.schema_name
|
|
196
|
+
database = self.data_connection.get("catalog") or "hive_metastore"
|
|
197
|
+
quoted_database = self.quote_database(database)
|
|
198
|
+
|
|
199
|
+
if with_view:
|
|
200
|
+
table_type_condition = "table_type IN ('MANAGED', 'EXTERNAL', 'VIEW')"
|
|
201
|
+
else:
|
|
202
|
+
table_type_condition = "table_type IN ('MANAGED', 'EXTERNAL')"
|
|
203
|
+
|
|
204
|
+
query = (
|
|
205
|
+
f"SELECT table_name, table_type FROM {quoted_database}.information_schema.tables "
|
|
206
|
+
f"WHERE table_schema = '{schema}' "
|
|
207
|
+
)
|
|
208
|
+
if not with_view:
|
|
209
|
+
query += " AND table_type != 'VIEW'"
|
|
210
|
+
|
|
211
|
+
rows = self.fetchall(query)
|
|
212
|
+
|
|
213
|
+
if with_view:
|
|
214
|
+
result = {"table": [], "view": []}
|
|
215
|
+
if rows:
|
|
216
|
+
for row in rows:
|
|
217
|
+
table_name = row[0]
|
|
218
|
+
table_type = row[1]
|
|
219
|
+
if "VIEW" in table_type:
|
|
220
|
+
result["view"].append(table_name)
|
|
221
|
+
else:
|
|
222
|
+
result["table"].append(table_name)
|
|
223
|
+
else:
|
|
224
|
+
result = {"table": []}
|
|
225
|
+
if rows:
|
|
226
|
+
result["table"] = [row[0] for row in rows]
|
|
227
|
+
|
|
228
|
+
return result
|
|
229
|
+
|
|
230
|
+
def query_get_table_columns(
|
|
231
|
+
self,
|
|
232
|
+
table: str,
|
|
233
|
+
schema: str | None = None,
|
|
234
|
+
) -> RawColumnInfo:
|
|
235
|
+
"""
|
|
236
|
+
Get the schema of a table.
|
|
237
|
+
:param table: table name
|
|
238
|
+
:return: RawColumnInfo object containing column information
|
|
239
|
+
"""
|
|
240
|
+
schema = schema or self.schema_name
|
|
241
|
+
database = self.data_connection.get("catalog") or "hive_metastore"
|
|
242
|
+
quoted_database = self.quote_database(database)
|
|
243
|
+
|
|
244
|
+
query = (
|
|
245
|
+
f"SELECT column_name, data_type, datetime_precision, numeric_precision, numeric_scale, "
|
|
246
|
+
f"character_maximum_length "
|
|
247
|
+
f"FROM {quoted_database}.information_schema.columns "
|
|
248
|
+
f"WHERE table_name = '{table}' AND table_schema = '{schema}'"
|
|
249
|
+
)
|
|
250
|
+
rows = self.fetchall(query)
|
|
251
|
+
|
|
252
|
+
if not rows:
|
|
253
|
+
raise RuntimeError(f"{table}: Table, {schema}: Schema, does not exist, or has no columns")
|
|
254
|
+
|
|
255
|
+
column_info = {
|
|
256
|
+
r[0]: RawColumnInfo(
|
|
257
|
+
column_name=self.safe_get(r, 0),
|
|
258
|
+
data_type=self.safe_get(r, 1),
|
|
259
|
+
datetime_precision=self.safe_get(r, 2),
|
|
260
|
+
numeric_precision=self.safe_get(r, 3),
|
|
261
|
+
numeric_scale=self.safe_get(r, 4),
|
|
262
|
+
character_maximum_length=self.safe_get(r, 5),
|
|
263
|
+
)
|
|
264
|
+
for r in rows
|
|
265
|
+
}
|
|
266
|
+
return column_info
|
|
267
|
+
|
|
268
|
+
def fetch_rows(
|
|
269
|
+
self,
|
|
270
|
+
query: str,
|
|
271
|
+
limit: int = 1,
|
|
272
|
+
with_column_names: bool = False,
|
|
273
|
+
complete_query: Optional[str] = None,
|
|
274
|
+
) -> Tuple[List, Optional[List[str]]]:
|
|
275
|
+
"""
|
|
276
|
+
Fetch rows from the database.
|
|
277
|
+
|
|
278
|
+
:param query: SQL query to execute.
|
|
279
|
+
:param limit: Number of rows to fetch.
|
|
280
|
+
:param with_column_names: Whether to include column names in the result.
|
|
281
|
+
:return: Tuple of (rows, column_names or None)
|
|
282
|
+
"""
|
|
283
|
+
query = complete_query or f"SELECT * FROM ({query}) AS subquery LIMIT {limit}"
|
|
284
|
+
|
|
285
|
+
result = self.connection.execute(text(query))
|
|
286
|
+
rows = result.fetchmany(limit)
|
|
287
|
+
|
|
288
|
+
if with_column_names:
|
|
289
|
+
column_names = result.keys()
|
|
290
|
+
return rows, list(column_names)
|
|
291
|
+
else:
|
|
292
|
+
return rows, None
|
|
293
|
+
|
|
294
|
+
def fetch_sample_values_from_database(
|
|
295
|
+
self,
|
|
296
|
+
table_name: str,
|
|
297
|
+
column_names: list[str],
|
|
298
|
+
limit: int = 5,
|
|
299
|
+
) -> List[Tuple]:
|
|
300
|
+
"""
|
|
301
|
+
Fetch sample rows for specific columns from the given table.
|
|
302
|
+
|
|
303
|
+
:param table_name: The name of the table.
|
|
304
|
+
:param column_names: List of column names to fetch.
|
|
305
|
+
:param limit: Number of rows to fetch.
|
|
306
|
+
:return: List of row tuples.
|
|
307
|
+
"""
|
|
308
|
+
table_name = self.qualified_table_name(table_name)
|
|
309
|
+
|
|
310
|
+
if not column_names:
|
|
311
|
+
raise ValueError("At least one column name must be provided")
|
|
312
|
+
|
|
313
|
+
if len(column_names) == 1 and column_names[0] == "*":
|
|
314
|
+
query = f"SELECT * FROM {table_name} LIMIT {limit}"
|
|
315
|
+
else:
|
|
316
|
+
columns = ", ".join([self.quote_column(col) for col in column_names])
|
|
317
|
+
query = f"SELECT {columns} FROM {table_name} LIMIT {limit}"
|
|
318
|
+
|
|
319
|
+
result = self.connection.execute(text(query))
|
|
320
|
+
column_names = list(result.keys())
|
|
321
|
+
rows = result.fetchall()
|
|
322
|
+
return rows, column_names
|
|
323
|
+
|
|
324
|
+
def build_table_metrics_query(
|
|
325
|
+
self,
|
|
326
|
+
table_name: str,
|
|
327
|
+
column_info: list[dict],
|
|
328
|
+
additional_queries: Optional[List[str]] = None,
|
|
329
|
+
) -> list[dict]:
|
|
330
|
+
query_parts = []
|
|
331
|
+
if not column_info:
|
|
332
|
+
return []
|
|
333
|
+
|
|
334
|
+
for col in column_info:
|
|
335
|
+
name = col["column_name"]
|
|
336
|
+
dtype = col["data_type"].lower()
|
|
337
|
+
quoted = self.quote_column(name)
|
|
338
|
+
|
|
339
|
+
if dtype in ("string", "varchar"):
|
|
340
|
+
distinct_expr = f"{quoted}"
|
|
341
|
+
else:
|
|
342
|
+
distinct_expr = f"{quoted}"
|
|
343
|
+
|
|
344
|
+
query_parts.append(f"COUNT(DISTINCT {distinct_expr}) AS `{name}_distinct`")
|
|
345
|
+
query_parts.append(f"COUNT(*) - COUNT(DISTINCT {distinct_expr}) AS `{name}_duplicate`")
|
|
346
|
+
query_parts.append(
|
|
347
|
+
f"SUM(CASE WHEN {self.quote_column(name)} IS NULL THEN 1 ELSE 0 END) AS `{name}_is_null`"
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
if dtype in (
|
|
351
|
+
"int",
|
|
352
|
+
"integer",
|
|
353
|
+
"bigint",
|
|
354
|
+
"long",
|
|
355
|
+
"smallint",
|
|
356
|
+
"tinyint",
|
|
357
|
+
"decimal",
|
|
358
|
+
"numeric",
|
|
359
|
+
"float",
|
|
360
|
+
"double",
|
|
361
|
+
):
|
|
362
|
+
query_parts.append(f"MIN({self.quote_column(name)}) AS `{name}_min`")
|
|
363
|
+
query_parts.append(f"MAX({self.quote_column(name)}) AS `{name}_max`")
|
|
364
|
+
query_parts.append(f"AVG({self.quote_column(name)}) AS `{name}_average`")
|
|
365
|
+
|
|
366
|
+
elif dtype in ("string", "varchar", "char", "text"):
|
|
367
|
+
# Databricks uses length() or char_length()
|
|
368
|
+
query_parts.append(f"MAX(length({self.quote_column(name)})) AS `{name}_max_character_length`")
|
|
369
|
+
|
|
370
|
+
elif dtype in ("timestamp", "date", "boolean"):
|
|
371
|
+
query_parts.append(f"MIN({self.quote_column(name)}) AS `{name}_min`")
|
|
372
|
+
query_parts.append(f"MAX({self.quote_column(name)}) AS `{name}_max`")
|
|
373
|
+
|
|
374
|
+
if additional_queries:
|
|
375
|
+
for queries in additional_queries:
|
|
376
|
+
query_parts.append(queries)
|
|
377
|
+
|
|
378
|
+
qualified_table = self.qualified_table_name(table_name)
|
|
379
|
+
joined_parts = ",\n ".join(query_parts)
|
|
380
|
+
query = f"SELECT\n {joined_parts}\nFROM {qualified_table};"
|
|
381
|
+
|
|
382
|
+
result = self.connection.execute(text(query))
|
|
383
|
+
row = dict(list(result)[0]._mapping)
|
|
384
|
+
|
|
385
|
+
def _normalize_metrics(value):
|
|
386
|
+
"""
|
|
387
|
+
Safely normalizes DB metric values into JSON-serializable Python types.
|
|
388
|
+
"""
|
|
389
|
+
if value is None:
|
|
390
|
+
return None
|
|
391
|
+
|
|
392
|
+
if isinstance(value, Decimal):
|
|
393
|
+
return float(value)
|
|
394
|
+
if isinstance(value, (int, float, bool)):
|
|
395
|
+
return value
|
|
396
|
+
|
|
397
|
+
if isinstance(value, (datetime.datetime, datetime.date)):
|
|
398
|
+
return value.isoformat()
|
|
399
|
+
|
|
400
|
+
if isinstance(value, UUID):
|
|
401
|
+
return str(value)
|
|
402
|
+
|
|
403
|
+
if isinstance(value, list):
|
|
404
|
+
return [_normalize_metrics(v) for v in value]
|
|
405
|
+
if isinstance(value, dict):
|
|
406
|
+
return {k: _normalize_metrics(v) for k, v in value.items()}
|
|
407
|
+
|
|
408
|
+
return str(value)
|
|
409
|
+
|
|
410
|
+
column_wise = []
|
|
411
|
+
for col in column_info:
|
|
412
|
+
name = col["column_name"]
|
|
413
|
+
col_metrics = {}
|
|
414
|
+
|
|
415
|
+
for key, value in row.items():
|
|
416
|
+
if key.startswith(f"{name}_"):
|
|
417
|
+
metric_name = key[len(name) + 1 :]
|
|
418
|
+
col_metrics[metric_name] = _normalize_metrics(value)
|
|
419
|
+
|
|
420
|
+
column_wise.append({"column_name": name, "metrics": col_metrics})
|
|
421
|
+
|
|
422
|
+
for col_data in column_wise:
|
|
423
|
+
metrics = col_data["metrics"]
|
|
424
|
+
distinct_count = metrics.get("distinct")
|
|
425
|
+
col_name = col_data["column_name"]
|
|
426
|
+
dtype = next(c["data_type"].lower() for c in column_info if c["column_name"] == col_name)
|
|
427
|
+
|
|
428
|
+
quoted = self.quote_column(col_name)
|
|
429
|
+
|
|
430
|
+
is_dtype_numeric = (
|
|
431
|
+
True
|
|
432
|
+
if dtype
|
|
433
|
+
in (
|
|
434
|
+
"int",
|
|
435
|
+
"integer",
|
|
436
|
+
"bigint",
|
|
437
|
+
"long",
|
|
438
|
+
"smallint",
|
|
439
|
+
"tinyint",
|
|
440
|
+
"decimal",
|
|
441
|
+
"numeric",
|
|
442
|
+
"float",
|
|
443
|
+
"double",
|
|
444
|
+
)
|
|
445
|
+
else False
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
if is_dtype_numeric:
|
|
449
|
+
col_min = metrics.get("min")
|
|
450
|
+
col_max = metrics.get("max")
|
|
451
|
+
|
|
452
|
+
if col_min is not None and col_max is not None and col_min != col_max:
|
|
453
|
+
bucket_count = 20
|
|
454
|
+
bucket_size = (col_max - col_min) / bucket_count
|
|
455
|
+
|
|
456
|
+
bucket_queries = []
|
|
457
|
+
for i in range(bucket_count):
|
|
458
|
+
start = col_min + i * bucket_size
|
|
459
|
+
end = col_min + (i + 1) * bucket_size
|
|
460
|
+
|
|
461
|
+
# Databricks SQL syntax for CASE WHEN
|
|
462
|
+
bucket_queries.append(
|
|
463
|
+
f"SUM(CASE WHEN {quoted} >= {start} AND {quoted} < {end} THEN 1 ELSE 0 END) AS `bucket_{i}`"
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
bucket_sql = f"SELECT {', '.join(bucket_queries)} FROM {qualified_table}"
|
|
467
|
+
|
|
468
|
+
try:
|
|
469
|
+
bucket_result = self.connection.execute(text(bucket_sql)).fetchone()
|
|
470
|
+
distribution = []
|
|
471
|
+
|
|
472
|
+
for i in range(bucket_count):
|
|
473
|
+
start_raw = col_min + i * bucket_size
|
|
474
|
+
end_raw = col_min + (i + 1) * bucket_size
|
|
475
|
+
if dtype in ("int", "integer", "bigint", "long", "smallint", "tinyint"):
|
|
476
|
+
start = math.floor(start_raw)
|
|
477
|
+
end = math.ceil(end_raw)
|
|
478
|
+
else:
|
|
479
|
+
start = round(start_raw, 2)
|
|
480
|
+
end = round(end_raw, 2)
|
|
481
|
+
|
|
482
|
+
# Fetch by index or name (sqlalchemy row access)
|
|
483
|
+
count = bucket_result[i]
|
|
484
|
+
|
|
485
|
+
distribution.append(
|
|
486
|
+
{
|
|
487
|
+
"col_val": f"{start} - {end}",
|
|
488
|
+
"count": count,
|
|
489
|
+
}
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
metrics["distribution_graph"] = distribution
|
|
493
|
+
|
|
494
|
+
except Exception as e:
|
|
495
|
+
print(f"Failed to generate numeric distribution for {col_name}: {e}")
|
|
496
|
+
|
|
497
|
+
continue
|
|
498
|
+
|
|
499
|
+
if isinstance(distinct_count, (int, float)) and distinct_count <= 20:
|
|
500
|
+
group_expr = quoted
|
|
501
|
+
|
|
502
|
+
dist_query = (
|
|
503
|
+
f"SELECT {group_expr}, COUNT(*) "
|
|
504
|
+
f"FROM {qualified_table} GROUP BY {group_expr} ORDER BY COUNT(*) DESC"
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
try:
|
|
508
|
+
dist_result = self.connection.execute(text(dist_query)).fetchall()
|
|
509
|
+
|
|
510
|
+
distribution = []
|
|
511
|
+
for r in dist_result:
|
|
512
|
+
val = _normalize_metrics(r[0])
|
|
513
|
+
distribution.append(
|
|
514
|
+
{
|
|
515
|
+
"col_val": val,
|
|
516
|
+
"count": r[1],
|
|
517
|
+
}
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
metrics["distribution_graph"] = distribution
|
|
521
|
+
|
|
522
|
+
except Exception as e:
|
|
523
|
+
print(f"Failed to generate distribution graph for column {col_name}: {e}")
|
|
524
|
+
|
|
525
|
+
for col_data in column_wise:
|
|
526
|
+
metrics = col_data["metrics"]
|
|
527
|
+
# Formatting as per existing pattern
|
|
528
|
+
is_dtype_numeric = (
|
|
529
|
+
True
|
|
530
|
+
if next(c["data_type"].lower() for c in column_info if c["column_name"] == col_data["column_name"])
|
|
531
|
+
in (
|
|
532
|
+
"int",
|
|
533
|
+
"integer",
|
|
534
|
+
"bigint",
|
|
535
|
+
"long",
|
|
536
|
+
"smallint",
|
|
537
|
+
"tinyint",
|
|
538
|
+
"decimal",
|
|
539
|
+
"numeric",
|
|
540
|
+
"float",
|
|
541
|
+
"double",
|
|
542
|
+
)
|
|
543
|
+
else False
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
formatted_metrics_data = {
|
|
547
|
+
"general_data": {key: value for key, value in metrics.items() if key != "distribution_graph"},
|
|
548
|
+
"is_dtype_numeric": is_dtype_numeric,
|
|
549
|
+
"distribution_data": metrics.get("distribution_graph", []),
|
|
550
|
+
}
|
|
551
|
+
col_data["metrics"] = formatted_metrics_data
|
|
552
|
+
|
|
553
|
+
return column_wise
|
dcs_sdk/__version__.py
CHANGED
|
@@ -102,7 +102,7 @@ dcs_core/integrations/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4
|
|
|
102
102
|
dcs_core/integrations/databases/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
103
103
|
dcs_core/integrations/databases/azure_blob.py,sha256=XF-B790XA1sGyhgIWUPE-BRHR_-ctA31IjI66pNL6eM,4231
|
|
104
104
|
dcs_core/integrations/databases/bigquery.py,sha256=26RuypLMmiARZIWkV_mxtnNL2yCs94YWerSGH5Nr10Q,7337
|
|
105
|
-
dcs_core/integrations/databases/databricks.py,sha256=
|
|
105
|
+
dcs_core/integrations/databases/databricks.py,sha256=r27uY_XtlJdbgeQ57ABbcfV6tU66LnbWyQucStayAVo,20768
|
|
106
106
|
dcs_core/integrations/databases/db2.py,sha256=hNGivvYCitp88ouZlCxp7iRQ-vnPiK1kL8x85NyGotk,26492
|
|
107
107
|
dcs_core/integrations/databases/duck_db.py,sha256=X4FRSsobOFCIi329cYofQsMd_fkRI4KxC8BIrtiDz4g,5531
|
|
108
108
|
dcs_core/integrations/databases/elasticsearch.py,sha256=6CTGs1WGrfgdDRNVt9DpOB0_z_znT6YoVj10E1WY-wQ,2152
|
|
@@ -134,7 +134,7 @@ dcs_core/report/static/index.js,sha256=p4wvku-zlXi0y4gWeSzV1amY0s4mjtUq2QsezARLV
|
|
|
134
134
|
dcs_core/report/static/index.js.LICENSE.txt,sha256=bBDZBJVEDrqjCi7sfoF8CchjFn3hdcbNkP7ub7kbcXQ,201041
|
|
135
135
|
dcs_sdk/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
136
136
|
dcs_sdk/__main__.py,sha256=Qn8stIaQGrdLjHQ-H7xO0T-brtq5RWZoWU9QvqoarV8,683
|
|
137
|
-
dcs_sdk/__version__.py,sha256=
|
|
137
|
+
dcs_sdk/__version__.py,sha256=LyQoYvx5NK-5Cr6vd2YXpSX8fYqK7p67ywpDOwSpWEI,633
|
|
138
138
|
dcs_sdk/cli/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
139
139
|
dcs_sdk/cli/cli.py,sha256=jaO52UrMWLafcF_yhqllPkmYSTuO2sksFi30fYFdAB4,4406
|
|
140
140
|
dcs_sdk/sdk/__init__.py,sha256=skrZcgWWJBL6NXTUERywJ3qRJRemgpDXyW7lPg1FJk8,2107
|
|
@@ -156,7 +156,7 @@ dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py,sha256=puAWP
|
|
|
156
156
|
dcs_sdk/sdk/utils/table.py,sha256=X8HxdYTWyx_oVrBWPsXlmA-xJKXXDBW9RrhlWNqA1As,18224
|
|
157
157
|
dcs_sdk/sdk/utils/themes.py,sha256=Meo2Yldv4uyPpEqI7qdA28Aa6sxtwUU1dLKKm4QavjM,1403
|
|
158
158
|
dcs_sdk/sdk/utils/utils.py,sha256=F7BUnLgJFyb7mc_9cw1hyk4lutsg6ctJuBQRrfyBv0g,16874
|
|
159
|
-
dcs_sdk-1.7.
|
|
160
|
-
dcs_sdk-1.7.
|
|
161
|
-
dcs_sdk-1.7.
|
|
162
|
-
dcs_sdk-1.7.
|
|
159
|
+
dcs_sdk-1.7.5.dist-info/METADATA,sha256=VLsUM65db7o2L5J6UpwF5yU2_bz3Pmx13sYQOZknY7U,7652
|
|
160
|
+
dcs_sdk-1.7.5.dist-info/WHEEL,sha256=kJCRJT_g0adfAJzTx2GUMmS80rTJIVHRCfG0DQgLq3o,88
|
|
161
|
+
dcs_sdk-1.7.5.dist-info/entry_points.txt,sha256=XhODNz7UccgPOyklXgp7pIfTTXArd6-V0mImjhnhwto,80
|
|
162
|
+
dcs_sdk-1.7.5.dist-info/RECORD,,
|
|
File without changes
|