dcs-sdk 1.6.5__py3-none-any.whl → 1.6.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dcs_core/integrations/databases/mssql.py +156 -6
- dcs_core/integrations/databases/postgres.py +90 -2
- dcs_sdk/__version__.py +1 -1
- {dcs_sdk-1.6.5.dist-info → dcs_sdk-1.6.6.dist-info}/METADATA +2 -2
- {dcs_sdk-1.6.5.dist-info → dcs_sdk-1.6.6.dist-info}/RECORD +7 -7
- {dcs_sdk-1.6.5.dist-info → dcs_sdk-1.6.6.dist-info}/WHEEL +0 -0
- {dcs_sdk-1.6.5.dist-info → dcs_sdk-1.6.6.dist-info}/entry_points.txt +0 -0
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import datetime
|
|
16
|
+
import math
|
|
16
17
|
from decimal import Decimal
|
|
17
18
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
18
19
|
from uuid import UUID
|
|
@@ -706,13 +707,15 @@ class MssqlDataSource(SQLDataSource):
|
|
|
706
707
|
cursor = self.connection.cursor()
|
|
707
708
|
try:
|
|
708
709
|
cursor.execute(query)
|
|
709
|
-
|
|
710
|
-
|
|
710
|
+
if cursor.description:
|
|
711
|
+
columns = [column[0] for column in cursor.description]
|
|
712
|
+
result_row = cursor.fetchone()
|
|
713
|
+
row = dict(zip(columns, result_row)) if result_row else {}
|
|
714
|
+
else:
|
|
715
|
+
row = {}
|
|
711
716
|
finally:
|
|
712
717
|
cursor.close()
|
|
713
718
|
|
|
714
|
-
row = dict(zip(columns, result_row))
|
|
715
|
-
|
|
716
719
|
def _normalize_metrics(value):
|
|
717
720
|
"""Safely normalize DB metric values for JSON serialization."""
|
|
718
721
|
if value is None:
|
|
@@ -737,11 +740,158 @@ class MssqlDataSource(SQLDataSource):
|
|
|
737
740
|
col_metrics = {}
|
|
738
741
|
|
|
739
742
|
for key, value in row.items():
|
|
740
|
-
|
|
741
|
-
|
|
743
|
+
clean_key = key.replace("[", "").replace("]", "")
|
|
744
|
+
if clean_key.startswith(f"{name}_"):
|
|
745
|
+
metric_name = clean_key[len(name) + 1 :]
|
|
742
746
|
col_metrics[metric_name] = _normalize_metrics(value)
|
|
743
747
|
|
|
744
748
|
column_wise.append({"column_name": name, "metrics": col_metrics})
|
|
749
|
+
|
|
750
|
+
for col_data in column_wise:
|
|
751
|
+
metrics = col_data["metrics"]
|
|
752
|
+
distinct_count = metrics.get("distinct")
|
|
753
|
+
col_name = col_data["column_name"]
|
|
754
|
+
|
|
755
|
+
dtype = next(c["data_type"].lower() for c in column_info if c["column_name"] == col_name)
|
|
756
|
+
|
|
757
|
+
quoted = self.quote_column(col_name)
|
|
758
|
+
|
|
759
|
+
is_dtype_numeric = (
|
|
760
|
+
True
|
|
761
|
+
if dtype
|
|
762
|
+
in (
|
|
763
|
+
"int",
|
|
764
|
+
"integer",
|
|
765
|
+
"bigint",
|
|
766
|
+
"smallint",
|
|
767
|
+
"tinyint",
|
|
768
|
+
"decimal",
|
|
769
|
+
"numeric",
|
|
770
|
+
"float",
|
|
771
|
+
"real",
|
|
772
|
+
"money",
|
|
773
|
+
"smallmoney",
|
|
774
|
+
)
|
|
775
|
+
else False
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
if is_dtype_numeric:
|
|
779
|
+
col_min = metrics.get("min")
|
|
780
|
+
col_max = metrics.get("max")
|
|
781
|
+
|
|
782
|
+
if col_min is not None and col_max is not None and col_min != col_max:
|
|
783
|
+
bucket_count = 20
|
|
784
|
+
bucket_size = (float(col_max) - float(col_min)) / bucket_count
|
|
785
|
+
|
|
786
|
+
bucket_queries = []
|
|
787
|
+
for i in range(bucket_count):
|
|
788
|
+
start = float(col_min) + i * bucket_size
|
|
789
|
+
end = float(col_min) + (i + 1) * bucket_size
|
|
790
|
+
|
|
791
|
+
bucket_queries.append(
|
|
792
|
+
f"SUM(CASE WHEN {quoted} >= {start} AND {quoted} < {end} THEN 1 ELSE 0 END) AS bucket_{i}"
|
|
793
|
+
)
|
|
794
|
+
|
|
795
|
+
bucket_sql = f"SELECT {', '.join(bucket_queries)} FROM {qualified_table}"
|
|
796
|
+
|
|
797
|
+
try:
|
|
798
|
+
bucket_result = self.connection.execute(text(bucket_sql)).fetchone()
|
|
799
|
+
distribution = []
|
|
800
|
+
|
|
801
|
+
for i in range(bucket_count):
|
|
802
|
+
start_raw = float(col_min) + i * bucket_size
|
|
803
|
+
end_raw = float(col_min) + (i + 1) * bucket_size
|
|
804
|
+
|
|
805
|
+
if dtype in ("int", "integer", "bigint", "smallint", "tinyint"):
|
|
806
|
+
start = math.floor(start_raw)
|
|
807
|
+
end = math.ceil(end_raw)
|
|
808
|
+
else:
|
|
809
|
+
start = round(start_raw, 2)
|
|
810
|
+
end = round(end_raw, 2)
|
|
811
|
+
|
|
812
|
+
count = bucket_result[i] if bucket_result and bucket_result[i] is not None else 0
|
|
813
|
+
|
|
814
|
+
distribution.append(
|
|
815
|
+
{
|
|
816
|
+
"col_val": f"{start} - {end}",
|
|
817
|
+
"count": count,
|
|
818
|
+
}
|
|
819
|
+
)
|
|
820
|
+
|
|
821
|
+
metrics["distribution_graph"] = distribution
|
|
822
|
+
|
|
823
|
+
except Exception as e:
|
|
824
|
+
print(f"Failed to generate numeric distribution for {col_name}: {e}")
|
|
825
|
+
|
|
826
|
+
continue
|
|
827
|
+
|
|
828
|
+
if isinstance(distinct_count, (int, float)) and distinct_count <= 20:
|
|
829
|
+
if dtype in ("text", "ntext", "xml"):
|
|
830
|
+
group_expr = f"CAST({quoted} AS NVARCHAR(MAX))"
|
|
831
|
+
else:
|
|
832
|
+
group_expr = quoted
|
|
833
|
+
|
|
834
|
+
dist_query = (
|
|
835
|
+
f"SELECT {group_expr}, COUNT(*) "
|
|
836
|
+
f"FROM {qualified_table} GROUP BY {group_expr} ORDER BY COUNT(*) DESC"
|
|
837
|
+
)
|
|
838
|
+
|
|
839
|
+
try:
|
|
840
|
+
dist_cursor = self.connection.cursor()
|
|
841
|
+
dist_cursor.execute(dist_query)
|
|
842
|
+
dist_result = dist_cursor.fetchall()
|
|
843
|
+
dist_cursor.close()
|
|
844
|
+
|
|
845
|
+
distribution = []
|
|
846
|
+
|
|
847
|
+
for r in dist_result:
|
|
848
|
+
val = _normalize_metrics(r[0])
|
|
849
|
+
distribution.append(
|
|
850
|
+
{
|
|
851
|
+
"col_val": val,
|
|
852
|
+
"count": r[1],
|
|
853
|
+
}
|
|
854
|
+
)
|
|
855
|
+
|
|
856
|
+
metrics["distribution_graph"] = distribution
|
|
857
|
+
|
|
858
|
+
except Exception as e:
|
|
859
|
+
print(f"Failed to generate distribution graph for column {col_name}: {e}")
|
|
860
|
+
|
|
861
|
+
for col_data in column_wise:
|
|
862
|
+
metrics = col_data["metrics"]
|
|
863
|
+
distinct_count = metrics.get("distinct")
|
|
864
|
+
col_name = col_data["column_name"]
|
|
865
|
+
dtype = next(c["data_type"].lower() for c in column_info if c["column_name"] == col_name)
|
|
866
|
+
|
|
867
|
+
quoted = self.quote_column(col_name)
|
|
868
|
+
|
|
869
|
+
is_dtype_numeric = (
|
|
870
|
+
True
|
|
871
|
+
if dtype
|
|
872
|
+
in (
|
|
873
|
+
"int",
|
|
874
|
+
"integer",
|
|
875
|
+
"bigint",
|
|
876
|
+
"smallint",
|
|
877
|
+
"tinyint",
|
|
878
|
+
"decimal",
|
|
879
|
+
"numeric",
|
|
880
|
+
"float",
|
|
881
|
+
"real",
|
|
882
|
+
"money",
|
|
883
|
+
"smallmoney",
|
|
884
|
+
)
|
|
885
|
+
else False
|
|
886
|
+
)
|
|
887
|
+
|
|
888
|
+
formatted_metrics_data = {
|
|
889
|
+
"general_data": {key: value for key, value in metrics.items() if key != "distribution_graph"},
|
|
890
|
+
"is_dtype_numeric": is_dtype_numeric,
|
|
891
|
+
"distribution_data": metrics.get("distribution_graph", []),
|
|
892
|
+
}
|
|
893
|
+
col_data["metrics"] = formatted_metrics_data
|
|
894
|
+
|
|
745
895
|
return column_wise
|
|
746
896
|
|
|
747
897
|
def fetch_sample_values_from_database(
|
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import datetime
|
|
16
|
+
import math
|
|
16
17
|
from decimal import Decimal
|
|
17
18
|
from typing import Any, Dict, List, Optional, Tuple
|
|
18
19
|
from uuid import UUID
|
|
@@ -411,9 +412,73 @@ class PostgresDataSource(SQLDataSource):
|
|
|
411
412
|
col_name = col_data["column_name"]
|
|
412
413
|
dtype = next(c["data_type"].lower() for c in column_info if c["column_name"] == col_name)
|
|
413
414
|
|
|
414
|
-
|
|
415
|
-
|
|
415
|
+
quoted = self.quote_column(col_name)
|
|
416
|
+
|
|
417
|
+
is_dtype_numeric = (
|
|
418
|
+
True
|
|
419
|
+
if dtype
|
|
420
|
+
in (
|
|
421
|
+
"int",
|
|
422
|
+
"integer",
|
|
423
|
+
"bigint",
|
|
424
|
+
"smallint",
|
|
425
|
+
"decimal",
|
|
426
|
+
"numeric",
|
|
427
|
+
"float",
|
|
428
|
+
"double",
|
|
429
|
+
)
|
|
430
|
+
else False
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
if is_dtype_numeric:
|
|
434
|
+
col_min = metrics.get("min")
|
|
435
|
+
col_max = metrics.get("max")
|
|
436
|
+
|
|
437
|
+
if col_min is not None and col_max is not None and col_min != col_max:
|
|
438
|
+
bucket_count = 20
|
|
439
|
+
bucket_size = (col_max - col_min) / bucket_count
|
|
440
|
+
|
|
441
|
+
bucket_queries = []
|
|
442
|
+
for i in range(bucket_count):
|
|
443
|
+
start = col_min + i * bucket_size
|
|
444
|
+
end = col_min + (i + 1) * bucket_size
|
|
445
|
+
|
|
446
|
+
bucket_queries.append(
|
|
447
|
+
f"SUM(CASE WHEN {quoted} >= {start} AND {quoted} < {end} THEN 1 ELSE 0 END) AS bucket_{i}"
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
bucket_sql = f"SELECT {', '.join(bucket_queries)} FROM {qualified_table}"
|
|
451
|
+
|
|
452
|
+
try:
|
|
453
|
+
bucket_result = self.connection.execute(text(bucket_sql)).fetchone()
|
|
454
|
+
distribution = []
|
|
455
|
+
|
|
456
|
+
for i in range(bucket_count):
|
|
457
|
+
start_raw = col_min + i * bucket_size
|
|
458
|
+
end_raw = col_min + (i + 1) * bucket_size
|
|
459
|
+
if dtype in ("int", "integer", "bigint", "smallint"):
|
|
460
|
+
start = math.floor(start_raw)
|
|
461
|
+
end = math.ceil(end_raw)
|
|
462
|
+
else:
|
|
463
|
+
start = round(start_raw, 2)
|
|
464
|
+
end = round(end_raw, 2)
|
|
465
|
+
count = bucket_result[i]
|
|
466
|
+
|
|
467
|
+
distribution.append(
|
|
468
|
+
{
|
|
469
|
+
"col_val": f"{start} - {end}",
|
|
470
|
+
"count": count,
|
|
471
|
+
}
|
|
472
|
+
)
|
|
416
473
|
|
|
474
|
+
metrics["distribution_graph"] = distribution
|
|
475
|
+
|
|
476
|
+
except Exception as e:
|
|
477
|
+
print(f"Failed to generate numeric distribution for {col_name}: {e}")
|
|
478
|
+
|
|
479
|
+
continue
|
|
480
|
+
|
|
481
|
+
if isinstance(distinct_count, (int, float)) and distinct_count <= 20:
|
|
417
482
|
if dtype in ("json", "jsonb"):
|
|
418
483
|
group_expr = f"{quoted}::text"
|
|
419
484
|
else:
|
|
@@ -444,8 +509,31 @@ class PostgresDataSource(SQLDataSource):
|
|
|
444
509
|
|
|
445
510
|
for col_data in column_wise:
|
|
446
511
|
metrics = col_data["metrics"]
|
|
512
|
+
distinct_count = metrics.get("distinct")
|
|
513
|
+
col_name = col_data["column_name"]
|
|
514
|
+
dtype = next(c["data_type"].lower() for c in column_info if c["column_name"] == col_name)
|
|
515
|
+
|
|
516
|
+
quoted = self.quote_column(col_name)
|
|
517
|
+
|
|
518
|
+
is_dtype_numeric = (
|
|
519
|
+
True
|
|
520
|
+
if dtype
|
|
521
|
+
in (
|
|
522
|
+
"int",
|
|
523
|
+
"integer",
|
|
524
|
+
"bigint",
|
|
525
|
+
"smallint",
|
|
526
|
+
"decimal",
|
|
527
|
+
"numeric",
|
|
528
|
+
"float",
|
|
529
|
+
"double",
|
|
530
|
+
)
|
|
531
|
+
else False
|
|
532
|
+
)
|
|
533
|
+
|
|
447
534
|
formatted_metrics_data = {
|
|
448
535
|
"general_data": {key: value for key, value in metrics.items() if key != "distribution_graph"},
|
|
536
|
+
"is_dtype_numeric": is_dtype_numeric,
|
|
449
537
|
"distribution_data": metrics.get("distribution_graph", []),
|
|
450
538
|
}
|
|
451
539
|
col_data["metrics"] = formatted_metrics_data
|
dcs_sdk/__version__.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dcs-sdk
|
|
3
|
-
Version: 1.6.
|
|
3
|
+
Version: 1.6.6
|
|
4
4
|
Summary: SDK for DataChecks
|
|
5
5
|
Author: Waterdip Labs
|
|
6
6
|
Author-email: hello@waterdip.ai
|
|
@@ -84,7 +84,7 @@ Requires-Dist: vertica-python (>=1.4.0) ; extra == "vertica" or extra == "all-db
|
|
|
84
84
|
Description-Content-Type: text/markdown
|
|
85
85
|
|
|
86
86
|
<h1 align="center">
|
|
87
|
-
DCS SDK v1.6.
|
|
87
|
+
DCS SDK v1.6.6
|
|
88
88
|
</h1>
|
|
89
89
|
|
|
90
90
|
> SDK for DataChecks
|
|
@@ -103,11 +103,11 @@ dcs_core/integrations/databases/bigquery.py,sha256=26RuypLMmiARZIWkV_mxtnNL2yCs9
|
|
|
103
103
|
dcs_core/integrations/databases/databricks.py,sha256=n4fm5m_mtRCdtjLGDvbNW18u7Ev234vDBjq_lxuOxns,1978
|
|
104
104
|
dcs_core/integrations/databases/db2.py,sha256=hNGivvYCitp88ouZlCxp7iRQ-vnPiK1kL8x85NyGotk,26492
|
|
105
105
|
dcs_core/integrations/databases/elasticsearch.py,sha256=6CTGs1WGrfgdDRNVt9DpOB0_z_znT6YoVj10E1WY-wQ,2152
|
|
106
|
-
dcs_core/integrations/databases/mssql.py,sha256=
|
|
106
|
+
dcs_core/integrations/databases/mssql.py,sha256=g0MmoG8-xFphJ2oZl-q_OZ2oT6yz-lVY09JTIvIx4-0,38910
|
|
107
107
|
dcs_core/integrations/databases/mysql.py,sha256=mUFLIGdbF_ktIlA19P7kq7holp5ZkRezGgN6TL_uiJ4,15815
|
|
108
108
|
dcs_core/integrations/databases/opensearch.py,sha256=XeDaHRLLym3wFeA_N6RzQEHmQCI3DjD8A86Y9UKwFEM,2190
|
|
109
109
|
dcs_core/integrations/databases/oracle.py,sha256=7g8Vs958tDx1v2CWFulCvuje0cLxWgU5-PVJTc1IluE,29194
|
|
110
|
-
dcs_core/integrations/databases/postgres.py,sha256=
|
|
110
|
+
dcs_core/integrations/databases/postgres.py,sha256=clT1fEIVCx3fcrare16rvBe_3TYWXn6wWwPc0Y-k9Ag,21326
|
|
111
111
|
dcs_core/integrations/databases/redshift.py,sha256=R9eYxpD1Ve3ChZb-gyClJ6suSljG53O6Wez2GzUW0k0,2043
|
|
112
112
|
dcs_core/integrations/databases/snowflake.py,sha256=NI6sgL9iakyCbIxtj0DiqeOpF5F9ybuhtG_IwvT86Ws,1942
|
|
113
113
|
dcs_core/integrations/databases/spark_df.py,sha256=pO9hSENLdrRaPvPa66yCrKS2iv5JWJBsU9XB13BBasY,3659
|
|
@@ -131,7 +131,7 @@ dcs_core/report/static/index.js,sha256=p4wvku-zlXi0y4gWeSzV1amY0s4mjtUq2QsezARLV
|
|
|
131
131
|
dcs_core/report/static/index.js.LICENSE.txt,sha256=bBDZBJVEDrqjCi7sfoF8CchjFn3hdcbNkP7ub7kbcXQ,201041
|
|
132
132
|
dcs_sdk/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
133
133
|
dcs_sdk/__main__.py,sha256=Qn8stIaQGrdLjHQ-H7xO0T-brtq5RWZoWU9QvqoarV8,683
|
|
134
|
-
dcs_sdk/__version__.py,sha256=
|
|
134
|
+
dcs_sdk/__version__.py,sha256=EkZnnw07uITZYElrylA-zR66DDr4c30pQVEZfA90dLE,633
|
|
135
135
|
dcs_sdk/cli/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
136
136
|
dcs_sdk/cli/cli.py,sha256=jaO52UrMWLafcF_yhqllPkmYSTuO2sksFi30fYFdAB4,4406
|
|
137
137
|
dcs_sdk/sdk/__init__.py,sha256=skrZcgWWJBL6NXTUERywJ3qRJRemgpDXyW7lPg1FJk8,2107
|
|
@@ -153,7 +153,7 @@ dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py,sha256=puAWP
|
|
|
153
153
|
dcs_sdk/sdk/utils/table.py,sha256=X8HxdYTWyx_oVrBWPsXlmA-xJKXXDBW9RrhlWNqA1As,18224
|
|
154
154
|
dcs_sdk/sdk/utils/themes.py,sha256=Meo2Yldv4uyPpEqI7qdA28Aa6sxtwUU1dLKKm4QavjM,1403
|
|
155
155
|
dcs_sdk/sdk/utils/utils.py,sha256=vF2zAvgt__Y8limicWTEWRyn41SBVJN81ZCTBRy6hQg,11907
|
|
156
|
-
dcs_sdk-1.6.
|
|
157
|
-
dcs_sdk-1.6.
|
|
158
|
-
dcs_sdk-1.6.
|
|
159
|
-
dcs_sdk-1.6.
|
|
156
|
+
dcs_sdk-1.6.6.dist-info/METADATA,sha256=m3T3TS7-x2WZet7CGwIWNRFS5wbxc2RPbrrTfYWviZY,7568
|
|
157
|
+
dcs_sdk-1.6.6.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
158
|
+
dcs_sdk-1.6.6.dist-info/entry_points.txt,sha256=XhODNz7UccgPOyklXgp7pIfTTXArd6-V0mImjhnhwto,80
|
|
159
|
+
dcs_sdk-1.6.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|