dcs-sdk 1.6.5__py3-none-any.whl → 1.6.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import datetime
16
+ import math
16
17
  from decimal import Decimal
17
18
  from typing import Any, Dict, List, Optional, Tuple, Union
18
19
  from uuid import UUID
@@ -706,13 +707,15 @@ class MssqlDataSource(SQLDataSource):
706
707
  cursor = self.connection.cursor()
707
708
  try:
708
709
  cursor.execute(query)
709
- columns = [column[0] for column in cursor.description]
710
- result_row = cursor.fetchone()
710
+ if cursor.description:
711
+ columns = [column[0] for column in cursor.description]
712
+ result_row = cursor.fetchone()
713
+ row = dict(zip(columns, result_row)) if result_row else {}
714
+ else:
715
+ row = {}
711
716
  finally:
712
717
  cursor.close()
713
718
 
714
- row = dict(zip(columns, result_row))
715
-
716
719
  def _normalize_metrics(value):
717
720
  """Safely normalize DB metric values for JSON serialization."""
718
721
  if value is None:
@@ -737,11 +740,158 @@ class MssqlDataSource(SQLDataSource):
737
740
  col_metrics = {}
738
741
 
739
742
  for key, value in row.items():
740
- if key.startswith(f"{name}_"):
741
- metric_name = key[len(name) + 1 :]
743
+ clean_key = key.replace("[", "").replace("]", "")
744
+ if clean_key.startswith(f"{name}_"):
745
+ metric_name = clean_key[len(name) + 1 :]
742
746
  col_metrics[metric_name] = _normalize_metrics(value)
743
747
 
744
748
  column_wise.append({"column_name": name, "metrics": col_metrics})
749
+
750
+ for col_data in column_wise:
751
+ metrics = col_data["metrics"]
752
+ distinct_count = metrics.get("distinct")
753
+ col_name = col_data["column_name"]
754
+
755
+ dtype = next(c["data_type"].lower() for c in column_info if c["column_name"] == col_name)
756
+
757
+ quoted = self.quote_column(col_name)
758
+
759
+ is_dtype_numeric = (
760
+ True
761
+ if dtype
762
+ in (
763
+ "int",
764
+ "integer",
765
+ "bigint",
766
+ "smallint",
767
+ "tinyint",
768
+ "decimal",
769
+ "numeric",
770
+ "float",
771
+ "real",
772
+ "money",
773
+ "smallmoney",
774
+ )
775
+ else False
776
+ )
777
+
778
+ if is_dtype_numeric:
779
+ col_min = metrics.get("min")
780
+ col_max = metrics.get("max")
781
+
782
+ if col_min is not None and col_max is not None and col_min != col_max:
783
+ bucket_count = 20
784
+ bucket_size = (float(col_max) - float(col_min)) / bucket_count
785
+
786
+ bucket_queries = []
787
+ for i in range(bucket_count):
788
+ start = float(col_min) + i * bucket_size
789
+ end = float(col_min) + (i + 1) * bucket_size
790
+
791
+ bucket_queries.append(
792
+ f"SUM(CASE WHEN {quoted} >= {start} AND {quoted} < {end} THEN 1 ELSE 0 END) AS bucket_{i}"
793
+ )
794
+
795
+ bucket_sql = f"SELECT {', '.join(bucket_queries)} FROM {qualified_table}"
796
+
797
+ try:
798
+ bucket_result = self.connection.execute(text(bucket_sql)).fetchone()
799
+ distribution = []
800
+
801
+ for i in range(bucket_count):
802
+ start_raw = float(col_min) + i * bucket_size
803
+ end_raw = float(col_min) + (i + 1) * bucket_size
804
+
805
+ if dtype in ("int", "integer", "bigint", "smallint", "tinyint"):
806
+ start = math.floor(start_raw)
807
+ end = math.ceil(end_raw)
808
+ else:
809
+ start = round(start_raw, 2)
810
+ end = round(end_raw, 2)
811
+
812
+ count = bucket_result[i] if bucket_result and bucket_result[i] is not None else 0
813
+
814
+ distribution.append(
815
+ {
816
+ "col_val": f"{start} - {end}",
817
+ "count": count,
818
+ }
819
+ )
820
+
821
+ metrics["distribution_graph"] = distribution
822
+
823
+ except Exception as e:
824
+ print(f"Failed to generate numeric distribution for {col_name}: {e}")
825
+
826
+ continue
827
+
828
+ if isinstance(distinct_count, (int, float)) and distinct_count <= 20:
829
+ if dtype in ("text", "ntext", "xml"):
830
+ group_expr = f"CAST({quoted} AS NVARCHAR(MAX))"
831
+ else:
832
+ group_expr = quoted
833
+
834
+ dist_query = (
835
+ f"SELECT {group_expr}, COUNT(*) "
836
+ f"FROM {qualified_table} GROUP BY {group_expr} ORDER BY COUNT(*) DESC"
837
+ )
838
+
839
+ try:
840
+ dist_cursor = self.connection.cursor()
841
+ dist_cursor.execute(dist_query)
842
+ dist_result = dist_cursor.fetchall()
843
+ dist_cursor.close()
844
+
845
+ distribution = []
846
+
847
+ for r in dist_result:
848
+ val = _normalize_metrics(r[0])
849
+ distribution.append(
850
+ {
851
+ "col_val": val,
852
+ "count": r[1],
853
+ }
854
+ )
855
+
856
+ metrics["distribution_graph"] = distribution
857
+
858
+ except Exception as e:
859
+ print(f"Failed to generate distribution graph for column {col_name}: {e}")
860
+
861
+ for col_data in column_wise:
862
+ metrics = col_data["metrics"]
863
+ distinct_count = metrics.get("distinct")
864
+ col_name = col_data["column_name"]
865
+ dtype = next(c["data_type"].lower() for c in column_info if c["column_name"] == col_name)
866
+
867
+ quoted = self.quote_column(col_name)
868
+
869
+ is_dtype_numeric = (
870
+ True
871
+ if dtype
872
+ in (
873
+ "int",
874
+ "integer",
875
+ "bigint",
876
+ "smallint",
877
+ "tinyint",
878
+ "decimal",
879
+ "numeric",
880
+ "float",
881
+ "real",
882
+ "money",
883
+ "smallmoney",
884
+ )
885
+ else False
886
+ )
887
+
888
+ formatted_metrics_data = {
889
+ "general_data": {key: value for key, value in metrics.items() if key != "distribution_graph"},
890
+ "is_dtype_numeric": is_dtype_numeric,
891
+ "distribution_data": metrics.get("distribution_graph", []),
892
+ }
893
+ col_data["metrics"] = formatted_metrics_data
894
+
745
895
  return column_wise
746
896
 
747
897
  def fetch_sample_values_from_database(
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import datetime
16
+ import math
16
17
  from decimal import Decimal
17
18
  from typing import Any, Dict, List, Optional, Tuple
18
19
  from uuid import UUID
@@ -411,9 +412,73 @@ class PostgresDataSource(SQLDataSource):
411
412
  col_name = col_data["column_name"]
412
413
  dtype = next(c["data_type"].lower() for c in column_info if c["column_name"] == col_name)
413
414
 
414
- if isinstance(distinct_count, (int, float)) and distinct_count < 20:
415
- quoted = self.quote_column(col_name)
415
+ quoted = self.quote_column(col_name)
416
+
417
+ is_dtype_numeric = (
418
+ True
419
+ if dtype
420
+ in (
421
+ "int",
422
+ "integer",
423
+ "bigint",
424
+ "smallint",
425
+ "decimal",
426
+ "numeric",
427
+ "float",
428
+ "double",
429
+ )
430
+ else False
431
+ )
432
+
433
+ if is_dtype_numeric:
434
+ col_min = metrics.get("min")
435
+ col_max = metrics.get("max")
436
+
437
+ if col_min is not None and col_max is not None and col_min != col_max:
438
+ bucket_count = 20
439
+ bucket_size = (col_max - col_min) / bucket_count
440
+
441
+ bucket_queries = []
442
+ for i in range(bucket_count):
443
+ start = col_min + i * bucket_size
444
+ end = col_min + (i + 1) * bucket_size
445
+
446
+ bucket_queries.append(
447
+ f"SUM(CASE WHEN {quoted} >= {start} AND {quoted} < {end} THEN 1 ELSE 0 END) AS bucket_{i}"
448
+ )
449
+
450
+ bucket_sql = f"SELECT {', '.join(bucket_queries)} FROM {qualified_table}"
451
+
452
+ try:
453
+ bucket_result = self.connection.execute(text(bucket_sql)).fetchone()
454
+ distribution = []
455
+
456
+ for i in range(bucket_count):
457
+ start_raw = col_min + i * bucket_size
458
+ end_raw = col_min + (i + 1) * bucket_size
459
+ if dtype in ("int", "integer", "bigint", "smallint"):
460
+ start = math.floor(start_raw)
461
+ end = math.ceil(end_raw)
462
+ else:
463
+ start = round(start_raw, 2)
464
+ end = round(end_raw, 2)
465
+ count = bucket_result[i]
466
+
467
+ distribution.append(
468
+ {
469
+ "col_val": f"{start} - {end}",
470
+ "count": count,
471
+ }
472
+ )
416
473
 
474
+ metrics["distribution_graph"] = distribution
475
+
476
+ except Exception as e:
477
+ print(f"Failed to generate numeric distribution for {col_name}: {e}")
478
+
479
+ continue
480
+
481
+ if isinstance(distinct_count, (int, float)) and distinct_count <= 20:
417
482
  if dtype in ("json", "jsonb"):
418
483
  group_expr = f"{quoted}::text"
419
484
  else:
@@ -444,8 +509,31 @@ class PostgresDataSource(SQLDataSource):
444
509
 
445
510
  for col_data in column_wise:
446
511
  metrics = col_data["metrics"]
512
+ distinct_count = metrics.get("distinct")
513
+ col_name = col_data["column_name"]
514
+ dtype = next(c["data_type"].lower() for c in column_info if c["column_name"] == col_name)
515
+
516
+ quoted = self.quote_column(col_name)
517
+
518
+ is_dtype_numeric = (
519
+ True
520
+ if dtype
521
+ in (
522
+ "int",
523
+ "integer",
524
+ "bigint",
525
+ "smallint",
526
+ "decimal",
527
+ "numeric",
528
+ "float",
529
+ "double",
530
+ )
531
+ else False
532
+ )
533
+
447
534
  formatted_metrics_data = {
448
535
  "general_data": {key: value for key, value in metrics.items() if key != "distribution_graph"},
536
+ "is_dtype_numeric": is_dtype_numeric,
449
537
  "distribution_data": metrics.get("distribution_graph", []),
450
538
  }
451
539
  col_data["metrics"] = formatted_metrics_data
dcs_sdk/__version__.py CHANGED
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.6.4"
15
+ __version__ = "1.6.6"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dcs-sdk
3
- Version: 1.6.5
3
+ Version: 1.6.6
4
4
  Summary: SDK for DataChecks
5
5
  Author: Waterdip Labs
6
6
  Author-email: hello@waterdip.ai
@@ -84,7 +84,7 @@ Requires-Dist: vertica-python (>=1.4.0) ; extra == "vertica" or extra == "all-db
84
84
  Description-Content-Type: text/markdown
85
85
 
86
86
  <h1 align="center">
87
- DCS SDK v1.6.4
87
+ DCS SDK v1.6.6
88
88
  </h1>
89
89
 
90
90
  > SDK for DataChecks
@@ -103,11 +103,11 @@ dcs_core/integrations/databases/bigquery.py,sha256=26RuypLMmiARZIWkV_mxtnNL2yCs9
103
103
  dcs_core/integrations/databases/databricks.py,sha256=n4fm5m_mtRCdtjLGDvbNW18u7Ev234vDBjq_lxuOxns,1978
104
104
  dcs_core/integrations/databases/db2.py,sha256=hNGivvYCitp88ouZlCxp7iRQ-vnPiK1kL8x85NyGotk,26492
105
105
  dcs_core/integrations/databases/elasticsearch.py,sha256=6CTGs1WGrfgdDRNVt9DpOB0_z_znT6YoVj10E1WY-wQ,2152
106
- dcs_core/integrations/databases/mssql.py,sha256=3Gpy1UIclwYRF5_dbogbb5MgHlg35ZKcEczCNqlCh3o,33258
106
+ dcs_core/integrations/databases/mssql.py,sha256=g0MmoG8-xFphJ2oZl-q_OZ2oT6yz-lVY09JTIvIx4-0,38910
107
107
  dcs_core/integrations/databases/mysql.py,sha256=mUFLIGdbF_ktIlA19P7kq7holp5ZkRezGgN6TL_uiJ4,15815
108
108
  dcs_core/integrations/databases/opensearch.py,sha256=XeDaHRLLym3wFeA_N6RzQEHmQCI3DjD8A86Y9UKwFEM,2190
109
109
  dcs_core/integrations/databases/oracle.py,sha256=7g8Vs958tDx1v2CWFulCvuje0cLxWgU5-PVJTc1IluE,29194
110
- dcs_core/integrations/databases/postgres.py,sha256=gXWVPSMJQdWo2ZWpzrnc1bONRyqdiX0osdRtvJLWPSE,18133
110
+ dcs_core/integrations/databases/postgres.py,sha256=clT1fEIVCx3fcrare16rvBe_3TYWXn6wWwPc0Y-k9Ag,21326
111
111
  dcs_core/integrations/databases/redshift.py,sha256=R9eYxpD1Ve3ChZb-gyClJ6suSljG53O6Wez2GzUW0k0,2043
112
112
  dcs_core/integrations/databases/snowflake.py,sha256=NI6sgL9iakyCbIxtj0DiqeOpF5F9ybuhtG_IwvT86Ws,1942
113
113
  dcs_core/integrations/databases/spark_df.py,sha256=pO9hSENLdrRaPvPa66yCrKS2iv5JWJBsU9XB13BBasY,3659
@@ -131,7 +131,7 @@ dcs_core/report/static/index.js,sha256=p4wvku-zlXi0y4gWeSzV1amY0s4mjtUq2QsezARLV
131
131
  dcs_core/report/static/index.js.LICENSE.txt,sha256=bBDZBJVEDrqjCi7sfoF8CchjFn3hdcbNkP7ub7kbcXQ,201041
132
132
  dcs_sdk/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
133
133
  dcs_sdk/__main__.py,sha256=Qn8stIaQGrdLjHQ-H7xO0T-brtq5RWZoWU9QvqoarV8,683
134
- dcs_sdk/__version__.py,sha256=0MZwU2M7klH43EtQxpbFKior602GfMQYbBVWxSs857c,633
134
+ dcs_sdk/__version__.py,sha256=EkZnnw07uITZYElrylA-zR66DDr4c30pQVEZfA90dLE,633
135
135
  dcs_sdk/cli/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
136
136
  dcs_sdk/cli/cli.py,sha256=jaO52UrMWLafcF_yhqllPkmYSTuO2sksFi30fYFdAB4,4406
137
137
  dcs_sdk/sdk/__init__.py,sha256=skrZcgWWJBL6NXTUERywJ3qRJRemgpDXyW7lPg1FJk8,2107
@@ -153,7 +153,7 @@ dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py,sha256=puAWP
153
153
  dcs_sdk/sdk/utils/table.py,sha256=X8HxdYTWyx_oVrBWPsXlmA-xJKXXDBW9RrhlWNqA1As,18224
154
154
  dcs_sdk/sdk/utils/themes.py,sha256=Meo2Yldv4uyPpEqI7qdA28Aa6sxtwUU1dLKKm4QavjM,1403
155
155
  dcs_sdk/sdk/utils/utils.py,sha256=vF2zAvgt__Y8limicWTEWRyn41SBVJN81ZCTBRy6hQg,11907
156
- dcs_sdk-1.6.5.dist-info/METADATA,sha256=A_zRG4BkxZt8pO_JwxTTL-6Sw1jOSQ93yG8bigJCnTc,7568
157
- dcs_sdk-1.6.5.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
158
- dcs_sdk-1.6.5.dist-info/entry_points.txt,sha256=XhODNz7UccgPOyklXgp7pIfTTXArd6-V0mImjhnhwto,80
159
- dcs_sdk-1.6.5.dist-info/RECORD,,
156
+ dcs_sdk-1.6.6.dist-info/METADATA,sha256=m3T3TS7-x2WZet7CGwIWNRFS5wbxc2RPbrrTfYWviZY,7568
157
+ dcs_sdk-1.6.6.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
158
+ dcs_sdk-1.6.6.dist-info/entry_points.txt,sha256=XhODNz7UccgPOyklXgp7pIfTTXArd6-V0mImjhnhwto,80
159
+ dcs_sdk-1.6.6.dist-info/RECORD,,