duckguard 2.2.0__py3-none-any.whl → 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +1 -1
- duckguard/anomaly/__init__.py +28 -0
- duckguard/anomaly/baselines.py +294 -0
- duckguard/anomaly/methods.py +16 -2
- duckguard/anomaly/ml_methods.py +724 -0
- duckguard/checks/__init__.py +26 -0
- duckguard/checks/conditional.py +796 -0
- duckguard/checks/distributional.py +524 -0
- duckguard/checks/multicolumn.py +726 -0
- duckguard/checks/query_based.py +643 -0
- duckguard/cli/main.py +257 -2
- duckguard/connectors/factory.py +30 -2
- duckguard/connectors/files.py +7 -3
- duckguard/core/column.py +851 -1
- duckguard/core/dataset.py +1035 -0
- duckguard/core/result.py +236 -0
- duckguard/freshness/__init__.py +33 -0
- duckguard/freshness/monitor.py +429 -0
- duckguard/history/schema.py +119 -1
- duckguard/notifications/__init__.py +20 -2
- duckguard/notifications/email.py +508 -0
- duckguard/profiler/distribution_analyzer.py +384 -0
- duckguard/profiler/outlier_detector.py +497 -0
- duckguard/profiler/pattern_matcher.py +301 -0
- duckguard/profiler/quality_scorer.py +445 -0
- duckguard/reports/html_reporter.py +1 -2
- duckguard/rules/executor.py +642 -0
- duckguard/rules/generator.py +4 -1
- duckguard/rules/schema.py +54 -0
- duckguard/schema_history/__init__.py +40 -0
- duckguard/schema_history/analyzer.py +414 -0
- duckguard/schema_history/tracker.py +288 -0
- duckguard/semantic/detector.py +17 -1
- duckguard-3.0.0.dist-info/METADATA +1072 -0
- {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/RECORD +38 -21
- duckguard-2.2.0.dist-info/METADATA +0 -351
- {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/WHEEL +0 -0
- {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/entry_points.txt +0 -0
- {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/licenses/LICENSE +0 -0
duckguard/core/column.py
CHANGED
|
@@ -4,7 +4,7 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from typing import TYPE_CHECKING, Any
|
|
6
6
|
|
|
7
|
-
from duckguard.core.result import FailedRow, ValidationResult
|
|
7
|
+
from duckguard.core.result import DriftResult, FailedRow, ValidationResult
|
|
8
8
|
|
|
9
9
|
if TYPE_CHECKING:
|
|
10
10
|
from duckguard.core.dataset import Dataset
|
|
@@ -477,6 +477,358 @@ class Column:
|
|
|
477
477
|
message=f"Column '{self._name}' has {invalid_count} values with length outside [{min_len}, {max_len}]",
|
|
478
478
|
)
|
|
479
479
|
|
|
480
|
+
# =========================================================================
|
|
481
|
+
# Cross-Dataset Validation Methods (Reference/FK Checks)
|
|
482
|
+
# =========================================================================
|
|
483
|
+
|
|
484
|
+
def exists_in(
|
|
485
|
+
self,
|
|
486
|
+
reference_column: Column,
|
|
487
|
+
capture_failures: bool = True,
|
|
488
|
+
) -> ValidationResult:
|
|
489
|
+
"""
|
|
490
|
+
Check that all non-null values in this column exist in the reference column.
|
|
491
|
+
|
|
492
|
+
This is the core foreign key validation method using an efficient SQL anti-join.
|
|
493
|
+
Null values in this column are ignored (they don't need to exist in reference).
|
|
494
|
+
|
|
495
|
+
Args:
|
|
496
|
+
reference_column: Column object from the reference dataset
|
|
497
|
+
capture_failures: Whether to capture sample orphaned rows (default: True)
|
|
498
|
+
|
|
499
|
+
Returns:
|
|
500
|
+
ValidationResult with orphan count and sample failed rows
|
|
501
|
+
|
|
502
|
+
Example:
|
|
503
|
+
orders = connect("orders.parquet")
|
|
504
|
+
customers = connect("customers.parquet")
|
|
505
|
+
result = orders["customer_id"].exists_in(customers["id"])
|
|
506
|
+
if not result:
|
|
507
|
+
print(f"Found {result.actual_value} orphan customer IDs")
|
|
508
|
+
"""
|
|
509
|
+
# Get source references for both datasets
|
|
510
|
+
source_ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
511
|
+
ref_ref = reference_column._dataset.engine.get_source_reference(
|
|
512
|
+
reference_column._dataset.source
|
|
513
|
+
)
|
|
514
|
+
source_col = f'"{self._name}"'
|
|
515
|
+
ref_col = f'"{reference_column._name}"'
|
|
516
|
+
|
|
517
|
+
# Count orphans using efficient anti-join pattern
|
|
518
|
+
sql = f"""
|
|
519
|
+
SELECT COUNT(*) as orphan_count
|
|
520
|
+
FROM {source_ref} s
|
|
521
|
+
WHERE s.{source_col} IS NOT NULL
|
|
522
|
+
AND NOT EXISTS (
|
|
523
|
+
SELECT 1 FROM {ref_ref} r
|
|
524
|
+
WHERE r.{ref_col} = s.{source_col}
|
|
525
|
+
)
|
|
526
|
+
"""
|
|
527
|
+
|
|
528
|
+
orphan_count = self._dataset.engine.fetch_value(sql) or 0
|
|
529
|
+
passed = orphan_count == 0
|
|
530
|
+
|
|
531
|
+
# Capture sample of orphan rows for debugging
|
|
532
|
+
failed_rows = []
|
|
533
|
+
if not passed and capture_failures:
|
|
534
|
+
failed_rows = self._get_failed_rows_exists_in(reference_column)
|
|
535
|
+
|
|
536
|
+
ref_dataset_name = reference_column._dataset.name or reference_column._dataset.source
|
|
537
|
+
return ValidationResult(
|
|
538
|
+
passed=passed,
|
|
539
|
+
actual_value=orphan_count,
|
|
540
|
+
expected_value=0,
|
|
541
|
+
message=f"Column '{self._name}' has {orphan_count} values not found in {ref_dataset_name}.{reference_column._name}",
|
|
542
|
+
details={
|
|
543
|
+
"orphan_count": orphan_count,
|
|
544
|
+
"reference_dataset": ref_dataset_name,
|
|
545
|
+
"reference_column": reference_column._name,
|
|
546
|
+
},
|
|
547
|
+
failed_rows=failed_rows,
|
|
548
|
+
total_failures=orphan_count,
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
def _get_failed_rows_exists_in(
|
|
552
|
+
self, reference_column: Column, limit: int = DEFAULT_SAMPLE_SIZE
|
|
553
|
+
) -> list[FailedRow]:
|
|
554
|
+
"""Get sample of rows with orphan values (not found in reference)."""
|
|
555
|
+
source_ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
556
|
+
ref_ref = reference_column._dataset.engine.get_source_reference(
|
|
557
|
+
reference_column._dataset.source
|
|
558
|
+
)
|
|
559
|
+
source_col = f'"{self._name}"'
|
|
560
|
+
ref_col = f'"{reference_column._name}"'
|
|
561
|
+
|
|
562
|
+
sql = f"""
|
|
563
|
+
SELECT row_number() OVER () as row_idx, s.{source_col} as val
|
|
564
|
+
FROM {source_ref} s
|
|
565
|
+
WHERE s.{source_col} IS NOT NULL
|
|
566
|
+
AND NOT EXISTS (
|
|
567
|
+
SELECT 1 FROM {ref_ref} r
|
|
568
|
+
WHERE r.{ref_col} = s.{source_col}
|
|
569
|
+
)
|
|
570
|
+
LIMIT {limit}
|
|
571
|
+
"""
|
|
572
|
+
|
|
573
|
+
rows = self._dataset.engine.fetch_all(sql)
|
|
574
|
+
ref_dataset_name = reference_column._dataset.name or reference_column._dataset.source
|
|
575
|
+
return [
|
|
576
|
+
FailedRow(
|
|
577
|
+
row_index=row[0],
|
|
578
|
+
column=self._name,
|
|
579
|
+
value=row[1],
|
|
580
|
+
expected=f"exists in {ref_dataset_name}.{reference_column._name}",
|
|
581
|
+
reason=f"Value '{row[1]}' not found in reference",
|
|
582
|
+
context={"reference_dataset": ref_dataset_name},
|
|
583
|
+
)
|
|
584
|
+
for row in rows
|
|
585
|
+
]
|
|
586
|
+
|
|
587
|
+
def references(
|
|
588
|
+
self,
|
|
589
|
+
reference_column: Column,
|
|
590
|
+
allow_nulls: bool = True,
|
|
591
|
+
capture_failures: bool = True,
|
|
592
|
+
) -> ValidationResult:
|
|
593
|
+
"""
|
|
594
|
+
Check foreign key relationship with configurable options.
|
|
595
|
+
|
|
596
|
+
This is a more configurable version of exists_in() that allows
|
|
597
|
+
controlling how null values are handled.
|
|
598
|
+
|
|
599
|
+
Args:
|
|
600
|
+
reference_column: Column in the reference dataset
|
|
601
|
+
allow_nulls: If True (default), null values pass. If False, nulls fail.
|
|
602
|
+
capture_failures: Whether to capture sample orphaned rows (default: True)
|
|
603
|
+
|
|
604
|
+
Returns:
|
|
605
|
+
ValidationResult
|
|
606
|
+
|
|
607
|
+
Example:
|
|
608
|
+
# Nulls are OK (default)
|
|
609
|
+
result = orders["customer_id"].references(customers["id"])
|
|
610
|
+
|
|
611
|
+
# Nulls should fail
|
|
612
|
+
result = orders["customer_id"].references(
|
|
613
|
+
customers["id"],
|
|
614
|
+
allow_nulls=False,
|
|
615
|
+
)
|
|
616
|
+
"""
|
|
617
|
+
# First, check for orphans (values not in reference)
|
|
618
|
+
result = self.exists_in(reference_column, capture_failures=capture_failures)
|
|
619
|
+
|
|
620
|
+
if not allow_nulls:
|
|
621
|
+
# Also count nulls as failures
|
|
622
|
+
null_count = self.null_count
|
|
623
|
+
if null_count > 0:
|
|
624
|
+
# Combine orphan failures with null failures
|
|
625
|
+
total_failures = result.actual_value + null_count
|
|
626
|
+
passed = total_failures == 0
|
|
627
|
+
|
|
628
|
+
# Add null rows to failed_rows if capturing
|
|
629
|
+
null_failed_rows = []
|
|
630
|
+
if capture_failures and null_count > 0:
|
|
631
|
+
null_failed_rows = self._get_null_rows_sample()
|
|
632
|
+
|
|
633
|
+
ref_dataset_name = reference_column._dataset.name or reference_column._dataset.source
|
|
634
|
+
return ValidationResult(
|
|
635
|
+
passed=passed,
|
|
636
|
+
actual_value=total_failures,
|
|
637
|
+
expected_value=0,
|
|
638
|
+
message=f"Column '{self._name}' has {result.actual_value} orphans and {null_count} nulls (references {ref_dataset_name}.{reference_column._name})",
|
|
639
|
+
details={
|
|
640
|
+
"orphan_count": result.actual_value,
|
|
641
|
+
"null_count": null_count,
|
|
642
|
+
"reference_dataset": ref_dataset_name,
|
|
643
|
+
"reference_column": reference_column._name,
|
|
644
|
+
"allow_nulls": allow_nulls,
|
|
645
|
+
},
|
|
646
|
+
failed_rows=result.failed_rows + null_failed_rows,
|
|
647
|
+
total_failures=total_failures,
|
|
648
|
+
)
|
|
649
|
+
|
|
650
|
+
return result
|
|
651
|
+
|
|
652
|
+
def _get_null_rows_sample(self, limit: int = DEFAULT_SAMPLE_SIZE) -> list[FailedRow]:
|
|
653
|
+
"""Get sample of rows with null values."""
|
|
654
|
+
ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
655
|
+
col = f'"{self._name}"'
|
|
656
|
+
|
|
657
|
+
sql = f"""
|
|
658
|
+
SELECT row_number() OVER () as row_idx
|
|
659
|
+
FROM {ref}
|
|
660
|
+
WHERE {col} IS NULL
|
|
661
|
+
LIMIT {limit}
|
|
662
|
+
"""
|
|
663
|
+
|
|
664
|
+
rows = self._dataset.engine.fetch_all(sql)
|
|
665
|
+
return [
|
|
666
|
+
FailedRow(
|
|
667
|
+
row_index=row[0],
|
|
668
|
+
column=self._name,
|
|
669
|
+
value=None,
|
|
670
|
+
expected="not null (allow_nulls=False)",
|
|
671
|
+
reason="Null value not allowed",
|
|
672
|
+
)
|
|
673
|
+
for row in rows
|
|
674
|
+
]
|
|
675
|
+
|
|
676
|
+
def find_orphans(
|
|
677
|
+
self,
|
|
678
|
+
reference_column: Column,
|
|
679
|
+
limit: int = 100,
|
|
680
|
+
) -> list[Any]:
|
|
681
|
+
"""
|
|
682
|
+
Find values that don't exist in the reference column.
|
|
683
|
+
|
|
684
|
+
This is a helper method to quickly identify orphan values
|
|
685
|
+
without running a full validation.
|
|
686
|
+
|
|
687
|
+
Args:
|
|
688
|
+
reference_column: Column in the reference dataset
|
|
689
|
+
limit: Maximum number of orphan values to return (default: 100)
|
|
690
|
+
|
|
691
|
+
Returns:
|
|
692
|
+
List of orphan values
|
|
693
|
+
|
|
694
|
+
Example:
|
|
695
|
+
orphan_ids = orders["customer_id"].find_orphans(customers["id"])
|
|
696
|
+
print(f"Invalid customer IDs: {orphan_ids}")
|
|
697
|
+
"""
|
|
698
|
+
source_ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
699
|
+
ref_ref = reference_column._dataset.engine.get_source_reference(
|
|
700
|
+
reference_column._dataset.source
|
|
701
|
+
)
|
|
702
|
+
source_col = f'"{self._name}"'
|
|
703
|
+
ref_col = f'"{reference_column._name}"'
|
|
704
|
+
|
|
705
|
+
sql = f"""
|
|
706
|
+
SELECT DISTINCT s.{source_col}
|
|
707
|
+
FROM {source_ref} s
|
|
708
|
+
WHERE s.{source_col} IS NOT NULL
|
|
709
|
+
AND NOT EXISTS (
|
|
710
|
+
SELECT 1 FROM {ref_ref} r
|
|
711
|
+
WHERE r.{ref_col} = s.{source_col}
|
|
712
|
+
)
|
|
713
|
+
LIMIT {limit}
|
|
714
|
+
"""
|
|
715
|
+
|
|
716
|
+
rows = self._dataset.engine.fetch_all(sql)
|
|
717
|
+
return [row[0] for row in rows]
|
|
718
|
+
|
|
719
|
+
def matches_values(
|
|
720
|
+
self,
|
|
721
|
+
other_column: Column,
|
|
722
|
+
capture_failures: bool = True,
|
|
723
|
+
) -> ValidationResult:
|
|
724
|
+
"""
|
|
725
|
+
Check that this column's distinct values match another column's distinct values.
|
|
726
|
+
|
|
727
|
+
Useful for comparing reference data or checking data synchronization.
|
|
728
|
+
Both "missing in other" and "extra in other" are considered failures.
|
|
729
|
+
|
|
730
|
+
Args:
|
|
731
|
+
other_column: Column to compare against
|
|
732
|
+
capture_failures: Whether to capture sample mismatched values (default: True)
|
|
733
|
+
|
|
734
|
+
Returns:
|
|
735
|
+
ValidationResult indicating if value sets match
|
|
736
|
+
|
|
737
|
+
Example:
|
|
738
|
+
result = orders["status"].matches_values(status_lookup["code"])
|
|
739
|
+
"""
|
|
740
|
+
source_ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
741
|
+
other_ref = other_column._dataset.engine.get_source_reference(
|
|
742
|
+
other_column._dataset.source
|
|
743
|
+
)
|
|
744
|
+
source_col = f'"{self._name}"'
|
|
745
|
+
other_col = f'"{other_column._name}"'
|
|
746
|
+
|
|
747
|
+
# Count values in source but not in other
|
|
748
|
+
sql_missing = f"""
|
|
749
|
+
SELECT COUNT(DISTINCT s.{source_col}) as missing_count
|
|
750
|
+
FROM {source_ref} s
|
|
751
|
+
WHERE s.{source_col} IS NOT NULL
|
|
752
|
+
AND NOT EXISTS (
|
|
753
|
+
SELECT 1 FROM {other_ref} o
|
|
754
|
+
WHERE o.{other_col} = s.{source_col}
|
|
755
|
+
)
|
|
756
|
+
"""
|
|
757
|
+
|
|
758
|
+
# Count values in other but not in source
|
|
759
|
+
sql_extra = f"""
|
|
760
|
+
SELECT COUNT(DISTINCT o.{other_col}) as extra_count
|
|
761
|
+
FROM {other_ref} o
|
|
762
|
+
WHERE o.{other_col} IS NOT NULL
|
|
763
|
+
AND NOT EXISTS (
|
|
764
|
+
SELECT 1 FROM {source_ref} s
|
|
765
|
+
WHERE s.{source_col} = o.{other_col}
|
|
766
|
+
)
|
|
767
|
+
"""
|
|
768
|
+
|
|
769
|
+
missing_count = self._dataset.engine.fetch_value(sql_missing) or 0
|
|
770
|
+
extra_count = self._dataset.engine.fetch_value(sql_extra) or 0
|
|
771
|
+
total_diff = missing_count + extra_count
|
|
772
|
+
passed = total_diff == 0
|
|
773
|
+
|
|
774
|
+
# Capture sample of mismatched values
|
|
775
|
+
failed_rows = []
|
|
776
|
+
if not passed and capture_failures:
|
|
777
|
+
failed_rows = self._get_failed_rows_matches_values(other_column)
|
|
778
|
+
|
|
779
|
+
other_dataset_name = other_column._dataset.name or other_column._dataset.source
|
|
780
|
+
return ValidationResult(
|
|
781
|
+
passed=passed,
|
|
782
|
+
actual_value=total_diff,
|
|
783
|
+
expected_value=0,
|
|
784
|
+
message=f"Column '{self._name}' has {missing_count} values missing in {other_dataset_name}.{other_column._name}, {extra_count} extra",
|
|
785
|
+
details={
|
|
786
|
+
"missing_in_other": missing_count,
|
|
787
|
+
"extra_in_other": extra_count,
|
|
788
|
+
"other_dataset": other_dataset_name,
|
|
789
|
+
"other_column": other_column._name,
|
|
790
|
+
},
|
|
791
|
+
failed_rows=failed_rows,
|
|
792
|
+
total_failures=total_diff,
|
|
793
|
+
)
|
|
794
|
+
|
|
795
|
+
def _get_failed_rows_matches_values(
|
|
796
|
+
self, other_column: Column, limit: int = DEFAULT_SAMPLE_SIZE
|
|
797
|
+
) -> list[FailedRow]:
|
|
798
|
+
"""Get sample of values that don't match between columns."""
|
|
799
|
+
source_ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
800
|
+
other_ref = other_column._dataset.engine.get_source_reference(
|
|
801
|
+
other_column._dataset.source
|
|
802
|
+
)
|
|
803
|
+
source_col = f'"{self._name}"'
|
|
804
|
+
other_col = f'"{other_column._name}"'
|
|
805
|
+
|
|
806
|
+
# Get values in source but not in other
|
|
807
|
+
sql = f"""
|
|
808
|
+
SELECT DISTINCT s.{source_col} as val, 'missing_in_other' as diff_type
|
|
809
|
+
FROM {source_ref} s
|
|
810
|
+
WHERE s.{source_col} IS NOT NULL
|
|
811
|
+
AND NOT EXISTS (
|
|
812
|
+
SELECT 1 FROM {other_ref} o
|
|
813
|
+
WHERE o.{other_col} = s.{source_col}
|
|
814
|
+
)
|
|
815
|
+
LIMIT {limit}
|
|
816
|
+
"""
|
|
817
|
+
|
|
818
|
+
rows = self._dataset.engine.fetch_all(sql)
|
|
819
|
+
other_dataset_name = other_column._dataset.name or other_column._dataset.source
|
|
820
|
+
return [
|
|
821
|
+
FailedRow(
|
|
822
|
+
row_index=idx + 1,
|
|
823
|
+
column=self._name,
|
|
824
|
+
value=row[0],
|
|
825
|
+
expected=f"exists in {other_dataset_name}.{other_column._name}",
|
|
826
|
+
reason=f"Value '{row[0]}' not found in other column",
|
|
827
|
+
context={"diff_type": row[1]},
|
|
828
|
+
)
|
|
829
|
+
for idx, row in enumerate(rows)
|
|
830
|
+
]
|
|
831
|
+
|
|
480
832
|
def get_distinct_values(self, limit: int = 100) -> list[Any]:
|
|
481
833
|
"""
|
|
482
834
|
Get distinct values in the column.
|
|
@@ -500,6 +852,132 @@ class Column:
|
|
|
500
852
|
rows = self._dataset.engine.fetch_all(sql)
|
|
501
853
|
return [row[0] for row in rows]
|
|
502
854
|
|
|
855
|
+
# =========================================================================
|
|
856
|
+
# Distribution Drift Detection
|
|
857
|
+
# =========================================================================
|
|
858
|
+
|
|
859
|
+
def detect_drift(
|
|
860
|
+
self,
|
|
861
|
+
reference_column: Column,
|
|
862
|
+
threshold: float = 0.05,
|
|
863
|
+
method: str = "ks_test",
|
|
864
|
+
) -> DriftResult:
|
|
865
|
+
"""
|
|
866
|
+
Detect distribution drift between this column and a reference column.
|
|
867
|
+
|
|
868
|
+
Uses statistical tests to determine if the distribution of values
|
|
869
|
+
has changed significantly. Useful for ML model monitoring and
|
|
870
|
+
data pipeline validation.
|
|
871
|
+
|
|
872
|
+
Args:
|
|
873
|
+
reference_column: Column from reference/baseline dataset
|
|
874
|
+
threshold: P-value threshold for drift detection (default: 0.05)
|
|
875
|
+
method: Statistical test method ("ks_test" for Kolmogorov-Smirnov)
|
|
876
|
+
|
|
877
|
+
Returns:
|
|
878
|
+
DriftResult with drift detection outcome
|
|
879
|
+
|
|
880
|
+
Example:
|
|
881
|
+
current = connect("orders_today.parquet")
|
|
882
|
+
baseline = connect("orders_baseline.parquet")
|
|
883
|
+
result = current["amount"].detect_drift(baseline["amount"])
|
|
884
|
+
if result.is_drifted:
|
|
885
|
+
print(f"Distribution drift detected! p-value: {result.p_value}")
|
|
886
|
+
"""
|
|
887
|
+
from duckguard.core.result import DriftResult
|
|
888
|
+
|
|
889
|
+
# Get values from both columns
|
|
890
|
+
current_values = self._get_numeric_values()
|
|
891
|
+
reference_values = reference_column._get_numeric_values()
|
|
892
|
+
|
|
893
|
+
if len(current_values) == 0 or len(reference_values) == 0:
|
|
894
|
+
return DriftResult(
|
|
895
|
+
is_drifted=False,
|
|
896
|
+
p_value=1.0,
|
|
897
|
+
statistic=0.0,
|
|
898
|
+
threshold=threshold,
|
|
899
|
+
method=method,
|
|
900
|
+
message="Insufficient data for drift detection",
|
|
901
|
+
details={"current_count": len(current_values), "reference_count": len(reference_values)},
|
|
902
|
+
)
|
|
903
|
+
|
|
904
|
+
# Perform KS test
|
|
905
|
+
ks_stat, p_value = self._ks_test(current_values, reference_values)
|
|
906
|
+
is_drifted = p_value < threshold
|
|
907
|
+
|
|
908
|
+
ref_dataset_name = reference_column._dataset.name or reference_column._dataset.source
|
|
909
|
+
if is_drifted:
|
|
910
|
+
message = f"Distribution drift detected in '{self._name}' vs {ref_dataset_name}.{reference_column._name} (p-value: {p_value:.4f} < {threshold})"
|
|
911
|
+
else:
|
|
912
|
+
message = f"No significant drift in '{self._name}' vs {ref_dataset_name}.{reference_column._name} (p-value: {p_value:.4f})"
|
|
913
|
+
|
|
914
|
+
return DriftResult(
|
|
915
|
+
is_drifted=is_drifted,
|
|
916
|
+
p_value=p_value,
|
|
917
|
+
statistic=ks_stat,
|
|
918
|
+
threshold=threshold,
|
|
919
|
+
method=method,
|
|
920
|
+
message=message,
|
|
921
|
+
details={
|
|
922
|
+
"current_column": self._name,
|
|
923
|
+
"reference_column": reference_column._name,
|
|
924
|
+
"reference_dataset": ref_dataset_name,
|
|
925
|
+
"current_count": len(current_values),
|
|
926
|
+
"reference_count": len(reference_values),
|
|
927
|
+
},
|
|
928
|
+
)
|
|
929
|
+
|
|
930
|
+
def _get_numeric_values(self, limit: int = 10000) -> list[float]:
|
|
931
|
+
"""Get numeric values from this column for statistical analysis."""
|
|
932
|
+
ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
933
|
+
col = f'"{self._name}"'
|
|
934
|
+
|
|
935
|
+
sql = f"""
|
|
936
|
+
SELECT CAST({col} AS DOUBLE) as val
|
|
937
|
+
FROM {ref}
|
|
938
|
+
WHERE {col} IS NOT NULL
|
|
939
|
+
LIMIT {limit}
|
|
940
|
+
"""
|
|
941
|
+
|
|
942
|
+
try:
|
|
943
|
+
rows = self._dataset.engine.fetch_all(sql)
|
|
944
|
+
return [float(row[0]) for row in rows if row[0] is not None]
|
|
945
|
+
except Exception:
|
|
946
|
+
return []
|
|
947
|
+
|
|
948
|
+
def _ks_test(self, data1: list[float], data2: list[float]) -> tuple[float, float]:
|
|
949
|
+
"""Perform two-sample Kolmogorov-Smirnov test.
|
|
950
|
+
|
|
951
|
+
Returns (ks_statistic, p_value).
|
|
952
|
+
"""
|
|
953
|
+
import math
|
|
954
|
+
|
|
955
|
+
# Sort both datasets
|
|
956
|
+
data1_sorted = sorted(data1)
|
|
957
|
+
data2_sorted = sorted(data2)
|
|
958
|
+
n1, n2 = len(data1_sorted), len(data2_sorted)
|
|
959
|
+
|
|
960
|
+
# Compute the maximum difference between empirical CDFs
|
|
961
|
+
all_values = sorted(set(data1_sorted + data2_sorted))
|
|
962
|
+
|
|
963
|
+
max_diff = 0.0
|
|
964
|
+
for val in all_values:
|
|
965
|
+
# CDF of data1 at val
|
|
966
|
+
cdf1 = sum(1 for x in data1_sorted if x <= val) / n1
|
|
967
|
+
# CDF of data2 at val
|
|
968
|
+
cdf2 = sum(1 for x in data2_sorted if x <= val) / n2
|
|
969
|
+
max_diff = max(max_diff, abs(cdf1 - cdf2))
|
|
970
|
+
|
|
971
|
+
ks_stat = max_diff
|
|
972
|
+
|
|
973
|
+
# Approximate p-value using asymptotic formula
|
|
974
|
+
# P(D > d) ≈ 2 * exp(-2 * d^2 * n1 * n2 / (n1 + n2))
|
|
975
|
+
en = math.sqrt(n1 * n2 / (n1 + n2))
|
|
976
|
+
p_value = 2.0 * math.exp(-2.0 * (ks_stat * en) ** 2)
|
|
977
|
+
p_value = min(1.0, max(0.0, p_value))
|
|
978
|
+
|
|
979
|
+
return ks_stat, p_value
|
|
980
|
+
|
|
503
981
|
def get_value_counts(self, limit: int = 20) -> dict[Any, int]:
|
|
504
982
|
"""
|
|
505
983
|
Get value counts for the column.
|
|
@@ -524,6 +1002,378 @@ class Column:
|
|
|
524
1002
|
rows = self._dataset.engine.fetch_all(sql)
|
|
525
1003
|
return {row[0]: row[1] for row in rows}
|
|
526
1004
|
|
|
1005
|
+
# =====================================================================
|
|
1006
|
+
# Conditional Validation Methods (DuckGuard 3.0)
|
|
1007
|
+
# =====================================================================
|
|
1008
|
+
|
|
1009
|
+
def not_null_when(
|
|
1010
|
+
self,
|
|
1011
|
+
condition: str,
|
|
1012
|
+
threshold: float = 1.0
|
|
1013
|
+
) -> ValidationResult:
|
|
1014
|
+
"""Check column is not null when condition is true.
|
|
1015
|
+
|
|
1016
|
+
This enables sophisticated conditional validation like:
|
|
1017
|
+
- "State must not be null when country = 'USA'"
|
|
1018
|
+
- "Phone is required when contact_method = 'phone'"
|
|
1019
|
+
|
|
1020
|
+
Args:
|
|
1021
|
+
condition: SQL WHERE clause condition (without WHERE keyword)
|
|
1022
|
+
threshold: Maximum allowed non-null rate (0.0 to 1.0, default 1.0)
|
|
1023
|
+
|
|
1024
|
+
Returns:
|
|
1025
|
+
ValidationResult with pass/fail status
|
|
1026
|
+
|
|
1027
|
+
Raises:
|
|
1028
|
+
ValidationError: If condition is invalid or contains forbidden SQL
|
|
1029
|
+
|
|
1030
|
+
Examples:
|
|
1031
|
+
>>> data = connect("customers.csv")
|
|
1032
|
+
>>> # State required for US customers
|
|
1033
|
+
>>> result = data.state.not_null_when("country = 'USA'")
|
|
1034
|
+
>>> assert result.passed
|
|
1035
|
+
|
|
1036
|
+
>>> # Email required for registered users
|
|
1037
|
+
>>> result = data.email.not_null_when("user_type = 'registered'")
|
|
1038
|
+
>>> assert result.passed
|
|
1039
|
+
|
|
1040
|
+
Security:
|
|
1041
|
+
Conditions are validated to prevent SQL injection. Only SELECT
|
|
1042
|
+
queries with WHERE clauses are allowed.
|
|
1043
|
+
"""
|
|
1044
|
+
from duckguard.checks.conditional import ConditionalCheckHandler
|
|
1045
|
+
|
|
1046
|
+
handler = ConditionalCheckHandler()
|
|
1047
|
+
return handler.execute_not_null_when(
|
|
1048
|
+
dataset=self._dataset,
|
|
1049
|
+
column=self._name,
|
|
1050
|
+
condition=condition,
|
|
1051
|
+
threshold=threshold
|
|
1052
|
+
)
|
|
1053
|
+
|
|
1054
|
+
def unique_when(
|
|
1055
|
+
self,
|
|
1056
|
+
condition: str,
|
|
1057
|
+
threshold: float = 1.0
|
|
1058
|
+
) -> ValidationResult:
|
|
1059
|
+
"""Check column is unique when condition is true.
|
|
1060
|
+
|
|
1061
|
+
Args:
|
|
1062
|
+
condition: SQL WHERE clause condition (without WHERE keyword)
|
|
1063
|
+
threshold: Minimum required uniqueness rate (0.0 to 1.0, default 1.0)
|
|
1064
|
+
|
|
1065
|
+
Returns:
|
|
1066
|
+
ValidationResult with pass/fail status
|
|
1067
|
+
|
|
1068
|
+
Examples:
|
|
1069
|
+
>>> # Order IDs must be unique for completed orders
|
|
1070
|
+
>>> result = data.order_id.unique_when("status = 'completed'")
|
|
1071
|
+
>>> assert result.passed
|
|
1072
|
+
|
|
1073
|
+
>>> # Transaction IDs unique for successful transactions
|
|
1074
|
+
>>> result = data.txn_id.unique_when("success = true")
|
|
1075
|
+
>>> assert result.passed
|
|
1076
|
+
"""
|
|
1077
|
+
from duckguard.checks.conditional import ConditionalCheckHandler
|
|
1078
|
+
|
|
1079
|
+
handler = ConditionalCheckHandler()
|
|
1080
|
+
return handler.execute_unique_when(
|
|
1081
|
+
dataset=self._dataset,
|
|
1082
|
+
column=self._name,
|
|
1083
|
+
condition=condition,
|
|
1084
|
+
threshold=threshold
|
|
1085
|
+
)
|
|
1086
|
+
|
|
1087
|
+
def between_when(
|
|
1088
|
+
self,
|
|
1089
|
+
min_val: float,
|
|
1090
|
+
max_val: float,
|
|
1091
|
+
condition: str,
|
|
1092
|
+
threshold: float = 1.0
|
|
1093
|
+
) -> ValidationResult:
|
|
1094
|
+
"""Check column is between min and max when condition is true.
|
|
1095
|
+
|
|
1096
|
+
Args:
|
|
1097
|
+
min_val: Minimum allowed value
|
|
1098
|
+
max_val: Maximum allowed value
|
|
1099
|
+
condition: SQL WHERE clause condition (without WHERE keyword)
|
|
1100
|
+
threshold: Maximum allowed failure rate (0.0 to 1.0, default 1.0)
|
|
1101
|
+
|
|
1102
|
+
Returns:
|
|
1103
|
+
ValidationResult with pass/fail status
|
|
1104
|
+
|
|
1105
|
+
Examples:
|
|
1106
|
+
>>> # Discount between 0-50% for standard customers
|
|
1107
|
+
>>> result = data.discount.between_when(
|
|
1108
|
+
... min_val=0,
|
|
1109
|
+
... max_val=50,
|
|
1110
|
+
... condition="customer_tier = 'standard'"
|
|
1111
|
+
... )
|
|
1112
|
+
>>> assert result.passed
|
|
1113
|
+
|
|
1114
|
+
>>> # Age between 18-65 for employees
|
|
1115
|
+
>>> result = data.age.between_when(18, 65, "type = 'employee'")
|
|
1116
|
+
>>> assert result.passed
|
|
1117
|
+
"""
|
|
1118
|
+
from duckguard.checks.conditional import ConditionalCheckHandler
|
|
1119
|
+
|
|
1120
|
+
handler = ConditionalCheckHandler()
|
|
1121
|
+
return handler.execute_between_when(
|
|
1122
|
+
dataset=self._dataset,
|
|
1123
|
+
column=self._name,
|
|
1124
|
+
min_value=min_val,
|
|
1125
|
+
max_value=max_val,
|
|
1126
|
+
condition=condition,
|
|
1127
|
+
threshold=threshold
|
|
1128
|
+
)
|
|
1129
|
+
|
|
1130
|
+
def isin_when(
|
|
1131
|
+
self,
|
|
1132
|
+
allowed_values: list[Any],
|
|
1133
|
+
condition: str,
|
|
1134
|
+
threshold: float = 1.0
|
|
1135
|
+
) -> ValidationResult:
|
|
1136
|
+
"""Check column is in allowed values when condition is true.
|
|
1137
|
+
|
|
1138
|
+
Args:
|
|
1139
|
+
allowed_values: List of allowed values
|
|
1140
|
+
condition: SQL WHERE clause condition (without WHERE keyword)
|
|
1141
|
+
threshold: Maximum allowed failure rate (0.0 to 1.0, default 1.0)
|
|
1142
|
+
|
|
1143
|
+
Returns:
|
|
1144
|
+
ValidationResult with pass/fail status
|
|
1145
|
+
|
|
1146
|
+
Examples:
|
|
1147
|
+
>>> # Status must be specific values for paid orders
|
|
1148
|
+
>>> result = data.status.isin_when(
|
|
1149
|
+
... allowed_values=['shipped', 'delivered'],
|
|
1150
|
+
... condition="payment_status = 'paid'"
|
|
1151
|
+
... )
|
|
1152
|
+
>>> assert result.passed
|
|
1153
|
+
|
|
1154
|
+
>>> # Category restricted for active products
|
|
1155
|
+
>>> result = data.category.isin_when(
|
|
1156
|
+
... ['A', 'B', 'C'],
|
|
1157
|
+
... "is_active = true"
|
|
1158
|
+
... )
|
|
1159
|
+
>>> assert result.passed
|
|
1160
|
+
"""
|
|
1161
|
+
from duckguard.checks.conditional import ConditionalCheckHandler
|
|
1162
|
+
|
|
1163
|
+
handler = ConditionalCheckHandler()
|
|
1164
|
+
return handler.execute_isin_when(
|
|
1165
|
+
dataset=self._dataset,
|
|
1166
|
+
column=self._name,
|
|
1167
|
+
allowed_values=allowed_values,
|
|
1168
|
+
condition=condition,
|
|
1169
|
+
threshold=threshold
|
|
1170
|
+
)
|
|
1171
|
+
|
|
1172
|
+
def matches_when(
|
|
1173
|
+
self,
|
|
1174
|
+
pattern: str,
|
|
1175
|
+
condition: str,
|
|
1176
|
+
threshold: float = 1.0
|
|
1177
|
+
) -> ValidationResult:
|
|
1178
|
+
"""Check column matches pattern when condition is true.
|
|
1179
|
+
|
|
1180
|
+
Args:
|
|
1181
|
+
pattern: Regular expression pattern to match
|
|
1182
|
+
condition: SQL WHERE clause condition (without WHERE keyword)
|
|
1183
|
+
threshold: Maximum allowed failure rate (0.0 to 1.0, default 1.0)
|
|
1184
|
+
|
|
1185
|
+
Returns:
|
|
1186
|
+
ValidationResult with pass/fail status
|
|
1187
|
+
|
|
1188
|
+
Examples:
|
|
1189
|
+
>>> # Email format required for email notifications
|
|
1190
|
+
>>> result = data.contact.matches_when(
|
|
1191
|
+
... pattern=r'^[\\w.-]+@[\\w.-]+\\.\\w+$',
|
|
1192
|
+
... condition="notification_type = 'email'"
|
|
1193
|
+
... )
|
|
1194
|
+
>>> assert result.passed
|
|
1195
|
+
|
|
1196
|
+
>>> # Phone format required for SMS
|
|
1197
|
+
>>> result = data.contact.matches_when(
|
|
1198
|
+
... pattern=r'^\\+?[0-9]{10,15}$',
|
|
1199
|
+
... condition="notification_type = 'sms'"
|
|
1200
|
+
... )
|
|
1201
|
+
>>> assert result.passed
|
|
1202
|
+
"""
|
|
1203
|
+
from duckguard.checks.conditional import ConditionalCheckHandler
|
|
1204
|
+
|
|
1205
|
+
handler = ConditionalCheckHandler()
|
|
1206
|
+
return handler.execute_pattern_when(
|
|
1207
|
+
dataset=self._dataset,
|
|
1208
|
+
column=self._name,
|
|
1209
|
+
pattern=pattern,
|
|
1210
|
+
condition=condition,
|
|
1211
|
+
threshold=threshold
|
|
1212
|
+
)
|
|
1213
|
+
|
|
1214
|
+
# =================================================================
|
|
1215
|
+
# Distributional Checks (DuckGuard 3.0)
|
|
1216
|
+
# =================================================================
|
|
1217
|
+
|
|
1218
|
+
def expect_distribution_normal(
|
|
1219
|
+
self,
|
|
1220
|
+
significance_level: float = 0.05
|
|
1221
|
+
) -> ValidationResult:
|
|
1222
|
+
"""Check if column data follows a normal distribution.
|
|
1223
|
+
|
|
1224
|
+
Uses Kolmogorov-Smirnov test comparing data to fitted normal distribution.
|
|
1225
|
+
|
|
1226
|
+
Args:
|
|
1227
|
+
significance_level: Significance level for test (default 0.05)
|
|
1228
|
+
|
|
1229
|
+
Returns:
|
|
1230
|
+
ValidationResult (passed if p-value > significance_level)
|
|
1231
|
+
|
|
1232
|
+
Examples:
|
|
1233
|
+
>>> # Test if temperature measurements are normally distributed
|
|
1234
|
+
>>> result = data.temperature.expect_distribution_normal()
|
|
1235
|
+
>>> assert result.passed
|
|
1236
|
+
|
|
1237
|
+
>>> # Use stricter significance level
|
|
1238
|
+
>>> result = data.measurement.expect_distribution_normal(
|
|
1239
|
+
... significance_level=0.01
|
|
1240
|
+
... )
|
|
1241
|
+
|
|
1242
|
+
Note:
|
|
1243
|
+
Requires scipy: pip install 'duckguard[statistics]'
|
|
1244
|
+
Requires minimum 30 samples for reliable results.
|
|
1245
|
+
"""
|
|
1246
|
+
from duckguard.checks.distributional import DistributionalCheckHandler
|
|
1247
|
+
|
|
1248
|
+
handler = DistributionalCheckHandler()
|
|
1249
|
+
return handler.execute_distribution_normal(
|
|
1250
|
+
dataset=self._dataset,
|
|
1251
|
+
column=self._name,
|
|
1252
|
+
significance_level=significance_level
|
|
1253
|
+
)
|
|
1254
|
+
|
|
1255
|
+
def expect_distribution_uniform(
|
|
1256
|
+
self,
|
|
1257
|
+
significance_level: float = 0.05
|
|
1258
|
+
) -> ValidationResult:
|
|
1259
|
+
"""Check if column data follows a uniform distribution.
|
|
1260
|
+
|
|
1261
|
+
Uses Kolmogorov-Smirnov test comparing data to uniform distribution.
|
|
1262
|
+
|
|
1263
|
+
Args:
|
|
1264
|
+
significance_level: Significance level for test (default 0.05)
|
|
1265
|
+
|
|
1266
|
+
Returns:
|
|
1267
|
+
ValidationResult (passed if p-value > significance_level)
|
|
1268
|
+
|
|
1269
|
+
Examples:
|
|
1270
|
+
>>> # Test if random numbers are uniformly distributed
|
|
1271
|
+
>>> result = data.random_value.expect_distribution_uniform()
|
|
1272
|
+
>>> assert result.passed
|
|
1273
|
+
|
|
1274
|
+
>>> # Test dice rolls for fairness
|
|
1275
|
+
>>> result = data.dice_roll.expect_distribution_uniform()
|
|
1276
|
+
|
|
1277
|
+
Note:
|
|
1278
|
+
Requires scipy: pip install 'duckguard[statistics]'
|
|
1279
|
+
Requires minimum 30 samples for reliable results.
|
|
1280
|
+
"""
|
|
1281
|
+
from duckguard.checks.distributional import DistributionalCheckHandler
|
|
1282
|
+
|
|
1283
|
+
handler = DistributionalCheckHandler()
|
|
1284
|
+
return handler.execute_distribution_uniform(
|
|
1285
|
+
dataset=self._dataset,
|
|
1286
|
+
column=self._name,
|
|
1287
|
+
significance_level=significance_level
|
|
1288
|
+
)
|
|
1289
|
+
|
|
1290
|
+
def expect_ks_test(
|
|
1291
|
+
self,
|
|
1292
|
+
distribution: str = "norm",
|
|
1293
|
+
significance_level: float = 0.05
|
|
1294
|
+
) -> ValidationResult:
|
|
1295
|
+
"""Perform Kolmogorov-Smirnov test for specified distribution.
|
|
1296
|
+
|
|
1297
|
+
Args:
|
|
1298
|
+
distribution: Distribution name ('norm', 'uniform', 'expon', etc.)
|
|
1299
|
+
significance_level: Significance level for test (default 0.05)
|
|
1300
|
+
|
|
1301
|
+
Returns:
|
|
1302
|
+
ValidationResult (passed if p-value > significance_level)
|
|
1303
|
+
|
|
1304
|
+
Examples:
|
|
1305
|
+
>>> # Test for normal distribution
|
|
1306
|
+
>>> result = data.values.expect_ks_test(distribution='norm')
|
|
1307
|
+
>>> assert result.passed
|
|
1308
|
+
|
|
1309
|
+
>>> # Test for exponential distribution
|
|
1310
|
+
>>> result = data.wait_times.expect_ks_test(
|
|
1311
|
+
... distribution='expon',
|
|
1312
|
+
... significance_level=0.01
|
|
1313
|
+
... )
|
|
1314
|
+
|
|
1315
|
+
Note:
|
|
1316
|
+
Requires scipy: pip install 'duckguard[statistics]'
|
|
1317
|
+
Supported distributions: norm, uniform, expon, gamma, beta, etc.
|
|
1318
|
+
"""
|
|
1319
|
+
from duckguard.checks.distributional import DistributionalCheckHandler
|
|
1320
|
+
|
|
1321
|
+
handler = DistributionalCheckHandler()
|
|
1322
|
+
return handler.execute_ks_test(
|
|
1323
|
+
dataset=self._dataset,
|
|
1324
|
+
column=self._name,
|
|
1325
|
+
distribution=distribution,
|
|
1326
|
+
significance_level=significance_level
|
|
1327
|
+
)
|
|
1328
|
+
|
|
1329
|
+
def expect_chi_square_test(
|
|
1330
|
+
self,
|
|
1331
|
+
expected_frequencies: dict | None = None,
|
|
1332
|
+
significance_level: float = 0.05
|
|
1333
|
+
) -> ValidationResult:
|
|
1334
|
+
"""Perform chi-square goodness-of-fit test for categorical data.
|
|
1335
|
+
|
|
1336
|
+
Tests if observed frequencies match expected frequencies.
|
|
1337
|
+
|
|
1338
|
+
Args:
|
|
1339
|
+
expected_frequencies: Dict mapping categories to expected frequencies
|
|
1340
|
+
If None, assumes uniform distribution
|
|
1341
|
+
significance_level: Significance level for test (default 0.05)
|
|
1342
|
+
|
|
1343
|
+
Returns:
|
|
1344
|
+
ValidationResult (passed if p-value > significance_level)
|
|
1345
|
+
|
|
1346
|
+
Examples:
|
|
1347
|
+
>>> # Test if dice is fair (uniform distribution)
|
|
1348
|
+
>>> result = data.dice_roll.expect_chi_square_test()
|
|
1349
|
+
>>> assert result.passed
|
|
1350
|
+
|
|
1351
|
+
>>> # Test with specific expected frequencies
|
|
1352
|
+
>>> expected = {1: 1/6, 2: 1/6, 3: 1/6, 4: 1/6, 5: 1/6, 6: 1/6}
|
|
1353
|
+
>>> result = data.dice_roll.expect_chi_square_test(
|
|
1354
|
+
... expected_frequencies=expected
|
|
1355
|
+
... )
|
|
1356
|
+
|
|
1357
|
+
>>> # Test categorical distribution
|
|
1358
|
+
>>> expected = {'A': 0.5, 'B': 0.3, 'C': 0.2}
|
|
1359
|
+
>>> result = data.category.expect_chi_square_test(
|
|
1360
|
+
... expected_frequencies=expected
|
|
1361
|
+
... )
|
|
1362
|
+
|
|
1363
|
+
Note:
|
|
1364
|
+
Requires scipy: pip install 'duckguard[statistics]'
|
|
1365
|
+
Requires minimum 30 samples for reliable results.
|
|
1366
|
+
"""
|
|
1367
|
+
from duckguard.checks.distributional import DistributionalCheckHandler
|
|
1368
|
+
|
|
1369
|
+
handler = DistributionalCheckHandler()
|
|
1370
|
+
return handler.execute_chi_square_test(
|
|
1371
|
+
dataset=self._dataset,
|
|
1372
|
+
column=self._name,
|
|
1373
|
+
expected_frequencies=expected_frequencies,
|
|
1374
|
+
significance_level=significance_level
|
|
1375
|
+
)
|
|
1376
|
+
|
|
527
1377
|
def clear_cache(self) -> None:
|
|
528
1378
|
"""Clear cached statistics."""
|
|
529
1379
|
self._stats_cache = None
|