duckguard 2.2.0__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
duckguard/core/column.py CHANGED
@@ -4,7 +4,7 @@ from __future__ import annotations
4
4
 
5
5
  from typing import TYPE_CHECKING, Any
6
6
 
7
- from duckguard.core.result import FailedRow, ValidationResult
7
+ from duckguard.core.result import DriftResult, FailedRow, ValidationResult
8
8
 
9
9
  if TYPE_CHECKING:
10
10
  from duckguard.core.dataset import Dataset
@@ -477,6 +477,358 @@ class Column:
477
477
  message=f"Column '{self._name}' has {invalid_count} values with length outside [{min_len}, {max_len}]",
478
478
  )
479
479
 
480
+ # =========================================================================
481
+ # Cross-Dataset Validation Methods (Reference/FK Checks)
482
+ # =========================================================================
483
+
484
+ def exists_in(
485
+ self,
486
+ reference_column: Column,
487
+ capture_failures: bool = True,
488
+ ) -> ValidationResult:
489
+ """
490
+ Check that all non-null values in this column exist in the reference column.
491
+
492
+ This is the core foreign key validation method using an efficient SQL anti-join.
493
+ Null values in this column are ignored (they don't need to exist in reference).
494
+
495
+ Args:
496
+ reference_column: Column object from the reference dataset
497
+ capture_failures: Whether to capture sample orphaned rows (default: True)
498
+
499
+ Returns:
500
+ ValidationResult with orphan count and sample failed rows
501
+
502
+ Example:
503
+ orders = connect("orders.parquet")
504
+ customers = connect("customers.parquet")
505
+ result = orders["customer_id"].exists_in(customers["id"])
506
+ if not result:
507
+ print(f"Found {result.actual_value} orphan customer IDs")
508
+ """
509
+ # Get source references for both datasets
510
+ source_ref = self._dataset.engine.get_source_reference(self._dataset.source)
511
+ ref_ref = reference_column._dataset.engine.get_source_reference(
512
+ reference_column._dataset.source
513
+ )
514
+ source_col = f'"{self._name}"'
515
+ ref_col = f'"{reference_column._name}"'
516
+
517
+ # Count orphans using efficient anti-join pattern
518
+ sql = f"""
519
+ SELECT COUNT(*) as orphan_count
520
+ FROM {source_ref} s
521
+ WHERE s.{source_col} IS NOT NULL
522
+ AND NOT EXISTS (
523
+ SELECT 1 FROM {ref_ref} r
524
+ WHERE r.{ref_col} = s.{source_col}
525
+ )
526
+ """
527
+
528
+ orphan_count = self._dataset.engine.fetch_value(sql) or 0
529
+ passed = orphan_count == 0
530
+
531
+ # Capture sample of orphan rows for debugging
532
+ failed_rows = []
533
+ if not passed and capture_failures:
534
+ failed_rows = self._get_failed_rows_exists_in(reference_column)
535
+
536
+ ref_dataset_name = reference_column._dataset.name or reference_column._dataset.source
537
+ return ValidationResult(
538
+ passed=passed,
539
+ actual_value=orphan_count,
540
+ expected_value=0,
541
+ message=f"Column '{self._name}' has {orphan_count} values not found in {ref_dataset_name}.{reference_column._name}",
542
+ details={
543
+ "orphan_count": orphan_count,
544
+ "reference_dataset": ref_dataset_name,
545
+ "reference_column": reference_column._name,
546
+ },
547
+ failed_rows=failed_rows,
548
+ total_failures=orphan_count,
549
+ )
550
+
551
+ def _get_failed_rows_exists_in(
552
+ self, reference_column: Column, limit: int = DEFAULT_SAMPLE_SIZE
553
+ ) -> list[FailedRow]:
554
+ """Get sample of rows with orphan values (not found in reference)."""
555
+ source_ref = self._dataset.engine.get_source_reference(self._dataset.source)
556
+ ref_ref = reference_column._dataset.engine.get_source_reference(
557
+ reference_column._dataset.source
558
+ )
559
+ source_col = f'"{self._name}"'
560
+ ref_col = f'"{reference_column._name}"'
561
+
562
+ sql = f"""
563
+ SELECT row_number() OVER () as row_idx, s.{source_col} as val
564
+ FROM {source_ref} s
565
+ WHERE s.{source_col} IS NOT NULL
566
+ AND NOT EXISTS (
567
+ SELECT 1 FROM {ref_ref} r
568
+ WHERE r.{ref_col} = s.{source_col}
569
+ )
570
+ LIMIT {limit}
571
+ """
572
+
573
+ rows = self._dataset.engine.fetch_all(sql)
574
+ ref_dataset_name = reference_column._dataset.name or reference_column._dataset.source
575
+ return [
576
+ FailedRow(
577
+ row_index=row[0],
578
+ column=self._name,
579
+ value=row[1],
580
+ expected=f"exists in {ref_dataset_name}.{reference_column._name}",
581
+ reason=f"Value '{row[1]}' not found in reference",
582
+ context={"reference_dataset": ref_dataset_name},
583
+ )
584
+ for row in rows
585
+ ]
586
+
587
+ def references(
588
+ self,
589
+ reference_column: Column,
590
+ allow_nulls: bool = True,
591
+ capture_failures: bool = True,
592
+ ) -> ValidationResult:
593
+ """
594
+ Check foreign key relationship with configurable options.
595
+
596
+ This is a more configurable version of exists_in() that allows
597
+ controlling how null values are handled.
598
+
599
+ Args:
600
+ reference_column: Column in the reference dataset
601
+ allow_nulls: If True (default), null values pass. If False, nulls fail.
602
+ capture_failures: Whether to capture sample orphaned rows (default: True)
603
+
604
+ Returns:
605
+ ValidationResult
606
+
607
+ Example:
608
+ # Nulls are OK (default)
609
+ result = orders["customer_id"].references(customers["id"])
610
+
611
+ # Nulls should fail
612
+ result = orders["customer_id"].references(
613
+ customers["id"],
614
+ allow_nulls=False,
615
+ )
616
+ """
617
+ # First, check for orphans (values not in reference)
618
+ result = self.exists_in(reference_column, capture_failures=capture_failures)
619
+
620
+ if not allow_nulls:
621
+ # Also count nulls as failures
622
+ null_count = self.null_count
623
+ if null_count > 0:
624
+ # Combine orphan failures with null failures
625
+ total_failures = result.actual_value + null_count
626
+ passed = total_failures == 0
627
+
628
+ # Add null rows to failed_rows if capturing
629
+ null_failed_rows = []
630
+ if capture_failures and null_count > 0:
631
+ null_failed_rows = self._get_null_rows_sample()
632
+
633
+ ref_dataset_name = reference_column._dataset.name or reference_column._dataset.source
634
+ return ValidationResult(
635
+ passed=passed,
636
+ actual_value=total_failures,
637
+ expected_value=0,
638
+ message=f"Column '{self._name}' has {result.actual_value} orphans and {null_count} nulls (references {ref_dataset_name}.{reference_column._name})",
639
+ details={
640
+ "orphan_count": result.actual_value,
641
+ "null_count": null_count,
642
+ "reference_dataset": ref_dataset_name,
643
+ "reference_column": reference_column._name,
644
+ "allow_nulls": allow_nulls,
645
+ },
646
+ failed_rows=result.failed_rows + null_failed_rows,
647
+ total_failures=total_failures,
648
+ )
649
+
650
+ return result
651
+
652
+ def _get_null_rows_sample(self, limit: int = DEFAULT_SAMPLE_SIZE) -> list[FailedRow]:
653
+ """Get sample of rows with null values."""
654
+ ref = self._dataset.engine.get_source_reference(self._dataset.source)
655
+ col = f'"{self._name}"'
656
+
657
+ sql = f"""
658
+ SELECT row_number() OVER () as row_idx
659
+ FROM {ref}
660
+ WHERE {col} IS NULL
661
+ LIMIT {limit}
662
+ """
663
+
664
+ rows = self._dataset.engine.fetch_all(sql)
665
+ return [
666
+ FailedRow(
667
+ row_index=row[0],
668
+ column=self._name,
669
+ value=None,
670
+ expected="not null (allow_nulls=False)",
671
+ reason="Null value not allowed",
672
+ )
673
+ for row in rows
674
+ ]
675
+
676
+ def find_orphans(
677
+ self,
678
+ reference_column: Column,
679
+ limit: int = 100,
680
+ ) -> list[Any]:
681
+ """
682
+ Find values that don't exist in the reference column.
683
+
684
+ This is a helper method to quickly identify orphan values
685
+ without running a full validation.
686
+
687
+ Args:
688
+ reference_column: Column in the reference dataset
689
+ limit: Maximum number of orphan values to return (default: 100)
690
+
691
+ Returns:
692
+ List of orphan values
693
+
694
+ Example:
695
+ orphan_ids = orders["customer_id"].find_orphans(customers["id"])
696
+ print(f"Invalid customer IDs: {orphan_ids}")
697
+ """
698
+ source_ref = self._dataset.engine.get_source_reference(self._dataset.source)
699
+ ref_ref = reference_column._dataset.engine.get_source_reference(
700
+ reference_column._dataset.source
701
+ )
702
+ source_col = f'"{self._name}"'
703
+ ref_col = f'"{reference_column._name}"'
704
+
705
+ sql = f"""
706
+ SELECT DISTINCT s.{source_col}
707
+ FROM {source_ref} s
708
+ WHERE s.{source_col} IS NOT NULL
709
+ AND NOT EXISTS (
710
+ SELECT 1 FROM {ref_ref} r
711
+ WHERE r.{ref_col} = s.{source_col}
712
+ )
713
+ LIMIT {limit}
714
+ """
715
+
716
+ rows = self._dataset.engine.fetch_all(sql)
717
+ return [row[0] for row in rows]
718
+
719
+ def matches_values(
720
+ self,
721
+ other_column: Column,
722
+ capture_failures: bool = True,
723
+ ) -> ValidationResult:
724
+ """
725
+ Check that this column's distinct values match another column's distinct values.
726
+
727
+ Useful for comparing reference data or checking data synchronization.
728
+ Both "missing in other" and "extra in other" are considered failures.
729
+
730
+ Args:
731
+ other_column: Column to compare against
732
+ capture_failures: Whether to capture sample mismatched values (default: True)
733
+
734
+ Returns:
735
+ ValidationResult indicating if value sets match
736
+
737
+ Example:
738
+ result = orders["status"].matches_values(status_lookup["code"])
739
+ """
740
+ source_ref = self._dataset.engine.get_source_reference(self._dataset.source)
741
+ other_ref = other_column._dataset.engine.get_source_reference(
742
+ other_column._dataset.source
743
+ )
744
+ source_col = f'"{self._name}"'
745
+ other_col = f'"{other_column._name}"'
746
+
747
+ # Count values in source but not in other
748
+ sql_missing = f"""
749
+ SELECT COUNT(DISTINCT s.{source_col}) as missing_count
750
+ FROM {source_ref} s
751
+ WHERE s.{source_col} IS NOT NULL
752
+ AND NOT EXISTS (
753
+ SELECT 1 FROM {other_ref} o
754
+ WHERE o.{other_col} = s.{source_col}
755
+ )
756
+ """
757
+
758
+ # Count values in other but not in source
759
+ sql_extra = f"""
760
+ SELECT COUNT(DISTINCT o.{other_col}) as extra_count
761
+ FROM {other_ref} o
762
+ WHERE o.{other_col} IS NOT NULL
763
+ AND NOT EXISTS (
764
+ SELECT 1 FROM {source_ref} s
765
+ WHERE s.{source_col} = o.{other_col}
766
+ )
767
+ """
768
+
769
+ missing_count = self._dataset.engine.fetch_value(sql_missing) or 0
770
+ extra_count = self._dataset.engine.fetch_value(sql_extra) or 0
771
+ total_diff = missing_count + extra_count
772
+ passed = total_diff == 0
773
+
774
+ # Capture sample of mismatched values
775
+ failed_rows = []
776
+ if not passed and capture_failures:
777
+ failed_rows = self._get_failed_rows_matches_values(other_column)
778
+
779
+ other_dataset_name = other_column._dataset.name or other_column._dataset.source
780
+ return ValidationResult(
781
+ passed=passed,
782
+ actual_value=total_diff,
783
+ expected_value=0,
784
+ message=f"Column '{self._name}' has {missing_count} values missing in {other_dataset_name}.{other_column._name}, {extra_count} extra",
785
+ details={
786
+ "missing_in_other": missing_count,
787
+ "extra_in_other": extra_count,
788
+ "other_dataset": other_dataset_name,
789
+ "other_column": other_column._name,
790
+ },
791
+ failed_rows=failed_rows,
792
+ total_failures=total_diff,
793
+ )
794
+
795
+ def _get_failed_rows_matches_values(
796
+ self, other_column: Column, limit: int = DEFAULT_SAMPLE_SIZE
797
+ ) -> list[FailedRow]:
798
+ """Get sample of values that don't match between columns."""
799
+ source_ref = self._dataset.engine.get_source_reference(self._dataset.source)
800
+ other_ref = other_column._dataset.engine.get_source_reference(
801
+ other_column._dataset.source
802
+ )
803
+ source_col = f'"{self._name}"'
804
+ other_col = f'"{other_column._name}"'
805
+
806
+ # Get values in source but not in other
807
+ sql = f"""
808
+ SELECT DISTINCT s.{source_col} as val, 'missing_in_other' as diff_type
809
+ FROM {source_ref} s
810
+ WHERE s.{source_col} IS NOT NULL
811
+ AND NOT EXISTS (
812
+ SELECT 1 FROM {other_ref} o
813
+ WHERE o.{other_col} = s.{source_col}
814
+ )
815
+ LIMIT {limit}
816
+ """
817
+
818
+ rows = self._dataset.engine.fetch_all(sql)
819
+ other_dataset_name = other_column._dataset.name or other_column._dataset.source
820
+ return [
821
+ FailedRow(
822
+ row_index=idx + 1,
823
+ column=self._name,
824
+ value=row[0],
825
+ expected=f"exists in {other_dataset_name}.{other_column._name}",
826
+ reason=f"Value '{row[0]}' not found in other column",
827
+ context={"diff_type": row[1]},
828
+ )
829
+ for idx, row in enumerate(rows)
830
+ ]
831
+
480
832
  def get_distinct_values(self, limit: int = 100) -> list[Any]:
481
833
  """
482
834
  Get distinct values in the column.
@@ -500,6 +852,132 @@ class Column:
500
852
  rows = self._dataset.engine.fetch_all(sql)
501
853
  return [row[0] for row in rows]
502
854
 
855
+ # =========================================================================
856
+ # Distribution Drift Detection
857
+ # =========================================================================
858
+
859
+ def detect_drift(
860
+ self,
861
+ reference_column: Column,
862
+ threshold: float = 0.05,
863
+ method: str = "ks_test",
864
+ ) -> DriftResult:
865
+ """
866
+ Detect distribution drift between this column and a reference column.
867
+
868
+ Uses statistical tests to determine if the distribution of values
869
+ has changed significantly. Useful for ML model monitoring and
870
+ data pipeline validation.
871
+
872
+ Args:
873
+ reference_column: Column from reference/baseline dataset
874
+ threshold: P-value threshold for drift detection (default: 0.05)
875
+ method: Statistical test method ("ks_test" for Kolmogorov-Smirnov)
876
+
877
+ Returns:
878
+ DriftResult with drift detection outcome
879
+
880
+ Example:
881
+ current = connect("orders_today.parquet")
882
+ baseline = connect("orders_baseline.parquet")
883
+ result = current["amount"].detect_drift(baseline["amount"])
884
+ if result.is_drifted:
885
+ print(f"Distribution drift detected! p-value: {result.p_value}")
886
+ """
887
+ from duckguard.core.result import DriftResult
888
+
889
+ # Get values from both columns
890
+ current_values = self._get_numeric_values()
891
+ reference_values = reference_column._get_numeric_values()
892
+
893
+ if len(current_values) == 0 or len(reference_values) == 0:
894
+ return DriftResult(
895
+ is_drifted=False,
896
+ p_value=1.0,
897
+ statistic=0.0,
898
+ threshold=threshold,
899
+ method=method,
900
+ message="Insufficient data for drift detection",
901
+ details={"current_count": len(current_values), "reference_count": len(reference_values)},
902
+ )
903
+
904
+ # Perform KS test
905
+ ks_stat, p_value = self._ks_test(current_values, reference_values)
906
+ is_drifted = p_value < threshold
907
+
908
+ ref_dataset_name = reference_column._dataset.name or reference_column._dataset.source
909
+ if is_drifted:
910
+ message = f"Distribution drift detected in '{self._name}' vs {ref_dataset_name}.{reference_column._name} (p-value: {p_value:.4f} < {threshold})"
911
+ else:
912
+ message = f"No significant drift in '{self._name}' vs {ref_dataset_name}.{reference_column._name} (p-value: {p_value:.4f})"
913
+
914
+ return DriftResult(
915
+ is_drifted=is_drifted,
916
+ p_value=p_value,
917
+ statistic=ks_stat,
918
+ threshold=threshold,
919
+ method=method,
920
+ message=message,
921
+ details={
922
+ "current_column": self._name,
923
+ "reference_column": reference_column._name,
924
+ "reference_dataset": ref_dataset_name,
925
+ "current_count": len(current_values),
926
+ "reference_count": len(reference_values),
927
+ },
928
+ )
929
+
930
+ def _get_numeric_values(self, limit: int = 10000) -> list[float]:
931
+ """Get numeric values from this column for statistical analysis."""
932
+ ref = self._dataset.engine.get_source_reference(self._dataset.source)
933
+ col = f'"{self._name}"'
934
+
935
+ sql = f"""
936
+ SELECT CAST({col} AS DOUBLE) as val
937
+ FROM {ref}
938
+ WHERE {col} IS NOT NULL
939
+ LIMIT {limit}
940
+ """
941
+
942
+ try:
943
+ rows = self._dataset.engine.fetch_all(sql)
944
+ return [float(row[0]) for row in rows if row[0] is not None]
945
+ except Exception:
946
+ return []
947
+
948
+ def _ks_test(self, data1: list[float], data2: list[float]) -> tuple[float, float]:
949
+ """Perform two-sample Kolmogorov-Smirnov test.
950
+
951
+ Returns (ks_statistic, p_value).
952
+ """
953
+ import math
954
+
955
+ # Sort both datasets
956
+ data1_sorted = sorted(data1)
957
+ data2_sorted = sorted(data2)
958
+ n1, n2 = len(data1_sorted), len(data2_sorted)
959
+
960
+ # Compute the maximum difference between empirical CDFs
961
+ all_values = sorted(set(data1_sorted + data2_sorted))
962
+
963
+ max_diff = 0.0
964
+ for val in all_values:
965
+ # CDF of data1 at val
966
+ cdf1 = sum(1 for x in data1_sorted if x <= val) / n1
967
+ # CDF of data2 at val
968
+ cdf2 = sum(1 for x in data2_sorted if x <= val) / n2
969
+ max_diff = max(max_diff, abs(cdf1 - cdf2))
970
+
971
+ ks_stat = max_diff
972
+
973
+ # Approximate p-value using asymptotic formula
974
+ # P(D > d) ≈ 2 * exp(-2 * d^2 * n1 * n2 / (n1 + n2))
975
+ en = math.sqrt(n1 * n2 / (n1 + n2))
976
+ p_value = 2.0 * math.exp(-2.0 * (ks_stat * en) ** 2)
977
+ p_value = min(1.0, max(0.0, p_value))
978
+
979
+ return ks_stat, p_value
980
+
503
981
  def get_value_counts(self, limit: int = 20) -> dict[Any, int]:
504
982
  """
505
983
  Get value counts for the column.