duckguard 2.2.0__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. duckguard/__init__.py +1 -1
  2. duckguard/anomaly/__init__.py +28 -0
  3. duckguard/anomaly/baselines.py +294 -0
  4. duckguard/anomaly/methods.py +16 -2
  5. duckguard/anomaly/ml_methods.py +724 -0
  6. duckguard/checks/__init__.py +26 -0
  7. duckguard/checks/conditional.py +796 -0
  8. duckguard/checks/distributional.py +524 -0
  9. duckguard/checks/multicolumn.py +726 -0
  10. duckguard/checks/query_based.py +643 -0
  11. duckguard/cli/main.py +257 -2
  12. duckguard/connectors/factory.py +30 -2
  13. duckguard/connectors/files.py +7 -3
  14. duckguard/core/column.py +851 -1
  15. duckguard/core/dataset.py +1035 -0
  16. duckguard/core/result.py +236 -0
  17. duckguard/freshness/__init__.py +33 -0
  18. duckguard/freshness/monitor.py +429 -0
  19. duckguard/history/schema.py +119 -1
  20. duckguard/notifications/__init__.py +20 -2
  21. duckguard/notifications/email.py +508 -0
  22. duckguard/profiler/distribution_analyzer.py +384 -0
  23. duckguard/profiler/outlier_detector.py +497 -0
  24. duckguard/profiler/pattern_matcher.py +301 -0
  25. duckguard/profiler/quality_scorer.py +445 -0
  26. duckguard/reports/html_reporter.py +1 -2
  27. duckguard/rules/executor.py +642 -0
  28. duckguard/rules/generator.py +4 -1
  29. duckguard/rules/schema.py +54 -0
  30. duckguard/schema_history/__init__.py +40 -0
  31. duckguard/schema_history/analyzer.py +414 -0
  32. duckguard/schema_history/tracker.py +288 -0
  33. duckguard/semantic/detector.py +17 -1
  34. duckguard-3.0.0.dist-info/METADATA +1072 -0
  35. {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/RECORD +38 -21
  36. duckguard-2.2.0.dist-info/METADATA +0 -351
  37. {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/WHEEL +0 -0
  38. {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/entry_points.txt +0 -0
  39. {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/licenses/LICENSE +0 -0
duckguard/core/column.py CHANGED
@@ -4,7 +4,7 @@ from __future__ import annotations
4
4
 
5
5
  from typing import TYPE_CHECKING, Any
6
6
 
7
- from duckguard.core.result import FailedRow, ValidationResult
7
+ from duckguard.core.result import DriftResult, FailedRow, ValidationResult
8
8
 
9
9
  if TYPE_CHECKING:
10
10
  from duckguard.core.dataset import Dataset
@@ -477,6 +477,358 @@ class Column:
477
477
  message=f"Column '{self._name}' has {invalid_count} values with length outside [{min_len}, {max_len}]",
478
478
  )
479
479
 
480
+ # =========================================================================
481
+ # Cross-Dataset Validation Methods (Reference/FK Checks)
482
+ # =========================================================================
483
+
484
+ def exists_in(
485
+ self,
486
+ reference_column: Column,
487
+ capture_failures: bool = True,
488
+ ) -> ValidationResult:
489
+ """
490
+ Check that all non-null values in this column exist in the reference column.
491
+
492
+ This is the core foreign key validation method using an efficient SQL anti-join.
493
+ Null values in this column are ignored (they don't need to exist in reference).
494
+
495
+ Args:
496
+ reference_column: Column object from the reference dataset
497
+ capture_failures: Whether to capture sample orphaned rows (default: True)
498
+
499
+ Returns:
500
+ ValidationResult with orphan count and sample failed rows
501
+
502
+ Example:
503
+ orders = connect("orders.parquet")
504
+ customers = connect("customers.parquet")
505
+ result = orders["customer_id"].exists_in(customers["id"])
506
+ if not result:
507
+ print(f"Found {result.actual_value} orphan customer IDs")
508
+ """
509
+ # Get source references for both datasets
510
+ source_ref = self._dataset.engine.get_source_reference(self._dataset.source)
511
+ ref_ref = reference_column._dataset.engine.get_source_reference(
512
+ reference_column._dataset.source
513
+ )
514
+ source_col = f'"{self._name}"'
515
+ ref_col = f'"{reference_column._name}"'
516
+
517
+ # Count orphans using efficient anti-join pattern
518
+ sql = f"""
519
+ SELECT COUNT(*) as orphan_count
520
+ FROM {source_ref} s
521
+ WHERE s.{source_col} IS NOT NULL
522
+ AND NOT EXISTS (
523
+ SELECT 1 FROM {ref_ref} r
524
+ WHERE r.{ref_col} = s.{source_col}
525
+ )
526
+ """
527
+
528
+ orphan_count = self._dataset.engine.fetch_value(sql) or 0
529
+ passed = orphan_count == 0
530
+
531
+ # Capture sample of orphan rows for debugging
532
+ failed_rows = []
533
+ if not passed and capture_failures:
534
+ failed_rows = self._get_failed_rows_exists_in(reference_column)
535
+
536
+ ref_dataset_name = reference_column._dataset.name or reference_column._dataset.source
537
+ return ValidationResult(
538
+ passed=passed,
539
+ actual_value=orphan_count,
540
+ expected_value=0,
541
+ message=f"Column '{self._name}' has {orphan_count} values not found in {ref_dataset_name}.{reference_column._name}",
542
+ details={
543
+ "orphan_count": orphan_count,
544
+ "reference_dataset": ref_dataset_name,
545
+ "reference_column": reference_column._name,
546
+ },
547
+ failed_rows=failed_rows,
548
+ total_failures=orphan_count,
549
+ )
550
+
551
+ def _get_failed_rows_exists_in(
552
+ self, reference_column: Column, limit: int = DEFAULT_SAMPLE_SIZE
553
+ ) -> list[FailedRow]:
554
+ """Get sample of rows with orphan values (not found in reference)."""
555
+ source_ref = self._dataset.engine.get_source_reference(self._dataset.source)
556
+ ref_ref = reference_column._dataset.engine.get_source_reference(
557
+ reference_column._dataset.source
558
+ )
559
+ source_col = f'"{self._name}"'
560
+ ref_col = f'"{reference_column._name}"'
561
+
562
+ sql = f"""
563
+ SELECT row_number() OVER () as row_idx, s.{source_col} as val
564
+ FROM {source_ref} s
565
+ WHERE s.{source_col} IS NOT NULL
566
+ AND NOT EXISTS (
567
+ SELECT 1 FROM {ref_ref} r
568
+ WHERE r.{ref_col} = s.{source_col}
569
+ )
570
+ LIMIT {limit}
571
+ """
572
+
573
+ rows = self._dataset.engine.fetch_all(sql)
574
+ ref_dataset_name = reference_column._dataset.name or reference_column._dataset.source
575
+ return [
576
+ FailedRow(
577
+ row_index=row[0],
578
+ column=self._name,
579
+ value=row[1],
580
+ expected=f"exists in {ref_dataset_name}.{reference_column._name}",
581
+ reason=f"Value '{row[1]}' not found in reference",
582
+ context={"reference_dataset": ref_dataset_name},
583
+ )
584
+ for row in rows
585
+ ]
586
+
587
+ def references(
588
+ self,
589
+ reference_column: Column,
590
+ allow_nulls: bool = True,
591
+ capture_failures: bool = True,
592
+ ) -> ValidationResult:
593
+ """
594
+ Check foreign key relationship with configurable options.
595
+
596
+ This is a more configurable version of exists_in() that allows
597
+ controlling how null values are handled.
598
+
599
+ Args:
600
+ reference_column: Column in the reference dataset
601
+ allow_nulls: If True (default), null values pass. If False, nulls fail.
602
+ capture_failures: Whether to capture sample orphaned rows (default: True)
603
+
604
+ Returns:
605
+ ValidationResult
606
+
607
+ Example:
608
+ # Nulls are OK (default)
609
+ result = orders["customer_id"].references(customers["id"])
610
+
611
+ # Nulls should fail
612
+ result = orders["customer_id"].references(
613
+ customers["id"],
614
+ allow_nulls=False,
615
+ )
616
+ """
617
+ # First, check for orphans (values not in reference)
618
+ result = self.exists_in(reference_column, capture_failures=capture_failures)
619
+
620
+ if not allow_nulls:
621
+ # Also count nulls as failures
622
+ null_count = self.null_count
623
+ if null_count > 0:
624
+ # Combine orphan failures with null failures
625
+ total_failures = result.actual_value + null_count
626
+ passed = total_failures == 0
627
+
628
+ # Add null rows to failed_rows if capturing
629
+ null_failed_rows = []
630
+ if capture_failures and null_count > 0:
631
+ null_failed_rows = self._get_null_rows_sample()
632
+
633
+ ref_dataset_name = reference_column._dataset.name or reference_column._dataset.source
634
+ return ValidationResult(
635
+ passed=passed,
636
+ actual_value=total_failures,
637
+ expected_value=0,
638
+ message=f"Column '{self._name}' has {result.actual_value} orphans and {null_count} nulls (references {ref_dataset_name}.{reference_column._name})",
639
+ details={
640
+ "orphan_count": result.actual_value,
641
+ "null_count": null_count,
642
+ "reference_dataset": ref_dataset_name,
643
+ "reference_column": reference_column._name,
644
+ "allow_nulls": allow_nulls,
645
+ },
646
+ failed_rows=result.failed_rows + null_failed_rows,
647
+ total_failures=total_failures,
648
+ )
649
+
650
+ return result
651
+
652
+ def _get_null_rows_sample(self, limit: int = DEFAULT_SAMPLE_SIZE) -> list[FailedRow]:
653
+ """Get sample of rows with null values."""
654
+ ref = self._dataset.engine.get_source_reference(self._dataset.source)
655
+ col = f'"{self._name}"'
656
+
657
+ sql = f"""
658
+ SELECT row_number() OVER () as row_idx
659
+ FROM {ref}
660
+ WHERE {col} IS NULL
661
+ LIMIT {limit}
662
+ """
663
+
664
+ rows = self._dataset.engine.fetch_all(sql)
665
+ return [
666
+ FailedRow(
667
+ row_index=row[0],
668
+ column=self._name,
669
+ value=None,
670
+ expected="not null (allow_nulls=False)",
671
+ reason="Null value not allowed",
672
+ )
673
+ for row in rows
674
+ ]
675
+
676
+ def find_orphans(
677
+ self,
678
+ reference_column: Column,
679
+ limit: int = 100,
680
+ ) -> list[Any]:
681
+ """
682
+ Find values that don't exist in the reference column.
683
+
684
+ This is a helper method to quickly identify orphan values
685
+ without running a full validation.
686
+
687
+ Args:
688
+ reference_column: Column in the reference dataset
689
+ limit: Maximum number of orphan values to return (default: 100)
690
+
691
+ Returns:
692
+ List of orphan values
693
+
694
+ Example:
695
+ orphan_ids = orders["customer_id"].find_orphans(customers["id"])
696
+ print(f"Invalid customer IDs: {orphan_ids}")
697
+ """
698
+ source_ref = self._dataset.engine.get_source_reference(self._dataset.source)
699
+ ref_ref = reference_column._dataset.engine.get_source_reference(
700
+ reference_column._dataset.source
701
+ )
702
+ source_col = f'"{self._name}"'
703
+ ref_col = f'"{reference_column._name}"'
704
+
705
+ sql = f"""
706
+ SELECT DISTINCT s.{source_col}
707
+ FROM {source_ref} s
708
+ WHERE s.{source_col} IS NOT NULL
709
+ AND NOT EXISTS (
710
+ SELECT 1 FROM {ref_ref} r
711
+ WHERE r.{ref_col} = s.{source_col}
712
+ )
713
+ LIMIT {limit}
714
+ """
715
+
716
+ rows = self._dataset.engine.fetch_all(sql)
717
+ return [row[0] for row in rows]
718
+
719
+ def matches_values(
720
+ self,
721
+ other_column: Column,
722
+ capture_failures: bool = True,
723
+ ) -> ValidationResult:
724
+ """
725
+ Check that this column's distinct values match another column's distinct values.
726
+
727
+ Useful for comparing reference data or checking data synchronization.
728
+ Both "missing in other" and "extra in other" are considered failures.
729
+
730
+ Args:
731
+ other_column: Column to compare against
732
+ capture_failures: Whether to capture sample mismatched values (default: True)
733
+
734
+ Returns:
735
+ ValidationResult indicating if value sets match
736
+
737
+ Example:
738
+ result = orders["status"].matches_values(status_lookup["code"])
739
+ """
740
+ source_ref = self._dataset.engine.get_source_reference(self._dataset.source)
741
+ other_ref = other_column._dataset.engine.get_source_reference(
742
+ other_column._dataset.source
743
+ )
744
+ source_col = f'"{self._name}"'
745
+ other_col = f'"{other_column._name}"'
746
+
747
+ # Count values in source but not in other
748
+ sql_missing = f"""
749
+ SELECT COUNT(DISTINCT s.{source_col}) as missing_count
750
+ FROM {source_ref} s
751
+ WHERE s.{source_col} IS NOT NULL
752
+ AND NOT EXISTS (
753
+ SELECT 1 FROM {other_ref} o
754
+ WHERE o.{other_col} = s.{source_col}
755
+ )
756
+ """
757
+
758
+ # Count values in other but not in source
759
+ sql_extra = f"""
760
+ SELECT COUNT(DISTINCT o.{other_col}) as extra_count
761
+ FROM {other_ref} o
762
+ WHERE o.{other_col} IS NOT NULL
763
+ AND NOT EXISTS (
764
+ SELECT 1 FROM {source_ref} s
765
+ WHERE s.{source_col} = o.{other_col}
766
+ )
767
+ """
768
+
769
+ missing_count = self._dataset.engine.fetch_value(sql_missing) or 0
770
+ extra_count = self._dataset.engine.fetch_value(sql_extra) or 0
771
+ total_diff = missing_count + extra_count
772
+ passed = total_diff == 0
773
+
774
+ # Capture sample of mismatched values
775
+ failed_rows = []
776
+ if not passed and capture_failures:
777
+ failed_rows = self._get_failed_rows_matches_values(other_column)
778
+
779
+ other_dataset_name = other_column._dataset.name or other_column._dataset.source
780
+ return ValidationResult(
781
+ passed=passed,
782
+ actual_value=total_diff,
783
+ expected_value=0,
784
+ message=f"Column '{self._name}' has {missing_count} values missing in {other_dataset_name}.{other_column._name}, {extra_count} extra",
785
+ details={
786
+ "missing_in_other": missing_count,
787
+ "extra_in_other": extra_count,
788
+ "other_dataset": other_dataset_name,
789
+ "other_column": other_column._name,
790
+ },
791
+ failed_rows=failed_rows,
792
+ total_failures=total_diff,
793
+ )
794
+
795
+ def _get_failed_rows_matches_values(
796
+ self, other_column: Column, limit: int = DEFAULT_SAMPLE_SIZE
797
+ ) -> list[FailedRow]:
798
+ """Get sample of values that don't match between columns."""
799
+ source_ref = self._dataset.engine.get_source_reference(self._dataset.source)
800
+ other_ref = other_column._dataset.engine.get_source_reference(
801
+ other_column._dataset.source
802
+ )
803
+ source_col = f'"{self._name}"'
804
+ other_col = f'"{other_column._name}"'
805
+
806
+ # Get values in source but not in other
807
+ sql = f"""
808
+ SELECT DISTINCT s.{source_col} as val, 'missing_in_other' as diff_type
809
+ FROM {source_ref} s
810
+ WHERE s.{source_col} IS NOT NULL
811
+ AND NOT EXISTS (
812
+ SELECT 1 FROM {other_ref} o
813
+ WHERE o.{other_col} = s.{source_col}
814
+ )
815
+ LIMIT {limit}
816
+ """
817
+
818
+ rows = self._dataset.engine.fetch_all(sql)
819
+ other_dataset_name = other_column._dataset.name or other_column._dataset.source
820
+ return [
821
+ FailedRow(
822
+ row_index=idx + 1,
823
+ column=self._name,
824
+ value=row[0],
825
+ expected=f"exists in {other_dataset_name}.{other_column._name}",
826
+ reason=f"Value '{row[0]}' not found in other column",
827
+ context={"diff_type": row[1]},
828
+ )
829
+ for idx, row in enumerate(rows)
830
+ ]
831
+
480
832
  def get_distinct_values(self, limit: int = 100) -> list[Any]:
481
833
  """
482
834
  Get distinct values in the column.
@@ -500,6 +852,132 @@ class Column:
500
852
  rows = self._dataset.engine.fetch_all(sql)
501
853
  return [row[0] for row in rows]
502
854
 
855
+ # =========================================================================
856
+ # Distribution Drift Detection
857
+ # =========================================================================
858
+
859
+ def detect_drift(
860
+ self,
861
+ reference_column: Column,
862
+ threshold: float = 0.05,
863
+ method: str = "ks_test",
864
+ ) -> DriftResult:
865
+ """
866
+ Detect distribution drift between this column and a reference column.
867
+
868
+ Uses statistical tests to determine if the distribution of values
869
+ has changed significantly. Useful for ML model monitoring and
870
+ data pipeline validation.
871
+
872
+ Args:
873
+ reference_column: Column from reference/baseline dataset
874
+ threshold: P-value threshold for drift detection (default: 0.05)
875
+ method: Statistical test method ("ks_test" for Kolmogorov-Smirnov)
876
+
877
+ Returns:
878
+ DriftResult with drift detection outcome
879
+
880
+ Example:
881
+ current = connect("orders_today.parquet")
882
+ baseline = connect("orders_baseline.parquet")
883
+ result = current["amount"].detect_drift(baseline["amount"])
884
+ if result.is_drifted:
885
+ print(f"Distribution drift detected! p-value: {result.p_value}")
886
+ """
887
+ from duckguard.core.result import DriftResult
888
+
889
+ # Get values from both columns
890
+ current_values = self._get_numeric_values()
891
+ reference_values = reference_column._get_numeric_values()
892
+
893
+ if len(current_values) == 0 or len(reference_values) == 0:
894
+ return DriftResult(
895
+ is_drifted=False,
896
+ p_value=1.0,
897
+ statistic=0.0,
898
+ threshold=threshold,
899
+ method=method,
900
+ message="Insufficient data for drift detection",
901
+ details={"current_count": len(current_values), "reference_count": len(reference_values)},
902
+ )
903
+
904
+ # Perform KS test
905
+ ks_stat, p_value = self._ks_test(current_values, reference_values)
906
+ is_drifted = p_value < threshold
907
+
908
+ ref_dataset_name = reference_column._dataset.name or reference_column._dataset.source
909
+ if is_drifted:
910
+ message = f"Distribution drift detected in '{self._name}' vs {ref_dataset_name}.{reference_column._name} (p-value: {p_value:.4f} < {threshold})"
911
+ else:
912
+ message = f"No significant drift in '{self._name}' vs {ref_dataset_name}.{reference_column._name} (p-value: {p_value:.4f})"
913
+
914
+ return DriftResult(
915
+ is_drifted=is_drifted,
916
+ p_value=p_value,
917
+ statistic=ks_stat,
918
+ threshold=threshold,
919
+ method=method,
920
+ message=message,
921
+ details={
922
+ "current_column": self._name,
923
+ "reference_column": reference_column._name,
924
+ "reference_dataset": ref_dataset_name,
925
+ "current_count": len(current_values),
926
+ "reference_count": len(reference_values),
927
+ },
928
+ )
929
+
930
+ def _get_numeric_values(self, limit: int = 10000) -> list[float]:
931
+ """Get numeric values from this column for statistical analysis."""
932
+ ref = self._dataset.engine.get_source_reference(self._dataset.source)
933
+ col = f'"{self._name}"'
934
+
935
+ sql = f"""
936
+ SELECT CAST({col} AS DOUBLE) as val
937
+ FROM {ref}
938
+ WHERE {col} IS NOT NULL
939
+ LIMIT {limit}
940
+ """
941
+
942
+ try:
943
+ rows = self._dataset.engine.fetch_all(sql)
944
+ return [float(row[0]) for row in rows if row[0] is not None]
945
+ except Exception:
946
+ return []
947
+
948
+ def _ks_test(self, data1: list[float], data2: list[float]) -> tuple[float, float]:
949
+ """Perform two-sample Kolmogorov-Smirnov test.
950
+
951
+ Returns (ks_statistic, p_value).
952
+ """
953
+ import math
954
+
955
+ # Sort both datasets
956
+ data1_sorted = sorted(data1)
957
+ data2_sorted = sorted(data2)
958
+ n1, n2 = len(data1_sorted), len(data2_sorted)
959
+
960
+ # Compute the maximum difference between empirical CDFs
961
+ all_values = sorted(set(data1_sorted + data2_sorted))
962
+
963
+ max_diff = 0.0
964
+ for val in all_values:
965
+ # CDF of data1 at val
966
+ cdf1 = sum(1 for x in data1_sorted if x <= val) / n1
967
+ # CDF of data2 at val
968
+ cdf2 = sum(1 for x in data2_sorted if x <= val) / n2
969
+ max_diff = max(max_diff, abs(cdf1 - cdf2))
970
+
971
+ ks_stat = max_diff
972
+
973
+ # Approximate p-value using asymptotic formula
974
+ # P(D > d) ≈ 2 * exp(-2 * d^2 * n1 * n2 / (n1 + n2))
975
+ en = math.sqrt(n1 * n2 / (n1 + n2))
976
+ p_value = 2.0 * math.exp(-2.0 * (ks_stat * en) ** 2)
977
+ p_value = min(1.0, max(0.0, p_value))
978
+
979
+ return ks_stat, p_value
980
+
503
981
  def get_value_counts(self, limit: int = 20) -> dict[Any, int]:
504
982
  """
505
983
  Get value counts for the column.
@@ -524,6 +1002,378 @@ class Column:
524
1002
  rows = self._dataset.engine.fetch_all(sql)
525
1003
  return {row[0]: row[1] for row in rows}
526
1004
 
1005
+ # =====================================================================
1006
+ # Conditional Validation Methods (DuckGuard 3.0)
1007
+ # =====================================================================
1008
+
1009
+ def not_null_when(
1010
+ self,
1011
+ condition: str,
1012
+ threshold: float = 1.0
1013
+ ) -> ValidationResult:
1014
+ """Check column is not null when condition is true.
1015
+
1016
+ This enables sophisticated conditional validation like:
1017
+ - "State must not be null when country = 'USA'"
1018
+ - "Phone is required when contact_method = 'phone'"
1019
+
1020
+ Args:
1021
+ condition: SQL WHERE clause condition (without WHERE keyword)
1022
+ threshold: Maximum allowed non-null rate (0.0 to 1.0, default 1.0)
1023
+
1024
+ Returns:
1025
+ ValidationResult with pass/fail status
1026
+
1027
+ Raises:
1028
+ ValidationError: If condition is invalid or contains forbidden SQL
1029
+
1030
+ Examples:
1031
+ >>> data = connect("customers.csv")
1032
+ >>> # State required for US customers
1033
+ >>> result = data.state.not_null_when("country = 'USA'")
1034
+ >>> assert result.passed
1035
+
1036
+ >>> # Email required for registered users
1037
+ >>> result = data.email.not_null_when("user_type = 'registered'")
1038
+ >>> assert result.passed
1039
+
1040
+ Security:
1041
+ Conditions are validated to prevent SQL injection. Only SELECT
1042
+ queries with WHERE clauses are allowed.
1043
+ """
1044
+ from duckguard.checks.conditional import ConditionalCheckHandler
1045
+
1046
+ handler = ConditionalCheckHandler()
1047
+ return handler.execute_not_null_when(
1048
+ dataset=self._dataset,
1049
+ column=self._name,
1050
+ condition=condition,
1051
+ threshold=threshold
1052
+ )
1053
+
1054
+ def unique_when(
1055
+ self,
1056
+ condition: str,
1057
+ threshold: float = 1.0
1058
+ ) -> ValidationResult:
1059
+ """Check column is unique when condition is true.
1060
+
1061
+ Args:
1062
+ condition: SQL WHERE clause condition (without WHERE keyword)
1063
+ threshold: Minimum required uniqueness rate (0.0 to 1.0, default 1.0)
1064
+
1065
+ Returns:
1066
+ ValidationResult with pass/fail status
1067
+
1068
+ Examples:
1069
+ >>> # Order IDs must be unique for completed orders
1070
+ >>> result = data.order_id.unique_when("status = 'completed'")
1071
+ >>> assert result.passed
1072
+
1073
+ >>> # Transaction IDs unique for successful transactions
1074
+ >>> result = data.txn_id.unique_when("success = true")
1075
+ >>> assert result.passed
1076
+ """
1077
+ from duckguard.checks.conditional import ConditionalCheckHandler
1078
+
1079
+ handler = ConditionalCheckHandler()
1080
+ return handler.execute_unique_when(
1081
+ dataset=self._dataset,
1082
+ column=self._name,
1083
+ condition=condition,
1084
+ threshold=threshold
1085
+ )
1086
+
1087
+ def between_when(
1088
+ self,
1089
+ min_val: float,
1090
+ max_val: float,
1091
+ condition: str,
1092
+ threshold: float = 1.0
1093
+ ) -> ValidationResult:
1094
+ """Check column is between min and max when condition is true.
1095
+
1096
+ Args:
1097
+ min_val: Minimum allowed value
1098
+ max_val: Maximum allowed value
1099
+ condition: SQL WHERE clause condition (without WHERE keyword)
1100
+ threshold: Maximum allowed failure rate (0.0 to 1.0, default 1.0)
1101
+
1102
+ Returns:
1103
+ ValidationResult with pass/fail status
1104
+
1105
+ Examples:
1106
+ >>> # Discount between 0-50% for standard customers
1107
+ >>> result = data.discount.between_when(
1108
+ ... min_val=0,
1109
+ ... max_val=50,
1110
+ ... condition="customer_tier = 'standard'"
1111
+ ... )
1112
+ >>> assert result.passed
1113
+
1114
+ >>> # Age between 18-65 for employees
1115
+ >>> result = data.age.between_when(18, 65, "type = 'employee'")
1116
+ >>> assert result.passed
1117
+ """
1118
+ from duckguard.checks.conditional import ConditionalCheckHandler
1119
+
1120
+ handler = ConditionalCheckHandler()
1121
+ return handler.execute_between_when(
1122
+ dataset=self._dataset,
1123
+ column=self._name,
1124
+ min_value=min_val,
1125
+ max_value=max_val,
1126
+ condition=condition,
1127
+ threshold=threshold
1128
+ )
1129
+
1130
+ def isin_when(
1131
+ self,
1132
+ allowed_values: list[Any],
1133
+ condition: str,
1134
+ threshold: float = 1.0
1135
+ ) -> ValidationResult:
1136
+ """Check column is in allowed values when condition is true.
1137
+
1138
+ Args:
1139
+ allowed_values: List of allowed values
1140
+ condition: SQL WHERE clause condition (without WHERE keyword)
1141
+ threshold: Maximum allowed failure rate (0.0 to 1.0, default 1.0)
1142
+
1143
+ Returns:
1144
+ ValidationResult with pass/fail status
1145
+
1146
+ Examples:
1147
+ >>> # Status must be specific values for paid orders
1148
+ >>> result = data.status.isin_when(
1149
+ ... allowed_values=['shipped', 'delivered'],
1150
+ ... condition="payment_status = 'paid'"
1151
+ ... )
1152
+ >>> assert result.passed
1153
+
1154
+ >>> # Category restricted for active products
1155
+ >>> result = data.category.isin_when(
1156
+ ... ['A', 'B', 'C'],
1157
+ ... "is_active = true"
1158
+ ... )
1159
+ >>> assert result.passed
1160
+ """
1161
+ from duckguard.checks.conditional import ConditionalCheckHandler
1162
+
1163
+ handler = ConditionalCheckHandler()
1164
+ return handler.execute_isin_when(
1165
+ dataset=self._dataset,
1166
+ column=self._name,
1167
+ allowed_values=allowed_values,
1168
+ condition=condition,
1169
+ threshold=threshold
1170
+ )
1171
+
1172
+ def matches_when(
1173
+ self,
1174
+ pattern: str,
1175
+ condition: str,
1176
+ threshold: float = 1.0
1177
+ ) -> ValidationResult:
1178
+ """Check column matches pattern when condition is true.
1179
+
1180
+ Args:
1181
+ pattern: Regular expression pattern to match
1182
+ condition: SQL WHERE clause condition (without WHERE keyword)
1183
+ threshold: Maximum allowed failure rate (0.0 to 1.0, default 1.0)
1184
+
1185
+ Returns:
1186
+ ValidationResult with pass/fail status
1187
+
1188
+ Examples:
1189
+ >>> # Email format required for email notifications
1190
+ >>> result = data.contact.matches_when(
1191
+ ... pattern=r'^[\\w.-]+@[\\w.-]+\\.\\w+$',
1192
+ ... condition="notification_type = 'email'"
1193
+ ... )
1194
+ >>> assert result.passed
1195
+
1196
+ >>> # Phone format required for SMS
1197
+ >>> result = data.contact.matches_when(
1198
+ ... pattern=r'^\\+?[0-9]{10,15}$',
1199
+ ... condition="notification_type = 'sms'"
1200
+ ... )
1201
+ >>> assert result.passed
1202
+ """
1203
+ from duckguard.checks.conditional import ConditionalCheckHandler
1204
+
1205
+ handler = ConditionalCheckHandler()
1206
+ return handler.execute_pattern_when(
1207
+ dataset=self._dataset,
1208
+ column=self._name,
1209
+ pattern=pattern,
1210
+ condition=condition,
1211
+ threshold=threshold
1212
+ )
1213
+
1214
+ # =================================================================
1215
+ # Distributional Checks (DuckGuard 3.0)
1216
+ # =================================================================
1217
+
1218
+ def expect_distribution_normal(
1219
+ self,
1220
+ significance_level: float = 0.05
1221
+ ) -> ValidationResult:
1222
+ """Check if column data follows a normal distribution.
1223
+
1224
+ Uses Kolmogorov-Smirnov test comparing data to fitted normal distribution.
1225
+
1226
+ Args:
1227
+ significance_level: Significance level for test (default 0.05)
1228
+
1229
+ Returns:
1230
+ ValidationResult (passed if p-value > significance_level)
1231
+
1232
+ Examples:
1233
+ >>> # Test if temperature measurements are normally distributed
1234
+ >>> result = data.temperature.expect_distribution_normal()
1235
+ >>> assert result.passed
1236
+
1237
+ >>> # Use stricter significance level
1238
+ >>> result = data.measurement.expect_distribution_normal(
1239
+ ... significance_level=0.01
1240
+ ... )
1241
+
1242
+ Note:
1243
+ Requires scipy: pip install 'duckguard[statistics]'
1244
+ Requires minimum 30 samples for reliable results.
1245
+ """
1246
+ from duckguard.checks.distributional import DistributionalCheckHandler
1247
+
1248
+ handler = DistributionalCheckHandler()
1249
+ return handler.execute_distribution_normal(
1250
+ dataset=self._dataset,
1251
+ column=self._name,
1252
+ significance_level=significance_level
1253
+ )
1254
+
1255
+ def expect_distribution_uniform(
1256
+ self,
1257
+ significance_level: float = 0.05
1258
+ ) -> ValidationResult:
1259
+ """Check if column data follows a uniform distribution.
1260
+
1261
+ Uses Kolmogorov-Smirnov test comparing data to uniform distribution.
1262
+
1263
+ Args:
1264
+ significance_level: Significance level for test (default 0.05)
1265
+
1266
+ Returns:
1267
+ ValidationResult (passed if p-value > significance_level)
1268
+
1269
+ Examples:
1270
+ >>> # Test if random numbers are uniformly distributed
1271
+ >>> result = data.random_value.expect_distribution_uniform()
1272
+ >>> assert result.passed
1273
+
1274
+ >>> # Test dice rolls for fairness
1275
+ >>> result = data.dice_roll.expect_distribution_uniform()
1276
+
1277
+ Note:
1278
+ Requires scipy: pip install 'duckguard[statistics]'
1279
+ Requires minimum 30 samples for reliable results.
1280
+ """
1281
+ from duckguard.checks.distributional import DistributionalCheckHandler
1282
+
1283
+ handler = DistributionalCheckHandler()
1284
+ return handler.execute_distribution_uniform(
1285
+ dataset=self._dataset,
1286
+ column=self._name,
1287
+ significance_level=significance_level
1288
+ )
1289
+
1290
+ def expect_ks_test(
1291
+ self,
1292
+ distribution: str = "norm",
1293
+ significance_level: float = 0.05
1294
+ ) -> ValidationResult:
1295
+ """Perform Kolmogorov-Smirnov test for specified distribution.
1296
+
1297
+ Args:
1298
+ distribution: Distribution name ('norm', 'uniform', 'expon', etc.)
1299
+ significance_level: Significance level for test (default 0.05)
1300
+
1301
+ Returns:
1302
+ ValidationResult (passed if p-value > significance_level)
1303
+
1304
+ Examples:
1305
+ >>> # Test for normal distribution
1306
+ >>> result = data.values.expect_ks_test(distribution='norm')
1307
+ >>> assert result.passed
1308
+
1309
+ >>> # Test for exponential distribution
1310
+ >>> result = data.wait_times.expect_ks_test(
1311
+ ... distribution='expon',
1312
+ ... significance_level=0.01
1313
+ ... )
1314
+
1315
+ Note:
1316
+ Requires scipy: pip install 'duckguard[statistics]'
1317
+ Supported distributions: norm, uniform, expon, gamma, beta, etc.
1318
+ """
1319
+ from duckguard.checks.distributional import DistributionalCheckHandler
1320
+
1321
+ handler = DistributionalCheckHandler()
1322
+ return handler.execute_ks_test(
1323
+ dataset=self._dataset,
1324
+ column=self._name,
1325
+ distribution=distribution,
1326
+ significance_level=significance_level
1327
+ )
1328
+
1329
+ def expect_chi_square_test(
1330
+ self,
1331
+ expected_frequencies: dict | None = None,
1332
+ significance_level: float = 0.05
1333
+ ) -> ValidationResult:
1334
+ """Perform chi-square goodness-of-fit test for categorical data.
1335
+
1336
+ Tests if observed frequencies match expected frequencies.
1337
+
1338
+ Args:
1339
+ expected_frequencies: Dict mapping categories to expected frequencies
1340
+ If None, assumes uniform distribution
1341
+ significance_level: Significance level for test (default 0.05)
1342
+
1343
+ Returns:
1344
+ ValidationResult (passed if p-value > significance_level)
1345
+
1346
+ Examples:
1347
+ >>> # Test if dice is fair (uniform distribution)
1348
+ >>> result = data.dice_roll.expect_chi_square_test()
1349
+ >>> assert result.passed
1350
+
1351
+ >>> # Test with specific expected frequencies
1352
+ >>> expected = {1: 1/6, 2: 1/6, 3: 1/6, 4: 1/6, 5: 1/6, 6: 1/6}
1353
+ >>> result = data.dice_roll.expect_chi_square_test(
1354
+ ... expected_frequencies=expected
1355
+ ... )
1356
+
1357
+ >>> # Test categorical distribution
1358
+ >>> expected = {'A': 0.5, 'B': 0.3, 'C': 0.2}
1359
+ >>> result = data.category.expect_chi_square_test(
1360
+ ... expected_frequencies=expected
1361
+ ... )
1362
+
1363
+ Note:
1364
+ Requires scipy: pip install 'duckguard[statistics]'
1365
+ Requires minimum 30 samples for reliable results.
1366
+ """
1367
+ from duckguard.checks.distributional import DistributionalCheckHandler
1368
+
1369
+ handler = DistributionalCheckHandler()
1370
+ return handler.execute_chi_square_test(
1371
+ dataset=self._dataset,
1372
+ column=self._name,
1373
+ expected_frequencies=expected_frequencies,
1374
+ significance_level=significance_level
1375
+ )
1376
+
527
1377
  def clear_cache(self) -> None:
528
1378
  """Clear cached statistics."""
529
1379
  self._stats_cache = None