duckguard 2.3.0__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
duckguard/core/dataset.py CHANGED
@@ -676,6 +676,336 @@ class Dataset:
676
676
 
677
677
  return GroupedDataset(self, columns)
678
678
 
679
+ # =================================================================
680
+ # Multi-Column Validation Methods (DuckGuard 3.0)
681
+ # =================================================================
682
+
683
+ def expect_column_pair_satisfy(
684
+ self,
685
+ column_a: str,
686
+ column_b: str,
687
+ expression: str,
688
+ threshold: float = 1.0
689
+ ) -> ValidationResult:
690
+ """Check that column pair satisfies expression.
691
+
692
+ Args:
693
+ column_a: First column name
694
+ column_b: Second column name
695
+ expression: Expression to evaluate (e.g., "A > B", "A + B = 100")
696
+ threshold: Maximum allowed failure rate (0.0-1.0)
697
+
698
+ Returns:
699
+ ValidationResult with pass/fail status
700
+
701
+ Examples:
702
+ >>> data = connect("orders.csv")
703
+ >>> # Date range validation
704
+ >>> result = data.expect_column_pair_satisfy(
705
+ ... column_a="end_date",
706
+ ... column_b="start_date",
707
+ ... expression="end_date >= start_date"
708
+ ... )
709
+ >>> assert result.passed
710
+
711
+ >>> # Arithmetic validation
712
+ >>> result = data.expect_column_pair_satisfy(
713
+ ... column_a="total",
714
+ ... column_b="subtotal",
715
+ ... expression="total = subtotal * 1.1"
716
+ ... )
717
+ """
718
+ from duckguard.checks.multicolumn import MultiColumnCheckHandler
719
+
720
+ handler = MultiColumnCheckHandler()
721
+ return handler.execute_column_pair_satisfy(
722
+ dataset=self,
723
+ column_a=column_a,
724
+ column_b=column_b,
725
+ expression=expression,
726
+ threshold=threshold
727
+ )
728
+
729
+ def expect_columns_unique(
730
+ self,
731
+ columns: list[str],
732
+ threshold: float = 1.0
733
+ ) -> ValidationResult:
734
+ """Check that combination of columns is unique (composite key).
735
+
736
+ Args:
737
+ columns: List of column names forming composite key
738
+ threshold: Minimum required uniqueness rate (0.0-1.0)
739
+
740
+ Returns:
741
+ ValidationResult with pass/fail status
742
+
743
+ Examples:
744
+ >>> # Two-column composite key
745
+ >>> result = data.expect_columns_unique(
746
+ ... columns=["user_id", "session_id"]
747
+ ... )
748
+ >>> assert result.passed
749
+
750
+ >>> # Three-column composite key
751
+ >>> result = data.expect_columns_unique(
752
+ ... columns=["year", "month", "product_id"]
753
+ ... )
754
+ """
755
+ from duckguard.checks.multicolumn import MultiColumnCheckHandler
756
+
757
+ handler = MultiColumnCheckHandler()
758
+ return handler.execute_columns_unique(
759
+ dataset=self,
760
+ columns=columns,
761
+ threshold=threshold
762
+ )
763
+
764
+ def expect_multicolumn_sum_to_equal(
765
+ self,
766
+ columns: list[str],
767
+ expected_sum: float,
768
+ threshold: float = 0.01
769
+ ) -> ValidationResult:
770
+ """Check that sum of columns equals expected value.
771
+
772
+ Args:
773
+ columns: List of columns to sum
774
+ expected_sum: Expected sum value
775
+ threshold: Maximum allowed deviation
776
+
777
+ Returns:
778
+ ValidationResult with pass/fail status
779
+
780
+ Examples:
781
+ >>> # Components must sum to 100%
782
+ >>> result = data.expect_multicolumn_sum_to_equal(
783
+ ... columns=["q1_pct", "q2_pct", "q3_pct", "q4_pct"],
784
+ ... expected_sum=100.0
785
+ ... )
786
+ >>> assert result.passed
787
+
788
+ >>> # Budget allocation check
789
+ >>> result = data.expect_multicolumn_sum_to_equal(
790
+ ... columns=["marketing", "sales", "r_and_d"],
791
+ ... expected_sum=data.total_budget
792
+ ... )
793
+ """
794
+ from duckguard.checks.multicolumn import MultiColumnCheckHandler
795
+
796
+ handler = MultiColumnCheckHandler()
797
+ return handler.execute_multicolumn_sum_equal(
798
+ dataset=self,
799
+ columns=columns,
800
+ expected_sum=expected_sum,
801
+ threshold=threshold
802
+ )
803
+
804
+ # =================================================================
805
+ # Query-Based Validation Methods (DuckGuard 3.0)
806
+ # =================================================================
807
+
808
+ def expect_query_to_return_no_rows(
809
+ self,
810
+ query: str,
811
+ message: str | None = None
812
+ ) -> ValidationResult:
813
+ """Check that custom SQL query returns no rows (finds no violations).
814
+
815
+ Use case: Write a query that finds violations. The check passes if
816
+ the query returns no rows (no violations found).
817
+
818
+ Args:
819
+ query: SQL SELECT query (use 'table' to reference the dataset)
820
+ message: Optional custom message
821
+
822
+ Returns:
823
+ ValidationResult (passed if query returns 0 rows)
824
+
825
+ Examples:
826
+ >>> data = connect("orders.csv")
827
+ >>> # Find invalid totals (total < subtotal)
828
+ >>> result = data.expect_query_to_return_no_rows(
829
+ ... query="SELECT * FROM table WHERE total < subtotal"
830
+ ... )
831
+ >>> assert result.passed
832
+
833
+ >>> # Find future dates
834
+ >>> result = data.expect_query_to_return_no_rows(
835
+ ... query="SELECT * FROM table WHERE order_date > CURRENT_DATE"
836
+ ... )
837
+
838
+ Security:
839
+ - Query is validated to prevent SQL injection
840
+ - Only SELECT queries allowed
841
+ - READ-ONLY mode enforced
842
+ - 30-second timeout
843
+ - 10,000 row result limit
844
+ """
845
+ from duckguard.checks.query_based import QueryCheckHandler
846
+
847
+ handler = QueryCheckHandler()
848
+ return handler.execute_query_no_rows(
849
+ dataset=self,
850
+ query=query,
851
+ message=message
852
+ )
853
+
854
+ def expect_query_to_return_rows(
855
+ self,
856
+ query: str,
857
+ message: str | None = None
858
+ ) -> ValidationResult:
859
+ """Check that custom SQL query returns at least one row.
860
+
861
+ Use case: Ensure expected data exists in the dataset.
862
+
863
+ Args:
864
+ query: SQL SELECT query (use 'table' to reference the dataset)
865
+ message: Optional custom message
866
+
867
+ Returns:
868
+ ValidationResult (passed if query returns > 0 rows)
869
+
870
+ Examples:
871
+ >>> data = connect("products.csv")
872
+ >>> # Ensure we have active products
873
+ >>> result = data.expect_query_to_return_rows(
874
+ ... query="SELECT * FROM table WHERE status = 'active'"
875
+ ... )
876
+ >>> assert result.passed
877
+
878
+ >>> # Ensure we have recent data
879
+ >>> result = data.expect_query_to_return_rows(
880
+ ... query="SELECT * FROM table WHERE created_at >= CURRENT_DATE - 7"
881
+ ... )
882
+
883
+ Security:
884
+ - Query is validated to prevent SQL injection
885
+ - Only SELECT queries allowed
886
+ - READ-ONLY mode enforced
887
+ """
888
+ from duckguard.checks.query_based import QueryCheckHandler
889
+
890
+ handler = QueryCheckHandler()
891
+ return handler.execute_query_returns_rows(
892
+ dataset=self,
893
+ query=query,
894
+ message=message
895
+ )
896
+
897
+ def expect_query_result_to_equal(
898
+ self,
899
+ query: str,
900
+ expected: Any,
901
+ tolerance: float | None = None,
902
+ message: str | None = None
903
+ ) -> ValidationResult:
904
+ """Check that custom SQL query returns a specific value.
905
+
906
+ Use case: Aggregate validation (COUNT, SUM, AVG, etc.)
907
+
908
+ Args:
909
+ query: SQL query returning single value (use 'table' to reference dataset)
910
+ expected: Expected value
911
+ tolerance: Optional tolerance for numeric comparisons
912
+ message: Optional custom message
913
+
914
+ Returns:
915
+ ValidationResult (passed if query result equals expected)
916
+
917
+ Examples:
918
+ >>> data = connect("orders.csv")
919
+ >>> # Check pending order count
920
+ >>> result = data.expect_query_result_to_equal(
921
+ ... query="SELECT COUNT(*) FROM table WHERE status = 'pending'",
922
+ ... expected=0
923
+ ... )
924
+ >>> assert result.passed
925
+
926
+ >>> # Check average with tolerance
927
+ >>> result = data.expect_query_result_to_equal(
928
+ ... query="SELECT AVG(price) FROM table",
929
+ ... expected=100.0,
930
+ ... tolerance=5.0
931
+ ... )
932
+
933
+ >>> # Check sum constraint
934
+ >>> result = data.expect_query_result_to_equal(
935
+ ... query="SELECT SUM(quantity) FROM table WHERE category = 'electronics'",
936
+ ... expected=1000
937
+ ... )
938
+
939
+ Security:
940
+ - Query must return exactly 1 row with 1 column
941
+ - Query is validated to prevent SQL injection
942
+ """
943
+ from duckguard.checks.query_based import QueryCheckHandler
944
+
945
+ handler = QueryCheckHandler()
946
+ return handler.execute_query_result_equals(
947
+ dataset=self,
948
+ query=query,
949
+ expected=expected,
950
+ tolerance=tolerance,
951
+ message=message
952
+ )
953
+
954
+ def expect_query_result_to_be_between(
955
+ self,
956
+ query: str,
957
+ min_value: float,
958
+ max_value: float,
959
+ message: str | None = None
960
+ ) -> ValidationResult:
961
+ """Check that custom SQL query result is within a range.
962
+
963
+ Use case: Metric validation (e.g., average must be between X and Y)
964
+
965
+ Args:
966
+ query: SQL query returning single numeric value
967
+ min_value: Minimum allowed value (inclusive)
968
+ max_value: Maximum allowed value (inclusive)
969
+ message: Optional custom message
970
+
971
+ Returns:
972
+ ValidationResult (passed if min_value <= result <= max_value)
973
+
974
+ Examples:
975
+ >>> data = connect("metrics.csv")
976
+ >>> # Average price in range
977
+ >>> result = data.expect_query_result_to_be_between(
978
+ ... query="SELECT AVG(price) FROM table",
979
+ ... min_value=10.0,
980
+ ... max_value=1000.0
981
+ ... )
982
+ >>> assert result.passed
983
+
984
+ >>> # Null rate validation
985
+ >>> result = data.expect_query_result_to_be_between(
986
+ ... query='''
987
+ ... SELECT (COUNT(*) FILTER (WHERE price IS NULL)) * 100.0 / COUNT(*)
988
+ ... FROM table
989
+ ... ''',
990
+ ... min_value=0.0,
991
+ ... max_value=5.0 # Max 5% nulls
992
+ ... )
993
+
994
+ Security:
995
+ - Query must return exactly 1 row with 1 numeric column
996
+ - Query is validated to prevent SQL injection
997
+ """
998
+ from duckguard.checks.query_based import QueryCheckHandler
999
+
1000
+ handler = QueryCheckHandler()
1001
+ return handler.execute_query_result_between(
1002
+ dataset=self,
1003
+ query=query,
1004
+ min_value=min_value,
1005
+ max_value=max_value,
1006
+ message=message
1007
+ )
1008
+
679
1009
 
680
1010
  class GroupedDataset:
681
1011
  """