duckguard 2.3.0__py3-none-any.whl → 3.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
duckguard/core/dataset.py CHANGED
@@ -676,6 +676,336 @@ class Dataset:
676
676
 
677
677
  return GroupedDataset(self, columns)
678
678
 
679
+ # =================================================================
680
+ # Multi-Column Validation Methods (DuckGuard 3.0)
681
+ # =================================================================
682
+
683
+ def expect_column_pair_satisfy(
684
+ self,
685
+ column_a: str,
686
+ column_b: str,
687
+ expression: str,
688
+ threshold: float = 1.0
689
+ ) -> ValidationResult:
690
+ """Check that column pair satisfies expression.
691
+
692
+ Args:
693
+ column_a: First column name
694
+ column_b: Second column name
695
+ expression: Expression to evaluate (e.g., "A > B", "A + B = 100")
696
+ threshold: Maximum allowed failure rate (0.0-1.0)
697
+
698
+ Returns:
699
+ ValidationResult with pass/fail status
700
+
701
+ Examples:
702
+ >>> data = connect("orders.csv")
703
+ >>> # Date range validation
704
+ >>> result = data.expect_column_pair_satisfy(
705
+ ... column_a="end_date",
706
+ ... column_b="start_date",
707
+ ... expression="end_date >= start_date"
708
+ ... )
709
+ >>> assert result.passed
710
+
711
+ >>> # Arithmetic validation
712
+ >>> result = data.expect_column_pair_satisfy(
713
+ ... column_a="total",
714
+ ... column_b="subtotal",
715
+ ... expression="total = subtotal * 1.1"
716
+ ... )
717
+ """
718
+ from duckguard.checks.multicolumn import MultiColumnCheckHandler
719
+
720
+ handler = MultiColumnCheckHandler()
721
+ return handler.execute_column_pair_satisfy(
722
+ dataset=self,
723
+ column_a=column_a,
724
+ column_b=column_b,
725
+ expression=expression,
726
+ threshold=threshold
727
+ )
728
+
729
+ def expect_columns_unique(
730
+ self,
731
+ columns: list[str],
732
+ threshold: float = 1.0
733
+ ) -> ValidationResult:
734
+ """Check that combination of columns is unique (composite key).
735
+
736
+ Args:
737
+ columns: List of column names forming composite key
738
+ threshold: Minimum required uniqueness rate (0.0-1.0)
739
+
740
+ Returns:
741
+ ValidationResult with pass/fail status
742
+
743
+ Examples:
744
+ >>> # Two-column composite key
745
+ >>> result = data.expect_columns_unique(
746
+ ... columns=["user_id", "session_id"]
747
+ ... )
748
+ >>> assert result.passed
749
+
750
+ >>> # Three-column composite key
751
+ >>> result = data.expect_columns_unique(
752
+ ... columns=["year", "month", "product_id"]
753
+ ... )
754
+ """
755
+ from duckguard.checks.multicolumn import MultiColumnCheckHandler
756
+
757
+ handler = MultiColumnCheckHandler()
758
+ return handler.execute_columns_unique(
759
+ dataset=self,
760
+ columns=columns,
761
+ threshold=threshold
762
+ )
763
+
764
+ def expect_multicolumn_sum_to_equal(
765
+ self,
766
+ columns: list[str],
767
+ expected_sum: float,
768
+ threshold: float = 0.01
769
+ ) -> ValidationResult:
770
+ """Check that sum of columns equals expected value.
771
+
772
+ Args:
773
+ columns: List of columns to sum
774
+ expected_sum: Expected sum value
775
+ threshold: Maximum allowed deviation
776
+
777
+ Returns:
778
+ ValidationResult with pass/fail status
779
+
780
+ Examples:
781
+ >>> # Components must sum to 100%
782
+ >>> result = data.expect_multicolumn_sum_to_equal(
783
+ ... columns=["q1_pct", "q2_pct", "q3_pct", "q4_pct"],
784
+ ... expected_sum=100.0
785
+ ... )
786
+ >>> assert result.passed
787
+
788
+ >>> # Budget allocation check
789
+ >>> result = data.expect_multicolumn_sum_to_equal(
790
+ ... columns=["marketing", "sales", "r_and_d"],
791
+ ... expected_sum=data.total_budget
792
+ ... )
793
+ """
794
+ from duckguard.checks.multicolumn import MultiColumnCheckHandler
795
+
796
+ handler = MultiColumnCheckHandler()
797
+ return handler.execute_multicolumn_sum_equal(
798
+ dataset=self,
799
+ columns=columns,
800
+ expected_sum=expected_sum,
801
+ threshold=threshold
802
+ )
803
+
804
+ # =================================================================
805
+ # Query-Based Validation Methods (DuckGuard 3.0)
806
+ # =================================================================
807
+
808
+ def expect_query_to_return_no_rows(
809
+ self,
810
+ query: str,
811
+ message: str | None = None
812
+ ) -> ValidationResult:
813
+ """Check that custom SQL query returns no rows (finds no violations).
814
+
815
+ Use case: Write a query that finds violations. The check passes if
816
+ the query returns no rows (no violations found).
817
+
818
+ Args:
819
+ query: SQL SELECT query (use 'table' to reference the dataset)
820
+ message: Optional custom message
821
+
822
+ Returns:
823
+ ValidationResult (passed if query returns 0 rows)
824
+
825
+ Examples:
826
+ >>> data = connect("orders.csv")
827
+ >>> # Find invalid totals (total < subtotal)
828
+ >>> result = data.expect_query_to_return_no_rows(
829
+ ... query="SELECT * FROM table WHERE total < subtotal"
830
+ ... )
831
+ >>> assert result.passed
832
+
833
+ >>> # Find future dates
834
+ >>> result = data.expect_query_to_return_no_rows(
835
+ ... query="SELECT * FROM table WHERE order_date > CURRENT_DATE"
836
+ ... )
837
+
838
+ Security:
839
+ - Query is validated to prevent SQL injection
840
+ - Only SELECT queries allowed
841
+ - READ-ONLY mode enforced
842
+ - 30-second timeout
843
+ - 10,000 row result limit
844
+ """
845
+ from duckguard.checks.query_based import QueryCheckHandler
846
+
847
+ handler = QueryCheckHandler()
848
+ return handler.execute_query_no_rows(
849
+ dataset=self,
850
+ query=query,
851
+ message=message
852
+ )
853
+
854
+ def expect_query_to_return_rows(
855
+ self,
856
+ query: str,
857
+ message: str | None = None
858
+ ) -> ValidationResult:
859
+ """Check that custom SQL query returns at least one row.
860
+
861
+ Use case: Ensure expected data exists in the dataset.
862
+
863
+ Args:
864
+ query: SQL SELECT query (use 'table' to reference the dataset)
865
+ message: Optional custom message
866
+
867
+ Returns:
868
+ ValidationResult (passed if query returns > 0 rows)
869
+
870
+ Examples:
871
+ >>> data = connect("products.csv")
872
+ >>> # Ensure we have active products
873
+ >>> result = data.expect_query_to_return_rows(
874
+ ... query="SELECT * FROM table WHERE status = 'active'"
875
+ ... )
876
+ >>> assert result.passed
877
+
878
+ >>> # Ensure we have recent data
879
+ >>> result = data.expect_query_to_return_rows(
880
+ ... query="SELECT * FROM table WHERE created_at >= CURRENT_DATE - 7"
881
+ ... )
882
+
883
+ Security:
884
+ - Query is validated to prevent SQL injection
885
+ - Only SELECT queries allowed
886
+ - READ-ONLY mode enforced
887
+ """
888
+ from duckguard.checks.query_based import QueryCheckHandler
889
+
890
+ handler = QueryCheckHandler()
891
+ return handler.execute_query_returns_rows(
892
+ dataset=self,
893
+ query=query,
894
+ message=message
895
+ )
896
+
897
+ def expect_query_result_to_equal(
898
+ self,
899
+ query: str,
900
+ expected: Any,
901
+ tolerance: float | None = None,
902
+ message: str | None = None
903
+ ) -> ValidationResult:
904
+ """Check that custom SQL query returns a specific value.
905
+
906
+ Use case: Aggregate validation (COUNT, SUM, AVG, etc.)
907
+
908
+ Args:
909
+ query: SQL query returning single value (use 'table' to reference dataset)
910
+ expected: Expected value
911
+ tolerance: Optional tolerance for numeric comparisons
912
+ message: Optional custom message
913
+
914
+ Returns:
915
+ ValidationResult (passed if query result equals expected)
916
+
917
+ Examples:
918
+ >>> data = connect("orders.csv")
919
+ >>> # Check pending order count
920
+ >>> result = data.expect_query_result_to_equal(
921
+ ... query="SELECT COUNT(*) FROM table WHERE status = 'pending'",
922
+ ... expected=0
923
+ ... )
924
+ >>> assert result.passed
925
+
926
+ >>> # Check average with tolerance
927
+ >>> result = data.expect_query_result_to_equal(
928
+ ... query="SELECT AVG(price) FROM table",
929
+ ... expected=100.0,
930
+ ... tolerance=5.0
931
+ ... )
932
+
933
+ >>> # Check sum constraint
934
+ >>> result = data.expect_query_result_to_equal(
935
+ ... query="SELECT SUM(quantity) FROM table WHERE category = 'electronics'",
936
+ ... expected=1000
937
+ ... )
938
+
939
+ Security:
940
+ - Query must return exactly 1 row with 1 column
941
+ - Query is validated to prevent SQL injection
942
+ """
943
+ from duckguard.checks.query_based import QueryCheckHandler
944
+
945
+ handler = QueryCheckHandler()
946
+ return handler.execute_query_result_equals(
947
+ dataset=self,
948
+ query=query,
949
+ expected=expected,
950
+ tolerance=tolerance,
951
+ message=message
952
+ )
953
+
954
+ def expect_query_result_to_be_between(
955
+ self,
956
+ query: str,
957
+ min_value: float,
958
+ max_value: float,
959
+ message: str | None = None
960
+ ) -> ValidationResult:
961
+ """Check that custom SQL query result is within a range.
962
+
963
+ Use case: Metric validation (e.g., average must be between X and Y)
964
+
965
+ Args:
966
+ query: SQL query returning single numeric value
967
+ min_value: Minimum allowed value (inclusive)
968
+ max_value: Maximum allowed value (inclusive)
969
+ message: Optional custom message
970
+
971
+ Returns:
972
+ ValidationResult (passed if min_value <= result <= max_value)
973
+
974
+ Examples:
975
+ >>> data = connect("metrics.csv")
976
+ >>> # Average price in range
977
+ >>> result = data.expect_query_result_to_be_between(
978
+ ... query="SELECT AVG(price) FROM table",
979
+ ... min_value=10.0,
980
+ ... max_value=1000.0
981
+ ... )
982
+ >>> assert result.passed
983
+
984
+ >>> # Null rate validation
985
+ >>> result = data.expect_query_result_to_be_between(
986
+ ... query='''
987
+ ... SELECT (COUNT(*) FILTER (WHERE price IS NULL)) * 100.0 / COUNT(*)
988
+ ... FROM table
989
+ ... ''',
990
+ ... min_value=0.0,
991
+ ... max_value=5.0 # Max 5% nulls
992
+ ... )
993
+
994
+ Security:
995
+ - Query must return exactly 1 row with 1 numeric column
996
+ - Query is validated to prevent SQL injection
997
+ """
998
+ from duckguard.checks.query_based import QueryCheckHandler
999
+
1000
+ handler = QueryCheckHandler()
1001
+ return handler.execute_query_result_between(
1002
+ dataset=self,
1003
+ query=query,
1004
+ min_value=min_value,
1005
+ max_value=max_value,
1006
+ message=message
1007
+ )
1008
+
679
1009
 
680
1010
  class GroupedDataset:
681
1011
  """
duckguard/core/result.py CHANGED
@@ -37,6 +37,11 @@ class FailedRow:
37
37
  reason: str = ""
38
38
  context: dict[str, Any] = field(default_factory=dict)
39
39
 
40
+ @property
41
+ def row_number(self) -> int:
42
+ """Alias for row_index (backward compatibility)."""
43
+ return self.row_index
44
+
40
45
  def __repr__(self) -> str:
41
46
  return f"FailedRow(row={self.row_index}, column='{self.column}', value={self.value!r})"
42
47
 
@@ -138,6 +138,15 @@ class EmailNotifier(BaseNotifier):
138
138
  if not self.email_config.to_addresses:
139
139
  raise ValueError("At least one recipient address (to_addresses) is required")
140
140
 
141
+ # Populate NotificationConfig with email settings for easy access
142
+ self.config.smtp_host = self.email_config.smtp_host
143
+ self.config.smtp_port = self.email_config.smtp_port
144
+ self.config.from_address = self.email_config.from_address
145
+ self.config.to_addresses = self.email_config.to_addresses
146
+ self.config.use_tls = self.email_config.use_tls
147
+ self.config.use_ssl = self.email_config.use_ssl
148
+ self.config.subject_prefix = self.email_config.subject_prefix
149
+
141
150
  # Set webhook_url to a placeholder (not used for email)
142
151
  self.webhook_url = "email://smtp"
143
152
 
@@ -40,6 +40,16 @@ class NotificationConfig:
40
40
  max_failures_shown: int = 10
41
41
  mention_users: list[str] = field(default_factory=list)
42
42
  channel: str | None = None
43
+ username: str | None = None # Slack bot username
44
+
45
+ # Email-specific attributes (set by EmailNotifier)
46
+ smtp_host: str | None = None
47
+ smtp_port: int | None = None
48
+ from_address: str | None = None
49
+ to_addresses: list[str] | None = None
50
+ use_tls: bool | None = None
51
+ use_ssl: bool | None = None
52
+ subject_prefix: str | None = None
43
53
 
44
54
 
45
55
  class BaseNotifier(ABC):
@@ -143,13 +153,39 @@ class SlackNotifier(BaseNotifier):
143
153
  """Slack webhook notifier.
144
154
 
145
155
  Usage:
146
- notifier = SlackNotifier(webhook_url="https://hooks.slack.com/...")
156
+ notifier = SlackNotifier(
157
+ webhook_url="https://hooks.slack.com/...",
158
+ channel="#data-quality",
159
+ username="DuckGuard Bot"
160
+ )
147
161
  # or set DUCKGUARD_SLACK_WEBHOOK environment variable
148
162
 
149
163
  result = execute_rules(rules, "data.csv")
150
164
  notifier.send_results(result)
151
165
  """
152
166
 
167
+ def __init__(
168
+ self,
169
+ webhook_url: str | None = None,
170
+ channel: str | None = None,
171
+ username: str | None = None,
172
+ config: NotificationConfig | None = None,
173
+ ):
174
+ """Initialize Slack notifier.
175
+
176
+ Args:
177
+ webhook_url: Slack webhook URL
178
+ channel: Override default channel (e.g., "#data-quality")
179
+ username: Bot username to display
180
+ config: Notification configuration
181
+ """
182
+ super().__init__(webhook_url=webhook_url, config=config)
183
+ # Only override if explicitly provided (don't overwrite config values with None)
184
+ if channel is not None:
185
+ self.config.channel = channel
186
+ if username is not None:
187
+ self.config.username = username
188
+
153
189
  @property
154
190
  def _env_var_name(self) -> str:
155
191
  return "DUCKGUARD_SLACK_WEBHOOK"
@@ -211,6 +247,8 @@ class SlackNotifier(BaseNotifier):
211
247
 
212
248
  if self.config.channel:
213
249
  message["channel"] = self.config.channel
250
+ if self.config.username:
251
+ message["username"] = self.config.username
214
252
 
215
253
  return message
216
254