duckguard 2.3.0__py3-none-any.whl → 3.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +1 -1
- duckguard/anomaly/methods.py +47 -0
- duckguard/anomaly/ml_methods.py +146 -21
- duckguard/checks/__init__.py +26 -0
- duckguard/checks/conditional.py +796 -0
- duckguard/checks/distributional.py +524 -0
- duckguard/checks/multicolumn.py +726 -0
- duckguard/checks/query_based.py +643 -0
- duckguard/connectors/factory.py +30 -2
- duckguard/connectors/files.py +7 -3
- duckguard/core/column.py +372 -0
- duckguard/core/dataset.py +330 -0
- duckguard/core/result.py +5 -0
- duckguard/notifications/email.py +9 -0
- duckguard/notifications/notifiers.py +39 -1
- duckguard/profiler/distribution_analyzer.py +384 -0
- duckguard/profiler/outlier_detector.py +497 -0
- duckguard/profiler/pattern_matcher.py +301 -0
- duckguard/profiler/quality_scorer.py +445 -0
- duckguard/rules/executor.py +642 -0
- duckguard/rules/schema.py +31 -0
- {duckguard-2.3.0.dist-info → duckguard-3.0.1.dist-info}/METADATA +120 -1
- {duckguard-2.3.0.dist-info → duckguard-3.0.1.dist-info}/RECORD +26 -17
- {duckguard-2.3.0.dist-info → duckguard-3.0.1.dist-info}/WHEEL +0 -0
- {duckguard-2.3.0.dist-info → duckguard-3.0.1.dist-info}/entry_points.txt +0 -0
- {duckguard-2.3.0.dist-info → duckguard-3.0.1.dist-info}/licenses/LICENSE +0 -0
duckguard/core/dataset.py
CHANGED
|
@@ -676,6 +676,336 @@ class Dataset:
|
|
|
676
676
|
|
|
677
677
|
return GroupedDataset(self, columns)
|
|
678
678
|
|
|
679
|
+
# =================================================================
|
|
680
|
+
# Multi-Column Validation Methods (DuckGuard 3.0)
|
|
681
|
+
# =================================================================
|
|
682
|
+
|
|
683
|
+
def expect_column_pair_satisfy(
|
|
684
|
+
self,
|
|
685
|
+
column_a: str,
|
|
686
|
+
column_b: str,
|
|
687
|
+
expression: str,
|
|
688
|
+
threshold: float = 1.0
|
|
689
|
+
) -> ValidationResult:
|
|
690
|
+
"""Check that column pair satisfies expression.
|
|
691
|
+
|
|
692
|
+
Args:
|
|
693
|
+
column_a: First column name
|
|
694
|
+
column_b: Second column name
|
|
695
|
+
expression: Expression to evaluate (e.g., "A > B", "A + B = 100")
|
|
696
|
+
threshold: Maximum allowed failure rate (0.0-1.0)
|
|
697
|
+
|
|
698
|
+
Returns:
|
|
699
|
+
ValidationResult with pass/fail status
|
|
700
|
+
|
|
701
|
+
Examples:
|
|
702
|
+
>>> data = connect("orders.csv")
|
|
703
|
+
>>> # Date range validation
|
|
704
|
+
>>> result = data.expect_column_pair_satisfy(
|
|
705
|
+
... column_a="end_date",
|
|
706
|
+
... column_b="start_date",
|
|
707
|
+
... expression="end_date >= start_date"
|
|
708
|
+
... )
|
|
709
|
+
>>> assert result.passed
|
|
710
|
+
|
|
711
|
+
>>> # Arithmetic validation
|
|
712
|
+
>>> result = data.expect_column_pair_satisfy(
|
|
713
|
+
... column_a="total",
|
|
714
|
+
... column_b="subtotal",
|
|
715
|
+
... expression="total = subtotal * 1.1"
|
|
716
|
+
... )
|
|
717
|
+
"""
|
|
718
|
+
from duckguard.checks.multicolumn import MultiColumnCheckHandler
|
|
719
|
+
|
|
720
|
+
handler = MultiColumnCheckHandler()
|
|
721
|
+
return handler.execute_column_pair_satisfy(
|
|
722
|
+
dataset=self,
|
|
723
|
+
column_a=column_a,
|
|
724
|
+
column_b=column_b,
|
|
725
|
+
expression=expression,
|
|
726
|
+
threshold=threshold
|
|
727
|
+
)
|
|
728
|
+
|
|
729
|
+
def expect_columns_unique(
|
|
730
|
+
self,
|
|
731
|
+
columns: list[str],
|
|
732
|
+
threshold: float = 1.0
|
|
733
|
+
) -> ValidationResult:
|
|
734
|
+
"""Check that combination of columns is unique (composite key).
|
|
735
|
+
|
|
736
|
+
Args:
|
|
737
|
+
columns: List of column names forming composite key
|
|
738
|
+
threshold: Minimum required uniqueness rate (0.0-1.0)
|
|
739
|
+
|
|
740
|
+
Returns:
|
|
741
|
+
ValidationResult with pass/fail status
|
|
742
|
+
|
|
743
|
+
Examples:
|
|
744
|
+
>>> # Two-column composite key
|
|
745
|
+
>>> result = data.expect_columns_unique(
|
|
746
|
+
... columns=["user_id", "session_id"]
|
|
747
|
+
... )
|
|
748
|
+
>>> assert result.passed
|
|
749
|
+
|
|
750
|
+
>>> # Three-column composite key
|
|
751
|
+
>>> result = data.expect_columns_unique(
|
|
752
|
+
... columns=["year", "month", "product_id"]
|
|
753
|
+
... )
|
|
754
|
+
"""
|
|
755
|
+
from duckguard.checks.multicolumn import MultiColumnCheckHandler
|
|
756
|
+
|
|
757
|
+
handler = MultiColumnCheckHandler()
|
|
758
|
+
return handler.execute_columns_unique(
|
|
759
|
+
dataset=self,
|
|
760
|
+
columns=columns,
|
|
761
|
+
threshold=threshold
|
|
762
|
+
)
|
|
763
|
+
|
|
764
|
+
def expect_multicolumn_sum_to_equal(
|
|
765
|
+
self,
|
|
766
|
+
columns: list[str],
|
|
767
|
+
expected_sum: float,
|
|
768
|
+
threshold: float = 0.01
|
|
769
|
+
) -> ValidationResult:
|
|
770
|
+
"""Check that sum of columns equals expected value.
|
|
771
|
+
|
|
772
|
+
Args:
|
|
773
|
+
columns: List of columns to sum
|
|
774
|
+
expected_sum: Expected sum value
|
|
775
|
+
threshold: Maximum allowed deviation
|
|
776
|
+
|
|
777
|
+
Returns:
|
|
778
|
+
ValidationResult with pass/fail status
|
|
779
|
+
|
|
780
|
+
Examples:
|
|
781
|
+
>>> # Components must sum to 100%
|
|
782
|
+
>>> result = data.expect_multicolumn_sum_to_equal(
|
|
783
|
+
... columns=["q1_pct", "q2_pct", "q3_pct", "q4_pct"],
|
|
784
|
+
... expected_sum=100.0
|
|
785
|
+
... )
|
|
786
|
+
>>> assert result.passed
|
|
787
|
+
|
|
788
|
+
>>> # Budget allocation check
|
|
789
|
+
>>> result = data.expect_multicolumn_sum_to_equal(
|
|
790
|
+
... columns=["marketing", "sales", "r_and_d"],
|
|
791
|
+
... expected_sum=data.total_budget
|
|
792
|
+
... )
|
|
793
|
+
"""
|
|
794
|
+
from duckguard.checks.multicolumn import MultiColumnCheckHandler
|
|
795
|
+
|
|
796
|
+
handler = MultiColumnCheckHandler()
|
|
797
|
+
return handler.execute_multicolumn_sum_equal(
|
|
798
|
+
dataset=self,
|
|
799
|
+
columns=columns,
|
|
800
|
+
expected_sum=expected_sum,
|
|
801
|
+
threshold=threshold
|
|
802
|
+
)
|
|
803
|
+
|
|
804
|
+
# =================================================================
|
|
805
|
+
# Query-Based Validation Methods (DuckGuard 3.0)
|
|
806
|
+
# =================================================================
|
|
807
|
+
|
|
808
|
+
def expect_query_to_return_no_rows(
|
|
809
|
+
self,
|
|
810
|
+
query: str,
|
|
811
|
+
message: str | None = None
|
|
812
|
+
) -> ValidationResult:
|
|
813
|
+
"""Check that custom SQL query returns no rows (finds no violations).
|
|
814
|
+
|
|
815
|
+
Use case: Write a query that finds violations. The check passes if
|
|
816
|
+
the query returns no rows (no violations found).
|
|
817
|
+
|
|
818
|
+
Args:
|
|
819
|
+
query: SQL SELECT query (use 'table' to reference the dataset)
|
|
820
|
+
message: Optional custom message
|
|
821
|
+
|
|
822
|
+
Returns:
|
|
823
|
+
ValidationResult (passed if query returns 0 rows)
|
|
824
|
+
|
|
825
|
+
Examples:
|
|
826
|
+
>>> data = connect("orders.csv")
|
|
827
|
+
>>> # Find invalid totals (total < subtotal)
|
|
828
|
+
>>> result = data.expect_query_to_return_no_rows(
|
|
829
|
+
... query="SELECT * FROM table WHERE total < subtotal"
|
|
830
|
+
... )
|
|
831
|
+
>>> assert result.passed
|
|
832
|
+
|
|
833
|
+
>>> # Find future dates
|
|
834
|
+
>>> result = data.expect_query_to_return_no_rows(
|
|
835
|
+
... query="SELECT * FROM table WHERE order_date > CURRENT_DATE"
|
|
836
|
+
... )
|
|
837
|
+
|
|
838
|
+
Security:
|
|
839
|
+
- Query is validated to prevent SQL injection
|
|
840
|
+
- Only SELECT queries allowed
|
|
841
|
+
- READ-ONLY mode enforced
|
|
842
|
+
- 30-second timeout
|
|
843
|
+
- 10,000 row result limit
|
|
844
|
+
"""
|
|
845
|
+
from duckguard.checks.query_based import QueryCheckHandler
|
|
846
|
+
|
|
847
|
+
handler = QueryCheckHandler()
|
|
848
|
+
return handler.execute_query_no_rows(
|
|
849
|
+
dataset=self,
|
|
850
|
+
query=query,
|
|
851
|
+
message=message
|
|
852
|
+
)
|
|
853
|
+
|
|
854
|
+
def expect_query_to_return_rows(
|
|
855
|
+
self,
|
|
856
|
+
query: str,
|
|
857
|
+
message: str | None = None
|
|
858
|
+
) -> ValidationResult:
|
|
859
|
+
"""Check that custom SQL query returns at least one row.
|
|
860
|
+
|
|
861
|
+
Use case: Ensure expected data exists in the dataset.
|
|
862
|
+
|
|
863
|
+
Args:
|
|
864
|
+
query: SQL SELECT query (use 'table' to reference the dataset)
|
|
865
|
+
message: Optional custom message
|
|
866
|
+
|
|
867
|
+
Returns:
|
|
868
|
+
ValidationResult (passed if query returns > 0 rows)
|
|
869
|
+
|
|
870
|
+
Examples:
|
|
871
|
+
>>> data = connect("products.csv")
|
|
872
|
+
>>> # Ensure we have active products
|
|
873
|
+
>>> result = data.expect_query_to_return_rows(
|
|
874
|
+
... query="SELECT * FROM table WHERE status = 'active'"
|
|
875
|
+
... )
|
|
876
|
+
>>> assert result.passed
|
|
877
|
+
|
|
878
|
+
>>> # Ensure we have recent data
|
|
879
|
+
>>> result = data.expect_query_to_return_rows(
|
|
880
|
+
... query="SELECT * FROM table WHERE created_at >= CURRENT_DATE - 7"
|
|
881
|
+
... )
|
|
882
|
+
|
|
883
|
+
Security:
|
|
884
|
+
- Query is validated to prevent SQL injection
|
|
885
|
+
- Only SELECT queries allowed
|
|
886
|
+
- READ-ONLY mode enforced
|
|
887
|
+
"""
|
|
888
|
+
from duckguard.checks.query_based import QueryCheckHandler
|
|
889
|
+
|
|
890
|
+
handler = QueryCheckHandler()
|
|
891
|
+
return handler.execute_query_returns_rows(
|
|
892
|
+
dataset=self,
|
|
893
|
+
query=query,
|
|
894
|
+
message=message
|
|
895
|
+
)
|
|
896
|
+
|
|
897
|
+
def expect_query_result_to_equal(
|
|
898
|
+
self,
|
|
899
|
+
query: str,
|
|
900
|
+
expected: Any,
|
|
901
|
+
tolerance: float | None = None,
|
|
902
|
+
message: str | None = None
|
|
903
|
+
) -> ValidationResult:
|
|
904
|
+
"""Check that custom SQL query returns a specific value.
|
|
905
|
+
|
|
906
|
+
Use case: Aggregate validation (COUNT, SUM, AVG, etc.)
|
|
907
|
+
|
|
908
|
+
Args:
|
|
909
|
+
query: SQL query returning single value (use 'table' to reference dataset)
|
|
910
|
+
expected: Expected value
|
|
911
|
+
tolerance: Optional tolerance for numeric comparisons
|
|
912
|
+
message: Optional custom message
|
|
913
|
+
|
|
914
|
+
Returns:
|
|
915
|
+
ValidationResult (passed if query result equals expected)
|
|
916
|
+
|
|
917
|
+
Examples:
|
|
918
|
+
>>> data = connect("orders.csv")
|
|
919
|
+
>>> # Check pending order count
|
|
920
|
+
>>> result = data.expect_query_result_to_equal(
|
|
921
|
+
... query="SELECT COUNT(*) FROM table WHERE status = 'pending'",
|
|
922
|
+
... expected=0
|
|
923
|
+
... )
|
|
924
|
+
>>> assert result.passed
|
|
925
|
+
|
|
926
|
+
>>> # Check average with tolerance
|
|
927
|
+
>>> result = data.expect_query_result_to_equal(
|
|
928
|
+
... query="SELECT AVG(price) FROM table",
|
|
929
|
+
... expected=100.0,
|
|
930
|
+
... tolerance=5.0
|
|
931
|
+
... )
|
|
932
|
+
|
|
933
|
+
>>> # Check sum constraint
|
|
934
|
+
>>> result = data.expect_query_result_to_equal(
|
|
935
|
+
... query="SELECT SUM(quantity) FROM table WHERE category = 'electronics'",
|
|
936
|
+
... expected=1000
|
|
937
|
+
... )
|
|
938
|
+
|
|
939
|
+
Security:
|
|
940
|
+
- Query must return exactly 1 row with 1 column
|
|
941
|
+
- Query is validated to prevent SQL injection
|
|
942
|
+
"""
|
|
943
|
+
from duckguard.checks.query_based import QueryCheckHandler
|
|
944
|
+
|
|
945
|
+
handler = QueryCheckHandler()
|
|
946
|
+
return handler.execute_query_result_equals(
|
|
947
|
+
dataset=self,
|
|
948
|
+
query=query,
|
|
949
|
+
expected=expected,
|
|
950
|
+
tolerance=tolerance,
|
|
951
|
+
message=message
|
|
952
|
+
)
|
|
953
|
+
|
|
954
|
+
def expect_query_result_to_be_between(
|
|
955
|
+
self,
|
|
956
|
+
query: str,
|
|
957
|
+
min_value: float,
|
|
958
|
+
max_value: float,
|
|
959
|
+
message: str | None = None
|
|
960
|
+
) -> ValidationResult:
|
|
961
|
+
"""Check that custom SQL query result is within a range.
|
|
962
|
+
|
|
963
|
+
Use case: Metric validation (e.g., average must be between X and Y)
|
|
964
|
+
|
|
965
|
+
Args:
|
|
966
|
+
query: SQL query returning single numeric value
|
|
967
|
+
min_value: Minimum allowed value (inclusive)
|
|
968
|
+
max_value: Maximum allowed value (inclusive)
|
|
969
|
+
message: Optional custom message
|
|
970
|
+
|
|
971
|
+
Returns:
|
|
972
|
+
ValidationResult (passed if min_value <= result <= max_value)
|
|
973
|
+
|
|
974
|
+
Examples:
|
|
975
|
+
>>> data = connect("metrics.csv")
|
|
976
|
+
>>> # Average price in range
|
|
977
|
+
>>> result = data.expect_query_result_to_be_between(
|
|
978
|
+
... query="SELECT AVG(price) FROM table",
|
|
979
|
+
... min_value=10.0,
|
|
980
|
+
... max_value=1000.0
|
|
981
|
+
... )
|
|
982
|
+
>>> assert result.passed
|
|
983
|
+
|
|
984
|
+
>>> # Null rate validation
|
|
985
|
+
>>> result = data.expect_query_result_to_be_between(
|
|
986
|
+
... query='''
|
|
987
|
+
... SELECT (COUNT(*) FILTER (WHERE price IS NULL)) * 100.0 / COUNT(*)
|
|
988
|
+
... FROM table
|
|
989
|
+
... ''',
|
|
990
|
+
... min_value=0.0,
|
|
991
|
+
... max_value=5.0 # Max 5% nulls
|
|
992
|
+
... )
|
|
993
|
+
|
|
994
|
+
Security:
|
|
995
|
+
- Query must return exactly 1 row with 1 numeric column
|
|
996
|
+
- Query is validated to prevent SQL injection
|
|
997
|
+
"""
|
|
998
|
+
from duckguard.checks.query_based import QueryCheckHandler
|
|
999
|
+
|
|
1000
|
+
handler = QueryCheckHandler()
|
|
1001
|
+
return handler.execute_query_result_between(
|
|
1002
|
+
dataset=self,
|
|
1003
|
+
query=query,
|
|
1004
|
+
min_value=min_value,
|
|
1005
|
+
max_value=max_value,
|
|
1006
|
+
message=message
|
|
1007
|
+
)
|
|
1008
|
+
|
|
679
1009
|
|
|
680
1010
|
class GroupedDataset:
|
|
681
1011
|
"""
|
duckguard/core/result.py
CHANGED
|
@@ -37,6 +37,11 @@ class FailedRow:
|
|
|
37
37
|
reason: str = ""
|
|
38
38
|
context: dict[str, Any] = field(default_factory=dict)
|
|
39
39
|
|
|
40
|
+
@property
|
|
41
|
+
def row_number(self) -> int:
|
|
42
|
+
"""Alias for row_index (backward compatibility)."""
|
|
43
|
+
return self.row_index
|
|
44
|
+
|
|
40
45
|
def __repr__(self) -> str:
|
|
41
46
|
return f"FailedRow(row={self.row_index}, column='{self.column}', value={self.value!r})"
|
|
42
47
|
|
duckguard/notifications/email.py
CHANGED
|
@@ -138,6 +138,15 @@ class EmailNotifier(BaseNotifier):
|
|
|
138
138
|
if not self.email_config.to_addresses:
|
|
139
139
|
raise ValueError("At least one recipient address (to_addresses) is required")
|
|
140
140
|
|
|
141
|
+
# Populate NotificationConfig with email settings for easy access
|
|
142
|
+
self.config.smtp_host = self.email_config.smtp_host
|
|
143
|
+
self.config.smtp_port = self.email_config.smtp_port
|
|
144
|
+
self.config.from_address = self.email_config.from_address
|
|
145
|
+
self.config.to_addresses = self.email_config.to_addresses
|
|
146
|
+
self.config.use_tls = self.email_config.use_tls
|
|
147
|
+
self.config.use_ssl = self.email_config.use_ssl
|
|
148
|
+
self.config.subject_prefix = self.email_config.subject_prefix
|
|
149
|
+
|
|
141
150
|
# Set webhook_url to a placeholder (not used for email)
|
|
142
151
|
self.webhook_url = "email://smtp"
|
|
143
152
|
|
|
@@ -40,6 +40,16 @@ class NotificationConfig:
|
|
|
40
40
|
max_failures_shown: int = 10
|
|
41
41
|
mention_users: list[str] = field(default_factory=list)
|
|
42
42
|
channel: str | None = None
|
|
43
|
+
username: str | None = None # Slack bot username
|
|
44
|
+
|
|
45
|
+
# Email-specific attributes (set by EmailNotifier)
|
|
46
|
+
smtp_host: str | None = None
|
|
47
|
+
smtp_port: int | None = None
|
|
48
|
+
from_address: str | None = None
|
|
49
|
+
to_addresses: list[str] | None = None
|
|
50
|
+
use_tls: bool | None = None
|
|
51
|
+
use_ssl: bool | None = None
|
|
52
|
+
subject_prefix: str | None = None
|
|
43
53
|
|
|
44
54
|
|
|
45
55
|
class BaseNotifier(ABC):
|
|
@@ -143,13 +153,39 @@ class SlackNotifier(BaseNotifier):
|
|
|
143
153
|
"""Slack webhook notifier.
|
|
144
154
|
|
|
145
155
|
Usage:
|
|
146
|
-
notifier = SlackNotifier(
|
|
156
|
+
notifier = SlackNotifier(
|
|
157
|
+
webhook_url="https://hooks.slack.com/...",
|
|
158
|
+
channel="#data-quality",
|
|
159
|
+
username="DuckGuard Bot"
|
|
160
|
+
)
|
|
147
161
|
# or set DUCKGUARD_SLACK_WEBHOOK environment variable
|
|
148
162
|
|
|
149
163
|
result = execute_rules(rules, "data.csv")
|
|
150
164
|
notifier.send_results(result)
|
|
151
165
|
"""
|
|
152
166
|
|
|
167
|
+
def __init__(
|
|
168
|
+
self,
|
|
169
|
+
webhook_url: str | None = None,
|
|
170
|
+
channel: str | None = None,
|
|
171
|
+
username: str | None = None,
|
|
172
|
+
config: NotificationConfig | None = None,
|
|
173
|
+
):
|
|
174
|
+
"""Initialize Slack notifier.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
webhook_url: Slack webhook URL
|
|
178
|
+
channel: Override default channel (e.g., "#data-quality")
|
|
179
|
+
username: Bot username to display
|
|
180
|
+
config: Notification configuration
|
|
181
|
+
"""
|
|
182
|
+
super().__init__(webhook_url=webhook_url, config=config)
|
|
183
|
+
# Only override if explicitly provided (don't overwrite config values with None)
|
|
184
|
+
if channel is not None:
|
|
185
|
+
self.config.channel = channel
|
|
186
|
+
if username is not None:
|
|
187
|
+
self.config.username = username
|
|
188
|
+
|
|
153
189
|
@property
|
|
154
190
|
def _env_var_name(self) -> str:
|
|
155
191
|
return "DUCKGUARD_SLACK_WEBHOOK"
|
|
@@ -211,6 +247,8 @@ class SlackNotifier(BaseNotifier):
|
|
|
211
247
|
|
|
212
248
|
if self.config.channel:
|
|
213
249
|
message["channel"] = self.config.channel
|
|
250
|
+
if self.config.username:
|
|
251
|
+
message["username"] = self.config.username
|
|
214
252
|
|
|
215
253
|
return message
|
|
216
254
|
|