PyPI - duckguard - Versions diffs - 2.3.0__py3-none-any.whl → 3.0.1__py3-none-any.whl - Mend

duckguard 2.3.0py3-none-any.whl → 3.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

duckguard/__init__.py +1 -1
duckguard/anomaly/methods.py +47 -0
duckguard/anomaly/ml_methods.py +146 -21
duckguard/checks/__init__.py +26 -0
duckguard/checks/conditional.py +796 -0
duckguard/checks/distributional.py +524 -0
duckguard/checks/multicolumn.py +726 -0
duckguard/checks/query_based.py +643 -0
duckguard/connectors/factory.py +30 -2
duckguard/connectors/files.py +7 -3
duckguard/core/column.py +372 -0
duckguard/core/dataset.py +330 -0
duckguard/core/result.py +5 -0
duckguard/notifications/email.py +9 -0
duckguard/notifications/notifiers.py +39 -1
duckguard/profiler/distribution_analyzer.py +384 -0
duckguard/profiler/outlier_detector.py +497 -0
duckguard/profiler/pattern_matcher.py +301 -0
duckguard/profiler/quality_scorer.py +445 -0
duckguard/rules/executor.py +642 -0
duckguard/rules/schema.py +31 -0
{duckguard-2.3.0.dist-info → duckguard-3.0.1.dist-info}/METADATA +120 -1
{duckguard-2.3.0.dist-info → duckguard-3.0.1.dist-info}/RECORD +26 -17
{duckguard-2.3.0.dist-info → duckguard-3.0.1.dist-info}/WHEEL +0 -0
{duckguard-2.3.0.dist-info → duckguard-3.0.1.dist-info}/entry_points.txt +0 -0
{duckguard-2.3.0.dist-info → duckguard-3.0.1.dist-info}/licenses/LICENSE +0 -0

duckguard/core/dataset.py CHANGED Viewed

@@ -676,6 +676,336 @@ class Dataset:
         return GroupedDataset(self, columns)
+    # =================================================================
+    # Multi-Column Validation Methods (DuckGuard 3.0)
+    # =================================================================
+    def expect_column_pair_satisfy(
+        self,
+        column_a: str,
+        column_b: str,
+        expression: str,
+        threshold: float = 1.0
+    ) -> ValidationResult:
+        """Check that column pair satisfies expression.
+        Args:
+            column_a: First column name
+            column_b: Second column name
+            expression: Expression to evaluate (e.g., "A > B", "A + B = 100")
+            threshold: Maximum allowed failure rate (0.0-1.0)
+        Returns:
+            ValidationResult with pass/fail status
+        Examples:
+            >>> data = connect("orders.csv")
+            >>> # Date range validation
+            >>> result = data.expect_column_pair_satisfy(
+            ...     column_a="end_date",
+            ...     column_b="start_date",
+            ...     expression="end_date >= start_date"
+            ... )
+            >>> assert result.passed
+            >>> # Arithmetic validation
+            >>> result = data.expect_column_pair_satisfy(
+            ...     column_a="total",
+            ...     column_b="subtotal",
+            ...     expression="total = subtotal * 1.1"
+            ... )
+        """
+        from duckguard.checks.multicolumn import MultiColumnCheckHandler
+        handler = MultiColumnCheckHandler()
+        return handler.execute_column_pair_satisfy(
+            dataset=self,
+            column_a=column_a,
+            column_b=column_b,
+            expression=expression,
+            threshold=threshold
+        )
+    def expect_columns_unique(
+        self,
+        columns: list[str],
+        threshold: float = 1.0
+    ) -> ValidationResult:
+        """Check that combination of columns is unique (composite key).
+        Args:
+            columns: List of column names forming composite key
+            threshold: Minimum required uniqueness rate (0.0-1.0)
+        Returns:
+            ValidationResult with pass/fail status
+        Examples:
+            >>> # Two-column composite key
+            >>> result = data.expect_columns_unique(
+            ...     columns=["user_id", "session_id"]
+            ... )
+            >>> assert result.passed
+            >>> # Three-column composite key
+            >>> result = data.expect_columns_unique(
+            ...     columns=["year", "month", "product_id"]
+            ... )
+        """
+        from duckguard.checks.multicolumn import MultiColumnCheckHandler
+        handler = MultiColumnCheckHandler()
+        return handler.execute_columns_unique(
+            dataset=self,
+            columns=columns,
+            threshold=threshold
+        )
+    def expect_multicolumn_sum_to_equal(
+        self,
+        columns: list[str],
+        expected_sum: float,
+        threshold: float = 0.01
+    ) -> ValidationResult:
+        """Check that sum of columns equals expected value.
+        Args:
+            columns: List of columns to sum
+            expected_sum: Expected sum value
+            threshold: Maximum allowed deviation
+        Returns:
+            ValidationResult with pass/fail status
+        Examples:
+            >>> # Components must sum to 100%
+            >>> result = data.expect_multicolumn_sum_to_equal(
+            ...     columns=["q1_pct", "q2_pct", "q3_pct", "q4_pct"],
+            ...     expected_sum=100.0
+            ... )
+            >>> assert result.passed
+            >>> # Budget allocation check
+            >>> result = data.expect_multicolumn_sum_to_equal(
+            ...     columns=["marketing", "sales", "r_and_d"],
+            ...     expected_sum=data.total_budget
+            ... )
+        """
+        from duckguard.checks.multicolumn import MultiColumnCheckHandler
+        handler = MultiColumnCheckHandler()
+        return handler.execute_multicolumn_sum_equal(
+            dataset=self,
+            columns=columns,
+            expected_sum=expected_sum,
+            threshold=threshold
+        )
+    # =================================================================
+    # Query-Based Validation Methods (DuckGuard 3.0)
+    # =================================================================
+    def expect_query_to_return_no_rows(
+        self,
+        query: str,
+        message: str | None = None
+    ) -> ValidationResult:
+        """Check that custom SQL query returns no rows (finds no violations).
+        Use case: Write a query that finds violations. The check passes if
+        the query returns no rows (no violations found).
+        Args:
+            query: SQL SELECT query (use 'table' to reference the dataset)
+            message: Optional custom message
+        Returns:
+            ValidationResult (passed if query returns 0 rows)
+        Examples:
+            >>> data = connect("orders.csv")
+            >>> # Find invalid totals (total < subtotal)
+            >>> result = data.expect_query_to_return_no_rows(
+            ...     query="SELECT * FROM table WHERE total < subtotal"
+            ... )
+            >>> assert result.passed
+            >>> # Find future dates
+            >>> result = data.expect_query_to_return_no_rows(
+            ...     query="SELECT * FROM table WHERE order_date > CURRENT_DATE"
+            ... )
+        Security:
+            - Query is validated to prevent SQL injection
+            - Only SELECT queries allowed
+            - READ-ONLY mode enforced
+            - 30-second timeout
+            - 10,000 row result limit
+        """
+        from duckguard.checks.query_based import QueryCheckHandler
+        handler = QueryCheckHandler()
+        return handler.execute_query_no_rows(
+            dataset=self,
+            query=query,
+            message=message
+        )
+    def expect_query_to_return_rows(
+        self,
+        query: str,
+        message: str | None = None
+    ) -> ValidationResult:
+        """Check that custom SQL query returns at least one row.
+        Use case: Ensure expected data exists in the dataset.
+        Args:
+            query: SQL SELECT query (use 'table' to reference the dataset)
+            message: Optional custom message
+        Returns:
+            ValidationResult (passed if query returns > 0 rows)
+        Examples:
+            >>> data = connect("products.csv")
+            >>> # Ensure we have active products
+            >>> result = data.expect_query_to_return_rows(
+            ...     query="SELECT * FROM table WHERE status = 'active'"
+            ... )
+            >>> assert result.passed
+            >>> # Ensure we have recent data
+            >>> result = data.expect_query_to_return_rows(
+            ...     query="SELECT * FROM table WHERE created_at >= CURRENT_DATE - 7"
+            ... )
+        Security:
+            - Query is validated to prevent SQL injection
+            - Only SELECT queries allowed
+            - READ-ONLY mode enforced
+        """
+        from duckguard.checks.query_based import QueryCheckHandler
+        handler = QueryCheckHandler()
+        return handler.execute_query_returns_rows(
+            dataset=self,
+            query=query,
+            message=message
+        )
+    def expect_query_result_to_equal(
+        self,
+        query: str,
+        expected: Any,
+        tolerance: float | None = None,
+        message: str | None = None
+    ) -> ValidationResult:
+        """Check that custom SQL query returns a specific value.
+        Use case: Aggregate validation (COUNT, SUM, AVG, etc.)
+        Args:
+            query: SQL query returning single value (use 'table' to reference dataset)
+            expected: Expected value
+            tolerance: Optional tolerance for numeric comparisons
+            message: Optional custom message
+        Returns:
+            ValidationResult (passed if query result equals expected)
+        Examples:
+            >>> data = connect("orders.csv")
+            >>> # Check pending order count
+            >>> result = data.expect_query_result_to_equal(
+            ...     query="SELECT COUNT(*) FROM table WHERE status = 'pending'",
+            ...     expected=0
+            ... )
+            >>> assert result.passed
+            >>> # Check average with tolerance
+            >>> result = data.expect_query_result_to_equal(
+            ...     query="SELECT AVG(price) FROM table",
+            ...     expected=100.0,
+            ...     tolerance=5.0
+            ... )
+            >>> # Check sum constraint
+            >>> result = data.expect_query_result_to_equal(
+            ...     query="SELECT SUM(quantity) FROM table WHERE category = 'electronics'",
+            ...     expected=1000
+            ... )
+        Security:
+            - Query must return exactly 1 row with 1 column
+            - Query is validated to prevent SQL injection
+        """
+        from duckguard.checks.query_based import QueryCheckHandler
+        handler = QueryCheckHandler()
+        return handler.execute_query_result_equals(
+            dataset=self,
+            query=query,
+            expected=expected,
+            tolerance=tolerance,
+            message=message
+        )
+    def expect_query_result_to_be_between(
+        self,
+        query: str,
+        min_value: float,
+        max_value: float,
+        message: str | None = None
+    ) -> ValidationResult:
+        """Check that custom SQL query result is within a range.
+        Use case: Metric validation (e.g., average must be between X and Y)
+        Args:
+            query: SQL query returning single numeric value
+            min_value: Minimum allowed value (inclusive)
+            max_value: Maximum allowed value (inclusive)
+            message: Optional custom message
+        Returns:
+            ValidationResult (passed if min_value <= result <= max_value)
+        Examples:
+            >>> data = connect("metrics.csv")
+            >>> # Average price in range
+            >>> result = data.expect_query_result_to_be_between(
+            ...     query="SELECT AVG(price) FROM table",
+            ...     min_value=10.0,
+            ...     max_value=1000.0
+            ... )
+            >>> assert result.passed
+            >>> # Null rate validation
+            >>> result = data.expect_query_result_to_be_between(
+            ...     query='''
+            ...         SELECT (COUNT(*) FILTER (WHERE price IS NULL)) * 100.0 / COUNT(*)
+            ...         FROM table
+            ...     ''',
+            ...     min_value=0.0,
+            ...     max_value=5.0  # Max 5% nulls
+            ... )
+        Security:
+            - Query must return exactly 1 row with 1 numeric column
+            - Query is validated to prevent SQL injection
+        """
+        from duckguard.checks.query_based import QueryCheckHandler
+        handler = QueryCheckHandler()
+        return handler.execute_query_result_between(
+            dataset=self,
+            query=query,
+            min_value=min_value,
+            max_value=max_value,
+            message=message
+        )
 class GroupedDataset:
     """

duckguard/core/result.py CHANGED Viewed

@@ -37,6 +37,11 @@ class FailedRow:
     reason: str = ""
     context: dict[str, Any] = field(default_factory=dict)
+    @property
+    def row_number(self) -> int:
+        """Alias for row_index (backward compatibility)."""
+        return self.row_index
     def __repr__(self) -> str:
         return f"FailedRow(row={self.row_index}, column='{self.column}', value={self.value!r})"

duckguard/notifications/email.py CHANGED Viewed

@@ -138,6 +138,15 @@ class EmailNotifier(BaseNotifier):
         if not self.email_config.to_addresses:
             raise ValueError("At least one recipient address (to_addresses) is required")
+        # Populate NotificationConfig with email settings for easy access
+        self.config.smtp_host = self.email_config.smtp_host
+        self.config.smtp_port = self.email_config.smtp_port
+        self.config.from_address = self.email_config.from_address
+        self.config.to_addresses = self.email_config.to_addresses
+        self.config.use_tls = self.email_config.use_tls
+        self.config.use_ssl = self.email_config.use_ssl
+        self.config.subject_prefix = self.email_config.subject_prefix
         # Set webhook_url to a placeholder (not used for email)
         self.webhook_url = "email://smtp"

duckguard/notifications/notifiers.py CHANGED Viewed

@@ -40,6 +40,16 @@ class NotificationConfig:
     max_failures_shown: int = 10
     mention_users: list[str] = field(default_factory=list)
     channel: str | None = None
+    username: str | None = None  # Slack bot username
+    # Email-specific attributes (set by EmailNotifier)
+    smtp_host: str | None = None
+    smtp_port: int | None = None
+    from_address: str | None = None
+    to_addresses: list[str] | None = None
+    use_tls: bool | None = None
+    use_ssl: bool | None = None
+    subject_prefix: str | None = None
 class BaseNotifier(ABC):
@@ -143,13 +153,39 @@ class SlackNotifier(BaseNotifier):
     """Slack webhook notifier.
     Usage:
-        notifier = SlackNotifier(webhook_url="https://hooks.slack.com/...")
+        notifier = SlackNotifier(
+            webhook_url="https://hooks.slack.com/...",
+            channel="#data-quality",
+            username="DuckGuard Bot"
+        )
         # or set DUCKGUARD_SLACK_WEBHOOK environment variable
         result = execute_rules(rules, "data.csv")
         notifier.send_results(result)
     """
+    def __init__(
+        self,
+        webhook_url: str | None = None,
+        channel: str | None = None,
+        username: str | None = None,
+        config: NotificationConfig | None = None,
+    ):
+        """Initialize Slack notifier.
+        Args:
+            webhook_url: Slack webhook URL
+            channel: Override default channel (e.g., "#data-quality")
+            username: Bot username to display
+            config: Notification configuration
+        """
+        super().__init__(webhook_url=webhook_url, config=config)
+        # Only override if explicitly provided (don't overwrite config values with None)
+        if channel is not None:
+            self.config.channel = channel
+        if username is not None:
+            self.config.username = username
     @property
     def _env_var_name(self) -> str:
         return "DUCKGUARD_SLACK_WEBHOOK"
@@ -211,6 +247,8 @@ class SlackNotifier(BaseNotifier):
         if self.config.channel:
             message["channel"] = self.config.channel
+        if self.config.username:
+            message["username"] = self.config.username
         return message

duckguard 2.3.0__py3-none-any.whl → 3.0.1__py3-none-any.whl

duckguard 2.3.0py3-none-any.whl → 3.0.1py3-none-any.whl