PyPI - duckguard - Versions diffs - 2.3.0__py3-none-any.whl → 3.0.0__py3-none-any.whl - Mend

duckguard 2.3.0py3-none-any.whl → 3.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

duckguard/__init__.py +1 -1
duckguard/checks/__init__.py +26 -0
duckguard/checks/conditional.py +796 -0
duckguard/checks/distributional.py +524 -0
duckguard/checks/multicolumn.py +726 -0
duckguard/checks/query_based.py +643 -0
duckguard/connectors/factory.py +30 -2
duckguard/connectors/files.py +7 -3
duckguard/core/column.py +372 -0
duckguard/core/dataset.py +330 -0
duckguard/profiler/distribution_analyzer.py +384 -0
duckguard/profiler/outlier_detector.py +497 -0
duckguard/profiler/pattern_matcher.py +301 -0
duckguard/profiler/quality_scorer.py +445 -0
duckguard/rules/executor.py +642 -0
duckguard/rules/schema.py +31 -0
{duckguard-2.3.0.dist-info → duckguard-3.0.0.dist-info}/METADATA +120 -1
{duckguard-2.3.0.dist-info → duckguard-3.0.0.dist-info}/RECORD +21 -12
{duckguard-2.3.0.dist-info → duckguard-3.0.0.dist-info}/WHEEL +0 -0
{duckguard-2.3.0.dist-info → duckguard-3.0.0.dist-info}/entry_points.txt +0 -0
{duckguard-2.3.0.dist-info → duckguard-3.0.0.dist-info}/licenses/LICENSE +0 -0

duckguard/connectors/factory.py CHANGED Viewed

@@ -31,7 +31,7 @@ def register_connector(connector_class: type[Connector]) -> None:
 def connect(
-    source: str,
+    source: Any,
     *,
     table: str | None = None,
     schema: str | None = None,
@@ -46,7 +46,7 @@ def connect(
     It automatically detects the source type and uses the appropriate connector.
     Args:
-        source: Path to file, connection string, or URL
+        source: Path to file, connection string, URL, or DataFrame (pandas/polars/pyarrow)
         table: Table name (for database connections)
         schema: Schema name (for database connections)
         database: Database name (for database connections)
@@ -60,6 +60,9 @@ def connect(
         # Connect to a CSV file
         orders = connect("data/orders.csv")
+        # Connect to a DataFrame
+        orders = connect(df)
         # Connect to a Parquet file on S3
         orders = connect("s3://bucket/orders.parquet")
@@ -72,6 +75,23 @@ def connect(
     Raises:
         ValueError: If no connector can handle the source
     """
+    # Handle DataFrame sources (pandas, polars, pyarrow)
+    if not isinstance(source, str):
+        # Check if it's a DataFrame-like object
+        if hasattr(source, '__dataframe__') or hasattr(source, 'to_pandas') or \
+           (hasattr(source, 'shape') and hasattr(source, 'columns')):
+            # Register DataFrame with engine
+            if engine is None:
+                engine = DuckGuardEngine.get_instance()
+            # Generate a unique name for the DataFrame
+            import hashlib
+            import time
+            df_name = f"df_{hashlib.md5(str(time.time()).encode()).hexdigest()[:8]}"
+            engine.register_dataframe(df_name, source)
+            return Dataset(source=df_name, engine=engine, name="dataframe")
     config = ConnectionConfig(
         source=source,
         table=table,
@@ -99,6 +119,10 @@ def connect(
 def _is_database_connection(source: str) -> bool:
     """Check if source is a database connection string."""
+    # Only handle string sources
+    if not isinstance(source, str):
+        return False
     db_prefixes = (
         "postgres://",
         "postgresql://",
@@ -143,6 +167,10 @@ def _handle_database_connection(
     engine: DuckGuardEngine | None,
 ) -> Dataset:
     """Handle database connection strings."""
+    # Validate source is a string
+    if not isinstance(source, str):
+        raise ValueError(f"Expected string source, got {type(source).__name__}")
     source_lower = source.lower()
     # PostgreSQL

duckguard/connectors/files.py CHANGED Viewed

@@ -65,6 +65,10 @@ class FileConnector(Connector):
     @classmethod
     def can_handle(cls, source: str) -> bool:
         """Check if this connector can handle the source."""
+        # Only handle string paths
+        if not isinstance(source, str):
+            return False
         # Check for file extensions
         path = Path(source)
         ext = path.suffix.lower()
@@ -99,7 +103,7 @@ class S3Connector(FileConnector):
     @classmethod
     def can_handle(cls, source: str) -> bool:
         """Check if this is an S3 path."""
-        return source.lower().startswith("s3://")
+        return isinstance(source, str) and source.lower().startswith("s3://")
     @classmethod
     def get_priority(cls) -> int:
@@ -113,7 +117,7 @@ class GCSConnector(FileConnector):
     @classmethod
     def can_handle(cls, source: str) -> bool:
         """Check if this is a GCS path."""
-        return source.lower().startswith(("gs://", "gcs://"))
+        return isinstance(source, str) and source.lower().startswith(("gs://", "gcs://"))
     @classmethod
     def get_priority(cls) -> int:
@@ -127,7 +131,7 @@ class AzureConnector(FileConnector):
     @classmethod
     def can_handle(cls, source: str) -> bool:
         """Check if this is an Azure path."""
-        return source.lower().startswith(("az://", "abfs://"))
+        return isinstance(source, str) and source.lower().startswith(("az://", "abfs://"))
     @classmethod
     def get_priority(cls) -> int:

duckguard/core/column.py CHANGED Viewed

@@ -1002,6 +1002,378 @@ class Column:
         rows = self._dataset.engine.fetch_all(sql)
         return {row[0]: row[1] for row in rows}
+    # =====================================================================
+    # Conditional Validation Methods (DuckGuard 3.0)
+    # =====================================================================
+    def not_null_when(
+        self,
+        condition: str,
+        threshold: float = 1.0
+    ) -> ValidationResult:
+        """Check column is not null when condition is true.
+        This enables sophisticated conditional validation like:
+        - "State must not be null when country = 'USA'"
+        - "Phone is required when contact_method = 'phone'"
+        Args:
+            condition: SQL WHERE clause condition (without WHERE keyword)
+            threshold: Maximum allowed non-null rate (0.0 to 1.0, default 1.0)
+        Returns:
+            ValidationResult with pass/fail status
+        Raises:
+            ValidationError: If condition is invalid or contains forbidden SQL
+        Examples:
+            >>> data = connect("customers.csv")
+            >>> # State required for US customers
+            >>> result = data.state.not_null_when("country = 'USA'")
+            >>> assert result.passed
+            >>> # Email required for registered users
+            >>> result = data.email.not_null_when("user_type = 'registered'")
+            >>> assert result.passed
+        Security:
+            Conditions are validated to prevent SQL injection. Only SELECT
+            queries with WHERE clauses are allowed.
+        """
+        from duckguard.checks.conditional import ConditionalCheckHandler
+        handler = ConditionalCheckHandler()
+        return handler.execute_not_null_when(
+            dataset=self._dataset,
+            column=self._name,
+            condition=condition,
+            threshold=threshold
+        )
+    def unique_when(
+        self,
+        condition: str,
+        threshold: float = 1.0
+    ) -> ValidationResult:
+        """Check column is unique when condition is true.
+        Args:
+            condition: SQL WHERE clause condition (without WHERE keyword)
+            threshold: Minimum required uniqueness rate (0.0 to 1.0, default 1.0)
+        Returns:
+            ValidationResult with pass/fail status
+        Examples:
+            >>> # Order IDs must be unique for completed orders
+            >>> result = data.order_id.unique_when("status = 'completed'")
+            >>> assert result.passed
+            >>> # Transaction IDs unique for successful transactions
+            >>> result = data.txn_id.unique_when("success = true")
+            >>> assert result.passed
+        """
+        from duckguard.checks.conditional import ConditionalCheckHandler
+        handler = ConditionalCheckHandler()
+        return handler.execute_unique_when(
+            dataset=self._dataset,
+            column=self._name,
+            condition=condition,
+            threshold=threshold
+        )
+    def between_when(
+        self,
+        min_val: float,
+        max_val: float,
+        condition: str,
+        threshold: float = 1.0
+    ) -> ValidationResult:
+        """Check column is between min and max when condition is true.
+        Args:
+            min_val: Minimum allowed value
+            max_val: Maximum allowed value
+            condition: SQL WHERE clause condition (without WHERE keyword)
+            threshold: Maximum allowed failure rate (0.0 to 1.0, default 1.0)
+        Returns:
+            ValidationResult with pass/fail status
+        Examples:
+            >>> # Discount between 0-50% for standard customers
+            >>> result = data.discount.between_when(
+            ...     min_val=0,
+            ...     max_val=50,
+            ...     condition="customer_tier = 'standard'"
+            ... )
+            >>> assert result.passed
+            >>> # Age between 18-65 for employees
+            >>> result = data.age.between_when(18, 65, "type = 'employee'")
+            >>> assert result.passed
+        """
+        from duckguard.checks.conditional import ConditionalCheckHandler
+        handler = ConditionalCheckHandler()
+        return handler.execute_between_when(
+            dataset=self._dataset,
+            column=self._name,
+            min_value=min_val,
+            max_value=max_val,
+            condition=condition,
+            threshold=threshold
+        )
+    def isin_when(
+        self,
+        allowed_values: list[Any],
+        condition: str,
+        threshold: float = 1.0
+    ) -> ValidationResult:
+        """Check column is in allowed values when condition is true.
+        Args:
+            allowed_values: List of allowed values
+            condition: SQL WHERE clause condition (without WHERE keyword)
+            threshold: Maximum allowed failure rate (0.0 to 1.0, default 1.0)
+        Returns:
+            ValidationResult with pass/fail status
+        Examples:
+            >>> # Status must be specific values for paid orders
+            >>> result = data.status.isin_when(
+            ...     allowed_values=['shipped', 'delivered'],
+            ...     condition="payment_status = 'paid'"
+            ... )
+            >>> assert result.passed
+            >>> # Category restricted for active products
+            >>> result = data.category.isin_when(
+            ...     ['A', 'B', 'C'],
+            ...     "is_active = true"
+            ... )
+            >>> assert result.passed
+        """
+        from duckguard.checks.conditional import ConditionalCheckHandler
+        handler = ConditionalCheckHandler()
+        return handler.execute_isin_when(
+            dataset=self._dataset,
+            column=self._name,
+            allowed_values=allowed_values,
+            condition=condition,
+            threshold=threshold
+        )
+    def matches_when(
+        self,
+        pattern: str,
+        condition: str,
+        threshold: float = 1.0
+    ) -> ValidationResult:
+        """Check column matches pattern when condition is true.
+        Args:
+            pattern: Regular expression pattern to match
+            condition: SQL WHERE clause condition (without WHERE keyword)
+            threshold: Maximum allowed failure rate (0.0 to 1.0, default 1.0)
+        Returns:
+            ValidationResult with pass/fail status
+        Examples:
+            >>> # Email format required for email notifications
+            >>> result = data.contact.matches_when(
+            ...     pattern=r'^[\\w.-]+@[\\w.-]+\\.\\w+$',
+            ...     condition="notification_type = 'email'"
+            ... )
+            >>> assert result.passed
+            >>> # Phone format required for SMS
+            >>> result = data.contact.matches_when(
+            ...     pattern=r'^\\+?[0-9]{10,15}$',
+            ...     condition="notification_type = 'sms'"
+            ... )
+            >>> assert result.passed
+        """
+        from duckguard.checks.conditional import ConditionalCheckHandler
+        handler = ConditionalCheckHandler()
+        return handler.execute_pattern_when(
+            dataset=self._dataset,
+            column=self._name,
+            pattern=pattern,
+            condition=condition,
+            threshold=threshold
+        )
+    # =================================================================
+    # Distributional Checks (DuckGuard 3.0)
+    # =================================================================
+    def expect_distribution_normal(
+        self,
+        significance_level: float = 0.05
+    ) -> ValidationResult:
+        """Check if column data follows a normal distribution.
+        Uses Kolmogorov-Smirnov test comparing data to fitted normal distribution.
+        Args:
+            significance_level: Significance level for test (default 0.05)
+        Returns:
+            ValidationResult (passed if p-value > significance_level)
+        Examples:
+            >>> # Test if temperature measurements are normally distributed
+            >>> result = data.temperature.expect_distribution_normal()
+            >>> assert result.passed
+            >>> # Use stricter significance level
+            >>> result = data.measurement.expect_distribution_normal(
+            ...     significance_level=0.01
+            ... )
+        Note:
+            Requires scipy: pip install 'duckguard[statistics]'
+            Requires minimum 30 samples for reliable results.
+        """
+        from duckguard.checks.distributional import DistributionalCheckHandler
+        handler = DistributionalCheckHandler()
+        return handler.execute_distribution_normal(
+            dataset=self._dataset,
+            column=self._name,
+            significance_level=significance_level
+        )
+    def expect_distribution_uniform(
+        self,
+        significance_level: float = 0.05
+    ) -> ValidationResult:
+        """Check if column data follows a uniform distribution.
+        Uses Kolmogorov-Smirnov test comparing data to uniform distribution.
+        Args:
+            significance_level: Significance level for test (default 0.05)
+        Returns:
+            ValidationResult (passed if p-value > significance_level)
+        Examples:
+            >>> # Test if random numbers are uniformly distributed
+            >>> result = data.random_value.expect_distribution_uniform()
+            >>> assert result.passed
+            >>> # Test dice rolls for fairness
+            >>> result = data.dice_roll.expect_distribution_uniform()
+        Note:
+            Requires scipy: pip install 'duckguard[statistics]'
+            Requires minimum 30 samples for reliable results.
+        """
+        from duckguard.checks.distributional import DistributionalCheckHandler
+        handler = DistributionalCheckHandler()
+        return handler.execute_distribution_uniform(
+            dataset=self._dataset,
+            column=self._name,
+            significance_level=significance_level
+        )
+    def expect_ks_test(
+        self,
+        distribution: str = "norm",
+        significance_level: float = 0.05
+    ) -> ValidationResult:
+        """Perform Kolmogorov-Smirnov test for specified distribution.
+        Args:
+            distribution: Distribution name ('norm', 'uniform', 'expon', etc.)
+            significance_level: Significance level for test (default 0.05)
+        Returns:
+            ValidationResult (passed if p-value > significance_level)
+        Examples:
+            >>> # Test for normal distribution
+            >>> result = data.values.expect_ks_test(distribution='norm')
+            >>> assert result.passed
+            >>> # Test for exponential distribution
+            >>> result = data.wait_times.expect_ks_test(
+            ...     distribution='expon',
+            ...     significance_level=0.01
+            ... )
+        Note:
+            Requires scipy: pip install 'duckguard[statistics]'
+            Supported distributions: norm, uniform, expon, gamma, beta, etc.
+        """
+        from duckguard.checks.distributional import DistributionalCheckHandler
+        handler = DistributionalCheckHandler()
+        return handler.execute_ks_test(
+            dataset=self._dataset,
+            column=self._name,
+            distribution=distribution,
+            significance_level=significance_level
+        )
+    def expect_chi_square_test(
+        self,
+        expected_frequencies: dict | None = None,
+        significance_level: float = 0.05
+    ) -> ValidationResult:
+        """Perform chi-square goodness-of-fit test for categorical data.
+        Tests if observed frequencies match expected frequencies.
+        Args:
+            expected_frequencies: Dict mapping categories to expected frequencies
+                                  If None, assumes uniform distribution
+            significance_level: Significance level for test (default 0.05)
+        Returns:
+            ValidationResult (passed if p-value > significance_level)
+        Examples:
+            >>> # Test if dice is fair (uniform distribution)
+            >>> result = data.dice_roll.expect_chi_square_test()
+            >>> assert result.passed
+            >>> # Test with specific expected frequencies
+            >>> expected = {1: 1/6, 2: 1/6, 3: 1/6, 4: 1/6, 5: 1/6, 6: 1/6}
+            >>> result = data.dice_roll.expect_chi_square_test(
+            ...     expected_frequencies=expected
+            ... )
+            >>> # Test categorical distribution
+            >>> expected = {'A': 0.5, 'B': 0.3, 'C': 0.2}
+            >>> result = data.category.expect_chi_square_test(
+            ...     expected_frequencies=expected
+            ... )
+        Note:
+            Requires scipy: pip install 'duckguard[statistics]'
+            Requires minimum 30 samples for reliable results.
+        """
+        from duckguard.checks.distributional import DistributionalCheckHandler
+        handler = DistributionalCheckHandler()
+        return handler.execute_chi_square_test(
+            dataset=self._dataset,
+            column=self._name,
+            expected_frequencies=expected_frequencies,
+            significance_level=significance_level
+        )
     def clear_cache(self) -> None:
         """Clear cached statistics."""
         self._stats_cache = None

duckguard 2.3.0__py3-none-any.whl → 3.0.0__py3-none-any.whl

duckguard 2.3.0py3-none-any.whl → 3.0.0py3-none-any.whl