PyPI - duckguard - Versions diffs - 2.0.0__py3-none-any.whl - Mend

duckguard 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

duckguard/__init__.py +110 -0
duckguard/anomaly/__init__.py +34 -0
duckguard/anomaly/detector.py +394 -0
duckguard/anomaly/methods.py +432 -0
duckguard/cli/__init__.py +5 -0
duckguard/cli/main.py +706 -0
duckguard/connectors/__init__.py +58 -0
duckguard/connectors/base.py +80 -0
duckguard/connectors/bigquery.py +171 -0
duckguard/connectors/databricks.py +201 -0
duckguard/connectors/factory.py +292 -0
duckguard/connectors/files.py +135 -0
duckguard/connectors/kafka.py +343 -0
duckguard/connectors/mongodb.py +236 -0
duckguard/connectors/mysql.py +121 -0
duckguard/connectors/oracle.py +196 -0
duckguard/connectors/postgres.py +99 -0
duckguard/connectors/redshift.py +154 -0
duckguard/connectors/snowflake.py +226 -0
duckguard/connectors/sqlite.py +112 -0
duckguard/connectors/sqlserver.py +242 -0
duckguard/contracts/__init__.py +48 -0
duckguard/contracts/diff.py +432 -0
duckguard/contracts/generator.py +334 -0
duckguard/contracts/loader.py +367 -0
duckguard/contracts/schema.py +242 -0
duckguard/contracts/validator.py +453 -0
duckguard/core/__init__.py +8 -0
duckguard/core/column.py +437 -0
duckguard/core/dataset.py +284 -0
duckguard/core/engine.py +261 -0
duckguard/core/result.py +119 -0
duckguard/core/scoring.py +508 -0
duckguard/profiler/__init__.py +5 -0
duckguard/profiler/auto_profile.py +350 -0
duckguard/pytest_plugin/__init__.py +5 -0
duckguard/pytest_plugin/plugin.py +161 -0
duckguard/reporting/__init__.py +6 -0
duckguard/reporting/console.py +88 -0
duckguard/reporting/json_report.py +96 -0
duckguard/rules/__init__.py +28 -0
duckguard/rules/executor.py +616 -0
duckguard/rules/generator.py +341 -0
duckguard/rules/loader.py +483 -0
duckguard/rules/schema.py +289 -0
duckguard/semantic/__init__.py +31 -0
duckguard/semantic/analyzer.py +270 -0
duckguard/semantic/detector.py +459 -0
duckguard/semantic/validators.py +354 -0
duckguard/validators/__init__.py +7 -0
duckguard-2.0.0.dist-info/METADATA +221 -0
duckguard-2.0.0.dist-info/RECORD +55 -0
duckguard-2.0.0.dist-info/WHEEL +4 -0
duckguard-2.0.0.dist-info/entry_points.txt +5 -0
duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0

duckguard/profiler/auto_profile.py ADDED Viewed

@@ -0,0 +1,350 @@
+"""Auto-profiling and rule suggestion engine."""
+from __future__ import annotations
+import re
+from dataclasses import dataclass, field
+from typing import Any
+from duckguard.core.dataset import Dataset
+from duckguard.core.result import ProfileResult, ColumnProfile
+@dataclass
+class RuleSuggestion:
+    """A suggested validation rule."""
+    rule: str
+    confidence: float  # 0-1
+    reason: str
+    category: str  # null, unique, range, pattern, enum
+class AutoProfiler:
+    """
+    Automatically profiles datasets and suggests validation rules.
+    The profiler analyzes data patterns and generates Python assertions
+    that can be used directly in test files.
+    """
+    # Thresholds for rule generation
+    NULL_THRESHOLD_SUGGEST = 1.0  # Suggest not_null if nulls < 1%
+    UNIQUE_THRESHOLD_SUGGEST = 99.0  # Suggest unique if > 99% unique
+    ENUM_MAX_VALUES = 20  # Max distinct values to suggest enum check
+    PATTERN_SAMPLE_SIZE = 1000  # Sample size for pattern detection
+    # Common patterns to detect
+    PATTERNS = {
+        "email": r"^[\w\.-]+@[\w\.-]+\.\w+$",
+        "uuid": r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$",
+        "phone": r"^\+?[\d\s\-\(\)]{10,}$",
+        "url": r"^https?://[\w\.-]+",
+        "ip_address": r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$",
+        "date_iso": r"^\d{4}-\d{2}-\d{2}$",
+        "datetime_iso": r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}",
+    }
+    def __init__(self, dataset_var_name: str = "data"):
+        """
+        Initialize the profiler.
+        Args:
+            dataset_var_name: Variable name to use in generated rules
+        """
+        self.dataset_var_name = dataset_var_name
+    def profile(self, dataset: Dataset) -> ProfileResult:
+        """
+        Generate a comprehensive profile of the dataset.
+        Args:
+            dataset: Dataset to profile
+        Returns:
+            ProfileResult with statistics and suggested rules
+        """
+        column_profiles = []
+        all_suggestions: list[str] = []
+        for col_name in dataset.columns:
+            col = dataset[col_name]
+            col_profile = self._profile_column(col)
+            column_profiles.append(col_profile)
+            all_suggestions.extend(col_profile.suggested_rules)
+        return ProfileResult(
+            source=dataset.source,
+            row_count=dataset.row_count,
+            column_count=dataset.column_count,
+            columns=column_profiles,
+            suggested_rules=all_suggestions,
+        )
+    def _profile_column(self, col) -> ColumnProfile:
+        """Profile a single column."""
+        # Get basic stats
+        stats = col._get_stats()
+        numeric_stats = col._get_numeric_stats()
+        # Get sample values for pattern detection
+        sample_values = col.get_distinct_values(limit=self.PATTERN_SAMPLE_SIZE)
+        # Generate suggestions
+        suggestions = self._generate_suggestions(col, stats, numeric_stats, sample_values)
+        return ColumnProfile(
+            name=col.name,
+            dtype=self._infer_dtype(stats, sample_values),
+            null_count=stats.get("null_count", 0),
+            null_percent=stats.get("null_percent", 0.0),
+            unique_count=stats.get("unique_count", 0),
+            unique_percent=stats.get("unique_percent", 0.0),
+            min_value=stats.get("min_value"),
+            max_value=stats.get("max_value"),
+            mean_value=numeric_stats.get("mean"),
+            stddev_value=numeric_stats.get("stddev"),
+            sample_values=sample_values[:10],
+            suggested_rules=[s.rule for s in suggestions],
+        )
+    def _generate_suggestions(
+        self,
+        col,
+        stats: dict[str, Any],
+        numeric_stats: dict[str, Any],
+        sample_values: list[Any],
+    ) -> list[RuleSuggestion]:
+        """Generate rule suggestions for a column."""
+        suggestions = []
+        col_name = col.name
+        var = self.dataset_var_name
+        # 1. Null check suggestions
+        null_pct = stats.get("null_percent", 0.0)
+        if null_pct == 0:
+            suggestions.append(
+                RuleSuggestion(
+                    rule=f"assert {var}.{col_name}.null_percent == 0",
+                    confidence=1.0,
+                    reason="Column has no null values",
+                    category="null",
+                )
+            )
+        elif null_pct < self.NULL_THRESHOLD_SUGGEST:
+            threshold = max(1, round(null_pct * 2))  # 2x buffer
+            suggestions.append(
+                RuleSuggestion(
+                    rule=f"assert {var}.{col_name}.null_percent < {threshold}",
+                    confidence=0.9,
+                    reason=f"Column has only {null_pct:.2f}% nulls",
+                    category="null",
+                )
+            )
+        # 2. Uniqueness suggestions
+        unique_pct = stats.get("unique_percent", 0.0)
+        if unique_pct == 100:
+            suggestions.append(
+                RuleSuggestion(
+                    rule=f"assert {var}.{col_name}.has_no_duplicates()",
+                    confidence=1.0,
+                    reason="All values are unique",
+                    category="unique",
+                )
+            )
+        elif unique_pct > self.UNIQUE_THRESHOLD_SUGGEST:
+            suggestions.append(
+                RuleSuggestion(
+                    rule=f"assert {var}.{col_name}.unique_percent > 99",
+                    confidence=0.8,
+                    reason=f"Column has {unique_pct:.2f}% unique values",
+                    category="unique",
+                )
+            )
+        # 3. Range suggestions for numeric columns
+        if numeric_stats.get("mean") is not None:
+            min_val = stats.get("min_value")
+            max_val = stats.get("max_value")
+            if min_val is not None and max_val is not None:
+                # Add buffer for range
+                range_size = max_val - min_val
+                buffer = range_size * 0.1 if range_size > 0 else 1
+                suggested_min = self._round_nice(min_val - buffer)
+                suggested_max = self._round_nice(max_val + buffer)
+                suggestions.append(
+                    RuleSuggestion(
+                        rule=f"assert {var}.{col_name}.between({suggested_min}, {suggested_max})",
+                        confidence=0.7,
+                        reason=f"Values range from {min_val} to {max_val}",
+                        category="range",
+                    )
+                )
+            # Non-negative check
+            if min_val is not None and min_val >= 0:
+                suggestions.append(
+                    RuleSuggestion(
+                        rule=f"assert {var}.{col_name}.min >= 0",
+                        confidence=0.9,
+                        reason="All values are non-negative",
+                        category="range",
+                    )
+                )
+        # 4. Enum suggestions for low-cardinality columns
+        unique_count = stats.get("unique_count", 0)
+        total_count = stats.get("total_count", 0)
+        if 0 < unique_count <= self.ENUM_MAX_VALUES and total_count > unique_count * 2:
+            # Get all distinct values
+            distinct_values = col.get_distinct_values(limit=self.ENUM_MAX_VALUES + 1)
+            if len(distinct_values) <= self.ENUM_MAX_VALUES:
+                # Format values for Python code
+                formatted_values = self._format_values(distinct_values)
+                suggestions.append(
+                    RuleSuggestion(
+                        rule=f"assert {var}.{col_name}.isin({formatted_values})",
+                        confidence=0.85,
+                        reason=f"Column has only {unique_count} distinct values",
+                        category="enum",
+                    )
+                )
+        # 5. Pattern suggestions for string columns
+        string_values = [v for v in sample_values if isinstance(v, str)]
+        if string_values:
+            detected_pattern = self._detect_pattern(string_values)
+            if detected_pattern:
+                pattern_name, pattern = detected_pattern
+                suggestions.append(
+                    RuleSuggestion(
+                        rule=f'assert {var}.{col_name}.matches(r"{pattern}")',
+                        confidence=0.75,
+                        reason=f"Values appear to be {pattern_name}",
+                        category="pattern",
+                    )
+                )
+        return suggestions
+    def _detect_pattern(self, values: list[str]) -> tuple[str, str] | None:
+        """Detect common patterns in string values."""
+        if not values:
+            return None
+        # Sample for pattern detection
+        sample = values[: min(100, len(values))]
+        for pattern_name, pattern in self.PATTERNS.items():
+            matches = sum(1 for v in sample if re.match(pattern, str(v), re.IGNORECASE))
+            match_rate = matches / len(sample)
+            if match_rate > 0.9:  # 90% match threshold
+                return pattern_name, pattern
+        return None
+    def _infer_dtype(self, stats: dict[str, Any], sample_values: list[Any]) -> str:
+        """Infer the data type from statistics and samples."""
+        if not sample_values:
+            return "unknown"
+        # Get first non-null value
+        first_val = next((v for v in sample_values if v is not None), None)
+        if first_val is None:
+            return "unknown"
+        if isinstance(first_val, bool):
+            return "boolean"
+        if isinstance(first_val, int):
+            return "integer"
+        if isinstance(first_val, float):
+            return "float"
+        if isinstance(first_val, str):
+            return "string"
+        return type(first_val).__name__
+    def _round_nice(self, value: float) -> int | float:
+        """Round to a nice human-readable number."""
+        if abs(value) < 1:
+            return round(value, 2)
+        if abs(value) < 100:
+            return round(value)
+        if abs(value) < 1000:
+            return round(value / 10) * 10
+        return round(value / 100) * 100
+    def _format_values(self, values: list[Any]) -> str:
+        """Format a list of values for Python code."""
+        formatted = []
+        for v in values:
+            if v is None:
+                continue
+            if isinstance(v, str):
+                # Escape quotes
+                escaped = v.replace("'", "\\'")
+                formatted.append(f"'{escaped}'")
+            else:
+                formatted.append(str(v))
+        return "[" + ", ".join(formatted) + "]"
+    def generate_test_file(self, dataset: Dataset, output_var: str = "data") -> str:
+        """
+        Generate a complete test file from profiling results.
+        Args:
+            dataset: Dataset to profile
+            output_var: Variable name to use for the dataset
+        Returns:
+            Python code string for a test file
+        """
+        self.dataset_var_name = output_var
+        profile = self.profile(dataset)
+        lines = [
+            '"""Auto-generated data quality tests by DuckGuard."""',
+            "",
+            "from duckguard import connect",
+            "",
+            "",
+            f'def test_{dataset.name.replace("-", "_").replace(".", "_")}():',
+            f'    {output_var} = connect("{dataset.source}")',
+            "",
+            f"    # Basic dataset checks",
+            f"    assert {output_var}.row_count > 0",
+            "",
+        ]
+        # Group suggestions by column
+        for col_profile in profile.columns:
+            if col_profile.suggested_rules:
+                lines.append(f"    # {col_profile.name} validations")
+                for rule in col_profile.suggested_rules:
+                    lines.append(f"    {rule}")
+                lines.append("")
+        return "\n".join(lines)
+def profile(dataset: Dataset, dataset_var_name: str = "data") -> ProfileResult:
+    """
+    Convenience function to profile a dataset.
+    Args:
+        dataset: Dataset to profile
+        dataset_var_name: Variable name for generated rules
+    Returns:
+        ProfileResult
+    """
+    profiler = AutoProfiler(dataset_var_name=dataset_var_name)
+    return profiler.profile(dataset)

duckguard/pytest_plugin/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""pytest plugin for DuckGuard."""
+from duckguard.pytest_plugin.plugin import duckguard_engine, duckguard_dataset
+__all__ = ["duckguard_engine", "duckguard_dataset"]

duckguard/pytest_plugin/plugin.py ADDED Viewed

@@ -0,0 +1,161 @@
+"""pytest plugin for DuckGuard data quality testing.
+This plugin provides fixtures and hooks for seamless pytest integration.
+Usage in conftest.py:
+    import pytest
+    from duckguard import connect
+    @pytest.fixture
+    def orders():
+        return connect("data/orders.csv")
+Usage in tests:
+    def test_orders_not_empty(orders):
+        assert orders.row_count > 0
+    def test_customer_id_valid(orders):
+        assert orders.customer_id.null_percent < 5
+"""
+from __future__ import annotations
+import pytest
+from duckguard.core.engine import DuckGuardEngine
+from duckguard.connectors import connect as duckguard_connect
+@pytest.fixture(scope="session")
+def duckguard_engine():
+    """
+    Provide a DuckGuard engine instance for the test session.
+    This fixture provides a shared DuckDB engine that persists
+    across all tests in the session.
+    Usage:
+        def test_something(duckguard_engine):
+            result = duckguard_engine.execute("SELECT 1")
+    """
+    engine = DuckGuardEngine()
+    yield engine
+    engine.close()
+@pytest.fixture
+def duckguard_dataset(request):
+    """
+    Factory fixture for creating datasets from markers.
+    Usage with marker:
+        @pytest.mark.duckguard_source("data/orders.csv")
+        def test_orders(duckguard_dataset):
+            assert duckguard_dataset.row_count > 0
+    Usage with parametrize:
+        @pytest.mark.parametrize("source", ["data/orders.csv", "data/customers.csv"])
+        def test_multiple_sources(duckguard_dataset, source):
+            dataset = duckguard_dataset(source)
+            assert dataset.row_count > 0
+    """
+    # Check for marker
+    marker = request.node.get_closest_marker("duckguard_source")
+    if marker:
+        source = marker.args[0] if marker.args else None
+        table = marker.kwargs.get("table")
+        if source:
+            return duckguard_connect(source, table=table)
+    # Return factory function
+    def _create_dataset(source: str, **kwargs):
+        return duckguard_connect(source, **kwargs)
+    return _create_dataset
+def pytest_configure(config):
+    """Register DuckGuard markers."""
+    config.addinivalue_line(
+        "markers",
+        "duckguard_source(source, table=None): Mark test with a DuckGuard data source",
+    )
+    config.addinivalue_line(
+        "markers",
+        "duckguard_skip_slow: Skip slow DuckGuard tests",
+    )
+def pytest_collection_modifyitems(config, items):
+    """Modify test collection based on DuckGuard options."""
+    # Check if slow tests should be skipped
+    skip_slow = config.getoption("--duckguard-skip-slow", default=False)
+    if skip_slow:
+        skip_marker = pytest.mark.skip(reason="Skipping slow DuckGuard tests")
+        for item in items:
+            if "duckguard_skip_slow" in item.keywords:
+                item.add_marker(skip_marker)
+def pytest_addoption(parser):
+    """Add DuckGuard-specific command line options."""
+    group = parser.getgroup("duckguard", "DuckGuard data quality testing")
+    group.addoption(
+        "--duckguard-skip-slow",
+        action="store_true",
+        default=False,
+        help="Skip slow DuckGuard tests",
+    )
+# Custom assertion helpers for better error messages
+class DuckGuardAssertionHelper:
+    """Helper class for custom DuckGuard assertions with better error messages."""
+    @staticmethod
+    def assert_not_null(column, threshold: float = 0.0):
+        """Assert column null percentage is below threshold."""
+        actual = column.null_percent
+        if actual > threshold:
+            pytest.fail(
+                f"Column '{column.name}' has {actual:.2f}% null values, "
+                f"expected <= {threshold}%"
+            )
+    @staticmethod
+    def assert_unique(column, threshold: float = 100.0):
+        """Assert column unique percentage is at or above threshold."""
+        actual = column.unique_percent
+        if actual < threshold:
+            pytest.fail(
+                f"Column '{column.name}' has {actual:.2f}% unique values, "
+                f"expected >= {threshold}%"
+            )
+    @staticmethod
+    def assert_in_range(column, min_val, max_val):
+        """Assert all column values are within range."""
+        result = column.between(min_val, max_val)
+        if not result:
+            pytest.fail(
+                f"Column '{column.name}' has {result.actual_value} values "
+                f"outside range [{min_val}, {max_val}]"
+            )
+    @staticmethod
+    def assert_matches_pattern(column, pattern: str):
+        """Assert all column values match pattern."""
+        result = column.matches(pattern)
+        if not result:
+            pytest.fail(
+                f"Column '{column.name}' has {result.actual_value} values "
+                f"not matching pattern '{pattern}'"
+            )
+@pytest.fixture
+def duckguard_assert():
+    """Provide DuckGuard assertion helpers with better error messages."""
+    return DuckGuardAssertionHelper()

duckguard/reporting/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Reporting module for DuckGuard."""
+from duckguard.reporting.console import ConsoleReporter
+from duckguard.reporting.json_report import JSONReporter
+__all__ = ["ConsoleReporter", "JSONReporter"]

duckguard/reporting/console.py ADDED Viewed

@@ -0,0 +1,88 @@
+"""Console reporter using Rich."""
+from __future__ import annotations
+from rich.console import Console
+from rich.table import Table
+from rich.panel import Panel
+from duckguard.core.result import ProfileResult, ScanResult, CheckResult, CheckStatus
+class ConsoleReporter:
+    """Reporter that outputs to the console using Rich."""
+    def __init__(self):
+        self.console = Console()
+    def report_profile(self, profile: ProfileResult) -> None:
+        """Display a profile result."""
+        self.console.print(
+            Panel(
+                f"[bold]Source:[/bold] {profile.source}\n"
+                f"[bold]Rows:[/bold] {profile.row_count:,}\n"
+                f"[bold]Columns:[/bold] {profile.column_count}",
+                title="Profile Summary",
+            )
+        )
+        # Column table
+        table = Table(title="Columns")
+        table.add_column("Name", style="cyan")
+        table.add_column("Type")
+        table.add_column("Nulls %", justify="right")
+        table.add_column("Unique %", justify="right")
+        for col in profile.columns:
+            table.add_row(
+                col.name,
+                col.dtype,
+                f"{col.null_percent:.1f}%",
+                f"{col.unique_percent:.1f}%",
+            )
+        self.console.print(table)
+    def report_scan(self, scan: ScanResult) -> None:
+        """Display a scan result."""
+        status_color = "green" if scan.passed else "red"
+        self.console.print(
+            Panel(
+                f"[bold]Source:[/bold] {scan.source}\n"
+                f"[bold]Rows:[/bold] {scan.row_count:,}\n"
+                f"[bold]Checks:[/bold] {scan.checks_passed}/{scan.checks_run} passed "
+                f"([{status_color}]{scan.pass_rate:.1f}%[/{status_color}])",
+                title="Scan Summary",
+            )
+        )
+        if scan.results:
+            table = Table(title="Check Results")
+            table.add_column("Check", style="cyan")
+            table.add_column("Status", justify="center")
+            table.add_column("Value")
+            table.add_column("Message")
+            for result in scan.results:
+                status_style = {
+                    CheckStatus.PASSED: "[green]PASS[/green]",
+                    CheckStatus.FAILED: "[red]FAIL[/red]",
+                    CheckStatus.WARNING: "[yellow]WARN[/yellow]",
+                    CheckStatus.ERROR: "[red]ERROR[/red]",
+                }
+                table.add_row(
+                    result.name,
+                    status_style.get(result.status, str(result.status)),
+                    str(result.actual_value),
+                    result.message,
+                )
+            self.console.print(table)
+    def report_check(self, result: CheckResult) -> None:
+        """Display a single check result."""
+        if result.passed:
+            self.console.print(f"[green]PASS[/green] {result.name}: {result.message}")
+        else:
+            self.console.print(f"[red]FAIL[/red] {result.name}: {result.message}")