PyPI - duckguard - Versions diffs - 2.2.0__py3-none-any.whl → 3.0.0__py3-none-any.whl - Mend

duckguard 2.2.0py3-none-any.whl → 3.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

duckguard/__init__.py +1 -1
duckguard/anomaly/__init__.py +28 -0
duckguard/anomaly/baselines.py +294 -0
duckguard/anomaly/methods.py +16 -2
duckguard/anomaly/ml_methods.py +724 -0
duckguard/checks/__init__.py +26 -0
duckguard/checks/conditional.py +796 -0
duckguard/checks/distributional.py +524 -0
duckguard/checks/multicolumn.py +726 -0
duckguard/checks/query_based.py +643 -0
duckguard/cli/main.py +257 -2
duckguard/connectors/factory.py +30 -2
duckguard/connectors/files.py +7 -3
duckguard/core/column.py +851 -1
duckguard/core/dataset.py +1035 -0
duckguard/core/result.py +236 -0
duckguard/freshness/__init__.py +33 -0
duckguard/freshness/monitor.py +429 -0
duckguard/history/schema.py +119 -1
duckguard/notifications/__init__.py +20 -2
duckguard/notifications/email.py +508 -0
duckguard/profiler/distribution_analyzer.py +384 -0
duckguard/profiler/outlier_detector.py +497 -0
duckguard/profiler/pattern_matcher.py +301 -0
duckguard/profiler/quality_scorer.py +445 -0
duckguard/reports/html_reporter.py +1 -2
duckguard/rules/executor.py +642 -0
duckguard/rules/generator.py +4 -1
duckguard/rules/schema.py +54 -0
duckguard/schema_history/__init__.py +40 -0
duckguard/schema_history/analyzer.py +414 -0
duckguard/schema_history/tracker.py +288 -0
duckguard/semantic/detector.py +17 -1
duckguard-3.0.0.dist-info/METADATA +1072 -0
{duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/RECORD +38 -21
duckguard-2.2.0.dist-info/METADATA +0 -351
{duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/WHEEL +0 -0
{duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/entry_points.txt +0 -0
{duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/licenses/LICENSE +0 -0

duckguard/core/result.py CHANGED Viewed

@@ -208,3 +208,239 @@ class ScanResult:
         if self.checks_run == 0:
             return 100.0
         return (self.checks_passed / self.checks_run) * 100
+# =========================================================================
+# Distribution Drift Results
+# =========================================================================
+@dataclass
+class DriftResult:
+    """Result of distribution drift detection between two columns.
+    Attributes:
+        is_drifted: Whether significant drift was detected
+        p_value: Statistical p-value from the test
+        statistic: Test statistic value
+        threshold: P-value threshold used for detection
+        method: Statistical method used (e.g., "ks_test")
+        message: Human-readable summary
+        details: Additional metadata
+    """
+    is_drifted: bool
+    p_value: float
+    statistic: float
+    threshold: float = 0.05
+    method: str = "ks_test"
+    message: str = ""
+    details: dict[str, Any] = field(default_factory=dict)
+    def __bool__(self) -> bool:
+        """Returns True if NO drift detected (data is stable)."""
+        return not self.is_drifted
+    def __repr__(self) -> str:
+        status = "DRIFT DETECTED" if self.is_drifted else "STABLE"
+        return f"DriftResult({status}, p_value={self.p_value:.4f}, threshold={self.threshold})"
+    def summary(self) -> str:
+        """Get a human-readable summary."""
+        status = "DRIFT DETECTED" if self.is_drifted else "No significant drift"
+        return f"{status} (p-value: {self.p_value:.4f}, threshold: {self.threshold}, method: {self.method})"
+# =========================================================================
+# Reconciliation Results
+# =========================================================================
+@dataclass
+class ReconciliationMismatch:
+    """Represents a single row mismatch in reconciliation.
+    Attributes:
+        key_values: Dictionary of key column values that identify the row
+        column: Column name where mismatch occurred
+        source_value: Value in source dataset
+        target_value: Value in target dataset
+        mismatch_type: Type of mismatch ("value_diff", "missing_in_target", "extra_in_target")
+    """
+    key_values: dict[str, Any]
+    column: str
+    source_value: Any = None
+    target_value: Any = None
+    mismatch_type: str = "value_diff"
+    def __repr__(self) -> str:
+        keys = ", ".join(f"{k}={v}" for k, v in self.key_values.items())
+        return f"ReconciliationMismatch({keys}, {self.column}: {self.source_value} vs {self.target_value})"
+@dataclass
+class ReconciliationResult:
+    """Result of reconciling two datasets.
+    Attributes:
+        passed: Whether reconciliation passed (datasets match)
+        source_row_count: Number of rows in source dataset
+        target_row_count: Number of rows in target dataset
+        missing_in_target: Rows in source but not in target
+        extra_in_target: Rows in target but not in source
+        value_mismatches: Count of value mismatches by column
+        match_percentage: Percentage of rows that match
+        key_columns: Columns used as keys for matching
+        compared_columns: Columns compared for values
+        mismatches: Sample of actual mismatches
+        details: Additional metadata
+    """
+    passed: bool
+    source_row_count: int
+    target_row_count: int
+    missing_in_target: int = 0
+    extra_in_target: int = 0
+    value_mismatches: dict[str, int] = field(default_factory=dict)
+    match_percentage: float = 100.0
+    key_columns: list[str] = field(default_factory=list)
+    compared_columns: list[str] = field(default_factory=list)
+    mismatches: list[ReconciliationMismatch] = field(default_factory=list)
+    details: dict[str, Any] = field(default_factory=dict)
+    def __bool__(self) -> bool:
+        """Allow using ReconciliationResult in boolean context."""
+        return self.passed
+    def __repr__(self) -> str:
+        status = "MATCHED" if self.passed else "MISMATCHED"
+        return f"ReconciliationResult({status}, match={self.match_percentage:.1f}%, missing={self.missing_in_target}, extra={self.extra_in_target})"
+    @property
+    def total_mismatches(self) -> int:
+        """Total number of mismatches across all columns."""
+        return self.missing_in_target + self.extra_in_target + sum(self.value_mismatches.values())
+    def summary(self) -> str:
+        """Get a human-readable summary."""
+        lines = [
+            f"Reconciliation: {'PASSED' if self.passed else 'FAILED'} ({self.match_percentage:.1f}% match)",
+            f"Source rows: {self.source_row_count}, Target rows: {self.target_row_count}",
+        ]
+        if self.missing_in_target > 0:
+            lines.append(f"Missing in target: {self.missing_in_target} rows")
+        if self.extra_in_target > 0:
+            lines.append(f"Extra in target: {self.extra_in_target} rows")
+        if self.value_mismatches:
+            lines.append("Column mismatches:")
+            for col, count in self.value_mismatches.items():
+                lines.append(f"  {col}: {count} differences")
+        if self.mismatches:
+            lines.append(f"\nSample mismatches ({len(self.mismatches)} shown):")
+            for m in self.mismatches[:5]:
+                keys = ", ".join(f"{k}={v}" for k, v in m.key_values.items())
+                lines.append(f"  [{keys}] {m.column}: {m.source_value!r} vs {m.target_value!r}")
+        return "\n".join(lines)
+# =========================================================================
+# Group By Results
+# =========================================================================
+@dataclass
+class GroupResult:
+    """Validation result for a single group.
+    Attributes:
+        group_key: Dictionary of group column values
+        row_count: Number of rows in this group
+        passed: Whether all checks passed for this group
+        check_results: List of individual check results
+        stats: Group-level statistics
+    """
+    group_key: dict[str, Any]
+    row_count: int
+    passed: bool = True
+    check_results: list[ValidationResult] = field(default_factory=list)
+    stats: dict[str, Any] = field(default_factory=dict)
+    def __bool__(self) -> bool:
+        """Allow using GroupResult in boolean context."""
+        return self.passed
+    def __repr__(self) -> str:
+        keys = ", ".join(f"{k}={v}" for k, v in self.group_key.items())
+        status = "PASSED" if self.passed else "FAILED"
+        return f"GroupResult({keys}, rows={self.row_count}, {status})"
+    @property
+    def key_string(self) -> str:
+        """Get a string representation of the group key."""
+        return ", ".join(f"{k}={v}" for k, v in self.group_key.items())
+@dataclass
+class GroupByResult:
+    """Result of group-by validation across all groups.
+    Attributes:
+        passed: Whether all groups passed validation
+        total_groups: Total number of groups
+        passed_groups: Number of groups that passed
+        failed_groups: Number of groups that failed
+        group_results: Individual results per group
+        group_columns: Columns used for grouping
+        details: Additional metadata
+    """
+    passed: bool
+    total_groups: int
+    passed_groups: int = 0
+    failed_groups: int = 0
+    group_results: list[GroupResult] = field(default_factory=list)
+    group_columns: list[str] = field(default_factory=list)
+    details: dict[str, Any] = field(default_factory=dict)
+    def __bool__(self) -> bool:
+        """Allow using GroupByResult in boolean context."""
+        return self.passed
+    def __repr__(self) -> str:
+        status = "PASSED" if self.passed else "FAILED"
+        return f"GroupByResult({status}, groups={self.total_groups}, passed={self.passed_groups}, failed={self.failed_groups})"
+    @property
+    def pass_rate(self) -> float:
+        """Calculate the pass rate as a percentage."""
+        if self.total_groups == 0:
+            return 100.0
+        return (self.passed_groups / self.total_groups) * 100
+    def get_failed_groups(self) -> list[GroupResult]:
+        """Get list of groups that failed validation."""
+        return [g for g in self.group_results if not g.passed]
+    def summary(self) -> str:
+        """Get a human-readable summary."""
+        lines = [
+            f"Group By Validation: {'PASSED' if self.passed else 'FAILED'}",
+            f"Groups: {self.total_groups} total, {self.passed_groups} passed, {self.failed_groups} failed ({self.pass_rate:.1f}%)",
+            f"Grouped by: {', '.join(self.group_columns)}",
+        ]
+        failed = self.get_failed_groups()
+        if failed:
+            lines.append(f"\nFailed groups ({len(failed)}):")
+            for g in failed[:5]:
+                lines.append(f"  [{g.key_string}]: {g.row_count} rows")
+                for cr in g.check_results:
+                    if not cr.passed:
+                        lines.append(f"    - {cr.message}")
+        return "\n".join(lines)

duckguard/freshness/__init__.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""Freshness monitoring for DuckGuard.
+This module provides functionality to check data freshness by monitoring
+file modification times and timestamp columns.
+Usage:
+    from duckguard.freshness import FreshnessMonitor, FreshnessResult
+    from datetime import timedelta
+    # Check file freshness
+    monitor = FreshnessMonitor(threshold=timedelta(hours=24))
+    result = monitor.check("data.csv")
+    if not result.is_fresh:
+        print(f"Data is stale! Last updated: {result.age_human}")
+    # Check column freshness
+    from duckguard import connect
+    data = connect("data.csv")
+    result = monitor.check_column_timestamp(data, "updated_at")
+"""
+from duckguard.freshness.monitor import (
+    FreshnessMethod,
+    FreshnessMonitor,
+    FreshnessResult,
+)
+__all__ = [
+    "FreshnessMonitor",
+    "FreshnessResult",
+    "FreshnessMethod",
+]

duckguard/freshness/monitor.py ADDED Viewed

@@ -0,0 +1,429 @@
+"""Freshness monitoring implementation.
+Provides functionality to check data freshness via file modification times
+and timestamp columns in the data.
+"""
+from __future__ import annotations
+import os
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+from enum import Enum
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+from urllib.parse import urlparse
+if TYPE_CHECKING:
+    from duckguard.core.dataset import Dataset
+class FreshnessMethod(str, Enum):
+    """Methods for checking freshness."""
+    FILE_MTIME = "file_mtime"
+    COLUMN_MAX = "column_max"
+    COLUMN_MIN = "column_min"
+    METADATA = "metadata"
+    UNKNOWN = "unknown"
+@dataclass
+class FreshnessResult:
+    """Result of a freshness check.
+    Attributes:
+        source: Data source path
+        last_modified: Timestamp of last modification
+        age_seconds: Age in seconds (None if unknown)
+        age_human: Human-readable age string
+        is_fresh: Whether the data meets freshness threshold
+        threshold_seconds: Threshold used (None if no threshold)
+        method: Method used to determine freshness
+        details: Additional details about the check
+    """
+    source: str
+    last_modified: datetime | None
+    age_seconds: float | None
+    age_human: str
+    is_fresh: bool
+    threshold_seconds: float | None
+    method: FreshnessMethod
+    details: dict[str, Any] | None = None
+    def __str__(self) -> str:
+        """Human-readable string representation."""
+        status = "FRESH" if self.is_fresh else "STALE"
+        return f"[{status}] {self.source}: {self.age_human} (method: {self.method.value})"
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary."""
+        return {
+            "source": self.source,
+            "last_modified": self.last_modified.isoformat() if self.last_modified else None,
+            "age_seconds": self.age_seconds,
+            "age_human": self.age_human,
+            "is_fresh": self.is_fresh,
+            "threshold_seconds": self.threshold_seconds,
+            "method": self.method.value,
+            "details": self.details,
+        }
+class FreshnessMonitor:
+    """Monitor data freshness.
+    Usage:
+        from duckguard.freshness import FreshnessMonitor
+        from datetime import timedelta
+        # Create monitor with default 24-hour threshold
+        monitor = FreshnessMonitor()
+        # Check file freshness
+        result = monitor.check("data.csv")
+        print(f"Fresh: {result.is_fresh}, Age: {result.age_human}")
+        # Check with custom threshold
+        monitor = FreshnessMonitor(threshold=timedelta(hours=6))
+        result = monitor.check("data.csv")
+        # Check column timestamp
+        from duckguard import connect
+        data = connect("data.csv")
+        result = monitor.check_column_timestamp(data, "updated_at")
+    """
+    def __init__(self, threshold: timedelta | None = None):
+        """Initialize freshness monitor.
+        Args:
+            threshold: Maximum acceptable age for data to be considered fresh.
+                      Defaults to 24 hours.
+        """
+        self.threshold = threshold or timedelta(hours=24)
+    @property
+    def threshold_seconds(self) -> float:
+        """Get threshold in seconds."""
+        return self.threshold.total_seconds()
+    def check(
+        self,
+        source: str | Dataset,
+        column: str | None = None,
+    ) -> FreshnessResult:
+        """Check freshness using the most appropriate method.
+        Args:
+            source: Data source path or Dataset object
+            column: Optional timestamp column to check
+        Returns:
+            FreshnessResult with freshness information
+        """
+        # Import here to avoid circular imports
+        from duckguard.core.dataset import Dataset
+        if isinstance(source, Dataset):
+            dataset = source
+            source_path = dataset.source
+        else:
+            source_path = source
+            dataset = None
+        # If column specified, use column method
+        if column and dataset:
+            return self.check_column_timestamp(dataset, column)
+        # Try to determine best method
+        if self._is_local_file(source_path):
+            return self.check_file_mtime(source_path)
+        elif dataset:
+            # Try to auto-detect timestamp column
+            timestamp_col = self._detect_timestamp_column(dataset)
+            if timestamp_col:
+                return self.check_column_timestamp(dataset, timestamp_col)
+        # Return unknown result
+        return FreshnessResult(
+            source=source_path,
+            last_modified=None,
+            age_seconds=None,
+            age_human="unknown",
+            is_fresh=True,  # Default to fresh if can't determine
+            threshold_seconds=self.threshold_seconds,
+            method=FreshnessMethod.UNKNOWN,
+            details={"reason": "Cannot determine freshness for this source type"},
+        )
+    def check_file_mtime(self, path: str | Path) -> FreshnessResult:
+        """Check freshness via file modification time.
+        Args:
+            path: Path to the file
+        Returns:
+            FreshnessResult with file modification information
+        """
+        path = Path(path)
+        source_str = str(path)
+        if not path.exists():
+            return FreshnessResult(
+                source=source_str,
+                last_modified=None,
+                age_seconds=None,
+                age_human="file not found",
+                is_fresh=False,
+                threshold_seconds=self.threshold_seconds,
+                method=FreshnessMethod.FILE_MTIME,
+                details={"error": "File does not exist"},
+            )
+        try:
+            mtime = os.path.getmtime(path)
+            last_modified = datetime.fromtimestamp(mtime)
+            now = datetime.now()
+            age = now - last_modified
+            age_seconds = age.total_seconds()
+            is_fresh = age_seconds <= self.threshold_seconds
+            return FreshnessResult(
+                source=source_str,
+                last_modified=last_modified,
+                age_seconds=age_seconds,
+                age_human=self._format_age(age),
+                is_fresh=is_fresh,
+                threshold_seconds=self.threshold_seconds,
+                method=FreshnessMethod.FILE_MTIME,
+                details={
+                    "file_size": path.stat().st_size,
+                    "threshold_human": self._format_age(self.threshold),
+                },
+            )
+        except OSError as e:
+            return FreshnessResult(
+                source=source_str,
+                last_modified=None,
+                age_seconds=None,
+                age_human="error reading file",
+                is_fresh=False,
+                threshold_seconds=self.threshold_seconds,
+                method=FreshnessMethod.FILE_MTIME,
+                details={"error": str(e)},
+            )
+    def check_column_timestamp(
+        self,
+        dataset: Dataset,
+        column: str,
+        use_max: bool = True,
+    ) -> FreshnessResult:
+        """Check freshness via timestamp column.
+        Args:
+            dataset: Dataset to check
+            column: Timestamp column name
+            use_max: Use MAX (most recent) if True, MIN (oldest) if False
+        Returns:
+            FreshnessResult with column timestamp information
+        """
+        source_str = dataset.source
+        method = FreshnessMethod.COLUMN_MAX if use_max else FreshnessMethod.COLUMN_MIN
+        # Verify column exists
+        if column not in dataset.columns:
+            return FreshnessResult(
+                source=source_str,
+                last_modified=None,
+                age_seconds=None,
+                age_human="column not found",
+                is_fresh=False,
+                threshold_seconds=self.threshold_seconds,
+                method=method,
+                details={"error": f"Column '{column}' not found in dataset"},
+            )
+        try:
+            # Get max/min timestamp from column
+            ref = dataset.engine.get_source_reference(dataset.source)
+            agg_func = "MAX" if use_max else "MIN"
+            sql = f"SELECT {agg_func}({column}) as ts FROM {ref}"
+            result = dataset.engine.fetch_all(sql)
+            if not result or result[0][0] is None:
+                return FreshnessResult(
+                    source=source_str,
+                    last_modified=None,
+                    age_seconds=None,
+                    age_human="no data",
+                    is_fresh=False,
+                    threshold_seconds=self.threshold_seconds,
+                    method=method,
+                    details={"error": "Column contains no timestamp values"},
+                )
+            timestamp_value = result[0][0]
+            # Parse timestamp
+            if isinstance(timestamp_value, datetime):
+                last_modified = timestamp_value
+            elif isinstance(timestamp_value, str):
+                # Try common formats
+                for fmt in [
+                    "%Y-%m-%d %H:%M:%S",
+                    "%Y-%m-%d %H:%M:%S.%f",
+                    "%Y-%m-%dT%H:%M:%S",
+                    "%Y-%m-%dT%H:%M:%S.%f",
+                    "%Y-%m-%d",
+                ]:
+                    try:
+                        last_modified = datetime.strptime(timestamp_value, fmt)
+                        break
+                    except ValueError:
+                        continue
+                else:
+                    return FreshnessResult(
+                        source=source_str,
+                        last_modified=None,
+                        age_seconds=None,
+                        age_human="invalid timestamp format",
+                        is_fresh=False,
+                        threshold_seconds=self.threshold_seconds,
+                        method=method,
+                        details={"error": f"Cannot parse timestamp: {timestamp_value}"},
+                    )
+            else:
+                return FreshnessResult(
+                    source=source_str,
+                    last_modified=None,
+                    age_seconds=None,
+                    age_human="unsupported type",
+                    is_fresh=False,
+                    threshold_seconds=self.threshold_seconds,
+                    method=method,
+                    details={"error": f"Unsupported timestamp type: {type(timestamp_value)}"},
+                )
+            now = datetime.now()
+            age = now - last_modified
+            age_seconds = age.total_seconds()
+            is_fresh = age_seconds <= self.threshold_seconds
+            return FreshnessResult(
+                source=source_str,
+                last_modified=last_modified,
+                age_seconds=age_seconds,
+                age_human=self._format_age(age),
+                is_fresh=is_fresh,
+                threshold_seconds=self.threshold_seconds,
+                method=method,
+                details={
+                    "column": column,
+                    "aggregation": agg_func,
+                    "threshold_human": self._format_age(self.threshold),
+                },
+            )
+        except Exception as e:
+            return FreshnessResult(
+                source=source_str,
+                last_modified=None,
+                age_seconds=None,
+                age_human="query error",
+                is_fresh=False,
+                threshold_seconds=self.threshold_seconds,
+                method=method,
+                details={"error": str(e)},
+            )
+    def _is_local_file(self, source: str) -> bool:
+        """Check if source is a local file path."""
+        # Check for URL schemes
+        parsed = urlparse(source)
+        if parsed.scheme and parsed.scheme not in ("", "file"):
+            return False
+        # Check for connection strings
+        if "://" in source and not source.startswith("file://"):
+            return False
+        # Check if path exists
+        path = Path(source)
+        return path.exists() and path.is_file()
+    def _detect_timestamp_column(self, dataset: Dataset) -> str | None:
+        """Try to auto-detect a timestamp column."""
+        timestamp_patterns = [
+            "updated_at", "modified_at", "last_modified", "modified",
+            "created_at", "timestamp", "date", "datetime", "time",
+            "update_time", "modify_time", "last_update",
+        ]
+        columns_lower = {c.lower(): c for c in dataset.columns}
+        for pattern in timestamp_patterns:
+            if pattern in columns_lower:
+                return columns_lower[pattern]
+        return None
+    def _format_age(self, age: timedelta) -> str:
+        """Format a timedelta as human-readable string."""
+        total_seconds = int(age.total_seconds())
+        if total_seconds < 0:
+            return "in the future"
+        elif total_seconds < 60:
+            return f"{total_seconds} seconds ago"
+        elif total_seconds < 3600:
+            minutes = total_seconds // 60
+            return f"{minutes} minute{'s' if minutes != 1 else ''} ago"
+        elif total_seconds < 86400:
+            hours = total_seconds // 3600
+            return f"{hours} hour{'s' if hours != 1 else ''} ago"
+        elif total_seconds < 604800:
+            days = total_seconds // 86400
+            return f"{days} day{'s' if days != 1 else ''} ago"
+        elif total_seconds < 2592000:
+            weeks = total_seconds // 604800
+            return f"{weeks} week{'s' if weeks != 1 else ''} ago"
+        else:
+            months = total_seconds // 2592000
+            return f"{months} month{'s' if months != 1 else ''} ago"
+def parse_age_string(age_str: str) -> timedelta:
+    """Parse an age string like '24h', '7d', '1w' into timedelta.
+    Args:
+        age_str: Age string with unit (s, m, h, d, w)
+    Returns:
+        timedelta representing the age
+    Examples:
+        parse_age_string("24h") -> timedelta(hours=24)
+        parse_age_string("7d") -> timedelta(days=7)
+        parse_age_string("1w") -> timedelta(weeks=1)
+    """
+    age_str = age_str.strip().lower()
+    if age_str.endswith("s"):
+        return timedelta(seconds=int(age_str[:-1]))
+    elif age_str.endswith("m"):
+        return timedelta(minutes=int(age_str[:-1]))
+    elif age_str.endswith("h"):
+        return timedelta(hours=int(age_str[:-1]))
+    elif age_str.endswith("d"):
+        return timedelta(days=int(age_str[:-1]))
+    elif age_str.endswith("w"):
+        return timedelta(weeks=int(age_str[:-1]))
+    else:
+        # Assume hours if no unit
+        return timedelta(hours=int(age_str))

duckguard 2.2.0__py3-none-any.whl → 3.0.0__py3-none-any.whl

duckguard 2.2.0py3-none-any.whl → 3.0.0py3-none-any.whl