PyPI - duckguard - Versions diffs - 2.0.0__py3-none-any.whl → 2.3.0__py3-none-any.whl - Mend

duckguard 2.0.0py3-none-any.whl → 2.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

duckguard/__init__.py +55 -28
duckguard/anomaly/__init__.py +29 -1
duckguard/anomaly/baselines.py +294 -0
duckguard/anomaly/detector.py +1 -5
duckguard/anomaly/methods.py +17 -5
duckguard/anomaly/ml_methods.py +724 -0
duckguard/cli/main.py +561 -56
duckguard/connectors/__init__.py +2 -2
duckguard/connectors/bigquery.py +1 -1
duckguard/connectors/databricks.py +1 -1
duckguard/connectors/factory.py +2 -3
duckguard/connectors/files.py +1 -1
duckguard/connectors/kafka.py +2 -2
duckguard/connectors/mongodb.py +1 -1
duckguard/connectors/mysql.py +1 -1
duckguard/connectors/oracle.py +1 -1
duckguard/connectors/postgres.py +1 -2
duckguard/connectors/redshift.py +1 -1
duckguard/connectors/snowflake.py +1 -2
duckguard/connectors/sqlite.py +1 -1
duckguard/connectors/sqlserver.py +10 -13
duckguard/contracts/__init__.py +6 -6
duckguard/contracts/diff.py +1 -1
duckguard/contracts/generator.py +5 -6
duckguard/contracts/loader.py +4 -4
duckguard/contracts/validator.py +3 -4
duckguard/core/__init__.py +3 -3
duckguard/core/column.py +588 -5
duckguard/core/dataset.py +708 -3
duckguard/core/result.py +328 -1
duckguard/core/scoring.py +1 -2
duckguard/errors.py +362 -0
duckguard/freshness/__init__.py +33 -0
duckguard/freshness/monitor.py +429 -0
duckguard/history/__init__.py +44 -0
duckguard/history/schema.py +301 -0
duckguard/history/storage.py +479 -0
duckguard/history/trends.py +348 -0
duckguard/integrations/__init__.py +31 -0
duckguard/integrations/airflow.py +387 -0
duckguard/integrations/dbt.py +458 -0
duckguard/notifications/__init__.py +61 -0
duckguard/notifications/email.py +508 -0
duckguard/notifications/formatter.py +118 -0
duckguard/notifications/notifiers.py +357 -0
duckguard/profiler/auto_profile.py +3 -3
duckguard/pytest_plugin/__init__.py +1 -1
duckguard/pytest_plugin/plugin.py +1 -1
duckguard/reporting/console.py +2 -2
duckguard/reports/__init__.py +42 -0
duckguard/reports/html_reporter.py +514 -0
duckguard/reports/pdf_reporter.py +114 -0
duckguard/rules/__init__.py +3 -3
duckguard/rules/executor.py +3 -4
duckguard/rules/generator.py +8 -5
duckguard/rules/loader.py +5 -5
duckguard/rules/schema.py +23 -0
duckguard/schema_history/__init__.py +40 -0
duckguard/schema_history/analyzer.py +414 -0
duckguard/schema_history/tracker.py +288 -0
duckguard/semantic/__init__.py +1 -1
duckguard/semantic/analyzer.py +0 -2
duckguard/semantic/detector.py +17 -1
duckguard/semantic/validators.py +2 -1
duckguard-2.3.0.dist-info/METADATA +953 -0
duckguard-2.3.0.dist-info/RECORD +77 -0
duckguard-2.0.0.dist-info/METADATA +0 -221
duckguard-2.0.0.dist-info/RECORD +0 -55
{duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/WHEEL +0 -0
{duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/entry_points.txt +0 -0
{duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/licenses/LICENSE +0 -0

duckguard/schema_history/tracker.py ADDED Viewed

@@ -0,0 +1,288 @@
+"""Schema tracking implementation.
+Provides functionality to capture and store schema snapshots over time.
+"""
+from __future__ import annotations
+import json
+import uuid
+from dataclasses import dataclass
+from datetime import datetime
+from typing import TYPE_CHECKING, Any
+from duckguard.history.schema import QUERIES
+from duckguard.history.storage import HistoryStorage
+if TYPE_CHECKING:
+    from duckguard.core.dataset import Dataset
+@dataclass
+class ColumnSchema:
+    """Represents the schema of a single column.
+    Attributes:
+        name: Column name
+        dtype: Data type as string
+        nullable: Whether the column allows nulls
+        position: Position in the table (0-indexed)
+    """
+    name: str
+    dtype: str
+    nullable: bool
+    position: int
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary."""
+        return {
+            "name": self.name,
+            "dtype": self.dtype,
+            "nullable": self.nullable,
+            "position": self.position,
+        }
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> ColumnSchema:
+        """Create from dictionary."""
+        return cls(
+            name=data["name"],
+            dtype=data["dtype"],
+            nullable=data.get("nullable", True),
+            position=data.get("position", 0),
+        )
+@dataclass
+class SchemaSnapshot:
+    """Represents a captured schema at a point in time.
+    Attributes:
+        source: Data source path
+        snapshot_id: Unique identifier for this snapshot
+        captured_at: When the snapshot was captured
+        columns: List of column schemas
+        row_count: Optional row count at capture time
+    """
+    source: str
+    snapshot_id: str
+    captured_at: datetime
+    columns: list[ColumnSchema]
+    row_count: int | None = None
+    @property
+    def column_count(self) -> int:
+        """Get the number of columns."""
+        return len(self.columns)
+    @property
+    def column_names(self) -> list[str]:
+        """Get list of column names."""
+        return [c.name for c in self.columns]
+    def get_column(self, name: str) -> ColumnSchema | None:
+        """Get a column by name."""
+        for col in self.columns:
+            if col.name == name:
+                return col
+        return None
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "source": self.source,
+            "snapshot_id": self.snapshot_id,
+            "captured_at": self.captured_at.isoformat(),
+            "columns": [c.to_dict() for c in self.columns],
+            "row_count": self.row_count,
+        }
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> SchemaSnapshot:
+        """Create from dictionary."""
+        return cls(
+            source=data["source"],
+            snapshot_id=data["snapshot_id"],
+            captured_at=datetime.fromisoformat(data["captured_at"]),
+            columns=[ColumnSchema.from_dict(c) for c in data["columns"]],
+            row_count=data.get("row_count"),
+        )
+    def __eq__(self, other: object) -> bool:
+        """Check schema equality (ignores snapshot_id and captured_at)."""
+        if not isinstance(other, SchemaSnapshot):
+            return False
+        return (
+            self.source == other.source
+            and len(self.columns) == len(other.columns)
+            and all(
+                c1.name == c2.name and c1.dtype == c2.dtype and c1.nullable == c2.nullable
+                for c1, c2 in zip(self.columns, other.columns)
+            )
+        )
+class SchemaTracker:
+    """Track schema changes over time.
+    Usage:
+        from duckguard import connect
+        from duckguard.schema_history import SchemaTracker
+        tracker = SchemaTracker()
+        data = connect("data.csv")
+        # Capture current schema
+        snapshot = tracker.capture(data)
+        # Get history
+        history = tracker.get_history(data.source)
+        # Get latest snapshot
+        latest = tracker.get_latest(data.source)
+    """
+    def __init__(self, storage: HistoryStorage | None = None):
+        """Initialize schema tracker.
+        Args:
+            storage: Optional HistoryStorage instance. Uses default if not provided.
+        """
+        self._storage = storage or HistoryStorage()
+    @property
+    def storage(self) -> HistoryStorage:
+        """Get the underlying storage."""
+        return self._storage
+    def capture(self, dataset: Dataset) -> SchemaSnapshot:
+        """Capture current schema as a snapshot.
+        Args:
+            dataset: Dataset to capture schema from
+        Returns:
+            SchemaSnapshot representing current state
+        """
+        # Get schema information from the engine
+        columns = self._get_column_schemas(dataset)
+        snapshot = SchemaSnapshot(
+            source=dataset.source,
+            snapshot_id=str(uuid.uuid4()),
+            captured_at=datetime.now(),
+            columns=columns,
+            row_count=dataset.row_count,
+        )
+        # Store in database
+        self._store_snapshot(snapshot)
+        return snapshot
+    def get_history(
+        self,
+        source: str,
+        limit: int = 50,
+    ) -> list[SchemaSnapshot]:
+        """Get schema snapshot history for a source.
+        Args:
+            source: Data source path
+            limit: Maximum snapshots to return
+        Returns:
+            List of SchemaSnapshot objects, most recent first
+        """
+        conn = self._storage._get_connection()
+        cursor = conn.execute(QUERIES["get_schema_snapshots"], (source, limit))
+        return [self._row_to_snapshot(row) for row in cursor.fetchall()]
+    def get_latest(self, source: str) -> SchemaSnapshot | None:
+        """Get the most recent schema snapshot for a source.
+        Args:
+            source: Data source path
+        Returns:
+            SchemaSnapshot or None if no snapshots exist
+        """
+        conn = self._storage._get_connection()
+        cursor = conn.execute(QUERIES["get_latest_schema_snapshot"], (source,))
+        row = cursor.fetchone()
+        return self._row_to_snapshot(row) if row else None
+    def get_snapshot(self, snapshot_id: str) -> SchemaSnapshot | None:
+        """Get a specific snapshot by ID.
+        Args:
+            snapshot_id: Snapshot ID
+        Returns:
+            SchemaSnapshot or None if not found
+        """
+        conn = self._storage._get_connection()
+        cursor = conn.execute(QUERIES["get_schema_snapshot_by_id"], (snapshot_id,))
+        row = cursor.fetchone()
+        return self._row_to_snapshot(row) if row else None
+    def _get_column_schemas(self, dataset: Dataset) -> list[ColumnSchema]:
+        """Get column schemas from dataset."""
+        columns = []
+        # Get column info from DuckDB
+        ref = dataset.engine.get_source_reference(dataset.source)
+        result = dataset.engine.execute(f"DESCRIBE {ref}")
+        for i, row in enumerate(result.fetchall()):
+            col_name = row[0]
+            col_type = row[1]
+            nullable = row[2] == "YES" if len(row) > 2 else True
+            columns.append(ColumnSchema(
+                name=col_name,
+                dtype=col_type,
+                nullable=nullable,
+                position=i,
+            ))
+        return columns
+    def _store_snapshot(self, snapshot: SchemaSnapshot) -> None:
+        """Store a snapshot in the database."""
+        conn = self._storage._get_connection()
+        schema_json = json.dumps({
+            "columns": [c.to_dict() for c in snapshot.columns]
+        })
+        conn.execute(
+            QUERIES["insert_schema_snapshot"],
+            (
+                snapshot.source,
+                snapshot.snapshot_id,
+                snapshot.captured_at.isoformat(),
+                schema_json,
+                snapshot.column_count,
+                snapshot.row_count,
+            ),
+        )
+        conn.commit()
+    def _row_to_snapshot(self, row) -> SchemaSnapshot:
+        """Convert database row to SchemaSnapshot."""
+        schema_data = json.loads(row["schema_json"])
+        columns = [ColumnSchema.from_dict(c) for c in schema_data["columns"]]
+        return SchemaSnapshot(
+            source=row["source"],
+            snapshot_id=row["snapshot_id"],
+            captured_at=datetime.fromisoformat(row["captured_at"]),
+            columns=columns,
+            row_count=row["row_count"],
+        )

duckguard/semantic/__init__.py CHANGED Viewed

@@ -12,13 +12,13 @@ Example:
     print(result.confidence)     # 0.95
 """
+from duckguard.semantic.analyzer import SemanticAnalyzer
 from duckguard.semantic.detector import (
     SemanticType,
     SemanticTypeResult,
     detect_type,
     detect_types_for_dataset,
 )
-from duckguard.semantic.analyzer import SemanticAnalyzer
 from duckguard.semantic.validators import get_validator_for_type
 __all__ = [

duckguard/semantic/analyzer.py CHANGED Viewed

@@ -12,9 +12,7 @@ from typing import Any
 from duckguard.core.dataset import Dataset
 from duckguard.semantic.detector import (
     SemanticType,
-    SemanticTypeResult,
     SemanticTypeDetector,
-    PII_TYPES,
 )

duckguard/semantic/detector.py CHANGED Viewed

@@ -73,6 +73,7 @@ class SemanticType(Enum):
     TITLE = "title"
     SLUG = "slug"
     CODE = "code"
+    IDENTIFIER = "identifier"
     # Unknown
     UNKNOWN = "unknown"
@@ -216,6 +217,9 @@ NAME_PATTERNS: dict[SemanticType, list[str]] = {
     SemanticType.CODE: [
         r"code", r".*_code$"
     ],
+    SemanticType.IDENTIFIER: [
+        r".*_id$", r".*_key$", r".*_code$", r".*_num(ber)?$", r".*_no$"
+    ],
 }
 # Value patterns for detection
@@ -235,6 +239,15 @@ VALUE_PATTERNS: dict[SemanticType, str] = {
     SemanticType.SLUG: r"^[a-z0-9]+(?:-[a-z0-9]+)*$",
     SemanticType.LATITUDE: r"^-?([1-8]?\d(\.\d+)?|90(\.0+)?)$",
     SemanticType.LONGITUDE: r"^-?(1[0-7]\d(\.\d+)?|180(\.0+)?|\d{1,2}(\.\d+)?)$",
+    # Identifier pattern: PREFIX-NUMBER, ABC123, etc. (uppercase or mixed case with numbers)
+    SemanticType.IDENTIFIER: r"^[A-Z][A-Z0-9]*[-_]?\d+$|^[A-Z]{2,}[-_][A-Z0-9]+$",
+}
+# Patterns that must be matched case-sensitively (not using IGNORECASE)
+CASE_SENSITIVE_PATTERNS = {
+    SemanticType.SLUG,  # Slugs must be lowercase
+    SemanticType.IDENTIFIER,  # Identifiers are typically uppercase
+    SemanticType.COUNTRY_CODE,  # Country codes are uppercase
 }
 # PII types that should be flagged
@@ -269,6 +282,7 @@ TYPE_VALIDATIONS: dict[SemanticType, list[str]] = {
     SemanticType.LONGITUDE: ["range: [-180, 180]"],
     SemanticType.BOOLEAN: ["allowed_values: [true, false]"],
     SemanticType.COUNTRY_CODE: ["pattern: country_code"],
+    SemanticType.IDENTIFIER: ["not_null"],
 }
@@ -386,9 +400,11 @@ class SemanticTypeDetector:
             string_values = [str(v) for v in sample_values if v is not None]
             if string_values:
                 for sem_type, pattern in self.value_patterns.items():
+                    # Use case-sensitive matching for certain patterns
+                    flags = 0 if sem_type in CASE_SENSITIVE_PATTERNS else re.IGNORECASE
                     match_count = sum(
                         1 for v in string_values[:50]
-                        if re.match(pattern, v, re.IGNORECASE)
+                        if re.match(pattern, v, flags)
                     )
                     match_rate = match_count / min(len(string_values), 50)

duckguard/semantic/validators.py CHANGED Viewed

@@ -6,8 +6,9 @@ Provides validation functions specific to each semantic type.
 from __future__ import annotations
 import re
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Any, Callable
+from typing import Any
 from duckguard.semantic.detector import SemanticType

duckguard 2.0.0__py3-none-any.whl → 2.3.0__py3-none-any.whl

duckguard 2.0.0py3-none-any.whl → 2.3.0py3-none-any.whl