PyPI - duckguard - Versions diffs - 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl - Mend

duckguard 2.0.0py3-none-any.whl → 2.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

duckguard/__init__.py +55 -28
duckguard/anomaly/__init__.py +1 -1
duckguard/anomaly/detector.py +1 -5
duckguard/anomaly/methods.py +1 -3
duckguard/cli/main.py +304 -54
duckguard/connectors/__init__.py +2 -2
duckguard/connectors/bigquery.py +1 -1
duckguard/connectors/databricks.py +1 -1
duckguard/connectors/factory.py +2 -3
duckguard/connectors/files.py +1 -1
duckguard/connectors/kafka.py +2 -2
duckguard/connectors/mongodb.py +1 -1
duckguard/connectors/mysql.py +1 -1
duckguard/connectors/oracle.py +1 -1
duckguard/connectors/postgres.py +1 -2
duckguard/connectors/redshift.py +1 -1
duckguard/connectors/snowflake.py +1 -2
duckguard/connectors/sqlite.py +1 -1
duckguard/connectors/sqlserver.py +10 -13
duckguard/contracts/__init__.py +6 -6
duckguard/contracts/diff.py +1 -1
duckguard/contracts/generator.py +5 -6
duckguard/contracts/loader.py +4 -4
duckguard/contracts/validator.py +3 -4
duckguard/core/__init__.py +3 -3
duckguard/core/column.py +110 -5
duckguard/core/dataset.py +3 -3
duckguard/core/result.py +92 -1
duckguard/core/scoring.py +1 -2
duckguard/errors.py +362 -0
duckguard/history/__init__.py +44 -0
duckguard/history/schema.py +183 -0
duckguard/history/storage.py +479 -0
duckguard/history/trends.py +348 -0
duckguard/integrations/__init__.py +31 -0
duckguard/integrations/airflow.py +387 -0
duckguard/integrations/dbt.py +458 -0
duckguard/notifications/__init__.py +43 -0
duckguard/notifications/formatter.py +118 -0
duckguard/notifications/notifiers.py +357 -0
duckguard/profiler/auto_profile.py +3 -3
duckguard/pytest_plugin/__init__.py +1 -1
duckguard/pytest_plugin/plugin.py +1 -1
duckguard/reporting/console.py +2 -2
duckguard/reports/__init__.py +42 -0
duckguard/reports/html_reporter.py +515 -0
duckguard/reports/pdf_reporter.py +114 -0
duckguard/rules/__init__.py +3 -3
duckguard/rules/executor.py +3 -4
duckguard/rules/generator.py +4 -4
duckguard/rules/loader.py +5 -5
duckguard/semantic/__init__.py +1 -1
duckguard/semantic/analyzer.py +0 -2
duckguard/semantic/validators.py +2 -1
{duckguard-2.0.0.dist-info → duckguard-2.2.0.dist-info}/METADATA +135 -5
duckguard-2.2.0.dist-info/RECORD +69 -0
duckguard-2.0.0.dist-info/RECORD +0 -55
{duckguard-2.0.0.dist-info → duckguard-2.2.0.dist-info}/WHEEL +0 -0
{duckguard-2.0.0.dist-info → duckguard-2.2.0.dist-info}/entry_points.txt +0 -0
{duckguard-2.0.0.dist-info → duckguard-2.2.0.dist-info}/licenses/LICENSE +0 -0

duckguard/errors.py ADDED Viewed

@@ -0,0 +1,362 @@
+"""Enhanced error classes for DuckGuard with helpful suggestions.
+Provides user-friendly error messages with:
+- Clear descriptions of what went wrong
+- Suggestions for how to fix the issue
+- Links to relevant documentation
+- Context about the data being validated
+"""
+from __future__ import annotations
+from typing import Any
+# Documentation base URL
+DOCS_BASE_URL = "https://github.com/XDataHubAI/duckguard"
+class DuckGuardError(Exception):
+    """Base exception for all DuckGuard errors.
+    Attributes:
+        message: Human-readable error description
+        suggestion: Helpful suggestion for fixing the issue
+        docs_url: Link to relevant documentation
+        context: Additional context about the error
+    """
+    def __init__(
+        self,
+        message: str,
+        suggestion: str | None = None,
+        docs_url: str | None = None,
+        context: dict[str, Any] | None = None,
+    ):
+        self.message = message
+        self.suggestion = suggestion
+        self.docs_url = docs_url
+        self.context = context or {}
+        super().__init__(self._format_message())
+    def _format_message(self) -> str:
+        """Format the full error message with suggestions."""
+        parts = [self.message]
+        if self.suggestion:
+            parts.append(f"\n\nSuggestion: {self.suggestion}")
+        if self.docs_url:
+            parts.append(f"\n\nDocs: {self.docs_url}")
+        if self.context:
+            context_str = "\n".join(f"  {k}: {v}" for k, v in self.context.items())
+            parts.append(f"\n\nContext:\n{context_str}")
+        return "".join(parts)
+class ConnectionError(DuckGuardError):
+    """Error connecting to a data source."""
+    def __init__(
+        self,
+        source: str,
+        original_error: Exception | None = None,
+        **context: Any,
+    ):
+        super().__init__(
+            message=f"Failed to connect to data source: {source}",
+            suggestion=self._get_suggestion(source, original_error),
+            docs_url=f"{DOCS_BASE_URL}#connectors",
+            context={"source": source, **context},
+        )
+        self.source = source
+        self.original_error = original_error
+    def _get_suggestion(self, source: str, error: Exception | None) -> str:
+        """Get a helpful suggestion based on the source type."""
+        suggestions = []
+        if source.endswith(".csv"):
+            suggestions.append("Verify the CSV file exists and is readable")
+            suggestions.append("Check file permissions")
+        elif source.endswith(".parquet"):
+            suggestions.append("Verify the Parquet file exists and is not corrupted")
+            suggestions.append("Try: pip install pyarrow")
+        elif "postgres" in source or "postgresql" in source:
+            suggestions.append("Verify PostgreSQL connection string format: postgresql://user:pass@host:port/db")
+            suggestions.append("Check if the database server is running")
+        elif "mysql" in source:
+            suggestions.append("Verify MySQL connection string format: mysql://user:pass@host:port/db")
+        elif "s3://" in source:
+            suggestions.append("Verify AWS credentials are configured")
+            suggestions.append("Check S3 bucket permissions")
+        else:
+            suggestions.append("Verify the data source path or connection string")
+        if error:
+            suggestions.append(f"Original error: {error}")
+        return "\n  - ".join([""] + suggestions).strip()
+class FileNotFoundError(DuckGuardError):
+    """File not found error with helpful context."""
+    def __init__(self, path: str, **context: Any):
+        import os
+        cwd = os.getcwd()
+        super().__init__(
+            message=f"File not found: {path}",
+            suggestion=f"Check if the file exists. Current directory: {cwd}",
+            docs_url=f"{DOCS_BASE_URL}#file-connectors",
+            context={"path": path, "cwd": cwd, **context},
+        )
+class ColumnNotFoundError(DuckGuardError):
+    """Column not found in dataset."""
+    def __init__(self, column: str, available_columns: list[str], **context: Any):
+        # Find similar column names
+        similar = self._find_similar(column, available_columns)
+        suggestion = "Available columns: " + ", ".join(available_columns[:10])
+        if len(available_columns) > 10:
+            suggestion += f" (and {len(available_columns) - 10} more)"
+        if similar:
+            suggestion = f"Did you mean: {similar}?\n\n{suggestion}"
+        super().__init__(
+            message=f"Column '{column}' not found in dataset",
+            suggestion=suggestion,
+            docs_url=f"{DOCS_BASE_URL}#working-with-columns",
+            context={"column": column, "similar": similar, **context},
+        )
+    def _find_similar(self, target: str, candidates: list[str]) -> str | None:
+        """Find a similar column name using simple string matching."""
+        target_lower = target.lower()
+        # Exact match ignoring case
+        for c in candidates:
+            if c.lower() == target_lower:
+                return c
+        # Prefix match
+        for c in candidates:
+            if c.lower().startswith(target_lower) or target_lower.startswith(c.lower()):
+                return c
+        # Contains match
+        for c in candidates:
+            if target_lower in c.lower() or c.lower() in target_lower:
+                return c
+        return None
+class ValidationError(DuckGuardError):
+    """Validation check failed with detailed information."""
+    def __init__(
+        self,
+        check_name: str,
+        column: str | None = None,
+        actual_value: Any = None,
+        expected_value: Any = None,
+        failed_rows: list | None = None,
+        **context: Any,
+    ):
+        col_str = f" for column '{column}'" if column else ""
+        message = f"Validation check '{check_name}' failed{col_str}"
+        suggestion_parts = []
+        if actual_value is not None and expected_value is not None:
+            suggestion_parts.append(f"Expected: {expected_value}, Got: {actual_value}")
+        if failed_rows:
+            sample = failed_rows[:3]
+            suggestion_parts.append(f"Sample failing values: {sample}")
+            if len(failed_rows) > 3:
+                suggestion_parts.append(f"({len(failed_rows)} total failures)")
+        suggestion = "\n".join(suggestion_parts) if suggestion_parts else None
+        super().__init__(
+            message=message,
+            suggestion=suggestion,
+            docs_url=f"{DOCS_BASE_URL}#validation-methods",
+            context={
+                "check_name": check_name,
+                "column": column,
+                "actual_value": actual_value,
+                "expected_value": expected_value,
+                **context,
+            },
+        )
+class RuleParseError(DuckGuardError):
+    """Error parsing validation rules."""
+    def __init__(
+        self,
+        message: str,
+        file_path: str | None = None,
+        line_number: int | None = None,
+        **context: Any,
+    ):
+        location = ""
+        if file_path:
+            location = f" in {file_path}"
+            if line_number:
+                location += f" at line {line_number}"
+        suggestion = "Check your YAML syntax and rule format.\n"
+        suggestion += "Example valid rule:\n"
+        suggestion += """
+columns:
+  order_id:
+    checks:
+      - type: not_null
+      - type: unique
+  amount:
+    checks:
+      - type: between
+        value: [0, 10000]
+"""
+        super().__init__(
+            message=f"Failed to parse rules{location}: {message}",
+            suggestion=suggestion,
+            docs_url=f"{DOCS_BASE_URL}#yaml-rules",
+            context={"file_path": file_path, "line_number": line_number, **context},
+        )
+class ContractViolationError(DuckGuardError):
+    """Data contract was violated."""
+    def __init__(
+        self,
+        violations: list[str],
+        contract_path: str | None = None,
+        **context: Any,
+    ):
+        message = f"Data contract violated with {len(violations)} issue(s)"
+        if contract_path:
+            message += f" (contract: {contract_path})"
+        suggestion = "Violations:\n  - " + "\n  - ".join(violations[:5])
+        if len(violations) > 5:
+            suggestion += f"\n  ... and {len(violations) - 5} more"
+        suggestion += "\n\nConsider updating the contract or fixing the data issues."
+        super().__init__(
+            message=message,
+            suggestion=suggestion,
+            docs_url=f"{DOCS_BASE_URL}#data-contracts",
+            context={"violations": violations, "contract_path": contract_path, **context},
+        )
+class UnsupportedConnectorError(DuckGuardError):
+    """No connector available for the data source."""
+    def __init__(self, source: str, **context: Any):
+        supported = [
+            "CSV (.csv)",
+            "Parquet (.parquet, .pq)",
+            "JSON (.json, .jsonl, .ndjson)",
+            "PostgreSQL (postgres://, postgresql://)",
+            "MySQL (mysql://)",
+            "SQLite (sqlite://)",
+            "S3 (s3://)",
+            "Snowflake (snowflake://)",
+            "BigQuery (bigquery://)",
+        ]
+        suggestion = "Supported formats:\n  - " + "\n  - ".join(supported)
+        super().__init__(
+            message=f"No connector found for: {source}",
+            suggestion=suggestion,
+            docs_url=f"{DOCS_BASE_URL}#supported-connectors",
+            context={"source": source, **context},
+        )
+# Error formatting utilities
+def format_validation_failure(
+    check_name: str,
+    column: str | None,
+    actual: Any,
+    expected: Any,
+    failed_rows: list | None = None,
+) -> str:
+    """Format a validation failure message with context.
+    Args:
+        check_name: Name of the failed check
+        column: Column name (if column-level)
+        actual: Actual value found
+        expected: Expected value
+        failed_rows: Sample of failing rows
+    Returns:
+        Formatted error message
+    """
+    parts = []
+    if column:
+        parts.append(f"Check '{check_name}' failed for column '{column}'")
+    else:
+        parts.append(f"Check '{check_name}' failed")
+    parts.append(f"  Expected: {expected}")
+    parts.append(f"  Actual: {actual}")
+    if failed_rows:
+        parts.append("")
+        parts.append("  Sample failing rows:")
+        for row in failed_rows[:5]:
+            if hasattr(row, "value"):
+                parts.append(f"    Row {row.row_index}: {row.value}")
+            else:
+                parts.append(f"    {row}")
+        if len(failed_rows) > 5:
+            parts.append(f"    ... and {len(failed_rows) - 5} more")
+    return "\n".join(parts)
+def format_multiple_failures(failures: list) -> str:
+    """Format multiple validation failures into a summary.
+    Args:
+        failures: List of failure objects
+    Returns:
+        Formatted summary string
+    """
+    if not failures:
+        return "All checks passed!"
+    parts = [f"{len(failures)} validation check(s) failed:"]
+    parts.append("")
+    for i, failure in enumerate(failures[:10], 1):
+        col = f"[{failure.column}]" if hasattr(failure, "column") and failure.column else "[table]"
+        msg = failure.message if hasattr(failure, "message") else str(failure)
+        parts.append(f"  {i}. {col} {msg}")
+    if len(failures) > 10:
+        parts.append(f"  ... and {len(failures) - 10} more failures")
+    return "\n".join(parts)

duckguard/history/__init__.py ADDED Viewed

@@ -0,0 +1,44 @@
+"""Historical result storage and trend analysis for DuckGuard.
+This module provides persistent storage for validation results,
+enabling trend analysis and historical comparison.
+Usage:
+    from duckguard.history import HistoryStorage, TrendAnalyzer
+    # Store validation results
+    storage = HistoryStorage()
+    storage.store(result)
+    # Query history
+    runs = storage.get_runs("data.csv", limit=10)
+    # Analyze trends
+    analyzer = TrendAnalyzer(storage)
+    analysis = analyzer.analyze("data.csv", days=30)
+    print(analysis.summary())
+"""
+from duckguard.history.storage import (
+    HistoryStorage,
+    StoredCheckResult,
+    StoredRun,
+    TrendDataPoint,
+)
+from duckguard.history.trends import (
+    TrendAnalysis,
+    TrendAnalyzer,
+    analyze_trends,
+)
+__all__ = [
+    # Storage
+    "HistoryStorage",
+    "StoredRun",
+    "StoredCheckResult",
+    "TrendDataPoint",
+    # Trends
+    "TrendAnalyzer",
+    "TrendAnalysis",
+    "analyze_trends",
+]

duckguard/history/schema.py ADDED Viewed

@@ -0,0 +1,183 @@
+"""Database schema for historical result storage.
+Defines the SQLite schema for storing validation results over time,
+enabling trend analysis and historical comparison.
+"""
+from __future__ import annotations
+# Schema version for migrations
+SCHEMA_VERSION = 1
+# SQL to create all tables
+CREATE_TABLES_SQL = """
+-- Validation runs table: stores metadata for each validation execution
+CREATE TABLE IF NOT EXISTS runs (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    run_id TEXT UNIQUE NOT NULL,
+    source TEXT NOT NULL,
+    ruleset_name TEXT,
+    started_at TEXT NOT NULL,
+    finished_at TEXT,
+    quality_score REAL NOT NULL,
+    total_checks INTEGER NOT NULL,
+    passed_count INTEGER NOT NULL,
+    failed_count INTEGER NOT NULL,
+    warning_count INTEGER NOT NULL,
+    passed INTEGER NOT NULL,
+    metadata TEXT,
+    created_at TEXT DEFAULT (datetime('now'))
+);
+-- Individual check results table
+CREATE TABLE IF NOT EXISTS check_results (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    run_id TEXT NOT NULL,
+    check_type TEXT NOT NULL,
+    column_name TEXT,
+    passed INTEGER NOT NULL,
+    severity TEXT NOT NULL,
+    actual_value TEXT,
+    expected_value TEXT,
+    message TEXT,
+    details TEXT,
+    created_at TEXT DEFAULT (datetime('now')),
+    FOREIGN KEY (run_id) REFERENCES runs(run_id)
+);
+-- Sample of failed rows (limited to avoid large storage)
+CREATE TABLE IF NOT EXISTS failed_rows_sample (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    run_id TEXT NOT NULL,
+    check_id INTEGER,
+    row_index INTEGER NOT NULL,
+    column_name TEXT NOT NULL,
+    value TEXT,
+    expected TEXT,
+    reason TEXT,
+    context TEXT,
+    created_at TEXT DEFAULT (datetime('now')),
+    FOREIGN KEY (run_id) REFERENCES runs(run_id),
+    FOREIGN KEY (check_id) REFERENCES check_results(id)
+);
+-- Quality score trends (aggregated daily for efficient queries)
+CREATE TABLE IF NOT EXISTS quality_trends (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    source TEXT NOT NULL,
+    date TEXT NOT NULL,
+    avg_quality_score REAL NOT NULL,
+    min_quality_score REAL NOT NULL,
+    max_quality_score REAL NOT NULL,
+    run_count INTEGER NOT NULL,
+    passed_count INTEGER NOT NULL,
+    failed_count INTEGER NOT NULL,
+    UNIQUE(source, date)
+);
+-- Schema metadata table
+CREATE TABLE IF NOT EXISTS schema_info (
+    key TEXT PRIMARY KEY,
+    value TEXT NOT NULL
+);
+-- Indexes for common query patterns
+CREATE INDEX IF NOT EXISTS idx_runs_source ON runs(source);
+CREATE INDEX IF NOT EXISTS idx_runs_started_at ON runs(started_at);
+CREATE INDEX IF NOT EXISTS idx_runs_source_started ON runs(source, started_at);
+CREATE INDEX IF NOT EXISTS idx_check_results_run_id ON check_results(run_id);
+CREATE INDEX IF NOT EXISTS idx_failed_rows_run_id ON failed_rows_sample(run_id);
+CREATE INDEX IF NOT EXISTS idx_quality_trends_source_date ON quality_trends(source, date);
+"""
+# Pre-built queries for common operations
+QUERIES = {
+    "insert_run": """
+        INSERT INTO runs (
+            run_id, source, ruleset_name, started_at, finished_at,
+            quality_score, total_checks, passed_count, failed_count,
+            warning_count, passed, metadata
+        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+    """,
+    "insert_check_result": """
+        INSERT INTO check_results (
+            run_id, check_type, column_name, passed, severity,
+            actual_value, expected_value, message, details
+        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+    """,
+    "insert_failed_row": """
+        INSERT INTO failed_rows_sample (
+            run_id, check_id, row_index, column_name, value, expected, reason, context
+        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+    """,
+    "get_runs_for_source": """
+        SELECT * FROM runs
+        WHERE source = ?
+        ORDER BY started_at DESC
+        LIMIT ?
+    """,
+    "get_runs_in_period": """
+        SELECT * FROM runs
+        WHERE source = ?
+          AND started_at >= ?
+          AND started_at <= ?
+        ORDER BY started_at DESC
+    """,
+    "get_all_runs": """
+        SELECT * FROM runs
+        ORDER BY started_at DESC
+        LIMIT ?
+    """,
+    "get_quality_trend": """
+        SELECT date, avg_quality_score, min_quality_score, max_quality_score,
+               run_count, passed_count, failed_count
+        FROM quality_trends
+        WHERE source = ?
+          AND date >= ?
+        ORDER BY date
+    """,
+    "get_latest_run": """
+        SELECT * FROM runs
+        WHERE source = ?
+        ORDER BY started_at DESC
+        LIMIT 1
+    """,
+    "get_check_results_for_run": """
+        SELECT * FROM check_results
+        WHERE run_id = ?
+        ORDER BY id
+    """,
+    "get_failed_rows_for_run": """
+        SELECT * FROM failed_rows_sample
+        WHERE run_id = ?
+        ORDER BY id
+    """,
+    "upsert_trend": """
+        INSERT INTO quality_trends (
+            source, date, avg_quality_score, min_quality_score,
+            max_quality_score, run_count, passed_count, failed_count
+        ) VALUES (?, ?, ?, ?, ?, 1, ?, ?)
+        ON CONFLICT(source, date) DO UPDATE SET
+            avg_quality_score = (
+                (avg_quality_score * run_count + excluded.avg_quality_score)
+                / (run_count + 1)
+            ),
+            min_quality_score = MIN(min_quality_score, excluded.min_quality_score),
+            max_quality_score = MAX(max_quality_score, excluded.max_quality_score),
+            run_count = run_count + 1,
+            passed_count = passed_count + excluded.passed_count,
+            failed_count = failed_count + excluded.failed_count
+    """,
+    "get_unique_sources": """
+        SELECT DISTINCT source FROM runs
+        ORDER BY source
+    """,
+    "delete_old_runs": """
+        DELETE FROM runs
+        WHERE started_at < ?
+    """,
+    "get_run_by_id": """
+        SELECT * FROM runs
+        WHERE run_id = ?
+    """,
+}

duckguard 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl

duckguard 2.0.0py3-none-any.whl → 2.2.0py3-none-any.whl