PyPI - duckguard - Versions diffs - 2.0.0__py3-none-any.whl - Mend

duckguard 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

duckguard/__init__.py +110 -0
duckguard/anomaly/__init__.py +34 -0
duckguard/anomaly/detector.py +394 -0
duckguard/anomaly/methods.py +432 -0
duckguard/cli/__init__.py +5 -0
duckguard/cli/main.py +706 -0
duckguard/connectors/__init__.py +58 -0
duckguard/connectors/base.py +80 -0
duckguard/connectors/bigquery.py +171 -0
duckguard/connectors/databricks.py +201 -0
duckguard/connectors/factory.py +292 -0
duckguard/connectors/files.py +135 -0
duckguard/connectors/kafka.py +343 -0
duckguard/connectors/mongodb.py +236 -0
duckguard/connectors/mysql.py +121 -0
duckguard/connectors/oracle.py +196 -0
duckguard/connectors/postgres.py +99 -0
duckguard/connectors/redshift.py +154 -0
duckguard/connectors/snowflake.py +226 -0
duckguard/connectors/sqlite.py +112 -0
duckguard/connectors/sqlserver.py +242 -0
duckguard/contracts/__init__.py +48 -0
duckguard/contracts/diff.py +432 -0
duckguard/contracts/generator.py +334 -0
duckguard/contracts/loader.py +367 -0
duckguard/contracts/schema.py +242 -0
duckguard/contracts/validator.py +453 -0
duckguard/core/__init__.py +8 -0
duckguard/core/column.py +437 -0
duckguard/core/dataset.py +284 -0
duckguard/core/engine.py +261 -0
duckguard/core/result.py +119 -0
duckguard/core/scoring.py +508 -0
duckguard/profiler/__init__.py +5 -0
duckguard/profiler/auto_profile.py +350 -0
duckguard/pytest_plugin/__init__.py +5 -0
duckguard/pytest_plugin/plugin.py +161 -0
duckguard/reporting/__init__.py +6 -0
duckguard/reporting/console.py +88 -0
duckguard/reporting/json_report.py +96 -0
duckguard/rules/__init__.py +28 -0
duckguard/rules/executor.py +616 -0
duckguard/rules/generator.py +341 -0
duckguard/rules/loader.py +483 -0
duckguard/rules/schema.py +289 -0
duckguard/semantic/__init__.py +31 -0
duckguard/semantic/analyzer.py +270 -0
duckguard/semantic/detector.py +459 -0
duckguard/semantic/validators.py +354 -0
duckguard/validators/__init__.py +7 -0
duckguard-2.0.0.dist-info/METADATA +221 -0
duckguard-2.0.0.dist-info/RECORD +55 -0
duckguard-2.0.0.dist-info/WHEEL +4 -0
duckguard-2.0.0.dist-info/entry_points.txt +5 -0
duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0

duckguard/contracts/generator.py ADDED Viewed

@@ -0,0 +1,334 @@
+"""Data contract generator for DuckGuard.
+Auto-generates data contracts from existing data sources.
+"""
+from __future__ import annotations
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+from duckguard.core.dataset import Dataset
+from duckguard.connectors import connect
+from duckguard.contracts.schema import (
+    DataContract,
+    SchemaField,
+    FieldType,
+    FieldConstraint,
+    QualitySLA,
+    ContractMetadata,
+)
+from duckguard.contracts.loader import contract_to_yaml
+from duckguard.semantic import SemanticAnalyzer, SemanticType
+class ContractGenerator:
+    """Generates data contracts from data analysis."""
+    # Type mapping from Python/DB types to FieldType
+    TYPE_MAPPING = {
+        "int": FieldType.INTEGER,
+        "int64": FieldType.INTEGER,
+        "int32": FieldType.INTEGER,
+        "integer": FieldType.INTEGER,
+        "bigint": FieldType.INTEGER,
+        "float": FieldType.FLOAT,
+        "float64": FieldType.FLOAT,
+        "double": FieldType.FLOAT,
+        "decimal": FieldType.DECIMAL,
+        "numeric": FieldType.DECIMAL,
+        "bool": FieldType.BOOLEAN,
+        "boolean": FieldType.BOOLEAN,
+        "str": FieldType.STRING,
+        "string": FieldType.STRING,
+        "varchar": FieldType.STRING,
+        "text": FieldType.STRING,
+        "date": FieldType.DATE,
+        "datetime": FieldType.DATETIME,
+        "timestamp": FieldType.TIMESTAMP,
+        "time": FieldType.TIME,
+    }
+    # Semantic type to field type mapping
+    SEMANTIC_TYPE_MAPPING = {
+        SemanticType.EMAIL: FieldType.STRING,
+        SemanticType.PHONE: FieldType.STRING,
+        SemanticType.URL: FieldType.STRING,
+        SemanticType.UUID: FieldType.UUID,
+        SemanticType.DATE: FieldType.DATE,
+        SemanticType.DATETIME: FieldType.DATETIME,
+        SemanticType.TIMESTAMP: FieldType.TIMESTAMP,
+        SemanticType.TIME: FieldType.TIME,
+        SemanticType.CURRENCY: FieldType.DECIMAL,
+        SemanticType.PERCENTAGE: FieldType.FLOAT,
+        SemanticType.BOOLEAN: FieldType.BOOLEAN,
+        SemanticType.LATITUDE: FieldType.FLOAT,
+        SemanticType.LONGITUDE: FieldType.FLOAT,
+        SemanticType.AGE: FieldType.INTEGER,
+        SemanticType.YEAR: FieldType.INTEGER,
+    }
+    def __init__(self):
+        self._analyzer = SemanticAnalyzer()
+    def generate(
+        self,
+        source: str | Dataset,
+        name: str | None = None,
+        owner: str | None = None,
+        include_constraints: bool = True,
+        include_quality_sla: bool = True,
+    ) -> DataContract:
+        """Generate a contract from a data source.
+        Args:
+            source: Data source path or Dataset
+            name: Contract name (defaults to source name)
+            owner: Contract owner
+            include_constraints: Include inferred constraints
+            include_quality_sla: Include quality SLA
+        Returns:
+            Generated DataContract
+        """
+        if isinstance(source, str):
+            dataset = connect(source)
+            source_path = source
+        else:
+            dataset = source
+            source_path = dataset.source
+        # Determine name
+        if not name:
+            name = Path(source_path).stem if source_path else "dataset"
+        contract = DataContract(
+            name=name,
+            version="1.0.0",
+            created_at=datetime.now(),
+            metadata=ContractMetadata(
+                owner=owner,
+                source_system=source_path,
+            ),
+        )
+        # Analyze dataset semantically
+        analysis = self._analyzer.analyze(dataset)
+        # Generate schema fields
+        for col_analysis in analysis.columns:
+            field_def = self._generate_field(
+                col_analysis,
+                dataset,
+                include_constraints
+            )
+            contract.schema.append(field_def)
+        # Generate quality SLA
+        if include_quality_sla:
+            contract.quality = self._generate_quality_sla(dataset, analysis)
+        # Add warnings to metadata
+        if analysis.warnings:
+            contract.metadata.tags.append("has_pii")
+        return contract
+    def _generate_field(
+        self,
+        col_analysis,
+        dataset: Dataset,
+        include_constraints: bool
+    ) -> SchemaField:
+        """Generate a schema field from column analysis."""
+        col = dataset[col_analysis.name]
+        # Determine field type
+        field_type = self._infer_type(col_analysis)
+        # Determine if required
+        required = col.null_count == 0
+        # Determine if unique
+        unique = col.unique_percent == 100 and col.null_count == 0
+        field_def = SchemaField(
+            name=col_analysis.name,
+            type=field_type,
+            required=required,
+            unique=unique,
+            semantic_type=col_analysis.semantic_type.value if col_analysis.semantic_type != SemanticType.UNKNOWN else None,
+            pii=col_analysis.is_pii,
+        )
+        # Add constraints
+        if include_constraints:
+            constraints = self._generate_constraints(col_analysis, col)
+            field_def.constraints = constraints
+        return field_def
+    def _infer_type(self, col_analysis) -> FieldType:
+        """Infer field type from analysis."""
+        # Try semantic type first
+        if col_analysis.semantic_type in self.SEMANTIC_TYPE_MAPPING:
+            return self.SEMANTIC_TYPE_MAPPING[col_analysis.semantic_type]
+        # Fall back to statistics-based inference
+        stats = col_analysis.statistics
+        if "mean" in stats and stats.get("mean") is not None:
+            # Numeric type
+            min_val = stats.get("min")
+            max_val = stats.get("max")
+            # Check if integer
+            if min_val is not None and max_val is not None:
+                if isinstance(min_val, int) and isinstance(max_val, int):
+                    return FieldType.INTEGER
+            return FieldType.FLOAT
+        # Default to string
+        return FieldType.STRING
+    def _generate_constraints(self, col_analysis, col) -> list[FieldConstraint]:
+        """Generate constraints for a field."""
+        constraints = []
+        stats = col_analysis.statistics
+        # Range constraint for numeric fields
+        if "mean" in stats and stats.get("mean") is not None:
+            min_val = stats.get("min")
+            max_val = stats.get("max")
+            if min_val is not None and max_val is not None:
+                # Add buffer
+                range_size = max_val - min_val
+                buffer = range_size * 0.1 if range_size > 0 else abs(max_val) * 0.1 or 1
+                constraints.append(FieldConstraint(
+                    type="range",
+                    value=[
+                        self._round_nice(min_val - buffer),
+                        self._round_nice(max_val + buffer)
+                    ]
+                ))
+            # Non-negative constraint
+            if min_val is not None and min_val >= 0:
+                constraints.append(FieldConstraint(type="non_negative"))
+        # Pattern constraint for semantic types
+        if col_analysis.semantic_type in (
+            SemanticType.EMAIL,
+            SemanticType.PHONE,
+            SemanticType.URL,
+            SemanticType.UUID,
+            SemanticType.IP_ADDRESS,
+        ):
+            constraints.append(FieldConstraint(
+                type="pattern",
+                value=col_analysis.semantic_type.value,
+            ))
+        # Enum constraint for low cardinality
+        unique_count = stats.get("unique_count", 0)
+        unique_pct = stats.get("unique_percent", 100)
+        if 0 < unique_count <= 20 and unique_pct < 5:
+            try:
+                distinct_values = col.get_distinct_values(limit=25)
+                if len(distinct_values) <= 20:
+                    allowed = [v for v in distinct_values if v is not None]
+                    if allowed:
+                        constraints.append(FieldConstraint(
+                            type="allowed_values",
+                            value=allowed,
+                        ))
+            except Exception:
+                pass
+        return constraints
+    def _generate_quality_sla(self, dataset: Dataset, analysis) -> QualitySLA:
+        """Generate quality SLA from dataset analysis."""
+        # Calculate overall completeness
+        total_cells = dataset.row_count * dataset.column_count
+        total_nulls = sum(
+            col.statistics.get("null_count", 0)
+            for col in analysis.columns
+        )
+        actual_completeness = 100 - (total_nulls / total_cells * 100) if total_cells > 0 else 100
+        # Set completeness SLA slightly below actual
+        completeness_sla = max(95.0, round(actual_completeness - 1, 1))
+        # Uniqueness SLAs for unique columns
+        uniqueness = {}
+        for col in analysis.columns:
+            unique_pct = col.statistics.get("unique_percent", 0)
+            if unique_pct == 100:
+                uniqueness[col.name] = 100.0
+        # Row count minimum (80% of current)
+        row_count_min = int(dataset.row_count * 0.8) if dataset.row_count > 100 else None
+        return QualitySLA(
+            completeness=completeness_sla,
+            uniqueness=uniqueness,
+            row_count_min=row_count_min,
+        )
+    def _round_nice(self, value: float) -> int | float:
+        """Round to a nice human-readable number."""
+        if value is None:
+            return 0
+        if abs(value) < 1:
+            return round(value, 2)
+        if abs(value) < 100:
+            return round(value)
+        if abs(value) < 1000:
+            return round(value / 10) * 10
+        return round(value / 100) * 100
+def generate_contract(
+    source: str | Dataset,
+    output: str | Path | None = None,
+    name: str | None = None,
+    owner: str | None = None,
+    dataset_name: str | None = None,
+    as_yaml: bool = False,
+) -> DataContract | str:
+    """Generate a data contract from a data source.
+    Args:
+        source: Data source path or Dataset
+        output: Optional output file path (.yaml)
+        name: Contract name (can also use dataset_name)
+        owner: Contract owner
+        dataset_name: Alias for name parameter
+        as_yaml: If True and output is None, return YAML string instead of DataContract
+    Returns:
+        DataContract if as_yaml=False, YAML string if as_yaml=True,
+        or file path if output is specified
+    """
+    # Support both name and dataset_name
+    contract_name = name or dataset_name
+    generator = ContractGenerator()
+    contract = generator.generate(source, name=contract_name, owner=owner)
+    if output is not None:
+        # Write to file
+        yaml_content = contract_to_yaml(contract)
+        output_path = Path(output)
+        output_path.write_text(yaml_content, encoding="utf-8")
+        return str(output_path)
+    if as_yaml:
+        return contract_to_yaml(contract)
+    return contract

duckguard/contracts/loader.py ADDED Viewed

@@ -0,0 +1,367 @@
+"""Data contract loader for DuckGuard.
+Parses YAML contract files into DataContract objects.
+Example contract YAML:
+    contract:
+      name: orders
+      version: "1.2.0"
+      schema:
+        - name: order_id
+          type: string
+          required: true
+          unique: true
+        - name: amount
+          type: decimal
+          required: true
+          constraints:
+            - type: range
+              value: [0, 100000]
+        - name: email
+          type: string
+          semantic_type: email
+          pii: true
+      quality:
+        completeness: 99.5
+        freshness: "24h"
+        row_count_min: 1000
+      metadata:
+        owner: platform-team
+        description: Order transactions from checkout
+        consumers:
+          - analytics
+          - finance
+"""
+from __future__ import annotations
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+import yaml
+from duckguard.contracts.schema import (
+    DataContract,
+    SchemaField,
+    FieldType,
+    FieldConstraint,
+    QualitySLA,
+    ContractMetadata,
+)
+class ContractParseError(Exception):
+    """Raised when contract parsing fails."""
+    def __init__(self, message: str, location: str | None = None):
+        self.location = location
+        full_message = f"{message}" if not location else f"{message} (at {location})"
+        super().__init__(full_message)
+def load_contract(path: str | Path) -> DataContract:
+    """Load a data contract from a YAML file.
+    Args:
+        path: Path to the contract YAML file
+    Returns:
+        Parsed DataContract
+    Raises:
+        FileNotFoundError: If file doesn't exist
+        ContractParseError: If YAML is invalid
+    """
+    path = Path(path)
+    if not path.exists():
+        raise FileNotFoundError(f"Contract file not found: {path}")
+    with open(path, "r", encoding="utf-8") as f:
+        content = f.read()
+    return load_contract_from_string(content, source_file=str(path))
+def load_contract_from_string(
+    content: str,
+    source_file: str | None = None
+) -> DataContract:
+    """Load a data contract from a YAML string.
+    Args:
+        content: YAML content
+        source_file: Optional source file for error messages
+    Returns:
+        Parsed DataContract
+    """
+    try:
+        data = yaml.safe_load(content)
+    except yaml.YAMLError as e:
+        raise ContractParseError(f"Invalid YAML: {e}", source_file)
+    if not data:
+        raise ContractParseError("Empty contract file", source_file)
+    # Support both root-level and nested 'contract' key
+    if "contract" in data:
+        data = data["contract"]
+    return _parse_contract(data, source_file)
+def _parse_contract(data: dict[str, Any], source_file: str | None = None) -> DataContract:
+    """Parse dictionary into DataContract."""
+    # Required: name
+    name = data.get("name")
+    if not name:
+        raise ContractParseError("Contract must have a 'name'", source_file)
+    contract = DataContract(
+        name=name,
+        version=str(data.get("version", "1.0.0")),
+    )
+    # Parse timestamps
+    if "created_at" in data:
+        contract.created_at = _parse_datetime(data["created_at"])
+    if "updated_at" in data:
+        contract.updated_at = _parse_datetime(data["updated_at"])
+    # Parse schema
+    schema_data = data.get("schema", [])
+    if isinstance(schema_data, list):
+        for i, field_data in enumerate(schema_data):
+            try:
+                field_obj = _parse_schema_field(field_data)
+                contract.schema.append(field_obj)
+            except Exception as e:
+                raise ContractParseError(
+                    f"Invalid schema field at index {i}: {e}",
+                    source_file
+                )
+    # Parse quality SLA
+    quality_data = data.get("quality", {})
+    if quality_data:
+        contract.quality = _parse_quality_sla(quality_data)
+    # Parse metadata
+    metadata_data = data.get("metadata", {})
+    if metadata_data:
+        contract.metadata = _parse_metadata(metadata_data)
+    return contract
+def _parse_schema_field(data: dict[str, Any] | str) -> SchemaField:
+    """Parse a schema field definition."""
+    # Handle simple string format: "field_name: type"
+    if isinstance(data, str):
+        parts = data.split(":")
+        name = parts[0].strip()
+        type_str = parts[1].strip() if len(parts) > 1 else "string"
+        return SchemaField(name=name, type=type_str)
+    if not isinstance(data, dict):
+        raise ValueError(f"Invalid field format: {data}")
+    name = data.get("name")
+    if not name:
+        raise ValueError("Field must have a 'name'")
+    # Parse type
+    type_value = data.get("type", "string")
+    try:
+        if isinstance(type_value, str):
+            field_type = FieldType(type_value.lower())
+        else:
+            field_type = type_value
+    except ValueError:
+        field_type = type_value  # Keep as string for custom types
+    # Parse constraints
+    constraints = []
+    constraints_data = data.get("constraints", [])
+    for c in constraints_data:
+        if isinstance(c, dict):
+            constraints.append(FieldConstraint(
+                type=c.get("type", "custom"),
+                value=c.get("value"),
+                params=c.get("params", {}),
+            ))
+        elif isinstance(c, str):
+            constraints.append(FieldConstraint(type=c))
+    return SchemaField(
+        name=name,
+        type=field_type,
+        required=data.get("required", False),
+        unique=data.get("unique", False),
+        description=data.get("description"),
+        semantic_type=data.get("semantic_type"),
+        constraints=constraints,
+        tags=data.get("tags", []),
+        pii=data.get("pii", False),
+        deprecated=data.get("deprecated", False),
+        default=data.get("default"),
+    )
+def _parse_quality_sla(data: dict[str, Any]) -> QualitySLA:
+    """Parse quality SLA definition."""
+    # Parse uniqueness dict
+    uniqueness = {}
+    uniqueness_data = data.get("uniqueness", {})
+    if isinstance(uniqueness_data, dict):
+        uniqueness = {k: float(v) for k, v in uniqueness_data.items()}
+    elif isinstance(uniqueness_data, list):
+        # Handle list format: ["col1", "col2"] means 100% unique
+        uniqueness = {col: 100.0 for col in uniqueness_data}
+    return QualitySLA(
+        completeness=_parse_percentage(data.get("completeness")),
+        freshness=data.get("freshness"),
+        uniqueness=uniqueness,
+        row_count_min=data.get("row_count_min") or data.get("min_rows"),
+        row_count_max=data.get("row_count_max") or data.get("max_rows"),
+        custom=data.get("custom", {}),
+    )
+def _parse_metadata(data: dict[str, Any]) -> ContractMetadata:
+    """Parse contract metadata."""
+    return ContractMetadata(
+        owner=data.get("owner"),
+        description=data.get("description"),
+        source_system=data.get("source_system") or data.get("source"),
+        consumers=data.get("consumers", []),
+        schedule=data.get("schedule"),
+        tags=data.get("tags", []),
+        links=data.get("links", {}),
+    )
+def _parse_percentage(value: Any) -> float | None:
+    """Parse a percentage value."""
+    if value is None:
+        return None
+    if isinstance(value, (int, float)):
+        return float(value)
+    if isinstance(value, str):
+        # Handle "99.5%" format
+        value = value.strip().rstrip("%")
+        return float(value)
+    return None
+def _parse_datetime(value: Any) -> datetime | None:
+    """Parse a datetime value."""
+    if value is None:
+        return None
+    if isinstance(value, datetime):
+        return value
+    if isinstance(value, str):
+        # Try common formats
+        formats = [
+            "%Y-%m-%d %H:%M:%S",
+            "%Y-%m-%dT%H:%M:%S",
+            "%Y-%m-%d",
+        ]
+        for fmt in formats:
+            try:
+                return datetime.strptime(value, fmt)
+            except ValueError:
+                continue
+    return None
+def contract_to_yaml(contract: DataContract) -> str:
+    """Convert a DataContract to YAML string.
+    Args:
+        contract: Contract to convert
+    Returns:
+        YAML string
+    """
+    data: dict[str, Any] = {
+        "contract": {
+            "name": contract.name,
+            "version": contract.version,
+        }
+    }
+    # Add schema
+    if contract.schema:
+        data["contract"]["schema"] = []
+        for field_obj in contract.schema:
+            field_dict: dict[str, Any] = {
+                "name": field_obj.name,
+                "type": field_obj.type.value if isinstance(field_obj.type, FieldType) else str(field_obj.type),
+            }
+            if field_obj.required:
+                field_dict["required"] = True
+            if field_obj.unique:
+                field_dict["unique"] = True
+            if field_obj.description:
+                field_dict["description"] = field_obj.description
+            if field_obj.semantic_type:
+                field_dict["semantic_type"] = field_obj.semantic_type
+            if field_obj.pii:
+                field_dict["pii"] = True
+            if field_obj.constraints:
+                field_dict["constraints"] = [
+                    {"type": c.type, "value": c.value} if c.value else {"type": c.type}
+                    for c in field_obj.constraints
+                ]
+            data["contract"]["schema"].append(field_dict)
+    # Add quality
+    quality_dict: dict[str, Any] = {}
+    if contract.quality.completeness is not None:
+        quality_dict["completeness"] = contract.quality.completeness
+    if contract.quality.freshness:
+        quality_dict["freshness"] = contract.quality.freshness
+    if contract.quality.uniqueness:
+        quality_dict["uniqueness"] = contract.quality.uniqueness
+    if contract.quality.row_count_min is not None:
+        quality_dict["row_count_min"] = contract.quality.row_count_min
+    if contract.quality.row_count_max is not None:
+        quality_dict["row_count_max"] = contract.quality.row_count_max
+    if quality_dict:
+        data["contract"]["quality"] = quality_dict
+    # Add metadata
+    meta_dict: dict[str, Any] = {}
+    if contract.metadata.owner:
+        meta_dict["owner"] = contract.metadata.owner
+    if contract.metadata.description:
+        meta_dict["description"] = contract.metadata.description
+    if contract.metadata.source_system:
+        meta_dict["source_system"] = contract.metadata.source_system
+    if contract.metadata.consumers:
+        meta_dict["consumers"] = contract.metadata.consumers
+    if contract.metadata.schedule:
+        meta_dict["schedule"] = contract.metadata.schedule
+    if contract.metadata.tags:
+        meta_dict["tags"] = contract.metadata.tags
+    if meta_dict:
+        data["contract"]["metadata"] = meta_dict
+    return yaml.dump(data, default_flow_style=False, sort_keys=False, allow_unicode=True)