PyPI - databricks4py - Versions diffs - 0.2.0__py3-none-any.whl - Mend

databricks4py 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

databricks4py/__init__.py +56 -0
databricks4py/catalog.py +65 -0
databricks4py/config/__init__.py +6 -0
databricks4py/config/base.py +119 -0
databricks4py/config/unity.py +72 -0
databricks4py/filters/__init__.py +17 -0
databricks4py/filters/base.py +154 -0
databricks4py/io/__init__.py +40 -0
databricks4py/io/checkpoint.py +98 -0
databricks4py/io/dbfs.py +91 -0
databricks4py/io/delta.py +564 -0
databricks4py/io/merge.py +176 -0
databricks4py/io/streaming.py +281 -0
databricks4py/logging.py +39 -0
databricks4py/metrics/__init__.py +22 -0
databricks4py/metrics/base.py +66 -0
databricks4py/metrics/delta_sink.py +75 -0
databricks4py/metrics/logging_sink.py +20 -0
databricks4py/migrations/__init__.py +27 -0
databricks4py/migrations/alter.py +114 -0
databricks4py/migrations/runner.py +241 -0
databricks4py/migrations/schema_diff.py +136 -0
databricks4py/migrations/validators.py +195 -0
databricks4py/observability/__init__.py +24 -0
databricks4py/observability/_utils.py +24 -0
databricks4py/observability/batch_context.py +134 -0
databricks4py/observability/health.py +223 -0
databricks4py/observability/query_listener.py +236 -0
databricks4py/py.typed +0 -0
databricks4py/quality/__init__.py +26 -0
databricks4py/quality/base.py +54 -0
databricks4py/quality/expectations.py +184 -0
databricks4py/quality/gate.py +90 -0
databricks4py/retry.py +102 -0
databricks4py/secrets.py +69 -0
databricks4py/spark_session.py +68 -0
databricks4py/testing/__init__.py +35 -0
databricks4py/testing/assertions.py +111 -0
databricks4py/testing/builders.py +127 -0
databricks4py/testing/fixtures.py +134 -0
databricks4py/testing/mocks.py +106 -0
databricks4py/testing/temp_table.py +73 -0
databricks4py/workflow.py +219 -0
databricks4py-0.2.0.dist-info/METADATA +589 -0
databricks4py-0.2.0.dist-info/RECORD +48 -0
databricks4py-0.2.0.dist-info/WHEEL +5 -0
databricks4py-0.2.0.dist-info/licenses/LICENSE +21 -0
databricks4py-0.2.0.dist-info/top_level.txt +1 -0

databricks4py/migrations/validators.py ADDED Viewed

@@ -0,0 +1,195 @@
+"""Table structure validation for migrations."""
+from __future__ import annotations
+import logging
+from collections.abc import Sequence
+from dataclasses import dataclass, field
+from pyspark.sql import SparkSession
+from databricks4py.io.delta import GeneratedColumn
+from databricks4py.spark_session import active_fallback
+__all__ = ["MigrationError", "TableValidator", "ValidationResult"]
+logger = logging.getLogger(__name__)
+class MigrationError(Exception):
+    """Raised when table validation fails during migration.
+    Attributes:
+        table_name: The table that failed validation.
+        errors: List of validation error messages.
+    """
+    def __init__(self, table_name: str, errors: list[str]) -> None:
+        self.table_name = table_name
+        self.errors = errors
+        message = f"Migration validation failed for '{table_name}':\n" + "\n".join(
+            f"  - {e}" for e in errors
+        )
+        super().__init__(message)
+@dataclass
+class ValidationResult:
+    """Result of a table validation check.
+    Attributes:
+        is_valid: Whether all checks passed.
+        errors: List of validation errors.
+        warnings: List of non-fatal warnings.
+    """
+    is_valid: bool
+    errors: list[str] = field(default_factory=list)
+    warnings: list[str] = field(default_factory=list)
+    def raise_if_invalid(self, table_name: str) -> None:
+        """Raise MigrationError if validation failed.
+        Args:
+            table_name: Table name for the error message.
+        Raises:
+            MigrationError: If ``is_valid`` is False.
+        """
+        if not self.is_valid:
+            raise MigrationError(table_name, self.errors)
+class TableValidator:
+    """Validates Delta table structure against expected configuration.
+    Used in migration workflows to verify that a table matches
+    expected schema, partitioning, and structure before and after
+    migration steps.
+    Example::
+        validator = TableValidator(
+            table_name="catalog.schema.events",
+            expected_columns=["id", "name", "event_date"],
+            expected_partition_columns=["event_date"],
+        )
+        result = validator.validate()
+        result.raise_if_invalid("catalog.schema.events")
+    Args:
+        table_name: Fully qualified table name.
+        expected_columns: Columns that must exist in the table.
+        expected_partition_columns: Expected partition column order.
+        expected_generated_columns: Expected generated column definitions.
+        expected_location_contains: Substring that must appear in table location.
+        spark: Optional SparkSession.
+    """
+    def __init__(
+        self,
+        table_name: str,
+        *,
+        expected_columns: Sequence[str] | None = None,
+        expected_partition_columns: Sequence[str] | None = None,
+        expected_generated_columns: Sequence[GeneratedColumn] | None = None,
+        expected_location_contains: str | None = None,
+        spark: SparkSession | None = None,
+    ) -> None:
+        self._spark = active_fallback(spark)
+        self._table_name = table_name
+        self._expected_columns = list(expected_columns or [])
+        self._expected_partition_columns = list(expected_partition_columns or [])
+        self._expected_generated_columns = list(expected_generated_columns or [])
+        self._expected_location_contains = expected_location_contains
+    def _table_exists(self) -> bool:
+        """Check if the table exists in the catalog."""
+        from pyspark.errors import AnalysisException
+        try:
+            self._spark.sql(f"DESCRIBE TABLE {self._table_name}")
+            return True
+        except AnalysisException:
+            return False
+    def _get_actual_columns(self) -> set[str]:
+        """Get column names from the table."""
+        rows = self._spark.sql(f"DESCRIBE TABLE {self._table_name}").collect()
+        columns: set[str] = set()
+        for row in rows:
+            col_name = row["col_name"]
+            if col_name is None or col_name == "" or col_name.startswith("#"):
+                break
+            columns.add(col_name)
+        return columns
+    def _get_actual_partitions(self) -> list[str]:
+        """Get partition columns from Delta DETAIL."""
+        from delta.tables import DeltaTable
+        dt = DeltaTable.forName(self._spark, self._table_name)
+        row = dt.detail().select("partitionColumns").first()
+        return list(row["partitionColumns"]) if row else []
+    def _get_actual_location(self) -> str:
+        """Get the table's physical location."""
+        from delta.tables import DeltaTable
+        dt = DeltaTable.forName(self._spark, self._table_name)
+        row = dt.detail().select("location").first()
+        return row["location"] if row else ""
+    def validate(self) -> ValidationResult:
+        """Run all configured validations.
+        Returns:
+            ValidationResult with any errors and warnings.
+        """
+        errors: list[str] = []
+        warnings: list[str] = []
+        if not self._table_exists():
+            errors.append(f"Table '{self._table_name}' does not exist")
+            return ValidationResult(is_valid=False, errors=errors)
+        logger.info("Validating table %s", self._table_name)
+        if self._expected_columns:
+            actual = self._get_actual_columns()
+            missing = set(self._expected_columns) - actual
+            if missing:
+                errors.append(f"Missing required columns: {sorted(missing)}")
+            extra = actual - set(self._expected_columns)
+            if extra:
+                warnings.append(f"Unexpected extra columns: {sorted(extra)}")
+        if self._expected_partition_columns:
+            actual_partitions = self._get_actual_partitions()
+            if sorted(actual_partitions) != sorted(self._expected_partition_columns):
+                errors.append(
+                    f"Partition mismatch: expected {self._expected_partition_columns}, "
+                    f"got {actual_partitions}"
+                )
+        if self._expected_location_contains:
+            actual_location = self._get_actual_location()
+            if self._expected_location_contains not in actual_location:
+                errors.append(
+                    f"Location '{actual_location}' does not contain "
+                    f"'{self._expected_location_contains}'"
+                )
+        if self._expected_generated_columns:
+            actual_cols = self._get_actual_columns()
+            for gc in self._expected_generated_columns:
+                if gc.name not in actual_cols:
+                    errors.append(f"Missing generated column: '{gc.name}'")
+        is_valid = len(errors) == 0
+        if is_valid:
+            logger.info("Table %s validation passed", self._table_name)
+        else:
+            logger.warning("Table %s validation failed: %s", self._table_name, errors)
+        return ValidationResult(is_valid=is_valid, errors=errors, warnings=warnings)

databricks4py/observability/__init__.py ADDED Viewed

@@ -0,0 +1,24 @@
+"""Observability: structured batch logging, query listeners, and health checks."""
+from databricks4py.observability.batch_context import BatchContext, BatchLogger
+from databricks4py.observability.health import (
+    CheckDetail,
+    HealthResult,
+    HealthStatus,
+    StreamingHealthCheck,
+)
+from databricks4py.observability.query_listener import (
+    QueryProgressObserver,
+    QueryProgressSnapshot,
+)
+__all__ = [
+    "BatchContext",
+    "BatchLogger",
+    "CheckDetail",
+    "HealthResult",
+    "HealthStatus",
+    "QueryProgressObserver",
+    "QueryProgressSnapshot",
+    "StreamingHealthCheck",
+]

databricks4py/observability/_utils.py ADDED Viewed

@@ -0,0 +1,24 @@
+"""Shared utilities for the observability subpackage."""
+from __future__ import annotations
+def parse_duration_ms(val: str | int) -> int:
+    """Parse a Spark duration string to integer milliseconds.
+    Handles ``'250 ms'``, ``'1 s'``, ``'2 m'``, and bare integers.
+    Returns 0 if the value cannot be parsed.
+    """
+    if isinstance(val, int):
+        return val
+    try:
+        stripped = val.strip()
+        if stripped.endswith("ms"):
+            return int(stripped.replace("ms", "").strip())
+        if stripped.endswith("s"):
+            return int(float(stripped.replace("s", "").strip()) * 1000)
+        if stripped.endswith("m"):
+            return int(float(stripped.replace("m", "").strip()) * 60_000)
+        return int(stripped)
+    except (ValueError, AttributeError):
+        return 0

databricks4py/observability/batch_context.py ADDED Viewed

@@ -0,0 +1,134 @@
+"""Structured per-batch logging with correlation IDs.
+Produces JSON-structured log records for each batch lifecycle event
+(start, complete, error, skip). Designed for use inside
+:class:`~databricks4py.io.streaming.StreamingTableReader` or any
+``foreachBatch`` processor where you need queryable, machine-parseable logs.
+Example::
+    logger = BatchLogger()
+    ctx = BatchContext.create(batch_id=42, source_table="catalog.schema.events")
+    logger.batch_start(ctx)
+    # ... process ...
+    logger.batch_complete(ctx, row_count=1000, duration_ms=345.2)
+"""
+from __future__ import annotations
+import json
+import logging
+import uuid
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from typing import Any
+__all__ = ["BatchContext", "BatchLogger"]
+@dataclass(frozen=True)
+class BatchContext:
+    """Immutable context for a single streaming micro-batch.
+    Carries the batch identifier, source table, a unique correlation ID,
+    and the batch start time. Thread-safe (frozen).
+    Args:
+        batch_id: Spark-assigned batch identifier.
+        source_table: Fully qualified source table or path.
+        correlation_id: Unique ID for correlating logs, metrics, and DLQ
+            records across systems. Auto-generated if not provided.
+        start_time: UTC timestamp when the batch started processing.
+    """
+    batch_id: int
+    source_table: str
+    correlation_id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])
+    start_time: datetime = field(default_factory=lambda: datetime.now(tz=timezone.utc))
+    @classmethod
+    def create(
+        cls,
+        batch_id: int,
+        source_table: str,
+        *,
+        correlation_id: str | None = None,
+    ) -> BatchContext:
+        """Factory with optional explicit correlation ID."""
+        if correlation_id is not None:
+            return cls(batch_id=batch_id, source_table=source_table, correlation_id=correlation_id)
+        return cls(batch_id=batch_id, source_table=source_table)
+    def elapsed_ms(self) -> float:
+        """Milliseconds since ``start_time``."""
+        return (datetime.now(tz=timezone.utc) - self.start_time).total_seconds() * 1000
+class BatchLogger:
+    """Structured JSON logger for streaming batch lifecycle events.
+    Each log record is a single-line JSON object with a consistent schema,
+    making it easy to query in log aggregation systems (Datadog, Splunk,
+    CloudWatch, etc.).
+    Args:
+        logger_name: Python logger name. Defaults to ``"databricks4py.batch"``.
+        extra_fields: Static fields added to every log record (e.g. environment,
+            pipeline name).
+    """
+    def __init__(
+        self,
+        logger_name: str = "databricks4py.batch",
+        extra_fields: dict[str, Any] | None = None,
+    ) -> None:
+        self._logger = logging.getLogger(logger_name)
+        self._extra = dict(extra_fields) if extra_fields else {}
+    def _emit(self, event: str, ctx: BatchContext, level: int, **fields: Any) -> None:
+        if not self._logger.isEnabledFor(level):
+            return
+        record = {
+            "event": event,
+            "batch_id": ctx.batch_id,
+            "source_table": ctx.source_table,
+            "correlation_id": ctx.correlation_id,
+            "timestamp": datetime.now(tz=timezone.utc).isoformat(),
+            **self._extra,
+            **fields,
+        }
+        self._logger.log(level, json.dumps(record, default=str))
+    def batch_start(self, ctx: BatchContext) -> None:
+        self._emit("batch_start", ctx, logging.INFO)
+    def batch_complete(
+        self,
+        ctx: BatchContext,
+        row_count: int,
+        duration_ms: float,
+    ) -> None:
+        self._emit(
+            "batch_complete",
+            ctx,
+            logging.INFO,
+            row_count=row_count,
+            duration_ms=round(duration_ms, 2),
+        )
+    def batch_error(self, ctx: BatchContext, error: str) -> None:
+        self._emit("batch_error", ctx, logging.ERROR, error=error[:2000])
+    def batch_skip(self, ctx: BatchContext, reason: str) -> None:
+        self._emit("batch_skip", ctx, logging.DEBUG, reason=reason[:500])
+    def batch_dlq(self, ctx: BatchContext, dlq_table: str, error: str) -> None:
+        """Log that a failed batch was routed to the dead-letter queue."""
+        self._emit(
+            "batch_dlq",
+            ctx,
+            logging.WARNING,
+            dlq_table=dlq_table,
+            error=error[:2000],
+        )

databricks4py/observability/health.py ADDED Viewed

@@ -0,0 +1,223 @@
+"""Health checks for streaming queries and checkpoints.
+Polls a ``StreamingQuery`` or ``QueryProgressObserver`` and evaluates
+configurable thresholds to produce a health status. Use in monitoring
+dashboards, alerting hooks, or as a pre-flight check before scaling down.
+Example::
+    check = StreamingHealthCheck(
+        query,
+        max_batch_duration_ms=60_000,
+        min_processing_rate=100.0,
+        stale_timeout_seconds=300,
+    )
+    result = check.evaluate()
+    if result.status == HealthStatus.UNHEALTHY:
+        alert(result.summary())
+"""
+from __future__ import annotations
+import json as _json
+import logging
+import time
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from enum import Enum
+from typing import TYPE_CHECKING, Any
+from databricks4py.observability._utils import parse_duration_ms
+if TYPE_CHECKING:
+    from pyspark.sql.streaming import StreamingQuery
+__all__ = ["CheckDetail", "HealthResult", "HealthStatus", "StreamingHealthCheck"]
+logger = logging.getLogger(__name__)
+class HealthStatus(Enum):
+    """Overall health of a monitored component."""
+    HEALTHY = "healthy"
+    DEGRADED = "degraded"
+    UNHEALTHY = "unhealthy"
+@dataclass(frozen=True)
+class CheckDetail:
+    """Result of a single health check rule.
+    Attributes:
+        name: Short identifier for the check (e.g. ``"stuck_query"``).
+        status: Pass/degraded/fail for this individual check.
+        message: Human-readable explanation.
+    """
+    name: str
+    status: HealthStatus
+    message: str
+@dataclass(frozen=True)
+class HealthResult:
+    """Aggregated health across all check rules.
+    ``status`` is the worst status among all ``checks``. If any check is
+    UNHEALTHY the result is UNHEALTHY; if any is DEGRADED it's DEGRADED.
+    Attributes:
+        status: Worst-case status across all checks.
+        checks: Individual check results.
+        timestamp: UTC time the evaluation ran.
+    """
+    status: HealthStatus
+    checks: list[CheckDetail] = field(default_factory=list)
+    timestamp: datetime = field(default_factory=lambda: datetime.now(tz=timezone.utc))
+    def summary(self) -> str:
+        """One-line-per-check summary string."""
+        lines = [f"Overall: {self.status.value}"]
+        for c in self.checks:
+            lines.append(f"  [{c.status.value}] {c.name}: {c.message}")
+        return "\n".join(lines)
+def _worst(statuses: list[HealthStatus]) -> HealthStatus:
+    if HealthStatus.UNHEALTHY in statuses:
+        return HealthStatus.UNHEALTHY
+    if HealthStatus.DEGRADED in statuses:
+        return HealthStatus.DEGRADED
+    return HealthStatus.HEALTHY
+class StreamingHealthCheck:
+    """Evaluates a streaming query's health by polling its progress.
+    Checks (all optional — configure the thresholds you care about):
+    - **Stuck query**: No progress events for ``stale_timeout_seconds``.
+    - **Slow batches**: ``batch_duration_ms`` exceeds ``max_batch_duration_ms``.
+    - **Low throughput**: ``processedRowsPerSecond`` below ``min_processing_rate``.
+    - **Query inactive**: The query has stopped unexpectedly.
+    Args:
+        query: A PySpark ``StreamingQuery`` to monitor.
+        max_batch_duration_ms: DEGRADED if last batch took longer than this.
+        min_processing_rate: DEGRADED if processed rows/sec drops below this.
+        stale_timeout_seconds: UNHEALTHY if no progress for this many seconds.
+    """
+    def __init__(
+        self,
+        query: StreamingQuery,
+        *,
+        max_batch_duration_ms: int | None = None,
+        min_processing_rate: float | None = None,
+        stale_timeout_seconds: int = 600,
+    ) -> None:
+        self._query = query
+        self._max_batch_duration_ms = max_batch_duration_ms
+        self._min_processing_rate = min_processing_rate
+        self._stale_timeout_seconds = stale_timeout_seconds
+        self._last_progress_time: float = time.monotonic()
+        self._last_batch_id: int | None = None
+    def _get_progress(self) -> dict[str, Any] | None:
+        progress = self._query.lastProgress
+        if progress is None:
+            return None
+        if isinstance(progress, dict):
+            return progress
+        return _json.loads(progress.json)
+    def evaluate(self) -> HealthResult:
+        """Run all configured checks and return the aggregated result."""
+        checks: list[CheckDetail] = []
+        # Check 1: query still active
+        if not self._query.isActive:
+            checks.append(
+                CheckDetail(
+                    name="query_active",
+                    status=HealthStatus.UNHEALTHY,
+                    message="Query is no longer active",
+                )
+            )
+            return HealthResult(status=_worst([c.status for c in checks]), checks=checks)
+        checks.append(
+            CheckDetail(
+                name="query_active",
+                status=HealthStatus.HEALTHY,
+                message="Query is running",
+            )
+        )
+        progress = self._get_progress()
+        if progress is None:
+            elapsed = time.monotonic() - self._last_progress_time
+            if elapsed > self._stale_timeout_seconds:
+                checks.append(
+                    CheckDetail(
+                        name="stale_progress",
+                        status=HealthStatus.UNHEALTHY,
+                        message=f"No progress events for {elapsed:.0f}s "
+                        f"(threshold: {self._stale_timeout_seconds}s)",
+                    )
+                )
+            else:
+                checks.append(
+                    CheckDetail(
+                        name="stale_progress",
+                        status=HealthStatus.HEALTHY,
+                        message="Waiting for first progress event",
+                    )
+                )
+            return HealthResult(status=_worst([c.status for c in checks]), checks=checks)
+        # Track progress advancement
+        batch_id = progress.get("batchId", -1)
+        if batch_id != self._last_batch_id:
+            self._last_progress_time = time.monotonic()
+            self._last_batch_id = batch_id
+        else:
+            elapsed = time.monotonic() - self._last_progress_time
+            if elapsed > self._stale_timeout_seconds:
+                checks.append(
+                    CheckDetail(
+                        name="stale_progress",
+                        status=HealthStatus.UNHEALTHY,
+                        message=f"Batch {batch_id} unchanged for {elapsed:.0f}s",
+                    )
+                )
+        # Check 2: batch duration
+        if self._max_batch_duration_ms is not None:
+            duration = parse_duration_ms(progress.get("batchDuration", "0 ms"))
+            exceeded = duration > self._max_batch_duration_ms
+            checks.append(
+                CheckDetail(
+                    name="batch_duration",
+                    status=HealthStatus.DEGRADED if exceeded else HealthStatus.HEALTHY,
+                    message=f"Batch took {duration}ms"
+                    + (f" (max: {self._max_batch_duration_ms}ms)" if exceeded else ""),
+                )
+            )
+        # Check 3: processing rate
+        if self._min_processing_rate is not None:
+            rate = progress.get("processedRowsPerSecond", 0.0)
+            below = rate < self._min_processing_rate
+            checks.append(
+                CheckDetail(
+                    name="processing_rate",
+                    status=HealthStatus.DEGRADED if below else HealthStatus.HEALTHY,
+                    message=f"Processing {rate:.1f} rows/s"
+                    + (f" (min: {self._min_processing_rate:.1f})" if below else ""),
+                )
+            )
+        return HealthResult(status=_worst([c.status for c in checks]), checks=checks)