PyPI - contractforge-databricks - Versions diffs - 0.1.0__py3-none-any.whl - Mend

contractforge-databricks 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

contractforge_databricks/__init__.py +172 -0
contractforge_databricks/adapter.py +69 -0
contractforge_databricks/annotations/__init__.py +10 -0
contractforge_databricks/annotations/application.py +52 -0
contractforge_databricks/annotations/audit.py +49 -0
contractforge_databricks/annotations/sql.py +142 -0
contractforge_databricks/api.py +65 -0
contractforge_databricks/bundles/__init__.py +9 -0
contractforge_databricks/bundles/assets.py +47 -0
contractforge_databricks/bundles/project.py +213 -0
contractforge_databricks/bundles/project_config.py +133 -0
contractforge_databricks/capabilities/__init__.py +17 -0
contractforge_databricks/capabilities/builders.py +43 -0
contractforge_databricks/capabilities/evaluate.py +162 -0
contractforge_databricks/capabilities/mapping.py +36 -0
contractforge_databricks/capabilities/models.py +44 -0
contractforge_databricks/capabilities/runtime.py +111 -0
contractforge_databricks/capabilities/uc.py +47 -0
contractforge_databricks/cli.py +196 -0
contractforge_databricks/cli_deploy.py +98 -0
contractforge_databricks/cli_governance.py +142 -0
contractforge_databricks/cli_io.py +91 -0
contractforge_databricks/cli_maintenance.py +69 -0
contractforge_databricks/coercion.py +31 -0
contractforge_databricks/contract_extensions.py +70 -0
contractforge_databricks/cost/__init__.py +11 -0
contractforge_databricks/cost/model.py +22 -0
contractforge_databricks/cost/report.py +65 -0
contractforge_databricks/cost/sql.py +136 -0
contractforge_databricks/dashboards/__init__.py +15 -0
contractforge_databricks/dashboards/control_tables.py +150 -0
contractforge_databricks/diagnostics/__init__.py +7 -0
contractforge_databricks/diagnostics/explain.py +40 -0
contractforge_databricks/environment.py +53 -0
contractforge_databricks/evidence/__init__.py +98 -0
contractforge_databricks/evidence/ddl.py +35 -0
contractforge_databricks/evidence/governance_log.py +175 -0
contractforge_databricks/evidence/helpers.py +29 -0
contractforge_databricks/evidence/ops_log.py +210 -0
contractforge_databricks/evidence/records.py +27 -0
contractforge_databricks/evidence/run_log.py +74 -0
contractforge_databricks/evidence/schemas.py +7 -0
contractforge_databricks/evidence/sql.py +144 -0
contractforge_databricks/evidence/tables.py +20 -0
contractforge_databricks/evidence/writer.py +118 -0
contractforge_databricks/execution/__init__.py +70 -0
contractforge_databricks/execution/delta_basic.py +57 -0
contractforge_databricks/execution/hash_diff.py +126 -0
contractforge_databricks/execution/hash_diff_latest.py +142 -0
contractforge_databricks/execution/replace_partitions.py +40 -0
contractforge_databricks/execution/results.py +5 -0
contractforge_databricks/execution/retry.py +36 -0
contractforge_databricks/execution/scd2.py +213 -0
contractforge_databricks/execution/scd2_deletes.py +65 -0
contractforge_databricks/execution/scd2_late.py +30 -0
contractforge_databricks/execution/snapshot.py +77 -0
contractforge_databricks/execution/sql_merge.py +85 -0
contractforge_databricks/execution/tables.py +98 -0
contractforge_databricks/execution/windows.py +58 -0
contractforge_databricks/governance/__init__.py +30 -0
contractforge_databricks/governance/access.py +185 -0
contractforge_databricks/governance/application.py +93 -0
contractforge_databricks/governance/drift.py +49 -0
contractforge_databricks/governance/runtime.py +60 -0
contractforge_databricks/governance/sql.py +31 -0
contractforge_databricks/governance/validation.py +135 -0
contractforge_databricks/lakeflow/__init__.py +21 -0
contractforge_databricks/lakeflow/compatibility.py +194 -0
contractforge_databricks/lakeflow/rendering.py +175 -0
contractforge_databricks/lineage/__init__.py +7 -0
contractforge_databricks/lineage/openlineage.py +182 -0
contractforge_databricks/maintenance/__init__.py +27 -0
contractforge_databricks/maintenance/retention.py +90 -0
contractforge_databricks/maintenance/sql.py +68 -0
contractforge_databricks/metrics/__init__.py +19 -0
contractforge_databricks/metrics/history.py +21 -0
contractforge_databricks/metrics/write.py +63 -0
contractforge_databricks/operations/__init__.py +4 -0
contractforge_databricks/operations/application.py +38 -0
contractforge_databricks/operations/sql.py +95 -0
contractforge_databricks/parity/__init__.py +18 -0
contractforge_databricks/parity/catalog.py +59 -0
contractforge_databricks/parity/models.py +7 -0
contractforge_databricks/parity/scenarios.py +111 -0
contractforge_databricks/partitioning/__init__.py +3 -0
contractforge_databricks/partitioning/predicates.py +28 -0
contractforge_databricks/preparation/__init__.py +47 -0
contractforge_databricks/preparation/deduplicate.py +87 -0
contractforge_databricks/preparation/encoding.py +37 -0
contractforge_databricks/preparation/hashing.py +18 -0
contractforge_databricks/preparation/pyspark.py +178 -0
contractforge_databricks/preparation/pyspark_staging.py +70 -0
contractforge_databricks/preparation/shape.py +209 -0
contractforge_databricks/preparation/shape_validation.py +94 -0
contractforge_databricks/preparation/staging.py +17 -0
contractforge_databricks/preparation/zip_arrays.py +51 -0
contractforge_databricks/presets/__init__.py +3 -0
contractforge_databricks/presets/base.py +24 -0
contractforge_databricks/presets/bronze.py +57 -0
contractforge_databricks/presets/catalog.py +22 -0
contractforge_databricks/presets/core.py +134 -0
contractforge_databricks/presets/gold.py +62 -0
contractforge_databricks/presets/modifiers.py +51 -0
contractforge_databricks/presets/runtime.py +22 -0
contractforge_databricks/presets/silver.py +101 -0
contractforge_databricks/presets/write_engine.py +57 -0
contractforge_databricks/quality/__init__.py +41 -0
contractforge_databricks/quality/evaluation.py +178 -0
contractforge_databricks/quality/persistence.py +81 -0
contractforge_databricks/quality/registry.py +134 -0
contractforge_databricks/quality/results.py +17 -0
contractforge_databricks/quality/sql.py +113 -0
contractforge_databricks/rendering/__init__.py +11 -0
contractforge_databricks/rendering/bundle.py +93 -0
contractforge_databricks/rendering/markdown.py +50 -0
contractforge_databricks/rendering/names.py +56 -0
contractforge_databricks/results.py +15 -0
contractforge_databricks/runtime/__init__.py +101 -0
contractforge_databricks/runtime/available_now.py +147 -0
contractforge_databricks/runtime/bundles.py +211 -0
contractforge_databricks/runtime/cache.py +20 -0
contractforge_databricks/runtime/control_tables.py +19 -0
contractforge_databricks/runtime/deploy.py +197 -0
contractforge_databricks/runtime/detection.py +114 -0
contractforge_databricks/runtime/dry_run.py +46 -0
contractforge_databricks/runtime/errors.py +54 -0
contractforge_databricks/runtime/file_selection.py +109 -0
contractforge_databricks/runtime/finalization.py +168 -0
contractforge_databricks/runtime/governance.py +37 -0
contractforge_databricks/runtime/hooks.py +45 -0
contractforge_databricks/runtime/http_file.py +37 -0
contractforge_databricks/runtime/http_retry.py +15 -0
contractforge_databricks/runtime/http_safety.py +9 -0
contractforge_databricks/runtime/json_materialization.py +97 -0
contractforge_databricks/runtime/lineage.py +164 -0
contractforge_databricks/runtime/maintenance.py +43 -0
contractforge_databricks/runtime/merge_validation.py +98 -0
contractforge_databricks/runtime/metadata.py +21 -0
contractforge_databricks/runtime/metrics.py +34 -0
contractforge_databricks/runtime/models.py +32 -0
contractforge_databricks/runtime/options.py +33 -0
contractforge_databricks/runtime/orchestration_context.py +185 -0
contractforge_databricks/runtime/orchestrator.py +147 -0
contractforge_databricks/runtime/partitioning.py +93 -0
contractforge_databricks/runtime/quality_quarantine.py +92 -0
contractforge_databricks/runtime/rest_api.py +46 -0
contractforge_databricks/runtime/rest_auth.py +21 -0
contractforge_databricks/runtime/rest_pagination.py +21 -0
contractforge_databricks/runtime/run_payload.py +177 -0
contractforge_databricks/runtime/schema.py +106 -0
contractforge_databricks/runtime/source_metadata.py +30 -0
contractforge_databricks/runtime/source_registry.py +43 -0
contractforge_databricks/runtime/source_schema.py +24 -0
contractforge_databricks/runtime/sources.py +208 -0
contractforge_databricks/runtime/spark.py +183 -0
contractforge_databricks/runtime/spark_defaults.py +35 -0
contractforge_databricks/runtime/storage_auth.py +132 -0
contractforge_databricks/runtime/streaming.py +131 -0
contractforge_databricks/runtime/success.py +104 -0
contractforge_databricks/runtime/utils.py +52 -0
contractforge_databricks/runtime/watermark.py +71 -0
contractforge_databricks/runtime/windows.py +184 -0
contractforge_databricks/runtime/write.py +66 -0
contractforge_databricks/runtime/write_flow.py +146 -0
contractforge_databricks/runtime/write_strategy.py +40 -0
contractforge_databricks/schema/__init__.py +21 -0
contractforge_databricks/schema/diff.py +11 -0
contractforge_databricks/schema/policy.py +33 -0
contractforge_databricks/schema/sync.py +23 -0
contractforge_databricks/security/__init__.py +21 -0
contractforge_databricks/security/errors.py +5 -0
contractforge_databricks/security/redaction.py +5 -0
contractforge_databricks/security/secrets.py +114 -0
contractforge_databricks/security/source_policy.py +17 -0
contractforge_databricks/shapes/__init__.py +3 -0
contractforge_databricks/shapes/sql.py +123 -0
contractforge_databricks/sources/__init__.py +67 -0
contractforge_databricks/sources/artifacts.py +100 -0
contractforge_databricks/sources/autoloader.py +48 -0
contractforge_databricks/sources/bounded_streams.py +44 -0
contractforge_databricks/sources/classification.py +115 -0
contractforge_databricks/sources/delta_share.py +21 -0
contractforge_databricks/sources/files.py +48 -0
contractforge_databricks/sources/http_file.py +46 -0
contractforge_databricks/sources/interpret.py +76 -0
contractforge_databricks/sources/jdbc.py +32 -0
contractforge_databricks/sources/metadata.py +18 -0
contractforge_databricks/sources/native_passthrough.py +33 -0
contractforge_databricks/sources/rds_iam.py +15 -0
contractforge_databricks/sources/rds_iam_runtime.py +191 -0
contractforge_databricks/sources/rest_api.py +33 -0
contractforge_databricks/sources/support.py +50 -0
contractforge_databricks/sources/table_refs.py +65 -0
contractforge_databricks/sql/__init__.py +4 -0
contractforge_databricks/sql/identifiers.py +17 -0
contractforge_databricks/sql/literals.py +36 -0
contractforge_databricks/state/__init__.py +39 -0
contractforge_databricks/state/ddl.py +24 -0
contractforge_databricks/state/migrations.py +146 -0
contractforge_databricks/state/queries.py +149 -0
contractforge_databricks/state/sql.py +116 -0
contractforge_databricks/state/tables.py +9 -0
contractforge_databricks/state/writer.py +83 -0
contractforge_databricks/templates/__init__.py +15 -0
contractforge_databricks/templates/catalog.py +205 -0
contractforge_databricks/templates/catalog_parity.py +85 -0
contractforge_databricks/templates/core.py +83 -0
contractforge_databricks/templates/enrichment.py +175 -0
contractforge_databricks/transforms/__init__.py +3 -0
contractforge_databricks/transforms/sql.py +118 -0
contractforge_databricks/watermark/__init__.py +6 -0
contractforge_databricks/watermark/sql.py +91 -0
contractforge_databricks/write_modes/__init__.py +20 -0
contractforge_databricks/write_modes/registry.py +44 -0
contractforge_databricks/write_modes/sql.py +33 -0
contractforge_databricks/write_modes/strategy.py +192 -0
contractforge_databricks-0.1.0.dist-info/METADATA +34 -0
contractforge_databricks-0.1.0.dist-info/RECORD +220 -0
contractforge_databricks-0.1.0.dist-info/WHEEL +4 -0
contractforge_databricks-0.1.0.dist-info/entry_points.txt +2 -0

contractforge_databricks/runtime/file_selection.py ADDED Viewed

@@ -0,0 +1,109 @@
+"""Databricks runtime file path selection helpers."""
+from __future__ import annotations
+import os
+import re
+from typing import Any
+def selected_file_load_path(spark: Any, source: dict[str, Any], options: dict[str, str]) -> object:
+    path = source.get("path")
+    if not path:
+        return path
+    read = source.get("read") if isinstance(source.get("read"), dict) else {}
+    pattern_text = str(read.get("file_regex") or "").strip()
+    if not pattern_text:
+        return path
+    try:
+        pattern = re.compile(pattern_text)
+    except re.error as exc:
+        raise ValueError(f"source.read.file_regex is invalid: {exc}") from exc
+    scope = str(read.get("file_regex_scope") or "relative_path").strip().lower()
+    if scope not in {"filename", "relative_path"}:
+        raise ValueError("source.read.file_regex_scope must be 'filename' or 'relative_path'")
+    max_listed = _positive_int(read.get("file_regex_max_listed"), "source.read.file_regex_max_listed", 10000)
+    recursive = _bool(read.get("file_regex_recursive"), _bool(options.get("recursiveFileLookup"), False))
+    listed = _listed_files(spark, str(path), recursive=recursive, max_files=max_listed, declared=read.get("files"))
+    root = str(path).rstrip("/")
+    matched = []
+    for file_path in listed:
+        file_text = str(file_path)
+        relative = file_text[len(root) :].lstrip("/") if file_text.startswith(root) else os.path.basename(file_text)
+        candidate = os.path.basename(file_text) if scope == "filename" else relative
+        if pattern.search(candidate):
+            matched.append(file_text)
+    if not matched:
+        raise ValueError(
+            "source.read.file_regex found no matching files. "
+            f"pattern={pattern_text!r}, scope={scope}, listed_files={len(listed)}"
+        )
+    return matched
+def _listed_files(
+    spark: Any,
+    path: str,
+    *,
+    recursive: bool,
+    max_files: int,
+    declared: object,
+) -> list[str]:
+    if isinstance(declared, (list, tuple)):
+        files = [str(item) for item in declared]
+        if len(files) > max_files:
+            raise ValueError(f"source.read.file_regex exceeded source.read.file_regex_max_listed={max_files}")
+        return files
+    jvm = getattr(spark, "_jvm", None)
+    jsc = getattr(spark, "_jsc", None)
+    if jvm is None or jsc is None:
+        raise RuntimeError(
+            "source.read.file_regex requires Hadoop FileSystem access through classic PySpark. "
+            "In Spark Connect/serverless, use pathGlobFilter, a filtered External Location/Volume path, "
+            "or provide an explicit source.read.files list."
+        )
+    return _hadoop_list_files(jvm, jsc, path, recursive=recursive, max_files=max_files)
+def _hadoop_list_files(jvm: Any, jsc: Any, path: str, *, recursive: bool, max_files: int) -> list[str]:
+    root = jvm.org.apache.hadoop.fs.Path(path)
+    fs = root.getFileSystem(jsc.hadoopConfiguration())
+    files: list[str] = []
+    def visit(current_path: Any) -> None:
+        status = fs.getFileStatus(current_path)
+        if status.isFile():
+            _append(files, str(status.getPath().toString()), max_files)
+            return
+        for child in fs.listStatus(current_path):
+            if child.isDirectory():
+                if recursive:
+                    visit(child.getPath())
+                continue
+            _append(files, str(child.getPath().toString()), max_files)
+    visit(root)
+    return files
+def _append(files: list[str], path: str, max_files: int) -> None:
+    files.append(path)
+    if len(files) > max_files:
+        raise ValueError(f"source.read.file_regex exceeded source.read.file_regex_max_listed={max_files}")
+def _bool(value: object, default: bool) -> bool:
+    if value is None:
+        return default
+    if isinstance(value, bool):
+        return value
+    return str(value).strip().lower() in {"1", "true", "yes", "y"}
+def _positive_int(value: object, field: str, default: int) -> int:
+    if value in (None, ""):
+        return default
+    parsed = int(value)
+    if parsed <= 0:
+        raise ValueError(f"{field} must be a positive integer")
+    return parsed

contractforge_databricks/runtime/finalization.py ADDED Viewed

@@ -0,0 +1,168 @@
+"""Finalize Databricks runtime ingestion with evidence and state."""
+from __future__ import annotations
+from datetime import datetime, timezone
+from typing import Any
+from contractforge_core.config import CTRL_SCHEMA_VERSION, FRAMEWORK_VERSION
+from contractforge_core.quality import QualityRuleResult
+from contractforge_core.runtime import PreparedInput, QuarantineReference
+from contractforge_core.semantic import SemanticContract
+from contractforge_databricks.evidence import EvidenceWriter, SourceMetadataEvidenceRecord
+from contractforge_databricks.quality import render_quality_result_insert_sql, render_quarantine_reference_insert_sql
+from contractforge_databricks.runtime.models import DatabricksIngestOptions
+from contractforge_databricks.runtime.run_payload import run_payload
+from contractforge_databricks.runtime.utils import utc_now_str
+from contractforge_databricks.state import StateWriter
+def finalize_ingest(
+    evidence: EvidenceWriter,
+    state: StateWriter,
+    contract: SemanticContract,
+    prepared: PreparedInput,
+    opts: DatabricksIngestOptions,
+    run_id: str,
+    target: str,
+    status: str,
+    started: str,
+    *,
+    rows_written: int,
+    quality_status_value: str,
+    quality_results: tuple[QualityRuleResult, ...] = (),
+    operation_metrics: dict[str, Any] | None = None,
+    schema_changes: dict[str, Any] | None = None,
+    governance_results: dict[str, Any] | None = None,
+    write_started_at: str | None = None,
+    write_finished_at: str | None = None,
+    stage_durations: dict[str, float] | None = None,
+    watermark_column: str | None = None,
+    watermark_previous: str | None = None,
+    watermark_current: str | None = None,
+    diagnostics: dict[str, bool] | None = None,
+    error_message: str | None = None,
+    skip_reason: str | None = None,
+    skipped_by_run_id: str | None = None,
+) -> dict[str, Any]:
+    finished = _utc_now()
+    payload = run_payload(
+        contract,
+        prepared,
+        opts,
+        run_id,
+        target,
+        status,
+        started,
+        finished,
+        rows_written,
+        quality_status_value,
+        operation_metrics or {},
+        schema_changes or {},
+        governance_results or {},
+        write_started_at,
+        write_finished_at,
+        stage_durations or {},
+        watermark_column,
+        watermark_previous,
+        watermark_current,
+        diagnostics or {},
+        error_message,
+        skip_reason,
+        skipped_by_run_id,
+    )
+    if not opts.dry_run:
+        operations = contract.operations.metadata if contract.operations and contract.operations.metadata else {}
+        evidence.write_run_log(payload)
+        _write_quality_results(evidence, run_id, target, quality_results, payload["finished_at_utc"], opts)
+        _write_quarantine_references(evidence, run_id, target, prepared.quarantine_records, payload["finished_at_utc"], opts)
+        if prepared.source_metadata:
+            evidence.write_source_metadata(
+                SourceMetadataEvidenceRecord(
+                    run_id=run_id,
+                    target_table=target,
+                    source_metadata=dict(prepared.source_metadata),
+                    captured_at_utc=_parse_utc(finished=payload["finished_at_utc"]),
+                )
+            )
+        state.record_control_metadata(
+            framework_version=FRAMEWORK_VERSION,
+            ctrl_schema_version=CTRL_SCHEMA_VERSION,
+        )
+        state.upsert_state(
+            target_table=target,
+            run_id=run_id,
+            status=status,
+            rows_written=rows_written,
+            watermark_column=watermark_column,
+            watermark_value=watermark_current,
+            success_at_utc=finished if status == "SUCCESS" else None,
+            watermark_candidate=watermark_current,
+            table_version=payload.get("table_version_after"),
+            write_completed_at_utc=write_finished_at if status == "SUCCESS" else None,
+            error_message=error_message,
+            parent_run_id=operations.get("parent_run_id"),
+            run_group_id=operations.get("run_group_id"),
+            master_job_id=operations.get("master_job_id"),
+            master_run_id=operations.get("master_run_id"),
+        )
+    return payload
+def _utc_now() -> str:
+    return utc_now_str()
+def _write_quality_results(
+    evidence: EvidenceWriter,
+    run_id: str,
+    target: str,
+    results: tuple[QualityRuleResult, ...],
+    checked_at: object,
+    opts: DatabricksIngestOptions,
+) -> None:
+    checked_at_utc = _parse_utc(finished=checked_at)
+    for result in results:
+        evidence.runner.sql(
+            render_quality_result_insert_sql(
+                run_id=run_id,
+                target_table=target,
+                result=result,
+                checked_at_utc=checked_at_utc,
+                catalog=opts.catalog,
+                schema=opts.schema,
+            )
+        )
+def _write_quarantine_references(
+    evidence: EvidenceWriter,
+    run_id: str,
+    target: str,
+    records: tuple[QuarantineReference, ...],
+    quarantined_at: object,
+    opts: DatabricksIngestOptions,
+) -> None:
+    quarantined_at_utc = _parse_utc(finished=quarantined_at)
+    for record in records:
+        reason = f"{record.rule_name}: {record.reason}" if record.rule_name else record.reason
+        evidence.runner.sql(
+            render_quarantine_reference_insert_sql(
+                run_id=run_id,
+                target_table=target,
+                record_ref=record.record_ref,
+                reason=reason,
+                quarantined_at_utc=quarantined_at_utc,
+                catalog=opts.catalog,
+                schema=opts.schema,
+            )
+        )
+def _parse_utc(*, finished: object) -> datetime:
+    if isinstance(finished, datetime):
+        return finished
+    try:
+        return datetime.strptime(str(finished), "%Y-%m-%d %H:%M:%S").replace(tzinfo=timezone.utc)
+    except ValueError:
+        return datetime.now(timezone.utc)

contractforge_databricks/runtime/governance.py ADDED Viewed

@@ -0,0 +1,37 @@
+"""Runtime governance side effects for Databricks ingestion."""
+from __future__ import annotations
+from dataclasses import asdict
+from typing import Any
+from contractforge_core.semantic import SemanticContract
+from contractforge_databricks.annotations import apply_annotations_contract
+from contractforge_databricks.environment import DatabricksEnvironment
+from contractforge_databricks.execution import SqlRunner
+from contractforge_databricks.operations import record_operations_contract
+def apply_runtime_governance(
+    *,
+    runner: SqlRunner,
+    contract: SemanticContract,
+    run_id: str,
+    evidence_catalog: str,
+    evidence_schema: str,
+) -> dict[str, Any]:
+    operations = record_operations_contract(
+        runner=runner,
+        contract=contract,
+        environment=DatabricksEnvironment(evidence_catalog=evidence_catalog, evidence_schema=evidence_schema),
+        run_id=run_id,
+    )
+    annotations = apply_annotations_contract(runner=runner, contract=contract)
+    result = {
+        "operations": asdict(operations),
+        "annotations": asdict(annotations),
+        "access": {"status": "DEFERRED"} if contract.governance and contract.governance.access else {"status": "NOT_CONFIGURED"},
+    }
+    if annotations.status == "FAILED":
+        raise ValueError(f"Databricks annotations failed: {list(annotations.errors)}")
+    return result

contractforge_databricks/runtime/hooks.py ADDED Viewed

@@ -0,0 +1,45 @@
+"""Programmatic hooks for Databricks runtime orchestration."""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Callable, Optional
+from contractforge_core.execution import ExecutionOutcome
+from contractforge_core.runtime import PreparedInput
+from contractforge_core.semantic import SemanticContract
+PreparedHook = Callable[[SemanticContract, PreparedInput], Optional[PreparedInput]]
+AfterWriteHook = Callable[[SemanticContract, PreparedInput, Optional[ExecutionOutcome]], None]
+AfterFinalizeHook = Callable[[SemanticContract, dict[str, object]], None]
+@dataclass(frozen=True)
+class DatabricksIngestionHooks:
+    """Optional callbacks around the Databricks prepared-view runtime boundary."""
+    after_prepare: PreparedHook | None = None
+    before_write: PreparedHook | None = None
+    after_write: AfterWriteHook | None = None
+    after_finalize: AfterFinalizeHook | None = None
+    def __post_init__(self) -> None:
+        for name in ("after_prepare", "before_write", "after_write", "after_finalize"):
+            hook = getattr(self, name)
+            if hook is not None and not callable(hook):
+                raise ValueError(f"DatabricksIngestionHooks.{name} must be callable")
+def apply_prepared_hook(
+    hook: PreparedHook | None,
+    contract: SemanticContract,
+    prepared: PreparedInput,
+) -> PreparedInput:
+    if hook is None:
+        return prepared
+    result = hook(contract, prepared)
+    if result is None:
+        return prepared
+    if not isinstance(result, PreparedInput):
+        raise ValueError("Databricks prepared hooks must return PreparedInput or None")
+    return result

contractforge_databricks/runtime/http_file.py ADDED Viewed

@@ -0,0 +1,37 @@
+"""Databricks runtime execution for bounded HTTP file sources."""
+from __future__ import annotations
+from typing import Any
+from contractforge_core.connectors import (
+    cleanup_http_file_downloads as cleanup_http_file_downloads,
+    download_http_file,
+    http_file_format,
+    http_file_reader_options,
+    is_http_file_source,
+)
+from contractforge_databricks.runtime.source_schema import apply_declared_schema
+def resolve_http_file_dataframe(spark: Any, source: dict[str, Any]) -> Any:
+    """Download a bounded HTTP file and load it with Spark's native reader."""
+    if not is_http_file_source(source):
+        raise ValueError("HTTP file runtime resolution requires source.type http_file/http_csv/http_json/http_text")
+    local_path = download_http_file(source)
+    reader = spark.read.format(http_file_format(source))
+    for key, value in sorted(http_file_reader_options(source).items()):
+        reader = reader.option(key, value)
+    reader = apply_declared_schema(reader, source)
+    df = reader.load(local_path)
+    _enforce_max_records(df, source)
+    return df
+def _enforce_max_records(df: Any, source: dict[str, Any]) -> None:
+    max_records = source.get("limits", {}).get("max_records")
+    if max_records is None or not hasattr(df, "count"):
+        return
+    count = int(df.count())
+    if count > int(max_records):
+        raise ValueError(f"HTTP file response exceeds source.limits.max_records={int(max_records)}")

contractforge_databricks/runtime/http_retry.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""Compatibility re-exports for the core HTTP retry policy."""
+from contractforge_core.connectors.api.rest.retry import (
+    RETRYABLE_HTTP_STATUS,
+    is_retryable_http_error,
+    is_retryable_network_error,
+    sleep_retry_backoff,
+)
+__all__ = [
+    "RETRYABLE_HTTP_STATUS",
+    "is_retryable_http_error",
+    "is_retryable_network_error",
+    "sleep_retry_backoff",
+]

contractforge_databricks/runtime/http_safety.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""Compatibility re-exports for the core HTTP target safety policy."""
+from contractforge_core.connectors.api.rest.safety import (
+    ALLOWED_SCHEMES,
+    ALLOW_PRIVATE_FLAG,
+    validate_http_target,
+)
+__all__ = ["ALLOWED_SCHEMES", "ALLOW_PRIVATE_FLAG", "validate_http_target"]

contractforge_databricks/runtime/json_materialization.py ADDED Viewed

@@ -0,0 +1,97 @@
+"""JSON record materialization helpers for Databricks runtime connectors."""
+from __future__ import annotations
+import json
+import os
+import uuid
+from collections.abc import Mapping
+from typing import Any
+def materialize_json_records(
+    spark: Any,
+    records: list[Any],
+    *,
+    schema: str | None = None,
+    read_options: Mapping[str, Any] | None = None,
+    staging_path: str | None = None,
+) -> Any:
+    if not records:
+        return spark.createDataFrame([], schema or "value string").limit(0)
+    normalized = [record if isinstance(record, Mapping) else {"value": record} for record in records]
+    if hasattr(spark, "sparkContext") and hasattr(spark, "read"):
+        json_lines = [json.dumps(record, default=str, ensure_ascii=False) for record in normalized]
+        return _json_reader(spark, read_options, schema=schema).json(spark.sparkContext.parallelize(json_lines))
+    staging_dir = _json_staging_dir(staging_path)
+    if staging_dir and hasattr(spark, "read"):
+        return _json_reader(spark, read_options, schema=schema).json(_write_json_lines_file(normalized, staging_dir))
+    try:
+        return _create_dataframe(spark, normalized, schema)
+    except Exception as exc:
+        if hasattr(spark, "read"):
+            raise ValueError(
+                "Could not materialize complex JSON records with createDataFrame. "
+                "Declare source.read.staging_path or CONTRACTFORGE_SOURCE_JSON_STAGING_DIR with a local path "
+                "accessible to the Python driver and Spark reader, or use source.response.mode=raw with shape.parse_json."
+            ) from exc
+        return _create_dataframe(spark, [_json_safe_record(record) for record in normalized], schema)
+def _create_dataframe(spark: Any, records: list[Any], schema: str | None) -> Any:
+    if schema is None:
+        return spark.createDataFrame(records)
+    try:
+        return spark.createDataFrame(records, schema=schema)
+    except TypeError as exc:
+        if "schema" not in str(exc):
+            raise
+        return spark.createDataFrame(records, schema)
+def _json_reader(spark: Any, options: Mapping[str, Any] | None, *, schema: str | None = None) -> Any:
+    reader = spark.read
+    if schema:
+        reader = reader.schema(schema)
+    if options is None:
+        return reader
+    if not isinstance(options, Mapping):
+        raise ValueError("source.read.json_options must be an object")
+    for key, value in options.items():
+        option_key = str(key).strip()
+        if not option_key:
+            raise ValueError("source.read.json_options cannot contain an empty key")
+        reader = reader.option(option_key, str(value).lower() if isinstance(value, bool) else str(value))
+    return reader
+def _json_staging_dir(staging_path: str | None) -> str | None:
+    raw = str(staging_path or os.environ.get("CONTRACTFORGE_SOURCE_JSON_STAGING_DIR") or "").strip()
+    if not raw:
+        return None
+    if "://" in raw and not raw.startswith("file:"):
+        raise ValueError(
+            "source.read.staging_path for JSON materialization must be a local filesystem path "
+            "accessible to the Python driver and Spark reader, for example /Volumes/... or file:/..."
+        )
+    return raw
+def _write_json_lines_file(records: list[Mapping[str, Any]], staging_dir: str) -> str:
+    use_file_uri = staging_dir.startswith("file:")
+    local_dir = staging_dir[5:] if use_file_uri else staging_dir
+    os.makedirs(local_dir, exist_ok=True)
+    path = os.path.join(local_dir, f"{uuid.uuid4().hex}.jsonl")
+    with open(path, "w", encoding="utf-8") as handle:
+        for record in records:
+            handle.write(json.dumps(record, default=str, ensure_ascii=False))
+            handle.write("\n")
+    return f"file:{path}" if use_file_uri else path
+def _json_safe_record(value: Any) -> Any:
+    if isinstance(value, Mapping):
+        return {str(key): _json_safe_record(item) for key, item in value.items()}
+    if isinstance(value, list):
+        return json.dumps(value, default=str, ensure_ascii=False)
+    return value

contractforge_databricks/runtime/lineage.py ADDED Viewed

@@ -0,0 +1,164 @@
+"""Runtime explain and OpenLineage evidence for Databricks."""
+from __future__ import annotations
+from datetime import datetime
+from typing import Any
+from contractforge_core.diagnostics import ExplainPlanRecord
+from contractforge_core.runtime import PreparedInput, QueryOne
+from contractforge_core.semantic import SemanticContract
+from contractforge_databricks.contract_extensions import databricks_extensions
+from contractforge_databricks.diagnostics import render_explain_insert_sql
+from contractforge_databricks.execution import SqlRunner
+from contractforge_databricks.lineage import render_openlineage_insert_sql
+from contractforge_databricks.sql import quote_table_name
+def write_runtime_diagnostics(
+    *,
+    runner: SqlRunner,
+    contract: SemanticContract,
+    prepared: PreparedInput,
+    run_id: str,
+    target: str,
+    status: str,
+    started: str,
+    finished: str,
+    rows_written: int,
+    operation_metrics: dict[str, Any],
+    catalog: str,
+    schema: str,
+    query_one: QueryOne | None,
+    runtime_metadata: dict[str, Any] | None = None,
+) -> dict[str, bool]:
+    extensions = databricks_extensions(contract)
+    explain = _write_explain(
+        runner=runner,
+        contract=contract,
+        prepared=prepared,
+        run_id=run_id,
+        target=target,
+        extensions=extensions,
+        catalog=catalog,
+        schema=schema,
+        query_one=query_one,
+    )
+    lineage = _write_openlineage(
+        runner=runner,
+        contract=contract,
+        prepared=prepared,
+        run_id=run_id,
+        target=target,
+        status=status,
+        started=started,
+        finished=finished,
+        rows_written=rows_written,
+        operation_metrics=operation_metrics,
+        extensions=extensions,
+        catalog=catalog,
+        schema=schema,
+        runtime_metadata=runtime_metadata,
+    )
+    return {"explain_captured": explain, "openlineage_event_emitted": lineage}
+def _write_explain(
+    *,
+    runner: SqlRunner,
+    contract: SemanticContract,
+    prepared: PreparedInput,
+    run_id: str,
+    target: str,
+    extensions: dict[str, Any],
+    catalog: str,
+    schema: str,
+    query_one: QueryOne | None,
+) -> bool:
+    if not extensions.get("explain_mode") or query_one is None:
+        return False
+    explain_format = str(extensions.get("explain_format") or "formatted")
+    row = query_one(f"EXPLAIN {explain_format.upper()} SELECT * FROM {quote_table_name(prepared.source_view)}")
+    plan_text = _row_value(row, "plan_text") or _row_value(row, "plan") or _row_value(row, "explain")
+    if plan_text is None:
+        return False
+    runner.sql(
+        render_explain_insert_sql(
+            ExplainPlanRecord(run_id, target, prepared.source_name or prepared.source_view, contract.write.mode, explain_format, str(plan_text)),
+            catalog=catalog,
+            schema=schema,
+        )
+    )
+    return True
+def _write_openlineage(
+    *,
+    runner: SqlRunner,
+    contract: SemanticContract,
+    prepared: PreparedInput,
+    run_id: str,
+    target: str,
+    status: str,
+    started: str,
+    finished: str,
+    rows_written: int,
+    operation_metrics: dict[str, Any],
+    extensions: dict[str, Any],
+    catalog: str,
+    schema: str,
+    runtime_metadata: dict[str, Any] | None,
+) -> bool:
+    if not extensions.get("openlineage_enabled"):
+        return False
+    operations = contract.operations.metadata if contract.operations and contract.operations.metadata else {}
+    runtime = dict(runtime_metadata or {})
+    runner.sql(
+        render_openlineage_insert_sql(
+            contract,
+            run_id=run_id,
+            source_name=prepared.source_name or prepared.source_view,
+            status=status,
+            started_at_utc=_parse_ts(started),
+            finished_at_utc=_parse_ts(finished),
+            rows_read=prepared.rows_read,
+            rows_written=rows_written,
+            input_schema=_schema_fields(prepared.source_schema),
+            output_schema=_schema_fields(prepared.source_schema),
+            delta_version_after=_int_or_none(operation_metrics.get("version")),
+            operation_metrics=operation_metrics,
+            namespace=extensions.get("openlineage_namespace"),
+            producer=str(extensions.get("openlineage_producer") or "contractforge-databricks"),
+            parent_run_id=operations.get("parent_run_id"),
+            spark_version=runtime.get("spark_version"),
+            source_code_url=runtime.get("notebook_name"),
+            catalog=catalog,
+            schema=schema,
+        )
+    )
+    return True
+def _schema_fields(schema: dict[str, str] | None) -> tuple[tuple[str, str], ...]:
+    return tuple((name, dtype) for name, dtype in (schema or {}).items())
+def _row_value(row: Any, key: str) -> Any:
+    if row is None:
+        return None
+    if isinstance(row, dict):
+        return row.get(key)
+    if hasattr(row, "asDict"):
+        return row.asDict().get(key)
+    return getattr(row, key, None)
+def _parse_ts(value: str) -> datetime:
+    return datetime.strptime(value, "%Y-%m-%d %H:%M:%S")
+def _int_or_none(value: object) -> int | None:
+    try:
+        return None if value is None else int(value)
+    except (TypeError, ValueError):
+        return None