contractforge-databricks 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contractforge_databricks/__init__.py +172 -0
- contractforge_databricks/adapter.py +69 -0
- contractforge_databricks/annotations/__init__.py +10 -0
- contractforge_databricks/annotations/application.py +52 -0
- contractforge_databricks/annotations/audit.py +49 -0
- contractforge_databricks/annotations/sql.py +142 -0
- contractforge_databricks/api.py +65 -0
- contractforge_databricks/bundles/__init__.py +9 -0
- contractforge_databricks/bundles/assets.py +47 -0
- contractforge_databricks/bundles/project.py +213 -0
- contractforge_databricks/bundles/project_config.py +133 -0
- contractforge_databricks/capabilities/__init__.py +17 -0
- contractforge_databricks/capabilities/builders.py +43 -0
- contractforge_databricks/capabilities/evaluate.py +162 -0
- contractforge_databricks/capabilities/mapping.py +36 -0
- contractforge_databricks/capabilities/models.py +44 -0
- contractforge_databricks/capabilities/runtime.py +111 -0
- contractforge_databricks/capabilities/uc.py +47 -0
- contractforge_databricks/cli.py +196 -0
- contractforge_databricks/cli_deploy.py +98 -0
- contractforge_databricks/cli_governance.py +142 -0
- contractforge_databricks/cli_io.py +91 -0
- contractforge_databricks/cli_maintenance.py +69 -0
- contractforge_databricks/coercion.py +31 -0
- contractforge_databricks/contract_extensions.py +70 -0
- contractforge_databricks/cost/__init__.py +11 -0
- contractforge_databricks/cost/model.py +22 -0
- contractforge_databricks/cost/report.py +65 -0
- contractforge_databricks/cost/sql.py +136 -0
- contractforge_databricks/dashboards/__init__.py +15 -0
- contractforge_databricks/dashboards/control_tables.py +150 -0
- contractforge_databricks/diagnostics/__init__.py +7 -0
- contractforge_databricks/diagnostics/explain.py +40 -0
- contractforge_databricks/environment.py +53 -0
- contractforge_databricks/evidence/__init__.py +98 -0
- contractforge_databricks/evidence/ddl.py +35 -0
- contractforge_databricks/evidence/governance_log.py +175 -0
- contractforge_databricks/evidence/helpers.py +29 -0
- contractforge_databricks/evidence/ops_log.py +210 -0
- contractforge_databricks/evidence/records.py +27 -0
- contractforge_databricks/evidence/run_log.py +74 -0
- contractforge_databricks/evidence/schemas.py +7 -0
- contractforge_databricks/evidence/sql.py +144 -0
- contractforge_databricks/evidence/tables.py +20 -0
- contractforge_databricks/evidence/writer.py +118 -0
- contractforge_databricks/execution/__init__.py +70 -0
- contractforge_databricks/execution/delta_basic.py +57 -0
- contractforge_databricks/execution/hash_diff.py +126 -0
- contractforge_databricks/execution/hash_diff_latest.py +142 -0
- contractforge_databricks/execution/replace_partitions.py +40 -0
- contractforge_databricks/execution/results.py +5 -0
- contractforge_databricks/execution/retry.py +36 -0
- contractforge_databricks/execution/scd2.py +213 -0
- contractforge_databricks/execution/scd2_deletes.py +65 -0
- contractforge_databricks/execution/scd2_late.py +30 -0
- contractforge_databricks/execution/snapshot.py +77 -0
- contractforge_databricks/execution/sql_merge.py +85 -0
- contractforge_databricks/execution/tables.py +98 -0
- contractforge_databricks/execution/windows.py +58 -0
- contractforge_databricks/governance/__init__.py +30 -0
- contractforge_databricks/governance/access.py +185 -0
- contractforge_databricks/governance/application.py +93 -0
- contractforge_databricks/governance/drift.py +49 -0
- contractforge_databricks/governance/runtime.py +60 -0
- contractforge_databricks/governance/sql.py +31 -0
- contractforge_databricks/governance/validation.py +135 -0
- contractforge_databricks/lakeflow/__init__.py +21 -0
- contractforge_databricks/lakeflow/compatibility.py +194 -0
- contractforge_databricks/lakeflow/rendering.py +175 -0
- contractforge_databricks/lineage/__init__.py +7 -0
- contractforge_databricks/lineage/openlineage.py +182 -0
- contractforge_databricks/maintenance/__init__.py +27 -0
- contractforge_databricks/maintenance/retention.py +90 -0
- contractforge_databricks/maintenance/sql.py +68 -0
- contractforge_databricks/metrics/__init__.py +19 -0
- contractforge_databricks/metrics/history.py +21 -0
- contractforge_databricks/metrics/write.py +63 -0
- contractforge_databricks/operations/__init__.py +4 -0
- contractforge_databricks/operations/application.py +38 -0
- contractforge_databricks/operations/sql.py +95 -0
- contractforge_databricks/parity/__init__.py +18 -0
- contractforge_databricks/parity/catalog.py +59 -0
- contractforge_databricks/parity/models.py +7 -0
- contractforge_databricks/parity/scenarios.py +111 -0
- contractforge_databricks/partitioning/__init__.py +3 -0
- contractforge_databricks/partitioning/predicates.py +28 -0
- contractforge_databricks/preparation/__init__.py +47 -0
- contractforge_databricks/preparation/deduplicate.py +87 -0
- contractforge_databricks/preparation/encoding.py +37 -0
- contractforge_databricks/preparation/hashing.py +18 -0
- contractforge_databricks/preparation/pyspark.py +178 -0
- contractforge_databricks/preparation/pyspark_staging.py +70 -0
- contractforge_databricks/preparation/shape.py +209 -0
- contractforge_databricks/preparation/shape_validation.py +94 -0
- contractforge_databricks/preparation/staging.py +17 -0
- contractforge_databricks/preparation/zip_arrays.py +51 -0
- contractforge_databricks/presets/__init__.py +3 -0
- contractforge_databricks/presets/base.py +24 -0
- contractforge_databricks/presets/bronze.py +57 -0
- contractforge_databricks/presets/catalog.py +22 -0
- contractforge_databricks/presets/core.py +134 -0
- contractforge_databricks/presets/gold.py +62 -0
- contractforge_databricks/presets/modifiers.py +51 -0
- contractforge_databricks/presets/runtime.py +22 -0
- contractforge_databricks/presets/silver.py +101 -0
- contractforge_databricks/presets/write_engine.py +57 -0
- contractforge_databricks/quality/__init__.py +41 -0
- contractforge_databricks/quality/evaluation.py +178 -0
- contractforge_databricks/quality/persistence.py +81 -0
- contractforge_databricks/quality/registry.py +134 -0
- contractforge_databricks/quality/results.py +17 -0
- contractforge_databricks/quality/sql.py +113 -0
- contractforge_databricks/rendering/__init__.py +11 -0
- contractforge_databricks/rendering/bundle.py +93 -0
- contractforge_databricks/rendering/markdown.py +50 -0
- contractforge_databricks/rendering/names.py +56 -0
- contractforge_databricks/results.py +15 -0
- contractforge_databricks/runtime/__init__.py +101 -0
- contractforge_databricks/runtime/available_now.py +147 -0
- contractforge_databricks/runtime/bundles.py +211 -0
- contractforge_databricks/runtime/cache.py +20 -0
- contractforge_databricks/runtime/control_tables.py +19 -0
- contractforge_databricks/runtime/deploy.py +197 -0
- contractforge_databricks/runtime/detection.py +114 -0
- contractforge_databricks/runtime/dry_run.py +46 -0
- contractforge_databricks/runtime/errors.py +54 -0
- contractforge_databricks/runtime/file_selection.py +109 -0
- contractforge_databricks/runtime/finalization.py +168 -0
- contractforge_databricks/runtime/governance.py +37 -0
- contractforge_databricks/runtime/hooks.py +45 -0
- contractforge_databricks/runtime/http_file.py +37 -0
- contractforge_databricks/runtime/http_retry.py +15 -0
- contractforge_databricks/runtime/http_safety.py +9 -0
- contractforge_databricks/runtime/json_materialization.py +97 -0
- contractforge_databricks/runtime/lineage.py +164 -0
- contractforge_databricks/runtime/maintenance.py +43 -0
- contractforge_databricks/runtime/merge_validation.py +98 -0
- contractforge_databricks/runtime/metadata.py +21 -0
- contractforge_databricks/runtime/metrics.py +34 -0
- contractforge_databricks/runtime/models.py +32 -0
- contractforge_databricks/runtime/options.py +33 -0
- contractforge_databricks/runtime/orchestration_context.py +185 -0
- contractforge_databricks/runtime/orchestrator.py +147 -0
- contractforge_databricks/runtime/partitioning.py +93 -0
- contractforge_databricks/runtime/quality_quarantine.py +92 -0
- contractforge_databricks/runtime/rest_api.py +46 -0
- contractforge_databricks/runtime/rest_auth.py +21 -0
- contractforge_databricks/runtime/rest_pagination.py +21 -0
- contractforge_databricks/runtime/run_payload.py +177 -0
- contractforge_databricks/runtime/schema.py +106 -0
- contractforge_databricks/runtime/source_metadata.py +30 -0
- contractforge_databricks/runtime/source_registry.py +43 -0
- contractforge_databricks/runtime/source_schema.py +24 -0
- contractforge_databricks/runtime/sources.py +208 -0
- contractforge_databricks/runtime/spark.py +183 -0
- contractforge_databricks/runtime/spark_defaults.py +35 -0
- contractforge_databricks/runtime/storage_auth.py +132 -0
- contractforge_databricks/runtime/streaming.py +131 -0
- contractforge_databricks/runtime/success.py +104 -0
- contractforge_databricks/runtime/utils.py +52 -0
- contractforge_databricks/runtime/watermark.py +71 -0
- contractforge_databricks/runtime/windows.py +184 -0
- contractforge_databricks/runtime/write.py +66 -0
- contractforge_databricks/runtime/write_flow.py +146 -0
- contractforge_databricks/runtime/write_strategy.py +40 -0
- contractforge_databricks/schema/__init__.py +21 -0
- contractforge_databricks/schema/diff.py +11 -0
- contractforge_databricks/schema/policy.py +33 -0
- contractforge_databricks/schema/sync.py +23 -0
- contractforge_databricks/security/__init__.py +21 -0
- contractforge_databricks/security/errors.py +5 -0
- contractforge_databricks/security/redaction.py +5 -0
- contractforge_databricks/security/secrets.py +114 -0
- contractforge_databricks/security/source_policy.py +17 -0
- contractforge_databricks/shapes/__init__.py +3 -0
- contractforge_databricks/shapes/sql.py +123 -0
- contractforge_databricks/sources/__init__.py +67 -0
- contractforge_databricks/sources/artifacts.py +100 -0
- contractforge_databricks/sources/autoloader.py +48 -0
- contractforge_databricks/sources/bounded_streams.py +44 -0
- contractforge_databricks/sources/classification.py +115 -0
- contractforge_databricks/sources/delta_share.py +21 -0
- contractforge_databricks/sources/files.py +48 -0
- contractforge_databricks/sources/http_file.py +46 -0
- contractforge_databricks/sources/interpret.py +76 -0
- contractforge_databricks/sources/jdbc.py +32 -0
- contractforge_databricks/sources/metadata.py +18 -0
- contractforge_databricks/sources/native_passthrough.py +33 -0
- contractforge_databricks/sources/rds_iam.py +15 -0
- contractforge_databricks/sources/rds_iam_runtime.py +191 -0
- contractforge_databricks/sources/rest_api.py +33 -0
- contractforge_databricks/sources/support.py +50 -0
- contractforge_databricks/sources/table_refs.py +65 -0
- contractforge_databricks/sql/__init__.py +4 -0
- contractforge_databricks/sql/identifiers.py +17 -0
- contractforge_databricks/sql/literals.py +36 -0
- contractforge_databricks/state/__init__.py +39 -0
- contractforge_databricks/state/ddl.py +24 -0
- contractforge_databricks/state/migrations.py +146 -0
- contractforge_databricks/state/queries.py +149 -0
- contractforge_databricks/state/sql.py +116 -0
- contractforge_databricks/state/tables.py +9 -0
- contractforge_databricks/state/writer.py +83 -0
- contractforge_databricks/templates/__init__.py +15 -0
- contractforge_databricks/templates/catalog.py +205 -0
- contractforge_databricks/templates/catalog_parity.py +85 -0
- contractforge_databricks/templates/core.py +83 -0
- contractforge_databricks/templates/enrichment.py +175 -0
- contractforge_databricks/transforms/__init__.py +3 -0
- contractforge_databricks/transforms/sql.py +118 -0
- contractforge_databricks/watermark/__init__.py +6 -0
- contractforge_databricks/watermark/sql.py +91 -0
- contractforge_databricks/write_modes/__init__.py +20 -0
- contractforge_databricks/write_modes/registry.py +44 -0
- contractforge_databricks/write_modes/sql.py +33 -0
- contractforge_databricks/write_modes/strategy.py +192 -0
- contractforge_databricks-0.1.0.dist-info/METADATA +34 -0
- contractforge_databricks-0.1.0.dist-info/RECORD +220 -0
- contractforge_databricks-0.1.0.dist-info/WHEEL +4 -0
- contractforge_databricks-0.1.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""Runtime payload helpers for Databricks available-now streaming."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from contractforge_core.config import CTRL_SCHEMA_VERSION, FRAMEWORK_VERSION
|
|
9
|
+
from contractforge_core.semantic import SemanticContract
|
|
10
|
+
from contractforge_databricks.rendering.names import target_full_name
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def stream_metrics_from_batches(batch_results: list[dict[str, Any]]) -> dict[str, int]:
|
|
14
|
+
"""Aggregate metrics returned by child batch ingestions."""
|
|
15
|
+
|
|
16
|
+
return {
|
|
17
|
+
"batches_processed": len(batch_results),
|
|
18
|
+
"total_rows_read": sum(_int_metric(result, "rows_read") for result in batch_results),
|
|
19
|
+
"total_rows_written": sum(_int_metric(result, "rows_written") for result in batch_results),
|
|
20
|
+
"total_rows_quarantined": sum(_int_metric(result, "rows_quarantined") for result in batch_results),
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def prefer_child_stream_metrics(local: dict[str, int], child: dict[str, int]) -> bool:
|
|
25
|
+
"""Return true when persisted child-run metrics are more complete."""
|
|
26
|
+
|
|
27
|
+
if child.get("batches_processed", 0) <= 0:
|
|
28
|
+
return False
|
|
29
|
+
local_rows = local.get("total_rows_read", 0) + local.get("total_rows_written", 0) + local.get("total_rows_quarantined", 0)
|
|
30
|
+
child_rows = child.get("total_rows_read", 0) + child.get("total_rows_written", 0) + child.get("total_rows_quarantined", 0)
|
|
31
|
+
return local.get("batches_processed", 0) == 0 or child.get("batches_processed", 0) > local.get("batches_processed", 0) or child_rows > local_rows
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def stream_start_payload(
|
|
35
|
+
contract: SemanticContract,
|
|
36
|
+
*,
|
|
37
|
+
stream_run_id: str,
|
|
38
|
+
status: str = "RUNNING",
|
|
39
|
+
started_at_utc: datetime | None = None,
|
|
40
|
+
idempotency_key: str | None = None,
|
|
41
|
+
idempotency_policy: str = "always_run",
|
|
42
|
+
runtime_metadata: dict[str, Any] | None = None,
|
|
43
|
+
) -> dict[str, Any]:
|
|
44
|
+
source = contract.source.raw or {}
|
|
45
|
+
operations = _operations_metadata(contract)
|
|
46
|
+
runtime = runtime_metadata or {}
|
|
47
|
+
return {
|
|
48
|
+
"stream_run_id": stream_run_id,
|
|
49
|
+
"idempotency_key": idempotency_key if idempotency_key is not None else operations.get("idempotency_key"),
|
|
50
|
+
"idempotency_policy": operations.get("idempotency_policy") or idempotency_policy,
|
|
51
|
+
"target_table": target_full_name(contract),
|
|
52
|
+
"target_catalog": _target_catalog(contract),
|
|
53
|
+
"target_layer": contract.target.layer,
|
|
54
|
+
"runtime_entrypoint": runtime.get("notebook_name") or operations.get("notebook_name"),
|
|
55
|
+
"source_type": source.get("type") or contract.source.kind,
|
|
56
|
+
"source_path": source.get("path") or source.get("url") or source.get("table") or contract.source.location,
|
|
57
|
+
"trigger": source.get("trigger") or "available_now",
|
|
58
|
+
"checkpoint_location": source.get("progress_location") or source.get("checkpoint_location"),
|
|
59
|
+
"status": status,
|
|
60
|
+
"started_at_utc": _timestamp(started_at_utc),
|
|
61
|
+
"batches_processed": 0,
|
|
62
|
+
"total_rows_read": 0,
|
|
63
|
+
"total_rows_written": 0,
|
|
64
|
+
"total_rows_quarantined": 0,
|
|
65
|
+
"framework_version": FRAMEWORK_VERSION,
|
|
66
|
+
"ctrl_schema_version": CTRL_SCHEMA_VERSION,
|
|
67
|
+
"master_job_id": operations.get("master_job_id"),
|
|
68
|
+
"master_run_id": operations.get("master_run_id"),
|
|
69
|
+
"parent_run_id": operations.get("parent_run_id"),
|
|
70
|
+
"run_group_id": operations.get("run_group_id"),
|
|
71
|
+
**runtime,
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def stream_result_payload(
|
|
76
|
+
contract: SemanticContract,
|
|
77
|
+
*,
|
|
78
|
+
stream_run_id: str,
|
|
79
|
+
status: str,
|
|
80
|
+
started_at_utc: datetime,
|
|
81
|
+
batch_results: list[dict[str, Any]],
|
|
82
|
+
stage_durations: dict[str, float] | None = None,
|
|
83
|
+
error_message: str | None = None,
|
|
84
|
+
skip_reason: str | None = None,
|
|
85
|
+
skipped_by_stream_run_id: str | None = None,
|
|
86
|
+
stream_metrics: dict[str, int] | None = None,
|
|
87
|
+
runtime_metadata: dict[str, Any] | None = None,
|
|
88
|
+
) -> dict[str, Any]:
|
|
89
|
+
finished = datetime.now(timezone.utc)
|
|
90
|
+
metrics = stream_metrics or stream_metrics_from_batches(batch_results)
|
|
91
|
+
return {
|
|
92
|
+
**stream_start_payload(
|
|
93
|
+
contract,
|
|
94
|
+
stream_run_id=stream_run_id,
|
|
95
|
+
status=status,
|
|
96
|
+
started_at_utc=started_at_utc,
|
|
97
|
+
runtime_metadata=runtime_metadata,
|
|
98
|
+
),
|
|
99
|
+
"ended_at_utc": _timestamp(finished),
|
|
100
|
+
"duration_seconds": (finished - started_at_utc).total_seconds(),
|
|
101
|
+
"batches_processed": metrics["batches_processed"],
|
|
102
|
+
"total_rows_read": metrics["total_rows_read"],
|
|
103
|
+
"total_rows_written": metrics["total_rows_written"],
|
|
104
|
+
"total_rows_quarantined": metrics["total_rows_quarantined"],
|
|
105
|
+
"batch_results": batch_results,
|
|
106
|
+
"stage_durations": stage_durations or {},
|
|
107
|
+
"error_message": error_message,
|
|
108
|
+
"skip_reason": skip_reason,
|
|
109
|
+
"skipped_by_stream_run_id": skipped_by_stream_run_id,
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _int_metric(payload: dict[str, Any], key: str) -> int:
|
|
114
|
+
return int(payload.get(key) or 0)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _timestamp(value: datetime | None) -> str:
|
|
118
|
+
value = value or datetime.now(timezone.utc)
|
|
119
|
+
if value.tzinfo is None:
|
|
120
|
+
value = value.replace(tzinfo=timezone.utc)
|
|
121
|
+
return value.astimezone(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _operations_metadata(contract: SemanticContract) -> dict[str, Any]:
|
|
125
|
+
return dict(contract.operations.metadata or {}) if contract.operations and contract.operations.metadata else {}
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _target_catalog(contract: SemanticContract) -> str | None:
|
|
129
|
+
if not contract.target.namespace:
|
|
130
|
+
return None
|
|
131
|
+
return contract.target.namespace.split(".", 1)[0]
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Successful Databricks runtime finalization."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from contractforge_core.execution import ExecutionOutcome
|
|
8
|
+
from contractforge_core.quality import QualityRuleResult
|
|
9
|
+
from contractforge_core.runtime import PreparedInput, QueryOne
|
|
10
|
+
from contractforge_core.semantic import SemanticContract
|
|
11
|
+
from contractforge_databricks.evidence import EvidenceWriter
|
|
12
|
+
from contractforge_databricks.runtime.finalization import finalize_ingest
|
|
13
|
+
from contractforge_databricks.runtime.lineage import write_runtime_diagnostics
|
|
14
|
+
from contractforge_databricks.runtime.metrics import collect_write_metrics
|
|
15
|
+
from contractforge_databricks.runtime.models import DatabricksIngestOptions
|
|
16
|
+
from contractforge_databricks.runtime.utils import utc_now_str
|
|
17
|
+
from contractforge_databricks.runtime.watermark import collect_previous_watermark, collect_watermark_candidate
|
|
18
|
+
from contractforge_databricks.state import StateWriter
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def finalize_success(
|
|
22
|
+
*,
|
|
23
|
+
evidence: EvidenceWriter,
|
|
24
|
+
state: StateWriter,
|
|
25
|
+
contract: SemanticContract,
|
|
26
|
+
prepared: PreparedInput,
|
|
27
|
+
opts: DatabricksIngestOptions,
|
|
28
|
+
run_id: str,
|
|
29
|
+
target: str,
|
|
30
|
+
started: str,
|
|
31
|
+
outcome: ExecutionOutcome | None,
|
|
32
|
+
logical_rows_written: int,
|
|
33
|
+
quality_status_value: str,
|
|
34
|
+
schema_changes: dict[str, Any],
|
|
35
|
+
governance_results: dict[str, Any],
|
|
36
|
+
query_one: QueryOne | None,
|
|
37
|
+
quality_results: tuple[QualityRuleResult, ...] = (),
|
|
38
|
+
write_started_at: str | None = None,
|
|
39
|
+
write_finished_at: str | None = None,
|
|
40
|
+
stage_durations: dict[str, float] | None = None,
|
|
41
|
+
) -> dict[str, Any]:
|
|
42
|
+
rows_written, operation_metrics = collect_write_metrics(
|
|
43
|
+
contract=contract,
|
|
44
|
+
target_table=target,
|
|
45
|
+
rows_written=logical_rows_written,
|
|
46
|
+
query_one=query_one,
|
|
47
|
+
)
|
|
48
|
+
watermark_column, watermark_current = collect_watermark_candidate(
|
|
49
|
+
contract=contract,
|
|
50
|
+
prepared=prepared,
|
|
51
|
+
query_one=query_one,
|
|
52
|
+
)
|
|
53
|
+
_, watermark_previous = collect_previous_watermark(
|
|
54
|
+
contract=contract,
|
|
55
|
+
query_one=query_one,
|
|
56
|
+
catalog=opts.catalog,
|
|
57
|
+
schema=opts.schema,
|
|
58
|
+
)
|
|
59
|
+
source_metadata = prepared.source_metadata or {}
|
|
60
|
+
watermark_previous = source_metadata.get("watermark_previous") or watermark_previous
|
|
61
|
+
diagnostics = write_runtime_diagnostics(
|
|
62
|
+
runner=evidence.runner,
|
|
63
|
+
contract=contract,
|
|
64
|
+
prepared=prepared,
|
|
65
|
+
run_id=run_id,
|
|
66
|
+
target=target,
|
|
67
|
+
status="SUCCESS",
|
|
68
|
+
started=started,
|
|
69
|
+
finished=_utc_now(),
|
|
70
|
+
rows_written=rows_written,
|
|
71
|
+
operation_metrics=operation_metrics,
|
|
72
|
+
catalog=opts.catalog,
|
|
73
|
+
schema=opts.schema,
|
|
74
|
+
query_one=query_one,
|
|
75
|
+
runtime_metadata=opts.runtime_metadata,
|
|
76
|
+
)
|
|
77
|
+
return finalize_ingest(
|
|
78
|
+
evidence,
|
|
79
|
+
state,
|
|
80
|
+
contract,
|
|
81
|
+
prepared,
|
|
82
|
+
opts,
|
|
83
|
+
run_id,
|
|
84
|
+
target,
|
|
85
|
+
"SUCCESS",
|
|
86
|
+
started,
|
|
87
|
+
rows_written=rows_written,
|
|
88
|
+
quality_status_value=quality_status_value,
|
|
89
|
+
quality_results=quality_results,
|
|
90
|
+
operation_metrics=operation_metrics,
|
|
91
|
+
schema_changes=schema_changes,
|
|
92
|
+
governance_results=governance_results,
|
|
93
|
+
write_started_at=write_started_at,
|
|
94
|
+
write_finished_at=write_finished_at,
|
|
95
|
+
stage_durations=stage_durations,
|
|
96
|
+
watermark_column=watermark_column,
|
|
97
|
+
watermark_previous=watermark_previous,
|
|
98
|
+
watermark_current=watermark_current,
|
|
99
|
+
diagnostics=diagnostics,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _utc_now() -> str:
|
|
104
|
+
return utc_now_str()
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Small Databricks runtime utility helpers without Spark import requirements."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import uuid
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
from typing import Any, Callable, Iterable
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def utc_now_ts() -> datetime:
|
|
11
|
+
return datetime.now(timezone.utc)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def utc_now_str() -> str:
|
|
15
|
+
return utc_now_ts().strftime("%Y-%m-%d %H:%M:%S")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def today_str() -> str:
|
|
19
|
+
return utc_now_ts().strftime("%Y-%m-%d")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def new_run_id() -> str:
|
|
23
|
+
return str(uuid.uuid4())
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def resolve_run_id(run_id: str | None, run_id_factory: Callable[[], str] | None = None) -> str:
|
|
27
|
+
if run_id:
|
|
28
|
+
return run_id
|
|
29
|
+
if run_id_factory:
|
|
30
|
+
return str(run_id_factory())
|
|
31
|
+
return f"run-{uuid.uuid4()}"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def safe_truncate(text: str | None, max_len: int = 100_000) -> str | None:
|
|
35
|
+
if text is None or len(text) <= max_len:
|
|
36
|
+
return text
|
|
37
|
+
return text[:max_len] + "\n...TRUNCATED..."
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def as_list(value: str | Iterable[Any] | None, sep: str = "|") -> list[str]:
|
|
41
|
+
if not value:
|
|
42
|
+
return []
|
|
43
|
+
if isinstance(value, str):
|
|
44
|
+
return [item.strip() for item in value.split(sep) if item.strip()]
|
|
45
|
+
return [str(item).strip() for item in value if str(item).strip()]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def validate_columns(df: Any, columns: Iterable[str], context: str = "columns") -> None:
|
|
49
|
+
available = set(getattr(df, "columns", ()) or ())
|
|
50
|
+
missing = [column for column in columns if column not in available]
|
|
51
|
+
if missing:
|
|
52
|
+
raise ValueError(f"{context} not found: {missing}")
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""Runtime watermark collection for Databricks prepared views."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from contractforge_core.runtime import PreparedInput, QueryOne
|
|
8
|
+
from contractforge_core.semantic import SemanticContract
|
|
9
|
+
from contractforge_databricks.rendering.names import target_full_name
|
|
10
|
+
from contractforge_databricks.state.queries import render_select_previous_watermark_sql
|
|
11
|
+
from contractforge_databricks.watermark import render_select_watermark_candidate_sql
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def collect_watermark_candidate(
|
|
15
|
+
*,
|
|
16
|
+
contract: SemanticContract,
|
|
17
|
+
prepared: PreparedInput,
|
|
18
|
+
query_one: QueryOne | None,
|
|
19
|
+
) -> tuple[str | None, str | None]:
|
|
20
|
+
columns = _watermark_columns(contract)
|
|
21
|
+
if not columns or query_one is None:
|
|
22
|
+
return (None, None)
|
|
23
|
+
row = query_one(
|
|
24
|
+
render_select_watermark_candidate_sql(
|
|
25
|
+
table_name=prepared.source_view,
|
|
26
|
+
columns=columns,
|
|
27
|
+
types=prepared.source_schema,
|
|
28
|
+
)
|
|
29
|
+
)
|
|
30
|
+
value = _row_value(row, "watermark_value")
|
|
31
|
+
return ("|".join(columns), None if value is None else str(value))
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def collect_previous_watermark(
|
|
35
|
+
*,
|
|
36
|
+
contract: SemanticContract,
|
|
37
|
+
query_one: QueryOne | None,
|
|
38
|
+
catalog: str = "main",
|
|
39
|
+
schema: str = "ops",
|
|
40
|
+
) -> tuple[str | None, str | None]:
|
|
41
|
+
columns = _watermark_columns(contract)
|
|
42
|
+
if not columns or query_one is None:
|
|
43
|
+
return (None, None)
|
|
44
|
+
row = query_one(
|
|
45
|
+
render_select_previous_watermark_sql(
|
|
46
|
+
target_table=target_full_name(contract),
|
|
47
|
+
state_table=f"{catalog}.{schema}.ctrl_ingestion_state",
|
|
48
|
+
)
|
|
49
|
+
)
|
|
50
|
+
value = _row_value(row, "watermark_value")
|
|
51
|
+
return ("|".join(columns), None if value is None else str(value))
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _watermark_columns(contract: SemanticContract) -> tuple[str, ...]:
|
|
55
|
+
metadata = contract.operations.metadata if contract.operations and contract.operations.metadata else {}
|
|
56
|
+
value = metadata.get("watermark_columns")
|
|
57
|
+
if isinstance(value, str):
|
|
58
|
+
return tuple(part.strip() for part in value.split(",") if part.strip())
|
|
59
|
+
if isinstance(value, (list, tuple)):
|
|
60
|
+
return tuple(str(part).strip() for part in value if str(part).strip())
|
|
61
|
+
return ()
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _row_value(row: Any, key: str) -> Any:
|
|
65
|
+
if row is None:
|
|
66
|
+
return None
|
|
67
|
+
if isinstance(row, dict):
|
|
68
|
+
return row.get(key)
|
|
69
|
+
if hasattr(row, "asDict"):
|
|
70
|
+
return row.asDict().get(key)
|
|
71
|
+
return getattr(row, key, None)
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""Databricks runtime orchestration for execution windows."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import replace
|
|
6
|
+
from typing import Any
|
|
7
|
+
from uuid import uuid4
|
|
8
|
+
|
|
9
|
+
from contractforge_core.contracts import semantic_contract_from_mapping
|
|
10
|
+
from contractforge_core.execution import ExecutionWindow, build_time_windows, summarize_window_results
|
|
11
|
+
from contractforge_core.quality import QualityRuleResult
|
|
12
|
+
from contractforge_core.runtime import QueryOne
|
|
13
|
+
from contractforge_core.watermark import extract_watermark_field_value
|
|
14
|
+
from contractforge_databricks.execution import build_child_window_plan
|
|
15
|
+
from contractforge_databricks.rendering.names import target_full_name
|
|
16
|
+
from contractforge_databricks.runtime.models import DatabricksIngestOptions
|
|
17
|
+
from contractforge_databricks.runtime.orchestrator import ingest_databricks_contract
|
|
18
|
+
from contractforge_databricks.runtime.sources import prepare_contract_source_view
|
|
19
|
+
from contractforge_databricks.state.queries import render_select_previous_watermark_sql
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def has_windowed_execution(contract_mapping: dict[str, Any]) -> bool:
|
|
23
|
+
execution = contract_mapping.get("execution")
|
|
24
|
+
if not isinstance(execution, dict):
|
|
25
|
+
return False
|
|
26
|
+
catchup = execution.get("catchup")
|
|
27
|
+
return isinstance(execution.get("window"), dict) or (isinstance(catchup, dict) and bool(catchup.get("enabled")))
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def ingest_windowed_databricks_contract(
|
|
31
|
+
contract_mapping: dict[str, Any],
|
|
32
|
+
*,
|
|
33
|
+
spark: Any,
|
|
34
|
+
runner: Any,
|
|
35
|
+
options: DatabricksIngestOptions,
|
|
36
|
+
query_one: QueryOne | None = None,
|
|
37
|
+
quality_results: tuple[QualityRuleResult, ...] = (),
|
|
38
|
+
view_name: str | None = None,
|
|
39
|
+
collect_metrics: bool = False,
|
|
40
|
+
) -> dict[str, Any]:
|
|
41
|
+
window_config = _window_config(contract_mapping, options=options, query_one=query_one)
|
|
42
|
+
windows = _windows(window_config)
|
|
43
|
+
parent_run_id = options.run_id or f"run-{uuid4()}"
|
|
44
|
+
results: list[dict[str, Any]] = []
|
|
45
|
+
for index, window in enumerate(windows, start=1):
|
|
46
|
+
child_plan = build_child_window_plan(
|
|
47
|
+
parent_run_id=parent_run_id,
|
|
48
|
+
column=str(window_config["column"]),
|
|
49
|
+
window=window,
|
|
50
|
+
index=index,
|
|
51
|
+
existing_filter=contract_mapping.get("filter_expression"),
|
|
52
|
+
base_idempotency_key=contract_mapping.get("idempotency_key"),
|
|
53
|
+
)
|
|
54
|
+
child_mapping = _child_contract_mapping(contract_mapping, child_plan)
|
|
55
|
+
child_contract = semantic_contract_from_mapping(child_mapping)
|
|
56
|
+
child_opts = replace(
|
|
57
|
+
options,
|
|
58
|
+
run_id=f"{parent_run_id}:window:{index:04d}",
|
|
59
|
+
idempotency_key=child_plan.idempotency_key or options.idempotency_key,
|
|
60
|
+
)
|
|
61
|
+
prepared = prepare_contract_source_view(
|
|
62
|
+
spark,
|
|
63
|
+
child_contract,
|
|
64
|
+
view_name=_child_view_name(child_contract, view_name, index),
|
|
65
|
+
collect_metrics=collect_metrics,
|
|
66
|
+
query_one=query_one,
|
|
67
|
+
evidence_catalog=child_opts.catalog,
|
|
68
|
+
evidence_schema=child_opts.schema,
|
|
69
|
+
)
|
|
70
|
+
result = ingest_databricks_contract(
|
|
71
|
+
child_contract,
|
|
72
|
+
runner=runner,
|
|
73
|
+
prepared=prepared,
|
|
74
|
+
options=child_opts,
|
|
75
|
+
query_one=query_one,
|
|
76
|
+
quality_results=quality_results,
|
|
77
|
+
)
|
|
78
|
+
result["execution_window"] = _window_payload(child_plan.window, str(window_config["column"]))
|
|
79
|
+
results.append(result)
|
|
80
|
+
if result.get("status") == "FAILED" and window_config.get("stop_on_failure", True):
|
|
81
|
+
break
|
|
82
|
+
return _summary(parent_run_id, windows, results)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _window_config(
|
|
86
|
+
contract_mapping: dict[str, Any],
|
|
87
|
+
*,
|
|
88
|
+
options: DatabricksIngestOptions,
|
|
89
|
+
query_one: QueryOne | None,
|
|
90
|
+
) -> dict[str, Any]:
|
|
91
|
+
execution = dict(contract_mapping.get("execution") or {})
|
|
92
|
+
if isinstance(execution.get("window"), dict):
|
|
93
|
+
return dict(execution["window"])
|
|
94
|
+
catchup = dict(execution.get("catchup") or {})
|
|
95
|
+
if not catchup.get("enabled"):
|
|
96
|
+
raise ValueError("windowed execution requires execution.window or enabled execution.catchup")
|
|
97
|
+
start = catchup.get("start") or _previous_watermark_start(contract_mapping, options, query_one, catchup)
|
|
98
|
+
return {
|
|
99
|
+
"column": catchup.get("column") or _single_watermark_column(contract_mapping),
|
|
100
|
+
"start": start,
|
|
101
|
+
"end": catchup.get("end"),
|
|
102
|
+
"every": catchup.get("every"),
|
|
103
|
+
"stop_on_failure": catchup.get("stop_on_failure", True),
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _windows(config: dict[str, Any]) -> tuple[ExecutionWindow, ...]:
|
|
108
|
+
explicit = config.get("windows")
|
|
109
|
+
if explicit:
|
|
110
|
+
return tuple(
|
|
111
|
+
ExecutionWindow(start=str(item["start"]), end=str(item["end"]), label=str(item.get("label") or ""))
|
|
112
|
+
for item in explicit
|
|
113
|
+
)
|
|
114
|
+
missing = [key for key in ("column", "start", "end", "every") if not config.get(key)]
|
|
115
|
+
if missing:
|
|
116
|
+
raise ValueError(f"execution window requires: {', '.join(missing)}")
|
|
117
|
+
return build_time_windows(str(config["start"]), str(config["end"]), str(config["every"]))
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _child_contract_mapping(contract_mapping: dict[str, Any], child_plan: Any) -> dict[str, Any]:
|
|
121
|
+
child = dict(contract_mapping)
|
|
122
|
+
child["filter_expression"] = child_plan.filter_expression
|
|
123
|
+
child["parent_run_id"] = child_plan.parent_run_id
|
|
124
|
+
if child_plan.idempotency_key:
|
|
125
|
+
child["idempotency_key"] = child_plan.idempotency_key
|
|
126
|
+
runtime = dict(child.get("runtime_parameters") or {})
|
|
127
|
+
runtime.update(child_plan.runtime_parameters)
|
|
128
|
+
child["runtime_parameters"] = runtime
|
|
129
|
+
child.pop("execution", None)
|
|
130
|
+
return child
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _summary(parent_run_id: str, windows: tuple[ExecutionWindow, ...], results: list[dict[str, Any]]) -> dict[str, Any]:
|
|
134
|
+
summary = dict(summarize_window_results(results))
|
|
135
|
+
summary.update(
|
|
136
|
+
{
|
|
137
|
+
"run_id": parent_run_id,
|
|
138
|
+
"parent_run_id": parent_run_id,
|
|
139
|
+
"windows_total": len(windows),
|
|
140
|
+
"windows_processed": len(results),
|
|
141
|
+
"window_results": results,
|
|
142
|
+
}
|
|
143
|
+
)
|
|
144
|
+
return summary
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _previous_watermark_start(
|
|
148
|
+
contract_mapping: dict[str, Any],
|
|
149
|
+
options: DatabricksIngestOptions,
|
|
150
|
+
query_one: QueryOne | None,
|
|
151
|
+
catchup: dict[str, Any],
|
|
152
|
+
) -> str:
|
|
153
|
+
if query_one is None:
|
|
154
|
+
raise ValueError("execution.catchup.start is required when query_one is not provided")
|
|
155
|
+
contract = semantic_contract_from_mapping(contract_mapping)
|
|
156
|
+
row = query_one(
|
|
157
|
+
render_select_previous_watermark_sql(
|
|
158
|
+
target_table=target_full_name(contract),
|
|
159
|
+
state_table=f"{options.catalog}.{options.schema}.ctrl_ingestion_state",
|
|
160
|
+
)
|
|
161
|
+
)
|
|
162
|
+
raw = row.get("watermark_value") if isinstance(row, dict) else None
|
|
163
|
+
start = extract_watermark_field_value(raw, catchup.get("column") or _single_watermark_column(contract_mapping))
|
|
164
|
+
if not start:
|
|
165
|
+
raise ValueError("execution.catchup.start is required when no previous watermark exists")
|
|
166
|
+
return start
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _single_watermark_column(contract_mapping: dict[str, Any]) -> str:
|
|
170
|
+
value = contract_mapping.get("watermark_columns")
|
|
171
|
+
columns = [value] if isinstance(value, str) else list(value or ())
|
|
172
|
+
columns = [str(item).strip() for item in columns if str(item).strip()]
|
|
173
|
+
if len(columns) != 1:
|
|
174
|
+
raise ValueError("execution.catchup.column is required unless exactly one watermark column is configured")
|
|
175
|
+
return columns[0]
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _child_view_name(contract: Any, view_name: str | None, index: int) -> str:
|
|
179
|
+
base = view_name or f"cf_source_{target_full_name(contract).replace('`', '').replace('.', '_')}"
|
|
180
|
+
return f"{base}_{index:04d}"
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _window_payload(window: ExecutionWindow, column: str) -> dict[str, str]:
|
|
184
|
+
return {"label": window.label, "column": column, "start": window.start, "end": window.end}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Dispatch prepared Databricks views to write-mode executors."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from contractforge_core.runtime import PreparedInput, QueryOne
|
|
6
|
+
from contractforge_core.semantic import SemanticContract
|
|
7
|
+
from contractforge_databricks.execution import (
|
|
8
|
+
ExecutionOutcome,
|
|
9
|
+
SqlRunner,
|
|
10
|
+
execute_append,
|
|
11
|
+
execute_hash_diff_insert,
|
|
12
|
+
execute_overwrite,
|
|
13
|
+
execute_replace_partitions,
|
|
14
|
+
execute_scd1_merge,
|
|
15
|
+
execute_scd2_merge,
|
|
16
|
+
execute_snapshot_soft_delete,
|
|
17
|
+
)
|
|
18
|
+
from contractforge_databricks.write_modes.registry import execute_registered_write_mode
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def execute_prepared_write(
|
|
22
|
+
*,
|
|
23
|
+
runner: SqlRunner,
|
|
24
|
+
contract: SemanticContract,
|
|
25
|
+
prepared: PreparedInput,
|
|
26
|
+
target_partition_predicate: str | None = None,
|
|
27
|
+
replace_partition_predicate: str | None = None,
|
|
28
|
+
target_schema: dict[str, str] | None = None,
|
|
29
|
+
query_one: QueryOne | None = None,
|
|
30
|
+
) -> ExecutionOutcome:
|
|
31
|
+
kwargs = {"runner": runner, "contract": contract, "source_view": prepared.source_view}
|
|
32
|
+
if contract.write.mode == "scd0_append":
|
|
33
|
+
return execute_append(**kwargs)
|
|
34
|
+
if contract.write.mode == "scd0_overwrite":
|
|
35
|
+
return execute_overwrite(**kwargs)
|
|
36
|
+
if contract.write.mode == "scd1_upsert":
|
|
37
|
+
if replace_partition_predicate:
|
|
38
|
+
return execute_replace_partitions(**kwargs, predicate=replace_partition_predicate)
|
|
39
|
+
return execute_scd1_merge(
|
|
40
|
+
**kwargs,
|
|
41
|
+
source_columns=prepared.source_columns,
|
|
42
|
+
target_partition_predicate=target_partition_predicate,
|
|
43
|
+
)
|
|
44
|
+
if contract.write.mode == "scd1_hash_diff":
|
|
45
|
+
return execute_hash_diff_insert(
|
|
46
|
+
**kwargs,
|
|
47
|
+
source_columns=prepared.source_columns,
|
|
48
|
+
target_schema=target_schema,
|
|
49
|
+
query_one=query_one,
|
|
50
|
+
)
|
|
51
|
+
if contract.write.mode == "scd2_historical":
|
|
52
|
+
return execute_scd2_merge(**kwargs, insert_columns=prepared.source_columns)
|
|
53
|
+
if contract.write.mode == "snapshot_soft_delete":
|
|
54
|
+
return execute_snapshot_soft_delete(**kwargs, source_columns=prepared.source_columns)
|
|
55
|
+
if contract.write.mode.startswith("custom:"):
|
|
56
|
+
return execute_registered_write_mode(
|
|
57
|
+
contract.write.mode,
|
|
58
|
+
runner=runner,
|
|
59
|
+
contract=contract,
|
|
60
|
+
prepared=prepared,
|
|
61
|
+
target_partition_predicate=target_partition_predicate,
|
|
62
|
+
replace_partition_predicate=replace_partition_predicate,
|
|
63
|
+
target_schema=target_schema,
|
|
64
|
+
query_one=query_one,
|
|
65
|
+
)
|
|
66
|
+
raise ValueError(f"Unsupported Databricks write mode: {contract.write.mode}")
|