contractforge-databricks 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. contractforge_databricks/__init__.py +172 -0
  2. contractforge_databricks/adapter.py +69 -0
  3. contractforge_databricks/annotations/__init__.py +10 -0
  4. contractforge_databricks/annotations/application.py +52 -0
  5. contractforge_databricks/annotations/audit.py +49 -0
  6. contractforge_databricks/annotations/sql.py +142 -0
  7. contractforge_databricks/api.py +65 -0
  8. contractforge_databricks/bundles/__init__.py +9 -0
  9. contractforge_databricks/bundles/assets.py +47 -0
  10. contractforge_databricks/bundles/project.py +213 -0
  11. contractforge_databricks/bundles/project_config.py +133 -0
  12. contractforge_databricks/capabilities/__init__.py +17 -0
  13. contractforge_databricks/capabilities/builders.py +43 -0
  14. contractforge_databricks/capabilities/evaluate.py +162 -0
  15. contractforge_databricks/capabilities/mapping.py +36 -0
  16. contractforge_databricks/capabilities/models.py +44 -0
  17. contractforge_databricks/capabilities/runtime.py +111 -0
  18. contractforge_databricks/capabilities/uc.py +47 -0
  19. contractforge_databricks/cli.py +196 -0
  20. contractforge_databricks/cli_deploy.py +98 -0
  21. contractforge_databricks/cli_governance.py +142 -0
  22. contractforge_databricks/cli_io.py +91 -0
  23. contractforge_databricks/cli_maintenance.py +69 -0
  24. contractforge_databricks/coercion.py +31 -0
  25. contractforge_databricks/contract_extensions.py +70 -0
  26. contractforge_databricks/cost/__init__.py +11 -0
  27. contractforge_databricks/cost/model.py +22 -0
  28. contractforge_databricks/cost/report.py +65 -0
  29. contractforge_databricks/cost/sql.py +136 -0
  30. contractforge_databricks/dashboards/__init__.py +15 -0
  31. contractforge_databricks/dashboards/control_tables.py +150 -0
  32. contractforge_databricks/diagnostics/__init__.py +7 -0
  33. contractforge_databricks/diagnostics/explain.py +40 -0
  34. contractforge_databricks/environment.py +53 -0
  35. contractforge_databricks/evidence/__init__.py +98 -0
  36. contractforge_databricks/evidence/ddl.py +35 -0
  37. contractforge_databricks/evidence/governance_log.py +175 -0
  38. contractforge_databricks/evidence/helpers.py +29 -0
  39. contractforge_databricks/evidence/ops_log.py +210 -0
  40. contractforge_databricks/evidence/records.py +27 -0
  41. contractforge_databricks/evidence/run_log.py +74 -0
  42. contractforge_databricks/evidence/schemas.py +7 -0
  43. contractforge_databricks/evidence/sql.py +144 -0
  44. contractforge_databricks/evidence/tables.py +20 -0
  45. contractforge_databricks/evidence/writer.py +118 -0
  46. contractforge_databricks/execution/__init__.py +70 -0
  47. contractforge_databricks/execution/delta_basic.py +57 -0
  48. contractforge_databricks/execution/hash_diff.py +126 -0
  49. contractforge_databricks/execution/hash_diff_latest.py +142 -0
  50. contractforge_databricks/execution/replace_partitions.py +40 -0
  51. contractforge_databricks/execution/results.py +5 -0
  52. contractforge_databricks/execution/retry.py +36 -0
  53. contractforge_databricks/execution/scd2.py +213 -0
  54. contractforge_databricks/execution/scd2_deletes.py +65 -0
  55. contractforge_databricks/execution/scd2_late.py +30 -0
  56. contractforge_databricks/execution/snapshot.py +77 -0
  57. contractforge_databricks/execution/sql_merge.py +85 -0
  58. contractforge_databricks/execution/tables.py +98 -0
  59. contractforge_databricks/execution/windows.py +58 -0
  60. contractforge_databricks/governance/__init__.py +30 -0
  61. contractforge_databricks/governance/access.py +185 -0
  62. contractforge_databricks/governance/application.py +93 -0
  63. contractforge_databricks/governance/drift.py +49 -0
  64. contractforge_databricks/governance/runtime.py +60 -0
  65. contractforge_databricks/governance/sql.py +31 -0
  66. contractforge_databricks/governance/validation.py +135 -0
  67. contractforge_databricks/lakeflow/__init__.py +21 -0
  68. contractforge_databricks/lakeflow/compatibility.py +194 -0
  69. contractforge_databricks/lakeflow/rendering.py +175 -0
  70. contractforge_databricks/lineage/__init__.py +7 -0
  71. contractforge_databricks/lineage/openlineage.py +182 -0
  72. contractforge_databricks/maintenance/__init__.py +27 -0
  73. contractforge_databricks/maintenance/retention.py +90 -0
  74. contractforge_databricks/maintenance/sql.py +68 -0
  75. contractforge_databricks/metrics/__init__.py +19 -0
  76. contractforge_databricks/metrics/history.py +21 -0
  77. contractforge_databricks/metrics/write.py +63 -0
  78. contractforge_databricks/operations/__init__.py +4 -0
  79. contractforge_databricks/operations/application.py +38 -0
  80. contractforge_databricks/operations/sql.py +95 -0
  81. contractforge_databricks/parity/__init__.py +18 -0
  82. contractforge_databricks/parity/catalog.py +59 -0
  83. contractforge_databricks/parity/models.py +7 -0
  84. contractforge_databricks/parity/scenarios.py +111 -0
  85. contractforge_databricks/partitioning/__init__.py +3 -0
  86. contractforge_databricks/partitioning/predicates.py +28 -0
  87. contractforge_databricks/preparation/__init__.py +47 -0
  88. contractforge_databricks/preparation/deduplicate.py +87 -0
  89. contractforge_databricks/preparation/encoding.py +37 -0
  90. contractforge_databricks/preparation/hashing.py +18 -0
  91. contractforge_databricks/preparation/pyspark.py +178 -0
  92. contractforge_databricks/preparation/pyspark_staging.py +70 -0
  93. contractforge_databricks/preparation/shape.py +209 -0
  94. contractforge_databricks/preparation/shape_validation.py +94 -0
  95. contractforge_databricks/preparation/staging.py +17 -0
  96. contractforge_databricks/preparation/zip_arrays.py +51 -0
  97. contractforge_databricks/presets/__init__.py +3 -0
  98. contractforge_databricks/presets/base.py +24 -0
  99. contractforge_databricks/presets/bronze.py +57 -0
  100. contractforge_databricks/presets/catalog.py +22 -0
  101. contractforge_databricks/presets/core.py +134 -0
  102. contractforge_databricks/presets/gold.py +62 -0
  103. contractforge_databricks/presets/modifiers.py +51 -0
  104. contractforge_databricks/presets/runtime.py +22 -0
  105. contractforge_databricks/presets/silver.py +101 -0
  106. contractforge_databricks/presets/write_engine.py +57 -0
  107. contractforge_databricks/quality/__init__.py +41 -0
  108. contractforge_databricks/quality/evaluation.py +178 -0
  109. contractforge_databricks/quality/persistence.py +81 -0
  110. contractforge_databricks/quality/registry.py +134 -0
  111. contractforge_databricks/quality/results.py +17 -0
  112. contractforge_databricks/quality/sql.py +113 -0
  113. contractforge_databricks/rendering/__init__.py +11 -0
  114. contractforge_databricks/rendering/bundle.py +93 -0
  115. contractforge_databricks/rendering/markdown.py +50 -0
  116. contractforge_databricks/rendering/names.py +56 -0
  117. contractforge_databricks/results.py +15 -0
  118. contractforge_databricks/runtime/__init__.py +101 -0
  119. contractforge_databricks/runtime/available_now.py +147 -0
  120. contractforge_databricks/runtime/bundles.py +211 -0
  121. contractforge_databricks/runtime/cache.py +20 -0
  122. contractforge_databricks/runtime/control_tables.py +19 -0
  123. contractforge_databricks/runtime/deploy.py +197 -0
  124. contractforge_databricks/runtime/detection.py +114 -0
  125. contractforge_databricks/runtime/dry_run.py +46 -0
  126. contractforge_databricks/runtime/errors.py +54 -0
  127. contractforge_databricks/runtime/file_selection.py +109 -0
  128. contractforge_databricks/runtime/finalization.py +168 -0
  129. contractforge_databricks/runtime/governance.py +37 -0
  130. contractforge_databricks/runtime/hooks.py +45 -0
  131. contractforge_databricks/runtime/http_file.py +37 -0
  132. contractforge_databricks/runtime/http_retry.py +15 -0
  133. contractforge_databricks/runtime/http_safety.py +9 -0
  134. contractforge_databricks/runtime/json_materialization.py +97 -0
  135. contractforge_databricks/runtime/lineage.py +164 -0
  136. contractforge_databricks/runtime/maintenance.py +43 -0
  137. contractforge_databricks/runtime/merge_validation.py +98 -0
  138. contractforge_databricks/runtime/metadata.py +21 -0
  139. contractforge_databricks/runtime/metrics.py +34 -0
  140. contractforge_databricks/runtime/models.py +32 -0
  141. contractforge_databricks/runtime/options.py +33 -0
  142. contractforge_databricks/runtime/orchestration_context.py +185 -0
  143. contractforge_databricks/runtime/orchestrator.py +147 -0
  144. contractforge_databricks/runtime/partitioning.py +93 -0
  145. contractforge_databricks/runtime/quality_quarantine.py +92 -0
  146. contractforge_databricks/runtime/rest_api.py +46 -0
  147. contractforge_databricks/runtime/rest_auth.py +21 -0
  148. contractforge_databricks/runtime/rest_pagination.py +21 -0
  149. contractforge_databricks/runtime/run_payload.py +177 -0
  150. contractforge_databricks/runtime/schema.py +106 -0
  151. contractforge_databricks/runtime/source_metadata.py +30 -0
  152. contractforge_databricks/runtime/source_registry.py +43 -0
  153. contractforge_databricks/runtime/source_schema.py +24 -0
  154. contractforge_databricks/runtime/sources.py +208 -0
  155. contractforge_databricks/runtime/spark.py +183 -0
  156. contractforge_databricks/runtime/spark_defaults.py +35 -0
  157. contractforge_databricks/runtime/storage_auth.py +132 -0
  158. contractforge_databricks/runtime/streaming.py +131 -0
  159. contractforge_databricks/runtime/success.py +104 -0
  160. contractforge_databricks/runtime/utils.py +52 -0
  161. contractforge_databricks/runtime/watermark.py +71 -0
  162. contractforge_databricks/runtime/windows.py +184 -0
  163. contractforge_databricks/runtime/write.py +66 -0
  164. contractforge_databricks/runtime/write_flow.py +146 -0
  165. contractforge_databricks/runtime/write_strategy.py +40 -0
  166. contractforge_databricks/schema/__init__.py +21 -0
  167. contractforge_databricks/schema/diff.py +11 -0
  168. contractforge_databricks/schema/policy.py +33 -0
  169. contractforge_databricks/schema/sync.py +23 -0
  170. contractforge_databricks/security/__init__.py +21 -0
  171. contractforge_databricks/security/errors.py +5 -0
  172. contractforge_databricks/security/redaction.py +5 -0
  173. contractforge_databricks/security/secrets.py +114 -0
  174. contractforge_databricks/security/source_policy.py +17 -0
  175. contractforge_databricks/shapes/__init__.py +3 -0
  176. contractforge_databricks/shapes/sql.py +123 -0
  177. contractforge_databricks/sources/__init__.py +67 -0
  178. contractforge_databricks/sources/artifacts.py +100 -0
  179. contractforge_databricks/sources/autoloader.py +48 -0
  180. contractforge_databricks/sources/bounded_streams.py +44 -0
  181. contractforge_databricks/sources/classification.py +115 -0
  182. contractforge_databricks/sources/delta_share.py +21 -0
  183. contractforge_databricks/sources/files.py +48 -0
  184. contractforge_databricks/sources/http_file.py +46 -0
  185. contractforge_databricks/sources/interpret.py +76 -0
  186. contractforge_databricks/sources/jdbc.py +32 -0
  187. contractforge_databricks/sources/metadata.py +18 -0
  188. contractforge_databricks/sources/native_passthrough.py +33 -0
  189. contractforge_databricks/sources/rds_iam.py +15 -0
  190. contractforge_databricks/sources/rds_iam_runtime.py +191 -0
  191. contractforge_databricks/sources/rest_api.py +33 -0
  192. contractforge_databricks/sources/support.py +50 -0
  193. contractforge_databricks/sources/table_refs.py +65 -0
  194. contractforge_databricks/sql/__init__.py +4 -0
  195. contractforge_databricks/sql/identifiers.py +17 -0
  196. contractforge_databricks/sql/literals.py +36 -0
  197. contractforge_databricks/state/__init__.py +39 -0
  198. contractforge_databricks/state/ddl.py +24 -0
  199. contractforge_databricks/state/migrations.py +146 -0
  200. contractforge_databricks/state/queries.py +149 -0
  201. contractforge_databricks/state/sql.py +116 -0
  202. contractforge_databricks/state/tables.py +9 -0
  203. contractforge_databricks/state/writer.py +83 -0
  204. contractforge_databricks/templates/__init__.py +15 -0
  205. contractforge_databricks/templates/catalog.py +205 -0
  206. contractforge_databricks/templates/catalog_parity.py +85 -0
  207. contractforge_databricks/templates/core.py +83 -0
  208. contractforge_databricks/templates/enrichment.py +175 -0
  209. contractforge_databricks/transforms/__init__.py +3 -0
  210. contractforge_databricks/transforms/sql.py +118 -0
  211. contractforge_databricks/watermark/__init__.py +6 -0
  212. contractforge_databricks/watermark/sql.py +91 -0
  213. contractforge_databricks/write_modes/__init__.py +20 -0
  214. contractforge_databricks/write_modes/registry.py +44 -0
  215. contractforge_databricks/write_modes/sql.py +33 -0
  216. contractforge_databricks/write_modes/strategy.py +192 -0
  217. contractforge_databricks-0.1.0.dist-info/METADATA +34 -0
  218. contractforge_databricks-0.1.0.dist-info/RECORD +220 -0
  219. contractforge_databricks-0.1.0.dist-info/WHEEL +4 -0
  220. contractforge_databricks-0.1.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,131 @@
1
+ """Runtime payload helpers for Databricks available-now streaming."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import datetime, timezone
6
+ from typing import Any
7
+
8
+ from contractforge_core.config import CTRL_SCHEMA_VERSION, FRAMEWORK_VERSION
9
+ from contractforge_core.semantic import SemanticContract
10
+ from contractforge_databricks.rendering.names import target_full_name
11
+
12
+
13
+ def stream_metrics_from_batches(batch_results: list[dict[str, Any]]) -> dict[str, int]:
14
+ """Aggregate metrics returned by child batch ingestions."""
15
+
16
+ return {
17
+ "batches_processed": len(batch_results),
18
+ "total_rows_read": sum(_int_metric(result, "rows_read") for result in batch_results),
19
+ "total_rows_written": sum(_int_metric(result, "rows_written") for result in batch_results),
20
+ "total_rows_quarantined": sum(_int_metric(result, "rows_quarantined") for result in batch_results),
21
+ }
22
+
23
+
24
+ def prefer_child_stream_metrics(local: dict[str, int], child: dict[str, int]) -> bool:
25
+ """Return true when persisted child-run metrics are more complete."""
26
+
27
+ if child.get("batches_processed", 0) <= 0:
28
+ return False
29
+ local_rows = local.get("total_rows_read", 0) + local.get("total_rows_written", 0) + local.get("total_rows_quarantined", 0)
30
+ child_rows = child.get("total_rows_read", 0) + child.get("total_rows_written", 0) + child.get("total_rows_quarantined", 0)
31
+ return local.get("batches_processed", 0) == 0 or child.get("batches_processed", 0) > local.get("batches_processed", 0) or child_rows > local_rows
32
+
33
+
34
+ def stream_start_payload(
35
+ contract: SemanticContract,
36
+ *,
37
+ stream_run_id: str,
38
+ status: str = "RUNNING",
39
+ started_at_utc: datetime | None = None,
40
+ idempotency_key: str | None = None,
41
+ idempotency_policy: str = "always_run",
42
+ runtime_metadata: dict[str, Any] | None = None,
43
+ ) -> dict[str, Any]:
44
+ source = contract.source.raw or {}
45
+ operations = _operations_metadata(contract)
46
+ runtime = runtime_metadata or {}
47
+ return {
48
+ "stream_run_id": stream_run_id,
49
+ "idempotency_key": idempotency_key if idempotency_key is not None else operations.get("idempotency_key"),
50
+ "idempotency_policy": operations.get("idempotency_policy") or idempotency_policy,
51
+ "target_table": target_full_name(contract),
52
+ "target_catalog": _target_catalog(contract),
53
+ "target_layer": contract.target.layer,
54
+ "runtime_entrypoint": runtime.get("notebook_name") or operations.get("notebook_name"),
55
+ "source_type": source.get("type") or contract.source.kind,
56
+ "source_path": source.get("path") or source.get("url") or source.get("table") or contract.source.location,
57
+ "trigger": source.get("trigger") or "available_now",
58
+ "checkpoint_location": source.get("progress_location") or source.get("checkpoint_location"),
59
+ "status": status,
60
+ "started_at_utc": _timestamp(started_at_utc),
61
+ "batches_processed": 0,
62
+ "total_rows_read": 0,
63
+ "total_rows_written": 0,
64
+ "total_rows_quarantined": 0,
65
+ "framework_version": FRAMEWORK_VERSION,
66
+ "ctrl_schema_version": CTRL_SCHEMA_VERSION,
67
+ "master_job_id": operations.get("master_job_id"),
68
+ "master_run_id": operations.get("master_run_id"),
69
+ "parent_run_id": operations.get("parent_run_id"),
70
+ "run_group_id": operations.get("run_group_id"),
71
+ **runtime,
72
+ }
73
+
74
+
75
+ def stream_result_payload(
76
+ contract: SemanticContract,
77
+ *,
78
+ stream_run_id: str,
79
+ status: str,
80
+ started_at_utc: datetime,
81
+ batch_results: list[dict[str, Any]],
82
+ stage_durations: dict[str, float] | None = None,
83
+ error_message: str | None = None,
84
+ skip_reason: str | None = None,
85
+ skipped_by_stream_run_id: str | None = None,
86
+ stream_metrics: dict[str, int] | None = None,
87
+ runtime_metadata: dict[str, Any] | None = None,
88
+ ) -> dict[str, Any]:
89
+ finished = datetime.now(timezone.utc)
90
+ metrics = stream_metrics or stream_metrics_from_batches(batch_results)
91
+ return {
92
+ **stream_start_payload(
93
+ contract,
94
+ stream_run_id=stream_run_id,
95
+ status=status,
96
+ started_at_utc=started_at_utc,
97
+ runtime_metadata=runtime_metadata,
98
+ ),
99
+ "ended_at_utc": _timestamp(finished),
100
+ "duration_seconds": (finished - started_at_utc).total_seconds(),
101
+ "batches_processed": metrics["batches_processed"],
102
+ "total_rows_read": metrics["total_rows_read"],
103
+ "total_rows_written": metrics["total_rows_written"],
104
+ "total_rows_quarantined": metrics["total_rows_quarantined"],
105
+ "batch_results": batch_results,
106
+ "stage_durations": stage_durations or {},
107
+ "error_message": error_message,
108
+ "skip_reason": skip_reason,
109
+ "skipped_by_stream_run_id": skipped_by_stream_run_id,
110
+ }
111
+
112
+
113
+ def _int_metric(payload: dict[str, Any], key: str) -> int:
114
+ return int(payload.get(key) or 0)
115
+
116
+
117
+ def _timestamp(value: datetime | None) -> str:
118
+ value = value or datetime.now(timezone.utc)
119
+ if value.tzinfo is None:
120
+ value = value.replace(tzinfo=timezone.utc)
121
+ return value.astimezone(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
122
+
123
+
124
+ def _operations_metadata(contract: SemanticContract) -> dict[str, Any]:
125
+ return dict(contract.operations.metadata or {}) if contract.operations and contract.operations.metadata else {}
126
+
127
+
128
+ def _target_catalog(contract: SemanticContract) -> str | None:
129
+ if not contract.target.namespace:
130
+ return None
131
+ return contract.target.namespace.split(".", 1)[0]
@@ -0,0 +1,104 @@
1
+ """Successful Databricks runtime finalization."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from contractforge_core.execution import ExecutionOutcome
8
+ from contractforge_core.quality import QualityRuleResult
9
+ from contractforge_core.runtime import PreparedInput, QueryOne
10
+ from contractforge_core.semantic import SemanticContract
11
+ from contractforge_databricks.evidence import EvidenceWriter
12
+ from contractforge_databricks.runtime.finalization import finalize_ingest
13
+ from contractforge_databricks.runtime.lineage import write_runtime_diagnostics
14
+ from contractforge_databricks.runtime.metrics import collect_write_metrics
15
+ from contractforge_databricks.runtime.models import DatabricksIngestOptions
16
+ from contractforge_databricks.runtime.utils import utc_now_str
17
+ from contractforge_databricks.runtime.watermark import collect_previous_watermark, collect_watermark_candidate
18
+ from contractforge_databricks.state import StateWriter
19
+
20
+
21
+ def finalize_success(
22
+ *,
23
+ evidence: EvidenceWriter,
24
+ state: StateWriter,
25
+ contract: SemanticContract,
26
+ prepared: PreparedInput,
27
+ opts: DatabricksIngestOptions,
28
+ run_id: str,
29
+ target: str,
30
+ started: str,
31
+ outcome: ExecutionOutcome | None,
32
+ logical_rows_written: int,
33
+ quality_status_value: str,
34
+ schema_changes: dict[str, Any],
35
+ governance_results: dict[str, Any],
36
+ query_one: QueryOne | None,
37
+ quality_results: tuple[QualityRuleResult, ...] = (),
38
+ write_started_at: str | None = None,
39
+ write_finished_at: str | None = None,
40
+ stage_durations: dict[str, float] | None = None,
41
+ ) -> dict[str, Any]:
42
+ rows_written, operation_metrics = collect_write_metrics(
43
+ contract=contract,
44
+ target_table=target,
45
+ rows_written=logical_rows_written,
46
+ query_one=query_one,
47
+ )
48
+ watermark_column, watermark_current = collect_watermark_candidate(
49
+ contract=contract,
50
+ prepared=prepared,
51
+ query_one=query_one,
52
+ )
53
+ _, watermark_previous = collect_previous_watermark(
54
+ contract=contract,
55
+ query_one=query_one,
56
+ catalog=opts.catalog,
57
+ schema=opts.schema,
58
+ )
59
+ source_metadata = prepared.source_metadata or {}
60
+ watermark_previous = source_metadata.get("watermark_previous") or watermark_previous
61
+ diagnostics = write_runtime_diagnostics(
62
+ runner=evidence.runner,
63
+ contract=contract,
64
+ prepared=prepared,
65
+ run_id=run_id,
66
+ target=target,
67
+ status="SUCCESS",
68
+ started=started,
69
+ finished=_utc_now(),
70
+ rows_written=rows_written,
71
+ operation_metrics=operation_metrics,
72
+ catalog=opts.catalog,
73
+ schema=opts.schema,
74
+ query_one=query_one,
75
+ runtime_metadata=opts.runtime_metadata,
76
+ )
77
+ return finalize_ingest(
78
+ evidence,
79
+ state,
80
+ contract,
81
+ prepared,
82
+ opts,
83
+ run_id,
84
+ target,
85
+ "SUCCESS",
86
+ started,
87
+ rows_written=rows_written,
88
+ quality_status_value=quality_status_value,
89
+ quality_results=quality_results,
90
+ operation_metrics=operation_metrics,
91
+ schema_changes=schema_changes,
92
+ governance_results=governance_results,
93
+ write_started_at=write_started_at,
94
+ write_finished_at=write_finished_at,
95
+ stage_durations=stage_durations,
96
+ watermark_column=watermark_column,
97
+ watermark_previous=watermark_previous,
98
+ watermark_current=watermark_current,
99
+ diagnostics=diagnostics,
100
+ )
101
+
102
+
103
+ def _utc_now() -> str:
104
+ return utc_now_str()
@@ -0,0 +1,52 @@
1
+ """Small Databricks runtime utility helpers without Spark import requirements."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import uuid
6
+ from datetime import datetime, timezone
7
+ from typing import Any, Callable, Iterable
8
+
9
+
10
+ def utc_now_ts() -> datetime:
11
+ return datetime.now(timezone.utc)
12
+
13
+
14
+ def utc_now_str() -> str:
15
+ return utc_now_ts().strftime("%Y-%m-%d %H:%M:%S")
16
+
17
+
18
+ def today_str() -> str:
19
+ return utc_now_ts().strftime("%Y-%m-%d")
20
+
21
+
22
+ def new_run_id() -> str:
23
+ return str(uuid.uuid4())
24
+
25
+
26
+ def resolve_run_id(run_id: str | None, run_id_factory: Callable[[], str] | None = None) -> str:
27
+ if run_id:
28
+ return run_id
29
+ if run_id_factory:
30
+ return str(run_id_factory())
31
+ return f"run-{uuid.uuid4()}"
32
+
33
+
34
+ def safe_truncate(text: str | None, max_len: int = 100_000) -> str | None:
35
+ if text is None or len(text) <= max_len:
36
+ return text
37
+ return text[:max_len] + "\n...TRUNCATED..."
38
+
39
+
40
+ def as_list(value: str | Iterable[Any] | None, sep: str = "|") -> list[str]:
41
+ if not value:
42
+ return []
43
+ if isinstance(value, str):
44
+ return [item.strip() for item in value.split(sep) if item.strip()]
45
+ return [str(item).strip() for item in value if str(item).strip()]
46
+
47
+
48
+ def validate_columns(df: Any, columns: Iterable[str], context: str = "columns") -> None:
49
+ available = set(getattr(df, "columns", ()) or ())
50
+ missing = [column for column in columns if column not in available]
51
+ if missing:
52
+ raise ValueError(f"{context} not found: {missing}")
@@ -0,0 +1,71 @@
1
+ """Runtime watermark collection for Databricks prepared views."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from contractforge_core.runtime import PreparedInput, QueryOne
8
+ from contractforge_core.semantic import SemanticContract
9
+ from contractforge_databricks.rendering.names import target_full_name
10
+ from contractforge_databricks.state.queries import render_select_previous_watermark_sql
11
+ from contractforge_databricks.watermark import render_select_watermark_candidate_sql
12
+
13
+
14
+ def collect_watermark_candidate(
15
+ *,
16
+ contract: SemanticContract,
17
+ prepared: PreparedInput,
18
+ query_one: QueryOne | None,
19
+ ) -> tuple[str | None, str | None]:
20
+ columns = _watermark_columns(contract)
21
+ if not columns or query_one is None:
22
+ return (None, None)
23
+ row = query_one(
24
+ render_select_watermark_candidate_sql(
25
+ table_name=prepared.source_view,
26
+ columns=columns,
27
+ types=prepared.source_schema,
28
+ )
29
+ )
30
+ value = _row_value(row, "watermark_value")
31
+ return ("|".join(columns), None if value is None else str(value))
32
+
33
+
34
+ def collect_previous_watermark(
35
+ *,
36
+ contract: SemanticContract,
37
+ query_one: QueryOne | None,
38
+ catalog: str = "main",
39
+ schema: str = "ops",
40
+ ) -> tuple[str | None, str | None]:
41
+ columns = _watermark_columns(contract)
42
+ if not columns or query_one is None:
43
+ return (None, None)
44
+ row = query_one(
45
+ render_select_previous_watermark_sql(
46
+ target_table=target_full_name(contract),
47
+ state_table=f"{catalog}.{schema}.ctrl_ingestion_state",
48
+ )
49
+ )
50
+ value = _row_value(row, "watermark_value")
51
+ return ("|".join(columns), None if value is None else str(value))
52
+
53
+
54
+ def _watermark_columns(contract: SemanticContract) -> tuple[str, ...]:
55
+ metadata = contract.operations.metadata if contract.operations and contract.operations.metadata else {}
56
+ value = metadata.get("watermark_columns")
57
+ if isinstance(value, str):
58
+ return tuple(part.strip() for part in value.split(",") if part.strip())
59
+ if isinstance(value, (list, tuple)):
60
+ return tuple(str(part).strip() for part in value if str(part).strip())
61
+ return ()
62
+
63
+
64
+ def _row_value(row: Any, key: str) -> Any:
65
+ if row is None:
66
+ return None
67
+ if isinstance(row, dict):
68
+ return row.get(key)
69
+ if hasattr(row, "asDict"):
70
+ return row.asDict().get(key)
71
+ return getattr(row, key, None)
@@ -0,0 +1,184 @@
1
+ """Databricks runtime orchestration for execution windows."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import replace
6
+ from typing import Any
7
+ from uuid import uuid4
8
+
9
+ from contractforge_core.contracts import semantic_contract_from_mapping
10
+ from contractforge_core.execution import ExecutionWindow, build_time_windows, summarize_window_results
11
+ from contractforge_core.quality import QualityRuleResult
12
+ from contractforge_core.runtime import QueryOne
13
+ from contractforge_core.watermark import extract_watermark_field_value
14
+ from contractforge_databricks.execution import build_child_window_plan
15
+ from contractforge_databricks.rendering.names import target_full_name
16
+ from contractforge_databricks.runtime.models import DatabricksIngestOptions
17
+ from contractforge_databricks.runtime.orchestrator import ingest_databricks_contract
18
+ from contractforge_databricks.runtime.sources import prepare_contract_source_view
19
+ from contractforge_databricks.state.queries import render_select_previous_watermark_sql
20
+
21
+
22
+ def has_windowed_execution(contract_mapping: dict[str, Any]) -> bool:
23
+ execution = contract_mapping.get("execution")
24
+ if not isinstance(execution, dict):
25
+ return False
26
+ catchup = execution.get("catchup")
27
+ return isinstance(execution.get("window"), dict) or (isinstance(catchup, dict) and bool(catchup.get("enabled")))
28
+
29
+
30
+ def ingest_windowed_databricks_contract(
31
+ contract_mapping: dict[str, Any],
32
+ *,
33
+ spark: Any,
34
+ runner: Any,
35
+ options: DatabricksIngestOptions,
36
+ query_one: QueryOne | None = None,
37
+ quality_results: tuple[QualityRuleResult, ...] = (),
38
+ view_name: str | None = None,
39
+ collect_metrics: bool = False,
40
+ ) -> dict[str, Any]:
41
+ window_config = _window_config(contract_mapping, options=options, query_one=query_one)
42
+ windows = _windows(window_config)
43
+ parent_run_id = options.run_id or f"run-{uuid4()}"
44
+ results: list[dict[str, Any]] = []
45
+ for index, window in enumerate(windows, start=1):
46
+ child_plan = build_child_window_plan(
47
+ parent_run_id=parent_run_id,
48
+ column=str(window_config["column"]),
49
+ window=window,
50
+ index=index,
51
+ existing_filter=contract_mapping.get("filter_expression"),
52
+ base_idempotency_key=contract_mapping.get("idempotency_key"),
53
+ )
54
+ child_mapping = _child_contract_mapping(contract_mapping, child_plan)
55
+ child_contract = semantic_contract_from_mapping(child_mapping)
56
+ child_opts = replace(
57
+ options,
58
+ run_id=f"{parent_run_id}:window:{index:04d}",
59
+ idempotency_key=child_plan.idempotency_key or options.idempotency_key,
60
+ )
61
+ prepared = prepare_contract_source_view(
62
+ spark,
63
+ child_contract,
64
+ view_name=_child_view_name(child_contract, view_name, index),
65
+ collect_metrics=collect_metrics,
66
+ query_one=query_one,
67
+ evidence_catalog=child_opts.catalog,
68
+ evidence_schema=child_opts.schema,
69
+ )
70
+ result = ingest_databricks_contract(
71
+ child_contract,
72
+ runner=runner,
73
+ prepared=prepared,
74
+ options=child_opts,
75
+ query_one=query_one,
76
+ quality_results=quality_results,
77
+ )
78
+ result["execution_window"] = _window_payload(child_plan.window, str(window_config["column"]))
79
+ results.append(result)
80
+ if result.get("status") == "FAILED" and window_config.get("stop_on_failure", True):
81
+ break
82
+ return _summary(parent_run_id, windows, results)
83
+
84
+
85
+ def _window_config(
86
+ contract_mapping: dict[str, Any],
87
+ *,
88
+ options: DatabricksIngestOptions,
89
+ query_one: QueryOne | None,
90
+ ) -> dict[str, Any]:
91
+ execution = dict(contract_mapping.get("execution") or {})
92
+ if isinstance(execution.get("window"), dict):
93
+ return dict(execution["window"])
94
+ catchup = dict(execution.get("catchup") or {})
95
+ if not catchup.get("enabled"):
96
+ raise ValueError("windowed execution requires execution.window or enabled execution.catchup")
97
+ start = catchup.get("start") or _previous_watermark_start(contract_mapping, options, query_one, catchup)
98
+ return {
99
+ "column": catchup.get("column") or _single_watermark_column(contract_mapping),
100
+ "start": start,
101
+ "end": catchup.get("end"),
102
+ "every": catchup.get("every"),
103
+ "stop_on_failure": catchup.get("stop_on_failure", True),
104
+ }
105
+
106
+
107
+ def _windows(config: dict[str, Any]) -> tuple[ExecutionWindow, ...]:
108
+ explicit = config.get("windows")
109
+ if explicit:
110
+ return tuple(
111
+ ExecutionWindow(start=str(item["start"]), end=str(item["end"]), label=str(item.get("label") or ""))
112
+ for item in explicit
113
+ )
114
+ missing = [key for key in ("column", "start", "end", "every") if not config.get(key)]
115
+ if missing:
116
+ raise ValueError(f"execution window requires: {', '.join(missing)}")
117
+ return build_time_windows(str(config["start"]), str(config["end"]), str(config["every"]))
118
+
119
+
120
+ def _child_contract_mapping(contract_mapping: dict[str, Any], child_plan: Any) -> dict[str, Any]:
121
+ child = dict(contract_mapping)
122
+ child["filter_expression"] = child_plan.filter_expression
123
+ child["parent_run_id"] = child_plan.parent_run_id
124
+ if child_plan.idempotency_key:
125
+ child["idempotency_key"] = child_plan.idempotency_key
126
+ runtime = dict(child.get("runtime_parameters") or {})
127
+ runtime.update(child_plan.runtime_parameters)
128
+ child["runtime_parameters"] = runtime
129
+ child.pop("execution", None)
130
+ return child
131
+
132
+
133
+ def _summary(parent_run_id: str, windows: tuple[ExecutionWindow, ...], results: list[dict[str, Any]]) -> dict[str, Any]:
134
+ summary = dict(summarize_window_results(results))
135
+ summary.update(
136
+ {
137
+ "run_id": parent_run_id,
138
+ "parent_run_id": parent_run_id,
139
+ "windows_total": len(windows),
140
+ "windows_processed": len(results),
141
+ "window_results": results,
142
+ }
143
+ )
144
+ return summary
145
+
146
+
147
+ def _previous_watermark_start(
148
+ contract_mapping: dict[str, Any],
149
+ options: DatabricksIngestOptions,
150
+ query_one: QueryOne | None,
151
+ catchup: dict[str, Any],
152
+ ) -> str:
153
+ if query_one is None:
154
+ raise ValueError("execution.catchup.start is required when query_one is not provided")
155
+ contract = semantic_contract_from_mapping(contract_mapping)
156
+ row = query_one(
157
+ render_select_previous_watermark_sql(
158
+ target_table=target_full_name(contract),
159
+ state_table=f"{options.catalog}.{options.schema}.ctrl_ingestion_state",
160
+ )
161
+ )
162
+ raw = row.get("watermark_value") if isinstance(row, dict) else None
163
+ start = extract_watermark_field_value(raw, catchup.get("column") or _single_watermark_column(contract_mapping))
164
+ if not start:
165
+ raise ValueError("execution.catchup.start is required when no previous watermark exists")
166
+ return start
167
+
168
+
169
+ def _single_watermark_column(contract_mapping: dict[str, Any]) -> str:
170
+ value = contract_mapping.get("watermark_columns")
171
+ columns = [value] if isinstance(value, str) else list(value or ())
172
+ columns = [str(item).strip() for item in columns if str(item).strip()]
173
+ if len(columns) != 1:
174
+ raise ValueError("execution.catchup.column is required unless exactly one watermark column is configured")
175
+ return columns[0]
176
+
177
+
178
+ def _child_view_name(contract: Any, view_name: str | None, index: int) -> str:
179
+ base = view_name or f"cf_source_{target_full_name(contract).replace('`', '').replace('.', '_')}"
180
+ return f"{base}_{index:04d}"
181
+
182
+
183
+ def _window_payload(window: ExecutionWindow, column: str) -> dict[str, str]:
184
+ return {"label": window.label, "column": column, "start": window.start, "end": window.end}
@@ -0,0 +1,66 @@
1
+ """Dispatch prepared Databricks views to write-mode executors."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from contractforge_core.runtime import PreparedInput, QueryOne
6
+ from contractforge_core.semantic import SemanticContract
7
+ from contractforge_databricks.execution import (
8
+ ExecutionOutcome,
9
+ SqlRunner,
10
+ execute_append,
11
+ execute_hash_diff_insert,
12
+ execute_overwrite,
13
+ execute_replace_partitions,
14
+ execute_scd1_merge,
15
+ execute_scd2_merge,
16
+ execute_snapshot_soft_delete,
17
+ )
18
+ from contractforge_databricks.write_modes.registry import execute_registered_write_mode
19
+
20
+
21
+ def execute_prepared_write(
22
+ *,
23
+ runner: SqlRunner,
24
+ contract: SemanticContract,
25
+ prepared: PreparedInput,
26
+ target_partition_predicate: str | None = None,
27
+ replace_partition_predicate: str | None = None,
28
+ target_schema: dict[str, str] | None = None,
29
+ query_one: QueryOne | None = None,
30
+ ) -> ExecutionOutcome:
31
+ kwargs = {"runner": runner, "contract": contract, "source_view": prepared.source_view}
32
+ if contract.write.mode == "scd0_append":
33
+ return execute_append(**kwargs)
34
+ if contract.write.mode == "scd0_overwrite":
35
+ return execute_overwrite(**kwargs)
36
+ if contract.write.mode == "scd1_upsert":
37
+ if replace_partition_predicate:
38
+ return execute_replace_partitions(**kwargs, predicate=replace_partition_predicate)
39
+ return execute_scd1_merge(
40
+ **kwargs,
41
+ source_columns=prepared.source_columns,
42
+ target_partition_predicate=target_partition_predicate,
43
+ )
44
+ if contract.write.mode == "scd1_hash_diff":
45
+ return execute_hash_diff_insert(
46
+ **kwargs,
47
+ source_columns=prepared.source_columns,
48
+ target_schema=target_schema,
49
+ query_one=query_one,
50
+ )
51
+ if contract.write.mode == "scd2_historical":
52
+ return execute_scd2_merge(**kwargs, insert_columns=prepared.source_columns)
53
+ if contract.write.mode == "snapshot_soft_delete":
54
+ return execute_snapshot_soft_delete(**kwargs, source_columns=prepared.source_columns)
55
+ if contract.write.mode.startswith("custom:"):
56
+ return execute_registered_write_mode(
57
+ contract.write.mode,
58
+ runner=runner,
59
+ contract=contract,
60
+ prepared=prepared,
61
+ target_partition_predicate=target_partition_predicate,
62
+ replace_partition_predicate=replace_partition_predicate,
63
+ target_schema=target_schema,
64
+ query_one=query_one,
65
+ )
66
+ raise ValueError(f"Unsupported Databricks write mode: {contract.write.mode}")