contractforge-databricks 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. contractforge_databricks/__init__.py +172 -0
  2. contractforge_databricks/adapter.py +69 -0
  3. contractforge_databricks/annotations/__init__.py +10 -0
  4. contractforge_databricks/annotations/application.py +52 -0
  5. contractforge_databricks/annotations/audit.py +49 -0
  6. contractforge_databricks/annotations/sql.py +142 -0
  7. contractforge_databricks/api.py +65 -0
  8. contractforge_databricks/bundles/__init__.py +9 -0
  9. contractforge_databricks/bundles/assets.py +47 -0
  10. contractforge_databricks/bundles/project.py +213 -0
  11. contractforge_databricks/bundles/project_config.py +133 -0
  12. contractforge_databricks/capabilities/__init__.py +17 -0
  13. contractforge_databricks/capabilities/builders.py +43 -0
  14. contractforge_databricks/capabilities/evaluate.py +162 -0
  15. contractforge_databricks/capabilities/mapping.py +36 -0
  16. contractforge_databricks/capabilities/models.py +44 -0
  17. contractforge_databricks/capabilities/runtime.py +111 -0
  18. contractforge_databricks/capabilities/uc.py +47 -0
  19. contractforge_databricks/cli.py +196 -0
  20. contractforge_databricks/cli_deploy.py +98 -0
  21. contractforge_databricks/cli_governance.py +142 -0
  22. contractforge_databricks/cli_io.py +91 -0
  23. contractforge_databricks/cli_maintenance.py +69 -0
  24. contractforge_databricks/coercion.py +31 -0
  25. contractforge_databricks/contract_extensions.py +70 -0
  26. contractforge_databricks/cost/__init__.py +11 -0
  27. contractforge_databricks/cost/model.py +22 -0
  28. contractforge_databricks/cost/report.py +65 -0
  29. contractforge_databricks/cost/sql.py +136 -0
  30. contractforge_databricks/dashboards/__init__.py +15 -0
  31. contractforge_databricks/dashboards/control_tables.py +150 -0
  32. contractforge_databricks/diagnostics/__init__.py +7 -0
  33. contractforge_databricks/diagnostics/explain.py +40 -0
  34. contractforge_databricks/environment.py +53 -0
  35. contractforge_databricks/evidence/__init__.py +98 -0
  36. contractforge_databricks/evidence/ddl.py +35 -0
  37. contractforge_databricks/evidence/governance_log.py +175 -0
  38. contractforge_databricks/evidence/helpers.py +29 -0
  39. contractforge_databricks/evidence/ops_log.py +210 -0
  40. contractforge_databricks/evidence/records.py +27 -0
  41. contractforge_databricks/evidence/run_log.py +74 -0
  42. contractforge_databricks/evidence/schemas.py +7 -0
  43. contractforge_databricks/evidence/sql.py +144 -0
  44. contractforge_databricks/evidence/tables.py +20 -0
  45. contractforge_databricks/evidence/writer.py +118 -0
  46. contractforge_databricks/execution/__init__.py +70 -0
  47. contractforge_databricks/execution/delta_basic.py +57 -0
  48. contractforge_databricks/execution/hash_diff.py +126 -0
  49. contractforge_databricks/execution/hash_diff_latest.py +142 -0
  50. contractforge_databricks/execution/replace_partitions.py +40 -0
  51. contractforge_databricks/execution/results.py +5 -0
  52. contractforge_databricks/execution/retry.py +36 -0
  53. contractforge_databricks/execution/scd2.py +213 -0
  54. contractforge_databricks/execution/scd2_deletes.py +65 -0
  55. contractforge_databricks/execution/scd2_late.py +30 -0
  56. contractforge_databricks/execution/snapshot.py +77 -0
  57. contractforge_databricks/execution/sql_merge.py +85 -0
  58. contractforge_databricks/execution/tables.py +98 -0
  59. contractforge_databricks/execution/windows.py +58 -0
  60. contractforge_databricks/governance/__init__.py +30 -0
  61. contractforge_databricks/governance/access.py +185 -0
  62. contractforge_databricks/governance/application.py +93 -0
  63. contractforge_databricks/governance/drift.py +49 -0
  64. contractforge_databricks/governance/runtime.py +60 -0
  65. contractforge_databricks/governance/sql.py +31 -0
  66. contractforge_databricks/governance/validation.py +135 -0
  67. contractforge_databricks/lakeflow/__init__.py +21 -0
  68. contractforge_databricks/lakeflow/compatibility.py +194 -0
  69. contractforge_databricks/lakeflow/rendering.py +175 -0
  70. contractforge_databricks/lineage/__init__.py +7 -0
  71. contractforge_databricks/lineage/openlineage.py +182 -0
  72. contractforge_databricks/maintenance/__init__.py +27 -0
  73. contractforge_databricks/maintenance/retention.py +90 -0
  74. contractforge_databricks/maintenance/sql.py +68 -0
  75. contractforge_databricks/metrics/__init__.py +19 -0
  76. contractforge_databricks/metrics/history.py +21 -0
  77. contractforge_databricks/metrics/write.py +63 -0
  78. contractforge_databricks/operations/__init__.py +4 -0
  79. contractforge_databricks/operations/application.py +38 -0
  80. contractforge_databricks/operations/sql.py +95 -0
  81. contractforge_databricks/parity/__init__.py +18 -0
  82. contractforge_databricks/parity/catalog.py +59 -0
  83. contractforge_databricks/parity/models.py +7 -0
  84. contractforge_databricks/parity/scenarios.py +111 -0
  85. contractforge_databricks/partitioning/__init__.py +3 -0
  86. contractforge_databricks/partitioning/predicates.py +28 -0
  87. contractforge_databricks/preparation/__init__.py +47 -0
  88. contractforge_databricks/preparation/deduplicate.py +87 -0
  89. contractforge_databricks/preparation/encoding.py +37 -0
  90. contractforge_databricks/preparation/hashing.py +18 -0
  91. contractforge_databricks/preparation/pyspark.py +178 -0
  92. contractforge_databricks/preparation/pyspark_staging.py +70 -0
  93. contractforge_databricks/preparation/shape.py +209 -0
  94. contractforge_databricks/preparation/shape_validation.py +94 -0
  95. contractforge_databricks/preparation/staging.py +17 -0
  96. contractforge_databricks/preparation/zip_arrays.py +51 -0
  97. contractforge_databricks/presets/__init__.py +3 -0
  98. contractforge_databricks/presets/base.py +24 -0
  99. contractforge_databricks/presets/bronze.py +57 -0
  100. contractforge_databricks/presets/catalog.py +22 -0
  101. contractforge_databricks/presets/core.py +134 -0
  102. contractforge_databricks/presets/gold.py +62 -0
  103. contractforge_databricks/presets/modifiers.py +51 -0
  104. contractforge_databricks/presets/runtime.py +22 -0
  105. contractforge_databricks/presets/silver.py +101 -0
  106. contractforge_databricks/presets/write_engine.py +57 -0
  107. contractforge_databricks/quality/__init__.py +41 -0
  108. contractforge_databricks/quality/evaluation.py +178 -0
  109. contractforge_databricks/quality/persistence.py +81 -0
  110. contractforge_databricks/quality/registry.py +134 -0
  111. contractforge_databricks/quality/results.py +17 -0
  112. contractforge_databricks/quality/sql.py +113 -0
  113. contractforge_databricks/rendering/__init__.py +11 -0
  114. contractforge_databricks/rendering/bundle.py +93 -0
  115. contractforge_databricks/rendering/markdown.py +50 -0
  116. contractforge_databricks/rendering/names.py +56 -0
  117. contractforge_databricks/results.py +15 -0
  118. contractforge_databricks/runtime/__init__.py +101 -0
  119. contractforge_databricks/runtime/available_now.py +147 -0
  120. contractforge_databricks/runtime/bundles.py +211 -0
  121. contractforge_databricks/runtime/cache.py +20 -0
  122. contractforge_databricks/runtime/control_tables.py +19 -0
  123. contractforge_databricks/runtime/deploy.py +197 -0
  124. contractforge_databricks/runtime/detection.py +114 -0
  125. contractforge_databricks/runtime/dry_run.py +46 -0
  126. contractforge_databricks/runtime/errors.py +54 -0
  127. contractforge_databricks/runtime/file_selection.py +109 -0
  128. contractforge_databricks/runtime/finalization.py +168 -0
  129. contractforge_databricks/runtime/governance.py +37 -0
  130. contractforge_databricks/runtime/hooks.py +45 -0
  131. contractforge_databricks/runtime/http_file.py +37 -0
  132. contractforge_databricks/runtime/http_retry.py +15 -0
  133. contractforge_databricks/runtime/http_safety.py +9 -0
  134. contractforge_databricks/runtime/json_materialization.py +97 -0
  135. contractforge_databricks/runtime/lineage.py +164 -0
  136. contractforge_databricks/runtime/maintenance.py +43 -0
  137. contractforge_databricks/runtime/merge_validation.py +98 -0
  138. contractforge_databricks/runtime/metadata.py +21 -0
  139. contractforge_databricks/runtime/metrics.py +34 -0
  140. contractforge_databricks/runtime/models.py +32 -0
  141. contractforge_databricks/runtime/options.py +33 -0
  142. contractforge_databricks/runtime/orchestration_context.py +185 -0
  143. contractforge_databricks/runtime/orchestrator.py +147 -0
  144. contractforge_databricks/runtime/partitioning.py +93 -0
  145. contractforge_databricks/runtime/quality_quarantine.py +92 -0
  146. contractforge_databricks/runtime/rest_api.py +46 -0
  147. contractforge_databricks/runtime/rest_auth.py +21 -0
  148. contractforge_databricks/runtime/rest_pagination.py +21 -0
  149. contractforge_databricks/runtime/run_payload.py +177 -0
  150. contractforge_databricks/runtime/schema.py +106 -0
  151. contractforge_databricks/runtime/source_metadata.py +30 -0
  152. contractforge_databricks/runtime/source_registry.py +43 -0
  153. contractforge_databricks/runtime/source_schema.py +24 -0
  154. contractforge_databricks/runtime/sources.py +208 -0
  155. contractforge_databricks/runtime/spark.py +183 -0
  156. contractforge_databricks/runtime/spark_defaults.py +35 -0
  157. contractforge_databricks/runtime/storage_auth.py +132 -0
  158. contractforge_databricks/runtime/streaming.py +131 -0
  159. contractforge_databricks/runtime/success.py +104 -0
  160. contractforge_databricks/runtime/utils.py +52 -0
  161. contractforge_databricks/runtime/watermark.py +71 -0
  162. contractforge_databricks/runtime/windows.py +184 -0
  163. contractforge_databricks/runtime/write.py +66 -0
  164. contractforge_databricks/runtime/write_flow.py +146 -0
  165. contractforge_databricks/runtime/write_strategy.py +40 -0
  166. contractforge_databricks/schema/__init__.py +21 -0
  167. contractforge_databricks/schema/diff.py +11 -0
  168. contractforge_databricks/schema/policy.py +33 -0
  169. contractforge_databricks/schema/sync.py +23 -0
  170. contractforge_databricks/security/__init__.py +21 -0
  171. contractforge_databricks/security/errors.py +5 -0
  172. contractforge_databricks/security/redaction.py +5 -0
  173. contractforge_databricks/security/secrets.py +114 -0
  174. contractforge_databricks/security/source_policy.py +17 -0
  175. contractforge_databricks/shapes/__init__.py +3 -0
  176. contractforge_databricks/shapes/sql.py +123 -0
  177. contractforge_databricks/sources/__init__.py +67 -0
  178. contractforge_databricks/sources/artifacts.py +100 -0
  179. contractforge_databricks/sources/autoloader.py +48 -0
  180. contractforge_databricks/sources/bounded_streams.py +44 -0
  181. contractforge_databricks/sources/classification.py +115 -0
  182. contractforge_databricks/sources/delta_share.py +21 -0
  183. contractforge_databricks/sources/files.py +48 -0
  184. contractforge_databricks/sources/http_file.py +46 -0
  185. contractforge_databricks/sources/interpret.py +76 -0
  186. contractforge_databricks/sources/jdbc.py +32 -0
  187. contractforge_databricks/sources/metadata.py +18 -0
  188. contractforge_databricks/sources/native_passthrough.py +33 -0
  189. contractforge_databricks/sources/rds_iam.py +15 -0
  190. contractforge_databricks/sources/rds_iam_runtime.py +191 -0
  191. contractforge_databricks/sources/rest_api.py +33 -0
  192. contractforge_databricks/sources/support.py +50 -0
  193. contractforge_databricks/sources/table_refs.py +65 -0
  194. contractforge_databricks/sql/__init__.py +4 -0
  195. contractforge_databricks/sql/identifiers.py +17 -0
  196. contractforge_databricks/sql/literals.py +36 -0
  197. contractforge_databricks/state/__init__.py +39 -0
  198. contractforge_databricks/state/ddl.py +24 -0
  199. contractforge_databricks/state/migrations.py +146 -0
  200. contractforge_databricks/state/queries.py +149 -0
  201. contractforge_databricks/state/sql.py +116 -0
  202. contractforge_databricks/state/tables.py +9 -0
  203. contractforge_databricks/state/writer.py +83 -0
  204. contractforge_databricks/templates/__init__.py +15 -0
  205. contractforge_databricks/templates/catalog.py +205 -0
  206. contractforge_databricks/templates/catalog_parity.py +85 -0
  207. contractforge_databricks/templates/core.py +83 -0
  208. contractforge_databricks/templates/enrichment.py +175 -0
  209. contractforge_databricks/transforms/__init__.py +3 -0
  210. contractforge_databricks/transforms/sql.py +118 -0
  211. contractforge_databricks/watermark/__init__.py +6 -0
  212. contractforge_databricks/watermark/sql.py +91 -0
  213. contractforge_databricks/write_modes/__init__.py +20 -0
  214. contractforge_databricks/write_modes/registry.py +44 -0
  215. contractforge_databricks/write_modes/sql.py +33 -0
  216. contractforge_databricks/write_modes/strategy.py +192 -0
  217. contractforge_databricks-0.1.0.dist-info/METADATA +34 -0
  218. contractforge_databricks-0.1.0.dist-info/RECORD +220 -0
  219. contractforge_databricks-0.1.0.dist-info/WHEEL +4 -0
  220. contractforge_databricks-0.1.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,109 @@
1
+ """Databricks runtime file path selection helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import re
7
+ from typing import Any
8
+
9
+
10
+ def selected_file_load_path(spark: Any, source: dict[str, Any], options: dict[str, str]) -> object:
11
+ path = source.get("path")
12
+ if not path:
13
+ return path
14
+ read = source.get("read") if isinstance(source.get("read"), dict) else {}
15
+ pattern_text = str(read.get("file_regex") or "").strip()
16
+ if not pattern_text:
17
+ return path
18
+ try:
19
+ pattern = re.compile(pattern_text)
20
+ except re.error as exc:
21
+ raise ValueError(f"source.read.file_regex is invalid: {exc}") from exc
22
+ scope = str(read.get("file_regex_scope") or "relative_path").strip().lower()
23
+ if scope not in {"filename", "relative_path"}:
24
+ raise ValueError("source.read.file_regex_scope must be 'filename' or 'relative_path'")
25
+ max_listed = _positive_int(read.get("file_regex_max_listed"), "source.read.file_regex_max_listed", 10000)
26
+ recursive = _bool(read.get("file_regex_recursive"), _bool(options.get("recursiveFileLookup"), False))
27
+ listed = _listed_files(spark, str(path), recursive=recursive, max_files=max_listed, declared=read.get("files"))
28
+ root = str(path).rstrip("/")
29
+ matched = []
30
+ for file_path in listed:
31
+ file_text = str(file_path)
32
+ relative = file_text[len(root) :].lstrip("/") if file_text.startswith(root) else os.path.basename(file_text)
33
+ candidate = os.path.basename(file_text) if scope == "filename" else relative
34
+ if pattern.search(candidate):
35
+ matched.append(file_text)
36
+ if not matched:
37
+ raise ValueError(
38
+ "source.read.file_regex found no matching files. "
39
+ f"pattern={pattern_text!r}, scope={scope}, listed_files={len(listed)}"
40
+ )
41
+ return matched
42
+
43
+
44
+ def _listed_files(
45
+ spark: Any,
46
+ path: str,
47
+ *,
48
+ recursive: bool,
49
+ max_files: int,
50
+ declared: object,
51
+ ) -> list[str]:
52
+ if isinstance(declared, (list, tuple)):
53
+ files = [str(item) for item in declared]
54
+ if len(files) > max_files:
55
+ raise ValueError(f"source.read.file_regex exceeded source.read.file_regex_max_listed={max_files}")
56
+ return files
57
+ jvm = getattr(spark, "_jvm", None)
58
+ jsc = getattr(spark, "_jsc", None)
59
+ if jvm is None or jsc is None:
60
+ raise RuntimeError(
61
+ "source.read.file_regex requires Hadoop FileSystem access through classic PySpark. "
62
+ "In Spark Connect/serverless, use pathGlobFilter, a filtered External Location/Volume path, "
63
+ "or provide an explicit source.read.files list."
64
+ )
65
+ return _hadoop_list_files(jvm, jsc, path, recursive=recursive, max_files=max_files)
66
+
67
+
68
+ def _hadoop_list_files(jvm: Any, jsc: Any, path: str, *, recursive: bool, max_files: int) -> list[str]:
69
+ root = jvm.org.apache.hadoop.fs.Path(path)
70
+ fs = root.getFileSystem(jsc.hadoopConfiguration())
71
+ files: list[str] = []
72
+
73
+ def visit(current_path: Any) -> None:
74
+ status = fs.getFileStatus(current_path)
75
+ if status.isFile():
76
+ _append(files, str(status.getPath().toString()), max_files)
77
+ return
78
+ for child in fs.listStatus(current_path):
79
+ if child.isDirectory():
80
+ if recursive:
81
+ visit(child.getPath())
82
+ continue
83
+ _append(files, str(child.getPath().toString()), max_files)
84
+
85
+ visit(root)
86
+ return files
87
+
88
+
89
+ def _append(files: list[str], path: str, max_files: int) -> None:
90
+ files.append(path)
91
+ if len(files) > max_files:
92
+ raise ValueError(f"source.read.file_regex exceeded source.read.file_regex_max_listed={max_files}")
93
+
94
+
95
+ def _bool(value: object, default: bool) -> bool:
96
+ if value is None:
97
+ return default
98
+ if isinstance(value, bool):
99
+ return value
100
+ return str(value).strip().lower() in {"1", "true", "yes", "y"}
101
+
102
+
103
+ def _positive_int(value: object, field: str, default: int) -> int:
104
+ if value in (None, ""):
105
+ return default
106
+ parsed = int(value)
107
+ if parsed <= 0:
108
+ raise ValueError(f"{field} must be a positive integer")
109
+ return parsed
@@ -0,0 +1,168 @@
1
+ """Finalize Databricks runtime ingestion with evidence and state."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import datetime, timezone
6
+ from typing import Any
7
+
8
+ from contractforge_core.config import CTRL_SCHEMA_VERSION, FRAMEWORK_VERSION
9
+ from contractforge_core.quality import QualityRuleResult
10
+ from contractforge_core.runtime import PreparedInput, QuarantineReference
11
+ from contractforge_core.semantic import SemanticContract
12
+ from contractforge_databricks.evidence import EvidenceWriter, SourceMetadataEvidenceRecord
13
+ from contractforge_databricks.quality import render_quality_result_insert_sql, render_quarantine_reference_insert_sql
14
+ from contractforge_databricks.runtime.models import DatabricksIngestOptions
15
+ from contractforge_databricks.runtime.run_payload import run_payload
16
+ from contractforge_databricks.runtime.utils import utc_now_str
17
+ from contractforge_databricks.state import StateWriter
18
+
19
+
20
+ def finalize_ingest(
21
+ evidence: EvidenceWriter,
22
+ state: StateWriter,
23
+ contract: SemanticContract,
24
+ prepared: PreparedInput,
25
+ opts: DatabricksIngestOptions,
26
+ run_id: str,
27
+ target: str,
28
+ status: str,
29
+ started: str,
30
+ *,
31
+ rows_written: int,
32
+ quality_status_value: str,
33
+ quality_results: tuple[QualityRuleResult, ...] = (),
34
+ operation_metrics: dict[str, Any] | None = None,
35
+ schema_changes: dict[str, Any] | None = None,
36
+ governance_results: dict[str, Any] | None = None,
37
+ write_started_at: str | None = None,
38
+ write_finished_at: str | None = None,
39
+ stage_durations: dict[str, float] | None = None,
40
+ watermark_column: str | None = None,
41
+ watermark_previous: str | None = None,
42
+ watermark_current: str | None = None,
43
+ diagnostics: dict[str, bool] | None = None,
44
+ error_message: str | None = None,
45
+ skip_reason: str | None = None,
46
+ skipped_by_run_id: str | None = None,
47
+ ) -> dict[str, Any]:
48
+ finished = _utc_now()
49
+ payload = run_payload(
50
+ contract,
51
+ prepared,
52
+ opts,
53
+ run_id,
54
+ target,
55
+ status,
56
+ started,
57
+ finished,
58
+ rows_written,
59
+ quality_status_value,
60
+ operation_metrics or {},
61
+ schema_changes or {},
62
+ governance_results or {},
63
+ write_started_at,
64
+ write_finished_at,
65
+ stage_durations or {},
66
+ watermark_column,
67
+ watermark_previous,
68
+ watermark_current,
69
+ diagnostics or {},
70
+ error_message,
71
+ skip_reason,
72
+ skipped_by_run_id,
73
+ )
74
+ if not opts.dry_run:
75
+ operations = contract.operations.metadata if contract.operations and contract.operations.metadata else {}
76
+ evidence.write_run_log(payload)
77
+ _write_quality_results(evidence, run_id, target, quality_results, payload["finished_at_utc"], opts)
78
+ _write_quarantine_references(evidence, run_id, target, prepared.quarantine_records, payload["finished_at_utc"], opts)
79
+ if prepared.source_metadata:
80
+ evidence.write_source_metadata(
81
+ SourceMetadataEvidenceRecord(
82
+ run_id=run_id,
83
+ target_table=target,
84
+ source_metadata=dict(prepared.source_metadata),
85
+ captured_at_utc=_parse_utc(finished=payload["finished_at_utc"]),
86
+ )
87
+ )
88
+ state.record_control_metadata(
89
+ framework_version=FRAMEWORK_VERSION,
90
+ ctrl_schema_version=CTRL_SCHEMA_VERSION,
91
+ )
92
+ state.upsert_state(
93
+ target_table=target,
94
+ run_id=run_id,
95
+ status=status,
96
+ rows_written=rows_written,
97
+ watermark_column=watermark_column,
98
+ watermark_value=watermark_current,
99
+ success_at_utc=finished if status == "SUCCESS" else None,
100
+ watermark_candidate=watermark_current,
101
+ table_version=payload.get("table_version_after"),
102
+ write_completed_at_utc=write_finished_at if status == "SUCCESS" else None,
103
+ error_message=error_message,
104
+ parent_run_id=operations.get("parent_run_id"),
105
+ run_group_id=operations.get("run_group_id"),
106
+ master_job_id=operations.get("master_job_id"),
107
+ master_run_id=operations.get("master_run_id"),
108
+ )
109
+ return payload
110
+
111
+
112
+ def _utc_now() -> str:
113
+ return utc_now_str()
114
+
115
+
116
+ def _write_quality_results(
117
+ evidence: EvidenceWriter,
118
+ run_id: str,
119
+ target: str,
120
+ results: tuple[QualityRuleResult, ...],
121
+ checked_at: object,
122
+ opts: DatabricksIngestOptions,
123
+ ) -> None:
124
+ checked_at_utc = _parse_utc(finished=checked_at)
125
+ for result in results:
126
+ evidence.runner.sql(
127
+ render_quality_result_insert_sql(
128
+ run_id=run_id,
129
+ target_table=target,
130
+ result=result,
131
+ checked_at_utc=checked_at_utc,
132
+ catalog=opts.catalog,
133
+ schema=opts.schema,
134
+ )
135
+ )
136
+
137
+
138
+ def _write_quarantine_references(
139
+ evidence: EvidenceWriter,
140
+ run_id: str,
141
+ target: str,
142
+ records: tuple[QuarantineReference, ...],
143
+ quarantined_at: object,
144
+ opts: DatabricksIngestOptions,
145
+ ) -> None:
146
+ quarantined_at_utc = _parse_utc(finished=quarantined_at)
147
+ for record in records:
148
+ reason = f"{record.rule_name}: {record.reason}" if record.rule_name else record.reason
149
+ evidence.runner.sql(
150
+ render_quarantine_reference_insert_sql(
151
+ run_id=run_id,
152
+ target_table=target,
153
+ record_ref=record.record_ref,
154
+ reason=reason,
155
+ quarantined_at_utc=quarantined_at_utc,
156
+ catalog=opts.catalog,
157
+ schema=opts.schema,
158
+ )
159
+ )
160
+
161
+
162
+ def _parse_utc(*, finished: object) -> datetime:
163
+ if isinstance(finished, datetime):
164
+ return finished
165
+ try:
166
+ return datetime.strptime(str(finished), "%Y-%m-%d %H:%M:%S").replace(tzinfo=timezone.utc)
167
+ except ValueError:
168
+ return datetime.now(timezone.utc)
@@ -0,0 +1,37 @@
1
+ """Runtime governance side effects for Databricks ingestion."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import asdict
6
+ from typing import Any
7
+
8
+ from contractforge_core.semantic import SemanticContract
9
+ from contractforge_databricks.annotations import apply_annotations_contract
10
+ from contractforge_databricks.environment import DatabricksEnvironment
11
+ from contractforge_databricks.execution import SqlRunner
12
+ from contractforge_databricks.operations import record_operations_contract
13
+
14
+
15
+ def apply_runtime_governance(
16
+ *,
17
+ runner: SqlRunner,
18
+ contract: SemanticContract,
19
+ run_id: str,
20
+ evidence_catalog: str,
21
+ evidence_schema: str,
22
+ ) -> dict[str, Any]:
23
+ operations = record_operations_contract(
24
+ runner=runner,
25
+ contract=contract,
26
+ environment=DatabricksEnvironment(evidence_catalog=evidence_catalog, evidence_schema=evidence_schema),
27
+ run_id=run_id,
28
+ )
29
+ annotations = apply_annotations_contract(runner=runner, contract=contract)
30
+ result = {
31
+ "operations": asdict(operations),
32
+ "annotations": asdict(annotations),
33
+ "access": {"status": "DEFERRED"} if contract.governance and contract.governance.access else {"status": "NOT_CONFIGURED"},
34
+ }
35
+ if annotations.status == "FAILED":
36
+ raise ValueError(f"Databricks annotations failed: {list(annotations.errors)}")
37
+ return result
@@ -0,0 +1,45 @@
1
+ """Programmatic hooks for Databricks runtime orchestration."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Callable, Optional
7
+
8
+ from contractforge_core.execution import ExecutionOutcome
9
+ from contractforge_core.runtime import PreparedInput
10
+ from contractforge_core.semantic import SemanticContract
11
+
12
+ PreparedHook = Callable[[SemanticContract, PreparedInput], Optional[PreparedInput]]
13
+ AfterWriteHook = Callable[[SemanticContract, PreparedInput, Optional[ExecutionOutcome]], None]
14
+ AfterFinalizeHook = Callable[[SemanticContract, dict[str, object]], None]
15
+
16
+
17
+ @dataclass(frozen=True)
18
+ class DatabricksIngestionHooks:
19
+ """Optional callbacks around the Databricks prepared-view runtime boundary."""
20
+
21
+ after_prepare: PreparedHook | None = None
22
+ before_write: PreparedHook | None = None
23
+ after_write: AfterWriteHook | None = None
24
+ after_finalize: AfterFinalizeHook | None = None
25
+
26
+ def __post_init__(self) -> None:
27
+ for name in ("after_prepare", "before_write", "after_write", "after_finalize"):
28
+ hook = getattr(self, name)
29
+ if hook is not None and not callable(hook):
30
+ raise ValueError(f"DatabricksIngestionHooks.{name} must be callable")
31
+
32
+
33
+ def apply_prepared_hook(
34
+ hook: PreparedHook | None,
35
+ contract: SemanticContract,
36
+ prepared: PreparedInput,
37
+ ) -> PreparedInput:
38
+ if hook is None:
39
+ return prepared
40
+ result = hook(contract, prepared)
41
+ if result is None:
42
+ return prepared
43
+ if not isinstance(result, PreparedInput):
44
+ raise ValueError("Databricks prepared hooks must return PreparedInput or None")
45
+ return result
@@ -0,0 +1,37 @@
1
+ """Databricks runtime execution for bounded HTTP file sources."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from contractforge_core.connectors import (
8
+ cleanup_http_file_downloads as cleanup_http_file_downloads,
9
+ download_http_file,
10
+ http_file_format,
11
+ http_file_reader_options,
12
+ is_http_file_source,
13
+ )
14
+ from contractforge_databricks.runtime.source_schema import apply_declared_schema
15
+
16
+
17
+ def resolve_http_file_dataframe(spark: Any, source: dict[str, Any]) -> Any:
18
+ """Download a bounded HTTP file and load it with Spark's native reader."""
19
+
20
+ if not is_http_file_source(source):
21
+ raise ValueError("HTTP file runtime resolution requires source.type http_file/http_csv/http_json/http_text")
22
+ local_path = download_http_file(source)
23
+ reader = spark.read.format(http_file_format(source))
24
+ for key, value in sorted(http_file_reader_options(source).items()):
25
+ reader = reader.option(key, value)
26
+ reader = apply_declared_schema(reader, source)
27
+ df = reader.load(local_path)
28
+ _enforce_max_records(df, source)
29
+ return df
30
+
31
+ def _enforce_max_records(df: Any, source: dict[str, Any]) -> None:
32
+ max_records = source.get("limits", {}).get("max_records")
33
+ if max_records is None or not hasattr(df, "count"):
34
+ return
35
+ count = int(df.count())
36
+ if count > int(max_records):
37
+ raise ValueError(f"HTTP file response exceeds source.limits.max_records={int(max_records)}")
@@ -0,0 +1,15 @@
1
+ """Compatibility re-exports for the core HTTP retry policy."""
2
+
3
+ from contractforge_core.connectors.api.rest.retry import (
4
+ RETRYABLE_HTTP_STATUS,
5
+ is_retryable_http_error,
6
+ is_retryable_network_error,
7
+ sleep_retry_backoff,
8
+ )
9
+
10
+ __all__ = [
11
+ "RETRYABLE_HTTP_STATUS",
12
+ "is_retryable_http_error",
13
+ "is_retryable_network_error",
14
+ "sleep_retry_backoff",
15
+ ]
@@ -0,0 +1,9 @@
1
+ """Compatibility re-exports for the core HTTP target safety policy."""
2
+
3
+ from contractforge_core.connectors.api.rest.safety import (
4
+ ALLOWED_SCHEMES,
5
+ ALLOW_PRIVATE_FLAG,
6
+ validate_http_target,
7
+ )
8
+
9
+ __all__ = ["ALLOWED_SCHEMES", "ALLOW_PRIVATE_FLAG", "validate_http_target"]
@@ -0,0 +1,97 @@
1
+ """JSON record materialization helpers for Databricks runtime connectors."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ import uuid
8
+ from collections.abc import Mapping
9
+ from typing import Any
10
+
11
+
12
+ def materialize_json_records(
13
+ spark: Any,
14
+ records: list[Any],
15
+ *,
16
+ schema: str | None = None,
17
+ read_options: Mapping[str, Any] | None = None,
18
+ staging_path: str | None = None,
19
+ ) -> Any:
20
+ if not records:
21
+ return spark.createDataFrame([], schema or "value string").limit(0)
22
+ normalized = [record if isinstance(record, Mapping) else {"value": record} for record in records]
23
+ if hasattr(spark, "sparkContext") and hasattr(spark, "read"):
24
+ json_lines = [json.dumps(record, default=str, ensure_ascii=False) for record in normalized]
25
+ return _json_reader(spark, read_options, schema=schema).json(spark.sparkContext.parallelize(json_lines))
26
+ staging_dir = _json_staging_dir(staging_path)
27
+ if staging_dir and hasattr(spark, "read"):
28
+ return _json_reader(spark, read_options, schema=schema).json(_write_json_lines_file(normalized, staging_dir))
29
+ try:
30
+ return _create_dataframe(spark, normalized, schema)
31
+ except Exception as exc:
32
+ if hasattr(spark, "read"):
33
+ raise ValueError(
34
+ "Could not materialize complex JSON records with createDataFrame. "
35
+ "Declare source.read.staging_path or CONTRACTFORGE_SOURCE_JSON_STAGING_DIR with a local path "
36
+ "accessible to the Python driver and Spark reader, or use source.response.mode=raw with shape.parse_json."
37
+ ) from exc
38
+ return _create_dataframe(spark, [_json_safe_record(record) for record in normalized], schema)
39
+
40
+
41
+ def _create_dataframe(spark: Any, records: list[Any], schema: str | None) -> Any:
42
+ if schema is None:
43
+ return spark.createDataFrame(records)
44
+ try:
45
+ return spark.createDataFrame(records, schema=schema)
46
+ except TypeError as exc:
47
+ if "schema" not in str(exc):
48
+ raise
49
+ return spark.createDataFrame(records, schema)
50
+
51
+
52
+ def _json_reader(spark: Any, options: Mapping[str, Any] | None, *, schema: str | None = None) -> Any:
53
+ reader = spark.read
54
+ if schema:
55
+ reader = reader.schema(schema)
56
+ if options is None:
57
+ return reader
58
+ if not isinstance(options, Mapping):
59
+ raise ValueError("source.read.json_options must be an object")
60
+ for key, value in options.items():
61
+ option_key = str(key).strip()
62
+ if not option_key:
63
+ raise ValueError("source.read.json_options cannot contain an empty key")
64
+ reader = reader.option(option_key, str(value).lower() if isinstance(value, bool) else str(value))
65
+ return reader
66
+
67
+
68
+ def _json_staging_dir(staging_path: str | None) -> str | None:
69
+ raw = str(staging_path or os.environ.get("CONTRACTFORGE_SOURCE_JSON_STAGING_DIR") or "").strip()
70
+ if not raw:
71
+ return None
72
+ if "://" in raw and not raw.startswith("file:"):
73
+ raise ValueError(
74
+ "source.read.staging_path for JSON materialization must be a local filesystem path "
75
+ "accessible to the Python driver and Spark reader, for example /Volumes/... or file:/..."
76
+ )
77
+ return raw
78
+
79
+
80
+ def _write_json_lines_file(records: list[Mapping[str, Any]], staging_dir: str) -> str:
81
+ use_file_uri = staging_dir.startswith("file:")
82
+ local_dir = staging_dir[5:] if use_file_uri else staging_dir
83
+ os.makedirs(local_dir, exist_ok=True)
84
+ path = os.path.join(local_dir, f"{uuid.uuid4().hex}.jsonl")
85
+ with open(path, "w", encoding="utf-8") as handle:
86
+ for record in records:
87
+ handle.write(json.dumps(record, default=str, ensure_ascii=False))
88
+ handle.write("\n")
89
+ return f"file:{path}" if use_file_uri else path
90
+
91
+
92
+ def _json_safe_record(value: Any) -> Any:
93
+ if isinstance(value, Mapping):
94
+ return {str(key): _json_safe_record(item) for key, item in value.items()}
95
+ if isinstance(value, list):
96
+ return json.dumps(value, default=str, ensure_ascii=False)
97
+ return value
@@ -0,0 +1,164 @@
1
+ """Runtime explain and OpenLineage evidence for Databricks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import datetime
6
+ from typing import Any
7
+
8
+ from contractforge_core.diagnostics import ExplainPlanRecord
9
+ from contractforge_core.runtime import PreparedInput, QueryOne
10
+ from contractforge_core.semantic import SemanticContract
11
+ from contractforge_databricks.contract_extensions import databricks_extensions
12
+ from contractforge_databricks.diagnostics import render_explain_insert_sql
13
+ from contractforge_databricks.execution import SqlRunner
14
+ from contractforge_databricks.lineage import render_openlineage_insert_sql
15
+ from contractforge_databricks.sql import quote_table_name
16
+
17
+
18
+ def write_runtime_diagnostics(
19
+ *,
20
+ runner: SqlRunner,
21
+ contract: SemanticContract,
22
+ prepared: PreparedInput,
23
+ run_id: str,
24
+ target: str,
25
+ status: str,
26
+ started: str,
27
+ finished: str,
28
+ rows_written: int,
29
+ operation_metrics: dict[str, Any],
30
+ catalog: str,
31
+ schema: str,
32
+ query_one: QueryOne | None,
33
+ runtime_metadata: dict[str, Any] | None = None,
34
+ ) -> dict[str, bool]:
35
+ extensions = databricks_extensions(contract)
36
+ explain = _write_explain(
37
+ runner=runner,
38
+ contract=contract,
39
+ prepared=prepared,
40
+ run_id=run_id,
41
+ target=target,
42
+ extensions=extensions,
43
+ catalog=catalog,
44
+ schema=schema,
45
+ query_one=query_one,
46
+ )
47
+ lineage = _write_openlineage(
48
+ runner=runner,
49
+ contract=contract,
50
+ prepared=prepared,
51
+ run_id=run_id,
52
+ target=target,
53
+ status=status,
54
+ started=started,
55
+ finished=finished,
56
+ rows_written=rows_written,
57
+ operation_metrics=operation_metrics,
58
+ extensions=extensions,
59
+ catalog=catalog,
60
+ schema=schema,
61
+ runtime_metadata=runtime_metadata,
62
+ )
63
+ return {"explain_captured": explain, "openlineage_event_emitted": lineage}
64
+
65
+
66
+ def _write_explain(
67
+ *,
68
+ runner: SqlRunner,
69
+ contract: SemanticContract,
70
+ prepared: PreparedInput,
71
+ run_id: str,
72
+ target: str,
73
+ extensions: dict[str, Any],
74
+ catalog: str,
75
+ schema: str,
76
+ query_one: QueryOne | None,
77
+ ) -> bool:
78
+ if not extensions.get("explain_mode") or query_one is None:
79
+ return False
80
+ explain_format = str(extensions.get("explain_format") or "formatted")
81
+ row = query_one(f"EXPLAIN {explain_format.upper()} SELECT * FROM {quote_table_name(prepared.source_view)}")
82
+ plan_text = _row_value(row, "plan_text") or _row_value(row, "plan") or _row_value(row, "explain")
83
+ if plan_text is None:
84
+ return False
85
+ runner.sql(
86
+ render_explain_insert_sql(
87
+ ExplainPlanRecord(run_id, target, prepared.source_name or prepared.source_view, contract.write.mode, explain_format, str(plan_text)),
88
+ catalog=catalog,
89
+ schema=schema,
90
+ )
91
+ )
92
+ return True
93
+
94
+
95
+ def _write_openlineage(
96
+ *,
97
+ runner: SqlRunner,
98
+ contract: SemanticContract,
99
+ prepared: PreparedInput,
100
+ run_id: str,
101
+ target: str,
102
+ status: str,
103
+ started: str,
104
+ finished: str,
105
+ rows_written: int,
106
+ operation_metrics: dict[str, Any],
107
+ extensions: dict[str, Any],
108
+ catalog: str,
109
+ schema: str,
110
+ runtime_metadata: dict[str, Any] | None,
111
+ ) -> bool:
112
+ if not extensions.get("openlineage_enabled"):
113
+ return False
114
+ operations = contract.operations.metadata if contract.operations and contract.operations.metadata else {}
115
+ runtime = dict(runtime_metadata or {})
116
+ runner.sql(
117
+ render_openlineage_insert_sql(
118
+ contract,
119
+ run_id=run_id,
120
+ source_name=prepared.source_name or prepared.source_view,
121
+ status=status,
122
+ started_at_utc=_parse_ts(started),
123
+ finished_at_utc=_parse_ts(finished),
124
+ rows_read=prepared.rows_read,
125
+ rows_written=rows_written,
126
+ input_schema=_schema_fields(prepared.source_schema),
127
+ output_schema=_schema_fields(prepared.source_schema),
128
+ delta_version_after=_int_or_none(operation_metrics.get("version")),
129
+ operation_metrics=operation_metrics,
130
+ namespace=extensions.get("openlineage_namespace"),
131
+ producer=str(extensions.get("openlineage_producer") or "contractforge-databricks"),
132
+ parent_run_id=operations.get("parent_run_id"),
133
+ spark_version=runtime.get("spark_version"),
134
+ source_code_url=runtime.get("notebook_name"),
135
+ catalog=catalog,
136
+ schema=schema,
137
+ )
138
+ )
139
+ return True
140
+
141
+
142
+ def _schema_fields(schema: dict[str, str] | None) -> tuple[tuple[str, str], ...]:
143
+ return tuple((name, dtype) for name, dtype in (schema or {}).items())
144
+
145
+
146
+ def _row_value(row: Any, key: str) -> Any:
147
+ if row is None:
148
+ return None
149
+ if isinstance(row, dict):
150
+ return row.get(key)
151
+ if hasattr(row, "asDict"):
152
+ return row.asDict().get(key)
153
+ return getattr(row, key, None)
154
+
155
+
156
+ def _parse_ts(value: str) -> datetime:
157
+ return datetime.strptime(value, "%Y-%m-%d %H:%M:%S")
158
+
159
+
160
+ def _int_or_none(value: object) -> int | None:
161
+ try:
162
+ return None if value is None else int(value)
163
+ except (TypeError, ValueError):
164
+ return None