contractforge-databricks 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. contractforge_databricks/__init__.py +172 -0
  2. contractforge_databricks/adapter.py +69 -0
  3. contractforge_databricks/annotations/__init__.py +10 -0
  4. contractforge_databricks/annotations/application.py +52 -0
  5. contractforge_databricks/annotations/audit.py +49 -0
  6. contractforge_databricks/annotations/sql.py +142 -0
  7. contractforge_databricks/api.py +65 -0
  8. contractforge_databricks/bundles/__init__.py +9 -0
  9. contractforge_databricks/bundles/assets.py +47 -0
  10. contractforge_databricks/bundles/project.py +213 -0
  11. contractforge_databricks/bundles/project_config.py +133 -0
  12. contractforge_databricks/capabilities/__init__.py +17 -0
  13. contractforge_databricks/capabilities/builders.py +43 -0
  14. contractforge_databricks/capabilities/evaluate.py +162 -0
  15. contractforge_databricks/capabilities/mapping.py +36 -0
  16. contractforge_databricks/capabilities/models.py +44 -0
  17. contractforge_databricks/capabilities/runtime.py +111 -0
  18. contractforge_databricks/capabilities/uc.py +47 -0
  19. contractforge_databricks/cli.py +196 -0
  20. contractforge_databricks/cli_deploy.py +98 -0
  21. contractforge_databricks/cli_governance.py +142 -0
  22. contractforge_databricks/cli_io.py +91 -0
  23. contractforge_databricks/cli_maintenance.py +69 -0
  24. contractforge_databricks/coercion.py +31 -0
  25. contractforge_databricks/contract_extensions.py +70 -0
  26. contractforge_databricks/cost/__init__.py +11 -0
  27. contractforge_databricks/cost/model.py +22 -0
  28. contractforge_databricks/cost/report.py +65 -0
  29. contractforge_databricks/cost/sql.py +136 -0
  30. contractforge_databricks/dashboards/__init__.py +15 -0
  31. contractforge_databricks/dashboards/control_tables.py +150 -0
  32. contractforge_databricks/diagnostics/__init__.py +7 -0
  33. contractforge_databricks/diagnostics/explain.py +40 -0
  34. contractforge_databricks/environment.py +53 -0
  35. contractforge_databricks/evidence/__init__.py +98 -0
  36. contractforge_databricks/evidence/ddl.py +35 -0
  37. contractforge_databricks/evidence/governance_log.py +175 -0
  38. contractforge_databricks/evidence/helpers.py +29 -0
  39. contractforge_databricks/evidence/ops_log.py +210 -0
  40. contractforge_databricks/evidence/records.py +27 -0
  41. contractforge_databricks/evidence/run_log.py +74 -0
  42. contractforge_databricks/evidence/schemas.py +7 -0
  43. contractforge_databricks/evidence/sql.py +144 -0
  44. contractforge_databricks/evidence/tables.py +20 -0
  45. contractforge_databricks/evidence/writer.py +118 -0
  46. contractforge_databricks/execution/__init__.py +70 -0
  47. contractforge_databricks/execution/delta_basic.py +57 -0
  48. contractforge_databricks/execution/hash_diff.py +126 -0
  49. contractforge_databricks/execution/hash_diff_latest.py +142 -0
  50. contractforge_databricks/execution/replace_partitions.py +40 -0
  51. contractforge_databricks/execution/results.py +5 -0
  52. contractforge_databricks/execution/retry.py +36 -0
  53. contractforge_databricks/execution/scd2.py +213 -0
  54. contractforge_databricks/execution/scd2_deletes.py +65 -0
  55. contractforge_databricks/execution/scd2_late.py +30 -0
  56. contractforge_databricks/execution/snapshot.py +77 -0
  57. contractforge_databricks/execution/sql_merge.py +85 -0
  58. contractforge_databricks/execution/tables.py +98 -0
  59. contractforge_databricks/execution/windows.py +58 -0
  60. contractforge_databricks/governance/__init__.py +30 -0
  61. contractforge_databricks/governance/access.py +185 -0
  62. contractforge_databricks/governance/application.py +93 -0
  63. contractforge_databricks/governance/drift.py +49 -0
  64. contractforge_databricks/governance/runtime.py +60 -0
  65. contractforge_databricks/governance/sql.py +31 -0
  66. contractforge_databricks/governance/validation.py +135 -0
  67. contractforge_databricks/lakeflow/__init__.py +21 -0
  68. contractforge_databricks/lakeflow/compatibility.py +194 -0
  69. contractforge_databricks/lakeflow/rendering.py +175 -0
  70. contractforge_databricks/lineage/__init__.py +7 -0
  71. contractforge_databricks/lineage/openlineage.py +182 -0
  72. contractforge_databricks/maintenance/__init__.py +27 -0
  73. contractforge_databricks/maintenance/retention.py +90 -0
  74. contractforge_databricks/maintenance/sql.py +68 -0
  75. contractforge_databricks/metrics/__init__.py +19 -0
  76. contractforge_databricks/metrics/history.py +21 -0
  77. contractforge_databricks/metrics/write.py +63 -0
  78. contractforge_databricks/operations/__init__.py +4 -0
  79. contractforge_databricks/operations/application.py +38 -0
  80. contractforge_databricks/operations/sql.py +95 -0
  81. contractforge_databricks/parity/__init__.py +18 -0
  82. contractforge_databricks/parity/catalog.py +59 -0
  83. contractforge_databricks/parity/models.py +7 -0
  84. contractforge_databricks/parity/scenarios.py +111 -0
  85. contractforge_databricks/partitioning/__init__.py +3 -0
  86. contractforge_databricks/partitioning/predicates.py +28 -0
  87. contractforge_databricks/preparation/__init__.py +47 -0
  88. contractforge_databricks/preparation/deduplicate.py +87 -0
  89. contractforge_databricks/preparation/encoding.py +37 -0
  90. contractforge_databricks/preparation/hashing.py +18 -0
  91. contractforge_databricks/preparation/pyspark.py +178 -0
  92. contractforge_databricks/preparation/pyspark_staging.py +70 -0
  93. contractforge_databricks/preparation/shape.py +209 -0
  94. contractforge_databricks/preparation/shape_validation.py +94 -0
  95. contractforge_databricks/preparation/staging.py +17 -0
  96. contractforge_databricks/preparation/zip_arrays.py +51 -0
  97. contractforge_databricks/presets/__init__.py +3 -0
  98. contractforge_databricks/presets/base.py +24 -0
  99. contractforge_databricks/presets/bronze.py +57 -0
  100. contractforge_databricks/presets/catalog.py +22 -0
  101. contractforge_databricks/presets/core.py +134 -0
  102. contractforge_databricks/presets/gold.py +62 -0
  103. contractforge_databricks/presets/modifiers.py +51 -0
  104. contractforge_databricks/presets/runtime.py +22 -0
  105. contractforge_databricks/presets/silver.py +101 -0
  106. contractforge_databricks/presets/write_engine.py +57 -0
  107. contractforge_databricks/quality/__init__.py +41 -0
  108. contractforge_databricks/quality/evaluation.py +178 -0
  109. contractforge_databricks/quality/persistence.py +81 -0
  110. contractforge_databricks/quality/registry.py +134 -0
  111. contractforge_databricks/quality/results.py +17 -0
  112. contractforge_databricks/quality/sql.py +113 -0
  113. contractforge_databricks/rendering/__init__.py +11 -0
  114. contractforge_databricks/rendering/bundle.py +93 -0
  115. contractforge_databricks/rendering/markdown.py +50 -0
  116. contractforge_databricks/rendering/names.py +56 -0
  117. contractforge_databricks/results.py +15 -0
  118. contractforge_databricks/runtime/__init__.py +101 -0
  119. contractforge_databricks/runtime/available_now.py +147 -0
  120. contractforge_databricks/runtime/bundles.py +211 -0
  121. contractforge_databricks/runtime/cache.py +20 -0
  122. contractforge_databricks/runtime/control_tables.py +19 -0
  123. contractforge_databricks/runtime/deploy.py +197 -0
  124. contractforge_databricks/runtime/detection.py +114 -0
  125. contractforge_databricks/runtime/dry_run.py +46 -0
  126. contractforge_databricks/runtime/errors.py +54 -0
  127. contractforge_databricks/runtime/file_selection.py +109 -0
  128. contractforge_databricks/runtime/finalization.py +168 -0
  129. contractforge_databricks/runtime/governance.py +37 -0
  130. contractforge_databricks/runtime/hooks.py +45 -0
  131. contractforge_databricks/runtime/http_file.py +37 -0
  132. contractforge_databricks/runtime/http_retry.py +15 -0
  133. contractforge_databricks/runtime/http_safety.py +9 -0
  134. contractforge_databricks/runtime/json_materialization.py +97 -0
  135. contractforge_databricks/runtime/lineage.py +164 -0
  136. contractforge_databricks/runtime/maintenance.py +43 -0
  137. contractforge_databricks/runtime/merge_validation.py +98 -0
  138. contractforge_databricks/runtime/metadata.py +21 -0
  139. contractforge_databricks/runtime/metrics.py +34 -0
  140. contractforge_databricks/runtime/models.py +32 -0
  141. contractforge_databricks/runtime/options.py +33 -0
  142. contractforge_databricks/runtime/orchestration_context.py +185 -0
  143. contractforge_databricks/runtime/orchestrator.py +147 -0
  144. contractforge_databricks/runtime/partitioning.py +93 -0
  145. contractforge_databricks/runtime/quality_quarantine.py +92 -0
  146. contractforge_databricks/runtime/rest_api.py +46 -0
  147. contractforge_databricks/runtime/rest_auth.py +21 -0
  148. contractforge_databricks/runtime/rest_pagination.py +21 -0
  149. contractforge_databricks/runtime/run_payload.py +177 -0
  150. contractforge_databricks/runtime/schema.py +106 -0
  151. contractforge_databricks/runtime/source_metadata.py +30 -0
  152. contractforge_databricks/runtime/source_registry.py +43 -0
  153. contractforge_databricks/runtime/source_schema.py +24 -0
  154. contractforge_databricks/runtime/sources.py +208 -0
  155. contractforge_databricks/runtime/spark.py +183 -0
  156. contractforge_databricks/runtime/spark_defaults.py +35 -0
  157. contractforge_databricks/runtime/storage_auth.py +132 -0
  158. contractforge_databricks/runtime/streaming.py +131 -0
  159. contractforge_databricks/runtime/success.py +104 -0
  160. contractforge_databricks/runtime/utils.py +52 -0
  161. contractforge_databricks/runtime/watermark.py +71 -0
  162. contractforge_databricks/runtime/windows.py +184 -0
  163. contractforge_databricks/runtime/write.py +66 -0
  164. contractforge_databricks/runtime/write_flow.py +146 -0
  165. contractforge_databricks/runtime/write_strategy.py +40 -0
  166. contractforge_databricks/schema/__init__.py +21 -0
  167. contractforge_databricks/schema/diff.py +11 -0
  168. contractforge_databricks/schema/policy.py +33 -0
  169. contractforge_databricks/schema/sync.py +23 -0
  170. contractforge_databricks/security/__init__.py +21 -0
  171. contractforge_databricks/security/errors.py +5 -0
  172. contractforge_databricks/security/redaction.py +5 -0
  173. contractforge_databricks/security/secrets.py +114 -0
  174. contractforge_databricks/security/source_policy.py +17 -0
  175. contractforge_databricks/shapes/__init__.py +3 -0
  176. contractforge_databricks/shapes/sql.py +123 -0
  177. contractforge_databricks/sources/__init__.py +67 -0
  178. contractforge_databricks/sources/artifacts.py +100 -0
  179. contractforge_databricks/sources/autoloader.py +48 -0
  180. contractforge_databricks/sources/bounded_streams.py +44 -0
  181. contractforge_databricks/sources/classification.py +115 -0
  182. contractforge_databricks/sources/delta_share.py +21 -0
  183. contractforge_databricks/sources/files.py +48 -0
  184. contractforge_databricks/sources/http_file.py +46 -0
  185. contractforge_databricks/sources/interpret.py +76 -0
  186. contractforge_databricks/sources/jdbc.py +32 -0
  187. contractforge_databricks/sources/metadata.py +18 -0
  188. contractforge_databricks/sources/native_passthrough.py +33 -0
  189. contractforge_databricks/sources/rds_iam.py +15 -0
  190. contractforge_databricks/sources/rds_iam_runtime.py +191 -0
  191. contractforge_databricks/sources/rest_api.py +33 -0
  192. contractforge_databricks/sources/support.py +50 -0
  193. contractforge_databricks/sources/table_refs.py +65 -0
  194. contractforge_databricks/sql/__init__.py +4 -0
  195. contractforge_databricks/sql/identifiers.py +17 -0
  196. contractforge_databricks/sql/literals.py +36 -0
  197. contractforge_databricks/state/__init__.py +39 -0
  198. contractforge_databricks/state/ddl.py +24 -0
  199. contractforge_databricks/state/migrations.py +146 -0
  200. contractforge_databricks/state/queries.py +149 -0
  201. contractforge_databricks/state/sql.py +116 -0
  202. contractforge_databricks/state/tables.py +9 -0
  203. contractforge_databricks/state/writer.py +83 -0
  204. contractforge_databricks/templates/__init__.py +15 -0
  205. contractforge_databricks/templates/catalog.py +205 -0
  206. contractforge_databricks/templates/catalog_parity.py +85 -0
  207. contractforge_databricks/templates/core.py +83 -0
  208. contractforge_databricks/templates/enrichment.py +175 -0
  209. contractforge_databricks/transforms/__init__.py +3 -0
  210. contractforge_databricks/transforms/sql.py +118 -0
  211. contractforge_databricks/watermark/__init__.py +6 -0
  212. contractforge_databricks/watermark/sql.py +91 -0
  213. contractforge_databricks/write_modes/__init__.py +20 -0
  214. contractforge_databricks/write_modes/registry.py +44 -0
  215. contractforge_databricks/write_modes/sql.py +33 -0
  216. contractforge_databricks/write_modes/strategy.py +192 -0
  217. contractforge_databricks-0.1.0.dist-info/METADATA +34 -0
  218. contractforge_databricks-0.1.0.dist-info/RECORD +220 -0
  219. contractforge_databricks-0.1.0.dist-info/WHEEL +4 -0
  220. contractforge_databricks-0.1.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,208 @@
1
+ """Databricks runtime source resolution.
2
+
3
+ The functions in this module intentionally receive ``spark`` as an argument so
4
+ the adapter stays importable outside Databricks and PySpark remains optional.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Any
10
+
11
+ from contractforge_core.connectors import (
12
+ catalog_source_query,
13
+ catalog_source_table_or_path,
14
+ delta_share_options,
15
+ eventhubs_bounded_options,
16
+ file_reader_options,
17
+ file_source_format,
18
+ is_available_now_stream_source,
19
+ is_bounded_stream_source,
20
+ is_catalog_source,
21
+ is_delta_share_source,
22
+ is_file_source,
23
+ is_http_file_source,
24
+ is_kafka_stream_source,
25
+ is_rest_api_connector,
26
+ jdbc_common_options,
27
+ kafka_bounded_options,
28
+ stream_source_format,
29
+ )
30
+ from contractforge_core.runtime import PreparedInput
31
+ from contractforge_core.runtime import QueryOne
32
+ from contractforge_core.semantic import SemanticContract
33
+ from contractforge_databricks.preparation import apply_contract_preparation, apply_write_staging
34
+ from contractforge_databricks.runtime.file_selection import selected_file_load_path
35
+ from contractforge_databricks.runtime.http_file import resolve_http_file_dataframe
36
+ from contractforge_databricks.runtime.rest_api import resolve_rest_api_dataframe
37
+ from contractforge_databricks.runtime.source_schema import apply_declared_schema
38
+ from contractforge_databricks.runtime.source_metadata import (
39
+ schema_types,
40
+ source_metadata,
41
+ source_metadata_with_watermark,
42
+ source_name,
43
+ )
44
+ from contractforge_databricks.runtime.source_registry import get_source_resolver
45
+ from contractforge_databricks.runtime.watermark import collect_previous_watermark
46
+ from contractforge_databricks.runtime.storage_auth import configure_object_storage_access
47
+ from contractforge_databricks.security import resolve_databricks_secret_placeholders, validate_source_security
48
+ from contractforge_databricks.sources.interpret import interpret_incremental_files_source, is_incremental_file_source
49
+ from contractforge_databricks.sources.rds_iam_runtime import materialize_rds_iam_options
50
+ from contractforge_databricks.sources.table_refs import (
51
+ contract_with_databricks_source_refs,
52
+ databricks_table_ref_resolver,
53
+ )
54
+
55
+ _JDBC_SOURCE_ALIASES = {"jdbc", "postgres", "mysql", "sqlserver", "oracle", "redshift", "db2", "mariadb", "snowflake_jdbc", "bigquery_jdbc"}
56
+
57
+
58
+ def resolve_source_dataframe(spark: Any, source: dict[str, Any], *, contract: SemanticContract | None = None) -> Any:
59
+ """Resolve a core source contract into a Databricks DataFrame."""
60
+
61
+ validate_source_security(source)
62
+ source = resolve_databricks_secret_placeholders(source)
63
+ source_type = source.get("type")
64
+ custom_resolver = get_source_resolver(str(source.get("connector") or source_type or ""))
65
+ if custom_resolver is not None:
66
+ return custom_resolver.resolve(spark, source)
67
+ if is_catalog_source(source):
68
+ return _resolve_catalog_source(spark, source, contract=contract)
69
+ if is_incremental_file_source(source):
70
+ return _resolve_autoloader_source(spark, source)
71
+ if is_file_source(source):
72
+ options = file_reader_options(source)
73
+ path, options = configure_object_storage_access(spark, source, options)
74
+ source = {**source, "path": path} if path is not None else source
75
+ return _read_source_with_options(
76
+ spark.read,
77
+ file_source_format(source),
78
+ options,
79
+ selected_file_load_path(spark, source, options),
80
+ source,
81
+ )
82
+ if is_http_file_source(source):
83
+ return resolve_http_file_dataframe(spark, source)
84
+ if is_rest_api_connector(source):
85
+ return resolve_rest_api_dataframe(spark, source)
86
+ if source_type in {"jdbc", "connector"} or source.get("connector") in _JDBC_SOURCE_ALIASES:
87
+ jdbc_options = materialize_rds_iam_options(
88
+ jdbc_common_options(source),
89
+ auth=source.get("auth"),
90
+ )
91
+ return _read_with_options(spark.read, "jdbc", jdbc_options, None)
92
+ if is_bounded_stream_source(source) or is_available_now_stream_source(source):
93
+ options = kafka_bounded_options(source) if is_kafka_stream_source(source) else eventhubs_bounded_options(source)
94
+ source_format = stream_source_format(source)
95
+ reader = spark.readStream if is_available_now_stream_source(source) else spark.read
96
+ return _read_with_options(reader, source_format, options, None)
97
+ if is_delta_share_source(source):
98
+ return _read_with_options(spark.read, "deltaSharing", delta_share_options(source), None)
99
+ raise ValueError(f"source.type={source_type!r} cannot be resolved by the Databricks runtime source resolver")
100
+
101
+
102
+ def prepare_source_view(
103
+ spark: Any,
104
+ source: dict[str, Any],
105
+ *,
106
+ view_name: str,
107
+ collect_metrics: bool = False,
108
+ ) -> PreparedInput:
109
+ """Resolve a source and register it as a temporary view for ingestion."""
110
+
111
+ df = resolve_source_dataframe(spark, source)
112
+ df.createOrReplaceTempView(view_name)
113
+ columns = tuple(str(column) for column in getattr(df, "columns", ()) or ())
114
+ rows_read = int(df.count()) if collect_metrics else 0
115
+ return PreparedInput(
116
+ source_view=view_name,
117
+ source_columns=columns,
118
+ source_schema=schema_types(df),
119
+ rows_read=rows_read,
120
+ source_name=source_name(source),
121
+ source_metadata=source_metadata(source),
122
+ )
123
+
124
+
125
+ def prepare_contract_source_view(
126
+ spark: Any,
127
+ contract: SemanticContract,
128
+ *,
129
+ view_name: str,
130
+ collect_metrics: bool = False,
131
+ query_one: QueryOne | None = None,
132
+ evidence_catalog: str = "main",
133
+ evidence_schema: str = "ops",
134
+ ) -> PreparedInput:
135
+ """Resolve, prepare and register the contract source as a temporary view."""
136
+
137
+ if not contract.source.raw:
138
+ raise ValueError("prepare_contract_source_view requires a structured source contract")
139
+ runtime_contract = contract_with_databricks_source_refs(contract)
140
+ df = resolve_source_dataframe(spark, runtime_contract.source.raw or {}, contract=runtime_contract)
141
+ watermark_column, watermark_previous = collect_previous_watermark(
142
+ contract=contract,
143
+ query_one=query_one,
144
+ catalog=evidence_catalog,
145
+ schema=evidence_schema,
146
+ )
147
+ df = apply_contract_preparation(
148
+ df,
149
+ contract,
150
+ watermark_column=watermark_column,
151
+ watermark_previous=watermark_previous,
152
+ )
153
+ df = apply_write_staging(df, contract)
154
+ df.createOrReplaceTempView(view_name)
155
+ columns = tuple(str(column) for column in getattr(df, "columns", ()) or ())
156
+ rows_read = int(df.count()) if collect_metrics else 0
157
+ return PreparedInput(
158
+ source_view=view_name,
159
+ source_columns=columns,
160
+ source_schema=schema_types(df),
161
+ rows_read=rows_read,
162
+ source_name=runtime_contract.source.name,
163
+ source_metadata=source_metadata_with_watermark(runtime_contract.source.raw or {}, watermark_previous),
164
+ )
165
+
166
+
167
+ def _resolve_catalog_source(spark: Any, source: dict[str, Any], *, contract: SemanticContract | None = None) -> Any:
168
+ resolver = databricks_table_ref_resolver(contract) if contract is not None else None
169
+ if source.get("type") == "sql" or source.get("connector") == "sql":
170
+ return spark.sql(catalog_source_query(source, table_ref_resolver=resolver))
171
+ table_or_path = catalog_source_table_or_path(source, table_ref_resolver=resolver)
172
+ if source.get("path") and not source.get("table"):
173
+ source_type = str(source.get("type") or "delta")
174
+ source_format = "delta" if source_type == "delta_table" else source_type.replace("_table", "")
175
+ return _read_with_options(spark.read, source_format, {}, table_or_path)
176
+ return spark.table(str(table_or_path))
177
+
178
+
179
+ def _resolve_autoloader_source(spark: Any, source: dict[str, Any]) -> Any:
180
+ interpreted = interpret_incremental_files_source(source)
181
+ options = {"cloudFiles.format": str(interpreted.get("format") or "json")}
182
+ options.update({str(key): str(value) for key, value in interpreted.get("options", {}).items()})
183
+ if interpreted.get("schema_tracking_location"):
184
+ options["cloudFiles.schemaLocation"] = str(interpreted["schema_tracking_location"])
185
+ if interpreted.get("schema_hints"):
186
+ options["cloudFiles.schemaHints"] = str(interpreted["schema_hints"])
187
+ return _read_with_options(spark.readStream, "cloudFiles", options, interpreted.get("path"))
188
+
189
+
190
+ def _read_with_options(reader: Any, source_format: str, options: dict[str, str], path: object | None) -> Any:
191
+ builder = reader.format(source_format)
192
+ for key, value in sorted(options.items()):
193
+ builder = builder.option(key, value)
194
+ return builder.load(path if isinstance(path, list) else str(path)) if path is not None else builder.load()
195
+
196
+
197
+ def _read_source_with_options(
198
+ reader: Any,
199
+ source_format: str,
200
+ options: dict[str, str],
201
+ path: object | None,
202
+ source: dict[str, Any],
203
+ ) -> Any:
204
+ builder = reader.format(source_format)
205
+ for key, value in sorted(options.items()):
206
+ builder = builder.option(key, value)
207
+ builder = apply_declared_schema(builder, source)
208
+ return builder.load(path if isinstance(path, list) else str(path)) if path is not None else builder.load()
@@ -0,0 +1,183 @@
1
+ """Databricks/Spark runtime convenience helpers with lazy imports."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ import platform
8
+ from typing import Any
9
+
10
+ from contractforge_databricks.capabilities.runtime import is_serverless_conf
11
+ from contractforge_databricks.runtime.detection import _collect_spark_conf
12
+ from contractforge_databricks.sql import quote_identifier, quote_table_name
13
+
14
+ logger = logging.getLogger(__name__)
15
+ _SERVERLESS_CACHE: dict[int, bool] = {}
16
+
17
+
18
+ def get_active_spark() -> Any:
19
+ """Resolve the active Databricks or PySpark session at call time."""
20
+ try:
21
+ from databricks.sdk.runtime import spark as dbx_spark # type: ignore
22
+
23
+ if dbx_spark is not None:
24
+ return dbx_spark
25
+ except Exception as exc:
26
+ logger.debug("Databricks runtime spark session was not available; falling back to PySpark.", exc_info=exc)
27
+ try:
28
+ from pyspark.sql import SparkSession
29
+ except Exception as exc:
30
+ raise RuntimeError("PySpark is required to resolve an active Spark session.") from exc
31
+ session = SparkSession.getActiveSession() or getattr(SparkSession, "_instantiatedSession", None)
32
+ if session is None:
33
+ raise RuntimeError("No active SparkSession was found.")
34
+ return session
35
+
36
+
37
+ def runtime_info(spark: Any | None = None) -> dict[str, str | None]:
38
+ session = spark or _maybe_active_spark()
39
+ version = getattr(session, "version", None) if session is not None else None
40
+ return {
41
+ "runtime_type": "serverless" if detect_serverless(session) else "classic",
42
+ "spark_version": version,
43
+ "python_version": platform.python_version(),
44
+ }
45
+
46
+
47
+ def detect_serverless(spark: Any | None = None) -> bool:
48
+ session = spark or _maybe_active_spark()
49
+ if session is None:
50
+ return False
51
+ cache_key = id(session)
52
+ if cache_key in _SERVERLESS_CACHE:
53
+ return _SERVERLESS_CACHE[cache_key]
54
+ conf = _collect_spark_conf(session) if session is not None else {}
55
+ detected = is_serverless_conf(conf) or conf.get("spark.databricks.clusterUsageTags.clusterSource") == "JOB_SERVERLESS"
56
+ _SERVERLESS_CACHE[cache_key] = detected
57
+ return detected
58
+
59
+
60
+ def safe_cache(df: Any, *, enabled: bool = True, serverless: bool | None = None) -> Any:
61
+ if not enabled or (detect_serverless() if serverless is None else serverless):
62
+ return df
63
+ try:
64
+ return df.cache()
65
+ except Exception as exc:
66
+ if _is_unsupported_cache_error(exc):
67
+ return df
68
+ raise
69
+
70
+
71
+ def safe_unpersist(df: Any, *, enabled: bool = True, serverless: bool | None = None) -> None:
72
+ if not enabled or (detect_serverless() if serverless is None else serverless):
73
+ return
74
+ try:
75
+ df.unpersist()
76
+ except Exception as exc:
77
+ if not _is_unsupported_cache_error(exc):
78
+ raise
79
+
80
+
81
+ def safe_cache_table(spark: Any, table_name: str, *, enabled: bool = True, serverless: bool | None = None) -> bool:
82
+ if not enabled or (detect_serverless(spark) if serverless is None else serverless):
83
+ return False
84
+ try:
85
+ catalog = getattr(spark, "catalog", None)
86
+ cache_table = getattr(catalog, "cacheTable", None)
87
+ if callable(cache_table):
88
+ cache_table(table_name)
89
+ else:
90
+ spark.sql(f"CACHE TABLE {quote_identifier(table_name)}")
91
+ return True
92
+ except Exception as exc:
93
+ if _is_unsupported_cache_error(exc):
94
+ return False
95
+ raise
96
+
97
+
98
+ def safe_uncache_table(spark: Any, table_name: str, *, enabled: bool = True, serverless: bool | None = None) -> None:
99
+ if not enabled or (detect_serverless(spark) if serverless is None else serverless):
100
+ return
101
+ try:
102
+ catalog = getattr(spark, "catalog", None)
103
+ uncache_table = getattr(catalog, "uncacheTable", None)
104
+ if callable(uncache_table):
105
+ uncache_table(table_name)
106
+ else:
107
+ spark.sql(f"UNCACHE TABLE {quote_identifier(table_name)}")
108
+ except Exception as exc:
109
+ if not _is_unsupported_cache_error(exc):
110
+ raise
111
+
112
+
113
+ def table_exists(full_name: str, *, spark: Any | None = None) -> bool:
114
+ session = spark or get_active_spark()
115
+ try:
116
+ if session.catalog.tableExists(full_name):
117
+ return True
118
+ except Exception as exc:
119
+ logger.debug("Spark catalog tableExists failed for %s; falling back to DESCRIBE TABLE.", full_name, exc_info=exc)
120
+ try:
121
+ session.sql(f"DESCRIBE TABLE {quote_table_name(full_name)}")
122
+ return True
123
+ except Exception as exc:
124
+ logger.debug("Spark DESCRIBE TABLE failed for %s.", full_name, exc_info=exc)
125
+ return False
126
+
127
+
128
+ def schema_signature(df: Any) -> str:
129
+ return json.dumps(
130
+ [(field.name, field.dataType.simpleString(), field.nullable) for field in df.schema.fields],
131
+ ensure_ascii=False,
132
+ )
133
+
134
+
135
+ def fix_encoding(df: Any, *, enabled: bool, encoding: str, columns: tuple[str, ...] = ()) -> Any:
136
+ if not enabled:
137
+ return df
138
+ from pyspark.sql import functions as functions # type: ignore
139
+
140
+ string_cols = [field.name for field in df.schema.fields if field.dataType.typeName() == "string"]
141
+ cols_to_fix = columns or tuple(string_cols)
142
+ for column in cols_to_fix:
143
+ if column in string_cols:
144
+ df = df.withColumn(column, functions.decode(functions.col(column).cast("binary"), encoding))
145
+ return df
146
+
147
+
148
+ def sync_delta_schema(
149
+ *,
150
+ df: Any,
151
+ target_table: str,
152
+ schema_changes: dict[str, Any],
153
+ policy: str,
154
+ spark: Any | None = None,
155
+ ) -> None:
156
+ session = spark or get_active_spark()
157
+ if policy not in {"permissive", "additive_only"} or not table_exists(target_table, spark=session):
158
+ return
159
+ fields = {field.name: field.dataType.simpleString() for field in df.schema.fields}
160
+ added = [column for column in schema_changes.get("added_columns", ()) if column in fields]
161
+ if added:
162
+ cols_sql = ", ".join(f"{quote_identifier(column)} {fields[column]}" for column in added)
163
+ session.sql(f"ALTER TABLE {quote_table_name(target_table)} ADD COLUMNS ({cols_sql})")
164
+ for change in schema_changes.get("type_changes", ()):
165
+ if not change.get("allowed"):
166
+ continue
167
+ column = str(change["column"])
168
+ source_type = str(change["source"])
169
+ session.sql(f"ALTER TABLE {quote_table_name(target_table)} ALTER COLUMN {quote_identifier(column)} TYPE {source_type}")
170
+ change["applied"] = True
171
+
172
+
173
+ def _maybe_active_spark() -> Any | None:
174
+ try:
175
+ return get_active_spark()
176
+ except Exception as exc:
177
+ logger.debug("No active Spark session could be resolved.", exc_info=exc)
178
+ return None
179
+
180
+
181
+ def _is_unsupported_cache_error(exc: Exception) -> bool:
182
+ text = str(exc).upper()
183
+ return "NOT_SUPPORTED" in text or "SERVERLESS" in text
@@ -0,0 +1,35 @@
1
+ """Spark-backed runtime defaults for Databricks bundle execution."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import replace
6
+ from typing import Any
7
+
8
+ from contractforge_core.runtime import QueryOne
9
+ from contractforge_databricks.runtime.models import DatabricksIngestOptions
10
+
11
+
12
+ def with_spark_runtime_defaults(spark: Any, opts: DatabricksIngestOptions, target: str) -> DatabricksIngestOptions:
13
+ if opts.target_schema is not None or not opts.ensure_table:
14
+ return opts
15
+ target_schema = spark_target_schema(spark, target)
16
+ return replace(opts, target_schema=target_schema) if target_schema is not None else opts
17
+
18
+
19
+ def spark_target_schema(spark: Any, target: str) -> dict[str, str] | None:
20
+ try:
21
+ schema = spark.table(target).schema
22
+ except Exception:
23
+ return None
24
+ return {str(field.name): str(field.dataType.simpleString()).lower() for field in schema.fields}
25
+
26
+
27
+ def spark_query_one(spark: Any) -> QueryOne | None:
28
+ if not callable(getattr(spark, "sql", None)):
29
+ return None
30
+
31
+ def query_one(statement: str) -> dict[str, Any] | None:
32
+ rows = spark.sql(statement).limit(1).collect()
33
+ return rows[0].asDict() if rows else None
34
+
35
+ return query_one
@@ -0,0 +1,132 @@
1
+ """Databricks runtime object-storage credential helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import urllib.parse
6
+ from typing import Any
7
+
8
+ from contractforge_core.connectors import object_storage_provider
9
+
10
+
11
+ def configure_object_storage_access(
12
+ spark: Any,
13
+ source: dict[str, Any],
14
+ options: dict[str, str],
15
+ ) -> tuple[object | None, dict[str, str]]:
16
+ """Configure adapter-owned Spark storage credentials and return read path/options."""
17
+
18
+ provider = object_storage_provider(source)
19
+ path = source.get("path")
20
+ if provider == "s3":
21
+ return path, _configure_s3(spark, source, options)
22
+ if provider == "azure_blob":
23
+ return _configure_azure_blob(spark, source, path), options
24
+ return path, options
25
+
26
+
27
+ def _configure_s3(spark: Any, source: dict[str, Any], options: dict[str, str]) -> dict[str, str]:
28
+ reader_options: dict[str, str] = {}
29
+ for key, value in options.items():
30
+ if key.startswith("fs.s3a.") or key.startswith("spark.hadoop.fs.s3a."):
31
+ _set_conf(spark, key, value)
32
+ else:
33
+ reader_options[key] = value
34
+ auth = _dict(source.get("auth"))
35
+ access_key = auth.get("access_key_id") or auth.get("access_key") or auth.get("aws_access_key_id")
36
+ secret_key = auth.get("secret_access_key") or auth.get("secret_key") or auth.get("aws_secret_access_key")
37
+ session_token = auth.get("session_token") or auth.get("token") or auth.get("aws_session_token")
38
+ if bool(access_key) != bool(secret_key):
39
+ raise ValueError("source.auth for connector=s3 requires access_key_id and secret_access_key together")
40
+ if access_key and secret_key:
41
+ _set_conf(spark, "fs.s3a.access.key", str(access_key))
42
+ _set_conf(spark, "fs.s3a.secret.key", str(secret_key))
43
+ if session_token:
44
+ _set_conf(spark, "fs.s3a.session.token", str(session_token))
45
+ _set_conf(spark, "fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider")
46
+ else:
47
+ _set_conf(spark, "fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
48
+ return reader_options
49
+
50
+
51
+ def _configure_azure_blob(spark: Any, source: dict[str, Any], path: object | None) -> object | None:
52
+ auth = _dict(source.get("auth"))
53
+ sas_token = auth.get("sas_token") or auth.get("token")
54
+ if not sas_token:
55
+ return path
56
+ account_url = str(source.get("account_url") or "").strip()
57
+ container = str(source.get("container") or "").strip()
58
+ if account_url or container:
59
+ account = _azure_account_from_url(account_url)
60
+ if not account:
61
+ raise ValueError("source.account_url is required for connector=azure_blob when source.container is used")
62
+ if not container:
63
+ raise ValueError("source.container is required for connector=azure_blob when source.account_url is used")
64
+ _configure_azure_blob_sas(spark, account, container, str(sas_token))
65
+ if path and "://" not in str(path):
66
+ return f"wasbs://{container}@{account}.blob.core.windows.net/{str(path).lstrip('/')}"
67
+ return path
68
+ account, inferred_container = _azure_account_container_from_uri(str(path or ""))
69
+ if not account or not inferred_container:
70
+ raise ValueError(
71
+ "auth.sas_token in connector=azure_blob requires source.account_url/source.container "
72
+ "or path wasbs://container@account.blob.core.windows.net/..."
73
+ )
74
+ _configure_azure_blob_sas(spark, account, inferred_container, str(sas_token))
75
+ return path
76
+
77
+
78
+ def _configure_azure_blob_sas(spark: Any, account: str, container: str, sas_token: str) -> None:
79
+ token = sas_token.strip()
80
+ if token.startswith("?"):
81
+ token = token[1:]
82
+ if not token:
83
+ raise ValueError("auth.sas_token cannot be empty for connector=azure_blob")
84
+ _set_conf(spark, f"fs.azure.sas.{container}.{account}.blob.core.windows.net", token)
85
+
86
+
87
+ def _azure_account_from_url(account_url: str) -> str:
88
+ if not account_url:
89
+ return ""
90
+ parsed = urllib.parse.urlparse(account_url if "://" in account_url else f"https://{account_url}")
91
+ host = parsed.netloc or parsed.path
92
+ return host.split(".", 1)[0].strip()
93
+
94
+
95
+ def _azure_account_container_from_uri(path: str) -> tuple[str, str]:
96
+ parsed = urllib.parse.urlparse(path)
97
+ if parsed.scheme not in {"wasbs", "wasb", "abfss", "abfs"} or "@" not in parsed.netloc:
98
+ return "", ""
99
+ container, host = parsed.netloc.split("@", 1)
100
+ return host.split(".", 1)[0].strip(), container.strip()
101
+
102
+
103
+ def _set_conf(spark: Any, key: str, value: str) -> None:
104
+ conf = getattr(spark, "conf", None)
105
+ if conf is None or not hasattr(conf, "set"):
106
+ raise RuntimeError("Object-storage source auth requires a Spark session with spark.conf.set")
107
+ try:
108
+ conf.set(key, value)
109
+ except Exception as exc:
110
+ if _is_spark_config_blocked(exc):
111
+ if key.startswith("fs.azure.sas."):
112
+ raise RuntimeError(
113
+ "Databricks serverless/Spark Connect blocked Spark SAS configuration. "
114
+ "Use a Unity Catalog External Location or Volume, or configure direct SAS only in a runtime "
115
+ "where Hadoop config fs.azure.sas.* is allowed."
116
+ ) from exc
117
+ if key.startswith("fs.s3a.") or key.startswith("spark.hadoop.fs.s3a."):
118
+ raise RuntimeError(
119
+ "Databricks serverless/Spark Connect blocked Spark S3 credential configuration. "
120
+ "Use a Unity Catalog External Location or Volume, or configure source.auth for S3 only in a "
121
+ "runtime where Hadoop config fs.s3a.* is allowed."
122
+ ) from exc
123
+ raise
124
+
125
+
126
+ def _is_spark_config_blocked(exc: Exception) -> bool:
127
+ message = str(exc)
128
+ return "CONFIG_NOT_AVAILABLE" in message or "Configuration fs.azure.sas" in message
129
+
130
+
131
+ def _dict(value: object) -> dict[str, Any]:
132
+ return dict(value) if isinstance(value, dict) else {}