contractforge-databricks 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contractforge_databricks/__init__.py +172 -0
- contractforge_databricks/adapter.py +69 -0
- contractforge_databricks/annotations/__init__.py +10 -0
- contractforge_databricks/annotations/application.py +52 -0
- contractforge_databricks/annotations/audit.py +49 -0
- contractforge_databricks/annotations/sql.py +142 -0
- contractforge_databricks/api.py +65 -0
- contractforge_databricks/bundles/__init__.py +9 -0
- contractforge_databricks/bundles/assets.py +47 -0
- contractforge_databricks/bundles/project.py +213 -0
- contractforge_databricks/bundles/project_config.py +133 -0
- contractforge_databricks/capabilities/__init__.py +17 -0
- contractforge_databricks/capabilities/builders.py +43 -0
- contractforge_databricks/capabilities/evaluate.py +162 -0
- contractforge_databricks/capabilities/mapping.py +36 -0
- contractforge_databricks/capabilities/models.py +44 -0
- contractforge_databricks/capabilities/runtime.py +111 -0
- contractforge_databricks/capabilities/uc.py +47 -0
- contractforge_databricks/cli.py +196 -0
- contractforge_databricks/cli_deploy.py +98 -0
- contractforge_databricks/cli_governance.py +142 -0
- contractforge_databricks/cli_io.py +91 -0
- contractforge_databricks/cli_maintenance.py +69 -0
- contractforge_databricks/coercion.py +31 -0
- contractforge_databricks/contract_extensions.py +70 -0
- contractforge_databricks/cost/__init__.py +11 -0
- contractforge_databricks/cost/model.py +22 -0
- contractforge_databricks/cost/report.py +65 -0
- contractforge_databricks/cost/sql.py +136 -0
- contractforge_databricks/dashboards/__init__.py +15 -0
- contractforge_databricks/dashboards/control_tables.py +150 -0
- contractforge_databricks/diagnostics/__init__.py +7 -0
- contractforge_databricks/diagnostics/explain.py +40 -0
- contractforge_databricks/environment.py +53 -0
- contractforge_databricks/evidence/__init__.py +98 -0
- contractforge_databricks/evidence/ddl.py +35 -0
- contractforge_databricks/evidence/governance_log.py +175 -0
- contractforge_databricks/evidence/helpers.py +29 -0
- contractforge_databricks/evidence/ops_log.py +210 -0
- contractforge_databricks/evidence/records.py +27 -0
- contractforge_databricks/evidence/run_log.py +74 -0
- contractforge_databricks/evidence/schemas.py +7 -0
- contractforge_databricks/evidence/sql.py +144 -0
- contractforge_databricks/evidence/tables.py +20 -0
- contractforge_databricks/evidence/writer.py +118 -0
- contractforge_databricks/execution/__init__.py +70 -0
- contractforge_databricks/execution/delta_basic.py +57 -0
- contractforge_databricks/execution/hash_diff.py +126 -0
- contractforge_databricks/execution/hash_diff_latest.py +142 -0
- contractforge_databricks/execution/replace_partitions.py +40 -0
- contractforge_databricks/execution/results.py +5 -0
- contractforge_databricks/execution/retry.py +36 -0
- contractforge_databricks/execution/scd2.py +213 -0
- contractforge_databricks/execution/scd2_deletes.py +65 -0
- contractforge_databricks/execution/scd2_late.py +30 -0
- contractforge_databricks/execution/snapshot.py +77 -0
- contractforge_databricks/execution/sql_merge.py +85 -0
- contractforge_databricks/execution/tables.py +98 -0
- contractforge_databricks/execution/windows.py +58 -0
- contractforge_databricks/governance/__init__.py +30 -0
- contractforge_databricks/governance/access.py +185 -0
- contractforge_databricks/governance/application.py +93 -0
- contractforge_databricks/governance/drift.py +49 -0
- contractforge_databricks/governance/runtime.py +60 -0
- contractforge_databricks/governance/sql.py +31 -0
- contractforge_databricks/governance/validation.py +135 -0
- contractforge_databricks/lakeflow/__init__.py +21 -0
- contractforge_databricks/lakeflow/compatibility.py +194 -0
- contractforge_databricks/lakeflow/rendering.py +175 -0
- contractforge_databricks/lineage/__init__.py +7 -0
- contractforge_databricks/lineage/openlineage.py +182 -0
- contractforge_databricks/maintenance/__init__.py +27 -0
- contractforge_databricks/maintenance/retention.py +90 -0
- contractforge_databricks/maintenance/sql.py +68 -0
- contractforge_databricks/metrics/__init__.py +19 -0
- contractforge_databricks/metrics/history.py +21 -0
- contractforge_databricks/metrics/write.py +63 -0
- contractforge_databricks/operations/__init__.py +4 -0
- contractforge_databricks/operations/application.py +38 -0
- contractforge_databricks/operations/sql.py +95 -0
- contractforge_databricks/parity/__init__.py +18 -0
- contractforge_databricks/parity/catalog.py +59 -0
- contractforge_databricks/parity/models.py +7 -0
- contractforge_databricks/parity/scenarios.py +111 -0
- contractforge_databricks/partitioning/__init__.py +3 -0
- contractforge_databricks/partitioning/predicates.py +28 -0
- contractforge_databricks/preparation/__init__.py +47 -0
- contractforge_databricks/preparation/deduplicate.py +87 -0
- contractforge_databricks/preparation/encoding.py +37 -0
- contractforge_databricks/preparation/hashing.py +18 -0
- contractforge_databricks/preparation/pyspark.py +178 -0
- contractforge_databricks/preparation/pyspark_staging.py +70 -0
- contractforge_databricks/preparation/shape.py +209 -0
- contractforge_databricks/preparation/shape_validation.py +94 -0
- contractforge_databricks/preparation/staging.py +17 -0
- contractforge_databricks/preparation/zip_arrays.py +51 -0
- contractforge_databricks/presets/__init__.py +3 -0
- contractforge_databricks/presets/base.py +24 -0
- contractforge_databricks/presets/bronze.py +57 -0
- contractforge_databricks/presets/catalog.py +22 -0
- contractforge_databricks/presets/core.py +134 -0
- contractforge_databricks/presets/gold.py +62 -0
- contractforge_databricks/presets/modifiers.py +51 -0
- contractforge_databricks/presets/runtime.py +22 -0
- contractforge_databricks/presets/silver.py +101 -0
- contractforge_databricks/presets/write_engine.py +57 -0
- contractforge_databricks/quality/__init__.py +41 -0
- contractforge_databricks/quality/evaluation.py +178 -0
- contractforge_databricks/quality/persistence.py +81 -0
- contractforge_databricks/quality/registry.py +134 -0
- contractforge_databricks/quality/results.py +17 -0
- contractforge_databricks/quality/sql.py +113 -0
- contractforge_databricks/rendering/__init__.py +11 -0
- contractforge_databricks/rendering/bundle.py +93 -0
- contractforge_databricks/rendering/markdown.py +50 -0
- contractforge_databricks/rendering/names.py +56 -0
- contractforge_databricks/results.py +15 -0
- contractforge_databricks/runtime/__init__.py +101 -0
- contractforge_databricks/runtime/available_now.py +147 -0
- contractforge_databricks/runtime/bundles.py +211 -0
- contractforge_databricks/runtime/cache.py +20 -0
- contractforge_databricks/runtime/control_tables.py +19 -0
- contractforge_databricks/runtime/deploy.py +197 -0
- contractforge_databricks/runtime/detection.py +114 -0
- contractforge_databricks/runtime/dry_run.py +46 -0
- contractforge_databricks/runtime/errors.py +54 -0
- contractforge_databricks/runtime/file_selection.py +109 -0
- contractforge_databricks/runtime/finalization.py +168 -0
- contractforge_databricks/runtime/governance.py +37 -0
- contractforge_databricks/runtime/hooks.py +45 -0
- contractforge_databricks/runtime/http_file.py +37 -0
- contractforge_databricks/runtime/http_retry.py +15 -0
- contractforge_databricks/runtime/http_safety.py +9 -0
- contractforge_databricks/runtime/json_materialization.py +97 -0
- contractforge_databricks/runtime/lineage.py +164 -0
- contractforge_databricks/runtime/maintenance.py +43 -0
- contractforge_databricks/runtime/merge_validation.py +98 -0
- contractforge_databricks/runtime/metadata.py +21 -0
- contractforge_databricks/runtime/metrics.py +34 -0
- contractforge_databricks/runtime/models.py +32 -0
- contractforge_databricks/runtime/options.py +33 -0
- contractforge_databricks/runtime/orchestration_context.py +185 -0
- contractforge_databricks/runtime/orchestrator.py +147 -0
- contractforge_databricks/runtime/partitioning.py +93 -0
- contractforge_databricks/runtime/quality_quarantine.py +92 -0
- contractforge_databricks/runtime/rest_api.py +46 -0
- contractforge_databricks/runtime/rest_auth.py +21 -0
- contractforge_databricks/runtime/rest_pagination.py +21 -0
- contractforge_databricks/runtime/run_payload.py +177 -0
- contractforge_databricks/runtime/schema.py +106 -0
- contractforge_databricks/runtime/source_metadata.py +30 -0
- contractforge_databricks/runtime/source_registry.py +43 -0
- contractforge_databricks/runtime/source_schema.py +24 -0
- contractforge_databricks/runtime/sources.py +208 -0
- contractforge_databricks/runtime/spark.py +183 -0
- contractforge_databricks/runtime/spark_defaults.py +35 -0
- contractforge_databricks/runtime/storage_auth.py +132 -0
- contractforge_databricks/runtime/streaming.py +131 -0
- contractforge_databricks/runtime/success.py +104 -0
- contractforge_databricks/runtime/utils.py +52 -0
- contractforge_databricks/runtime/watermark.py +71 -0
- contractforge_databricks/runtime/windows.py +184 -0
- contractforge_databricks/runtime/write.py +66 -0
- contractforge_databricks/runtime/write_flow.py +146 -0
- contractforge_databricks/runtime/write_strategy.py +40 -0
- contractforge_databricks/schema/__init__.py +21 -0
- contractforge_databricks/schema/diff.py +11 -0
- contractforge_databricks/schema/policy.py +33 -0
- contractforge_databricks/schema/sync.py +23 -0
- contractforge_databricks/security/__init__.py +21 -0
- contractforge_databricks/security/errors.py +5 -0
- contractforge_databricks/security/redaction.py +5 -0
- contractforge_databricks/security/secrets.py +114 -0
- contractforge_databricks/security/source_policy.py +17 -0
- contractforge_databricks/shapes/__init__.py +3 -0
- contractforge_databricks/shapes/sql.py +123 -0
- contractforge_databricks/sources/__init__.py +67 -0
- contractforge_databricks/sources/artifacts.py +100 -0
- contractforge_databricks/sources/autoloader.py +48 -0
- contractforge_databricks/sources/bounded_streams.py +44 -0
- contractforge_databricks/sources/classification.py +115 -0
- contractforge_databricks/sources/delta_share.py +21 -0
- contractforge_databricks/sources/files.py +48 -0
- contractforge_databricks/sources/http_file.py +46 -0
- contractforge_databricks/sources/interpret.py +76 -0
- contractforge_databricks/sources/jdbc.py +32 -0
- contractforge_databricks/sources/metadata.py +18 -0
- contractforge_databricks/sources/native_passthrough.py +33 -0
- contractforge_databricks/sources/rds_iam.py +15 -0
- contractforge_databricks/sources/rds_iam_runtime.py +191 -0
- contractforge_databricks/sources/rest_api.py +33 -0
- contractforge_databricks/sources/support.py +50 -0
- contractforge_databricks/sources/table_refs.py +65 -0
- contractforge_databricks/sql/__init__.py +4 -0
- contractforge_databricks/sql/identifiers.py +17 -0
- contractforge_databricks/sql/literals.py +36 -0
- contractforge_databricks/state/__init__.py +39 -0
- contractforge_databricks/state/ddl.py +24 -0
- contractforge_databricks/state/migrations.py +146 -0
- contractforge_databricks/state/queries.py +149 -0
- contractforge_databricks/state/sql.py +116 -0
- contractforge_databricks/state/tables.py +9 -0
- contractforge_databricks/state/writer.py +83 -0
- contractforge_databricks/templates/__init__.py +15 -0
- contractforge_databricks/templates/catalog.py +205 -0
- contractforge_databricks/templates/catalog_parity.py +85 -0
- contractforge_databricks/templates/core.py +83 -0
- contractforge_databricks/templates/enrichment.py +175 -0
- contractforge_databricks/transforms/__init__.py +3 -0
- contractforge_databricks/transforms/sql.py +118 -0
- contractforge_databricks/watermark/__init__.py +6 -0
- contractforge_databricks/watermark/sql.py +91 -0
- contractforge_databricks/write_modes/__init__.py +20 -0
- contractforge_databricks/write_modes/registry.py +44 -0
- contractforge_databricks/write_modes/sql.py +33 -0
- contractforge_databricks/write_modes/strategy.py +192 -0
- contractforge_databricks-0.1.0.dist-info/METADATA +34 -0
- contractforge_databricks-0.1.0.dist-info/RECORD +220 -0
- contractforge_databricks-0.1.0.dist-info/WHEEL +4 -0
- contractforge_databricks-0.1.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
"""Databricks runtime source resolution.
|
|
2
|
+
|
|
3
|
+
The functions in this module intentionally receive ``spark`` as an argument so
|
|
4
|
+
the adapter stays importable outside Databricks and PySpark remains optional.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from contractforge_core.connectors import (
|
|
12
|
+
catalog_source_query,
|
|
13
|
+
catalog_source_table_or_path,
|
|
14
|
+
delta_share_options,
|
|
15
|
+
eventhubs_bounded_options,
|
|
16
|
+
file_reader_options,
|
|
17
|
+
file_source_format,
|
|
18
|
+
is_available_now_stream_source,
|
|
19
|
+
is_bounded_stream_source,
|
|
20
|
+
is_catalog_source,
|
|
21
|
+
is_delta_share_source,
|
|
22
|
+
is_file_source,
|
|
23
|
+
is_http_file_source,
|
|
24
|
+
is_kafka_stream_source,
|
|
25
|
+
is_rest_api_connector,
|
|
26
|
+
jdbc_common_options,
|
|
27
|
+
kafka_bounded_options,
|
|
28
|
+
stream_source_format,
|
|
29
|
+
)
|
|
30
|
+
from contractforge_core.runtime import PreparedInput
|
|
31
|
+
from contractforge_core.runtime import QueryOne
|
|
32
|
+
from contractforge_core.semantic import SemanticContract
|
|
33
|
+
from contractforge_databricks.preparation import apply_contract_preparation, apply_write_staging
|
|
34
|
+
from contractforge_databricks.runtime.file_selection import selected_file_load_path
|
|
35
|
+
from contractforge_databricks.runtime.http_file import resolve_http_file_dataframe
|
|
36
|
+
from contractforge_databricks.runtime.rest_api import resolve_rest_api_dataframe
|
|
37
|
+
from contractforge_databricks.runtime.source_schema import apply_declared_schema
|
|
38
|
+
from contractforge_databricks.runtime.source_metadata import (
|
|
39
|
+
schema_types,
|
|
40
|
+
source_metadata,
|
|
41
|
+
source_metadata_with_watermark,
|
|
42
|
+
source_name,
|
|
43
|
+
)
|
|
44
|
+
from contractforge_databricks.runtime.source_registry import get_source_resolver
|
|
45
|
+
from contractforge_databricks.runtime.watermark import collect_previous_watermark
|
|
46
|
+
from contractforge_databricks.runtime.storage_auth import configure_object_storage_access
|
|
47
|
+
from contractforge_databricks.security import resolve_databricks_secret_placeholders, validate_source_security
|
|
48
|
+
from contractforge_databricks.sources.interpret import interpret_incremental_files_source, is_incremental_file_source
|
|
49
|
+
from contractforge_databricks.sources.rds_iam_runtime import materialize_rds_iam_options
|
|
50
|
+
from contractforge_databricks.sources.table_refs import (
|
|
51
|
+
contract_with_databricks_source_refs,
|
|
52
|
+
databricks_table_ref_resolver,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
_JDBC_SOURCE_ALIASES = {"jdbc", "postgres", "mysql", "sqlserver", "oracle", "redshift", "db2", "mariadb", "snowflake_jdbc", "bigquery_jdbc"}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def resolve_source_dataframe(spark: Any, source: dict[str, Any], *, contract: SemanticContract | None = None) -> Any:
|
|
59
|
+
"""Resolve a core source contract into a Databricks DataFrame."""
|
|
60
|
+
|
|
61
|
+
validate_source_security(source)
|
|
62
|
+
source = resolve_databricks_secret_placeholders(source)
|
|
63
|
+
source_type = source.get("type")
|
|
64
|
+
custom_resolver = get_source_resolver(str(source.get("connector") or source_type or ""))
|
|
65
|
+
if custom_resolver is not None:
|
|
66
|
+
return custom_resolver.resolve(spark, source)
|
|
67
|
+
if is_catalog_source(source):
|
|
68
|
+
return _resolve_catalog_source(spark, source, contract=contract)
|
|
69
|
+
if is_incremental_file_source(source):
|
|
70
|
+
return _resolve_autoloader_source(spark, source)
|
|
71
|
+
if is_file_source(source):
|
|
72
|
+
options = file_reader_options(source)
|
|
73
|
+
path, options = configure_object_storage_access(spark, source, options)
|
|
74
|
+
source = {**source, "path": path} if path is not None else source
|
|
75
|
+
return _read_source_with_options(
|
|
76
|
+
spark.read,
|
|
77
|
+
file_source_format(source),
|
|
78
|
+
options,
|
|
79
|
+
selected_file_load_path(spark, source, options),
|
|
80
|
+
source,
|
|
81
|
+
)
|
|
82
|
+
if is_http_file_source(source):
|
|
83
|
+
return resolve_http_file_dataframe(spark, source)
|
|
84
|
+
if is_rest_api_connector(source):
|
|
85
|
+
return resolve_rest_api_dataframe(spark, source)
|
|
86
|
+
if source_type in {"jdbc", "connector"} or source.get("connector") in _JDBC_SOURCE_ALIASES:
|
|
87
|
+
jdbc_options = materialize_rds_iam_options(
|
|
88
|
+
jdbc_common_options(source),
|
|
89
|
+
auth=source.get("auth"),
|
|
90
|
+
)
|
|
91
|
+
return _read_with_options(spark.read, "jdbc", jdbc_options, None)
|
|
92
|
+
if is_bounded_stream_source(source) or is_available_now_stream_source(source):
|
|
93
|
+
options = kafka_bounded_options(source) if is_kafka_stream_source(source) else eventhubs_bounded_options(source)
|
|
94
|
+
source_format = stream_source_format(source)
|
|
95
|
+
reader = spark.readStream if is_available_now_stream_source(source) else spark.read
|
|
96
|
+
return _read_with_options(reader, source_format, options, None)
|
|
97
|
+
if is_delta_share_source(source):
|
|
98
|
+
return _read_with_options(spark.read, "deltaSharing", delta_share_options(source), None)
|
|
99
|
+
raise ValueError(f"source.type={source_type!r} cannot be resolved by the Databricks runtime source resolver")
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def prepare_source_view(
|
|
103
|
+
spark: Any,
|
|
104
|
+
source: dict[str, Any],
|
|
105
|
+
*,
|
|
106
|
+
view_name: str,
|
|
107
|
+
collect_metrics: bool = False,
|
|
108
|
+
) -> PreparedInput:
|
|
109
|
+
"""Resolve a source and register it as a temporary view for ingestion."""
|
|
110
|
+
|
|
111
|
+
df = resolve_source_dataframe(spark, source)
|
|
112
|
+
df.createOrReplaceTempView(view_name)
|
|
113
|
+
columns = tuple(str(column) for column in getattr(df, "columns", ()) or ())
|
|
114
|
+
rows_read = int(df.count()) if collect_metrics else 0
|
|
115
|
+
return PreparedInput(
|
|
116
|
+
source_view=view_name,
|
|
117
|
+
source_columns=columns,
|
|
118
|
+
source_schema=schema_types(df),
|
|
119
|
+
rows_read=rows_read,
|
|
120
|
+
source_name=source_name(source),
|
|
121
|
+
source_metadata=source_metadata(source),
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def prepare_contract_source_view(
|
|
126
|
+
spark: Any,
|
|
127
|
+
contract: SemanticContract,
|
|
128
|
+
*,
|
|
129
|
+
view_name: str,
|
|
130
|
+
collect_metrics: bool = False,
|
|
131
|
+
query_one: QueryOne | None = None,
|
|
132
|
+
evidence_catalog: str = "main",
|
|
133
|
+
evidence_schema: str = "ops",
|
|
134
|
+
) -> PreparedInput:
|
|
135
|
+
"""Resolve, prepare and register the contract source as a temporary view."""
|
|
136
|
+
|
|
137
|
+
if not contract.source.raw:
|
|
138
|
+
raise ValueError("prepare_contract_source_view requires a structured source contract")
|
|
139
|
+
runtime_contract = contract_with_databricks_source_refs(contract)
|
|
140
|
+
df = resolve_source_dataframe(spark, runtime_contract.source.raw or {}, contract=runtime_contract)
|
|
141
|
+
watermark_column, watermark_previous = collect_previous_watermark(
|
|
142
|
+
contract=contract,
|
|
143
|
+
query_one=query_one,
|
|
144
|
+
catalog=evidence_catalog,
|
|
145
|
+
schema=evidence_schema,
|
|
146
|
+
)
|
|
147
|
+
df = apply_contract_preparation(
|
|
148
|
+
df,
|
|
149
|
+
contract,
|
|
150
|
+
watermark_column=watermark_column,
|
|
151
|
+
watermark_previous=watermark_previous,
|
|
152
|
+
)
|
|
153
|
+
df = apply_write_staging(df, contract)
|
|
154
|
+
df.createOrReplaceTempView(view_name)
|
|
155
|
+
columns = tuple(str(column) for column in getattr(df, "columns", ()) or ())
|
|
156
|
+
rows_read = int(df.count()) if collect_metrics else 0
|
|
157
|
+
return PreparedInput(
|
|
158
|
+
source_view=view_name,
|
|
159
|
+
source_columns=columns,
|
|
160
|
+
source_schema=schema_types(df),
|
|
161
|
+
rows_read=rows_read,
|
|
162
|
+
source_name=runtime_contract.source.name,
|
|
163
|
+
source_metadata=source_metadata_with_watermark(runtime_contract.source.raw or {}, watermark_previous),
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _resolve_catalog_source(spark: Any, source: dict[str, Any], *, contract: SemanticContract | None = None) -> Any:
|
|
168
|
+
resolver = databricks_table_ref_resolver(contract) if contract is not None else None
|
|
169
|
+
if source.get("type") == "sql" or source.get("connector") == "sql":
|
|
170
|
+
return spark.sql(catalog_source_query(source, table_ref_resolver=resolver))
|
|
171
|
+
table_or_path = catalog_source_table_or_path(source, table_ref_resolver=resolver)
|
|
172
|
+
if source.get("path") and not source.get("table"):
|
|
173
|
+
source_type = str(source.get("type") or "delta")
|
|
174
|
+
source_format = "delta" if source_type == "delta_table" else source_type.replace("_table", "")
|
|
175
|
+
return _read_with_options(spark.read, source_format, {}, table_or_path)
|
|
176
|
+
return spark.table(str(table_or_path))
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _resolve_autoloader_source(spark: Any, source: dict[str, Any]) -> Any:
|
|
180
|
+
interpreted = interpret_incremental_files_source(source)
|
|
181
|
+
options = {"cloudFiles.format": str(interpreted.get("format") or "json")}
|
|
182
|
+
options.update({str(key): str(value) for key, value in interpreted.get("options", {}).items()})
|
|
183
|
+
if interpreted.get("schema_tracking_location"):
|
|
184
|
+
options["cloudFiles.schemaLocation"] = str(interpreted["schema_tracking_location"])
|
|
185
|
+
if interpreted.get("schema_hints"):
|
|
186
|
+
options["cloudFiles.schemaHints"] = str(interpreted["schema_hints"])
|
|
187
|
+
return _read_with_options(spark.readStream, "cloudFiles", options, interpreted.get("path"))
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _read_with_options(reader: Any, source_format: str, options: dict[str, str], path: object | None) -> Any:
|
|
191
|
+
builder = reader.format(source_format)
|
|
192
|
+
for key, value in sorted(options.items()):
|
|
193
|
+
builder = builder.option(key, value)
|
|
194
|
+
return builder.load(path if isinstance(path, list) else str(path)) if path is not None else builder.load()
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _read_source_with_options(
|
|
198
|
+
reader: Any,
|
|
199
|
+
source_format: str,
|
|
200
|
+
options: dict[str, str],
|
|
201
|
+
path: object | None,
|
|
202
|
+
source: dict[str, Any],
|
|
203
|
+
) -> Any:
|
|
204
|
+
builder = reader.format(source_format)
|
|
205
|
+
for key, value in sorted(options.items()):
|
|
206
|
+
builder = builder.option(key, value)
|
|
207
|
+
builder = apply_declared_schema(builder, source)
|
|
208
|
+
return builder.load(path if isinstance(path, list) else str(path)) if path is not None else builder.load()
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""Databricks/Spark runtime convenience helpers with lazy imports."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import platform
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from contractforge_databricks.capabilities.runtime import is_serverless_conf
|
|
11
|
+
from contractforge_databricks.runtime.detection import _collect_spark_conf
|
|
12
|
+
from contractforge_databricks.sql import quote_identifier, quote_table_name
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
_SERVERLESS_CACHE: dict[int, bool] = {}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_active_spark() -> Any:
|
|
19
|
+
"""Resolve the active Databricks or PySpark session at call time."""
|
|
20
|
+
try:
|
|
21
|
+
from databricks.sdk.runtime import spark as dbx_spark # type: ignore
|
|
22
|
+
|
|
23
|
+
if dbx_spark is not None:
|
|
24
|
+
return dbx_spark
|
|
25
|
+
except Exception as exc:
|
|
26
|
+
logger.debug("Databricks runtime spark session was not available; falling back to PySpark.", exc_info=exc)
|
|
27
|
+
try:
|
|
28
|
+
from pyspark.sql import SparkSession
|
|
29
|
+
except Exception as exc:
|
|
30
|
+
raise RuntimeError("PySpark is required to resolve an active Spark session.") from exc
|
|
31
|
+
session = SparkSession.getActiveSession() or getattr(SparkSession, "_instantiatedSession", None)
|
|
32
|
+
if session is None:
|
|
33
|
+
raise RuntimeError("No active SparkSession was found.")
|
|
34
|
+
return session
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def runtime_info(spark: Any | None = None) -> dict[str, str | None]:
|
|
38
|
+
session = spark or _maybe_active_spark()
|
|
39
|
+
version = getattr(session, "version", None) if session is not None else None
|
|
40
|
+
return {
|
|
41
|
+
"runtime_type": "serverless" if detect_serverless(session) else "classic",
|
|
42
|
+
"spark_version": version,
|
|
43
|
+
"python_version": platform.python_version(),
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def detect_serverless(spark: Any | None = None) -> bool:
|
|
48
|
+
session = spark or _maybe_active_spark()
|
|
49
|
+
if session is None:
|
|
50
|
+
return False
|
|
51
|
+
cache_key = id(session)
|
|
52
|
+
if cache_key in _SERVERLESS_CACHE:
|
|
53
|
+
return _SERVERLESS_CACHE[cache_key]
|
|
54
|
+
conf = _collect_spark_conf(session) if session is not None else {}
|
|
55
|
+
detected = is_serverless_conf(conf) or conf.get("spark.databricks.clusterUsageTags.clusterSource") == "JOB_SERVERLESS"
|
|
56
|
+
_SERVERLESS_CACHE[cache_key] = detected
|
|
57
|
+
return detected
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def safe_cache(df: Any, *, enabled: bool = True, serverless: bool | None = None) -> Any:
|
|
61
|
+
if not enabled or (detect_serverless() if serverless is None else serverless):
|
|
62
|
+
return df
|
|
63
|
+
try:
|
|
64
|
+
return df.cache()
|
|
65
|
+
except Exception as exc:
|
|
66
|
+
if _is_unsupported_cache_error(exc):
|
|
67
|
+
return df
|
|
68
|
+
raise
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def safe_unpersist(df: Any, *, enabled: bool = True, serverless: bool | None = None) -> None:
|
|
72
|
+
if not enabled or (detect_serverless() if serverless is None else serverless):
|
|
73
|
+
return
|
|
74
|
+
try:
|
|
75
|
+
df.unpersist()
|
|
76
|
+
except Exception as exc:
|
|
77
|
+
if not _is_unsupported_cache_error(exc):
|
|
78
|
+
raise
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def safe_cache_table(spark: Any, table_name: str, *, enabled: bool = True, serverless: bool | None = None) -> bool:
|
|
82
|
+
if not enabled or (detect_serverless(spark) if serverless is None else serverless):
|
|
83
|
+
return False
|
|
84
|
+
try:
|
|
85
|
+
catalog = getattr(spark, "catalog", None)
|
|
86
|
+
cache_table = getattr(catalog, "cacheTable", None)
|
|
87
|
+
if callable(cache_table):
|
|
88
|
+
cache_table(table_name)
|
|
89
|
+
else:
|
|
90
|
+
spark.sql(f"CACHE TABLE {quote_identifier(table_name)}")
|
|
91
|
+
return True
|
|
92
|
+
except Exception as exc:
|
|
93
|
+
if _is_unsupported_cache_error(exc):
|
|
94
|
+
return False
|
|
95
|
+
raise
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def safe_uncache_table(spark: Any, table_name: str, *, enabled: bool = True, serverless: bool | None = None) -> None:
|
|
99
|
+
if not enabled or (detect_serverless(spark) if serverless is None else serverless):
|
|
100
|
+
return
|
|
101
|
+
try:
|
|
102
|
+
catalog = getattr(spark, "catalog", None)
|
|
103
|
+
uncache_table = getattr(catalog, "uncacheTable", None)
|
|
104
|
+
if callable(uncache_table):
|
|
105
|
+
uncache_table(table_name)
|
|
106
|
+
else:
|
|
107
|
+
spark.sql(f"UNCACHE TABLE {quote_identifier(table_name)}")
|
|
108
|
+
except Exception as exc:
|
|
109
|
+
if not _is_unsupported_cache_error(exc):
|
|
110
|
+
raise
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def table_exists(full_name: str, *, spark: Any | None = None) -> bool:
|
|
114
|
+
session = spark or get_active_spark()
|
|
115
|
+
try:
|
|
116
|
+
if session.catalog.tableExists(full_name):
|
|
117
|
+
return True
|
|
118
|
+
except Exception as exc:
|
|
119
|
+
logger.debug("Spark catalog tableExists failed for %s; falling back to DESCRIBE TABLE.", full_name, exc_info=exc)
|
|
120
|
+
try:
|
|
121
|
+
session.sql(f"DESCRIBE TABLE {quote_table_name(full_name)}")
|
|
122
|
+
return True
|
|
123
|
+
except Exception as exc:
|
|
124
|
+
logger.debug("Spark DESCRIBE TABLE failed for %s.", full_name, exc_info=exc)
|
|
125
|
+
return False
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def schema_signature(df: Any) -> str:
|
|
129
|
+
return json.dumps(
|
|
130
|
+
[(field.name, field.dataType.simpleString(), field.nullable) for field in df.schema.fields],
|
|
131
|
+
ensure_ascii=False,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def fix_encoding(df: Any, *, enabled: bool, encoding: str, columns: tuple[str, ...] = ()) -> Any:
|
|
136
|
+
if not enabled:
|
|
137
|
+
return df
|
|
138
|
+
from pyspark.sql import functions as functions # type: ignore
|
|
139
|
+
|
|
140
|
+
string_cols = [field.name for field in df.schema.fields if field.dataType.typeName() == "string"]
|
|
141
|
+
cols_to_fix = columns or tuple(string_cols)
|
|
142
|
+
for column in cols_to_fix:
|
|
143
|
+
if column in string_cols:
|
|
144
|
+
df = df.withColumn(column, functions.decode(functions.col(column).cast("binary"), encoding))
|
|
145
|
+
return df
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def sync_delta_schema(
|
|
149
|
+
*,
|
|
150
|
+
df: Any,
|
|
151
|
+
target_table: str,
|
|
152
|
+
schema_changes: dict[str, Any],
|
|
153
|
+
policy: str,
|
|
154
|
+
spark: Any | None = None,
|
|
155
|
+
) -> None:
|
|
156
|
+
session = spark or get_active_spark()
|
|
157
|
+
if policy not in {"permissive", "additive_only"} or not table_exists(target_table, spark=session):
|
|
158
|
+
return
|
|
159
|
+
fields = {field.name: field.dataType.simpleString() for field in df.schema.fields}
|
|
160
|
+
added = [column for column in schema_changes.get("added_columns", ()) if column in fields]
|
|
161
|
+
if added:
|
|
162
|
+
cols_sql = ", ".join(f"{quote_identifier(column)} {fields[column]}" for column in added)
|
|
163
|
+
session.sql(f"ALTER TABLE {quote_table_name(target_table)} ADD COLUMNS ({cols_sql})")
|
|
164
|
+
for change in schema_changes.get("type_changes", ()):
|
|
165
|
+
if not change.get("allowed"):
|
|
166
|
+
continue
|
|
167
|
+
column = str(change["column"])
|
|
168
|
+
source_type = str(change["source"])
|
|
169
|
+
session.sql(f"ALTER TABLE {quote_table_name(target_table)} ALTER COLUMN {quote_identifier(column)} TYPE {source_type}")
|
|
170
|
+
change["applied"] = True
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _maybe_active_spark() -> Any | None:
|
|
174
|
+
try:
|
|
175
|
+
return get_active_spark()
|
|
176
|
+
except Exception as exc:
|
|
177
|
+
logger.debug("No active Spark session could be resolved.", exc_info=exc)
|
|
178
|
+
return None
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _is_unsupported_cache_error(exc: Exception) -> bool:
|
|
182
|
+
text = str(exc).upper()
|
|
183
|
+
return "NOT_SUPPORTED" in text or "SERVERLESS" in text
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Spark-backed runtime defaults for Databricks bundle execution."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import replace
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from contractforge_core.runtime import QueryOne
|
|
9
|
+
from contractforge_databricks.runtime.models import DatabricksIngestOptions
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def with_spark_runtime_defaults(spark: Any, opts: DatabricksIngestOptions, target: str) -> DatabricksIngestOptions:
|
|
13
|
+
if opts.target_schema is not None or not opts.ensure_table:
|
|
14
|
+
return opts
|
|
15
|
+
target_schema = spark_target_schema(spark, target)
|
|
16
|
+
return replace(opts, target_schema=target_schema) if target_schema is not None else opts
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def spark_target_schema(spark: Any, target: str) -> dict[str, str] | None:
|
|
20
|
+
try:
|
|
21
|
+
schema = spark.table(target).schema
|
|
22
|
+
except Exception:
|
|
23
|
+
return None
|
|
24
|
+
return {str(field.name): str(field.dataType.simpleString()).lower() for field in schema.fields}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def spark_query_one(spark: Any) -> QueryOne | None:
|
|
28
|
+
if not callable(getattr(spark, "sql", None)):
|
|
29
|
+
return None
|
|
30
|
+
|
|
31
|
+
def query_one(statement: str) -> dict[str, Any] | None:
|
|
32
|
+
rows = spark.sql(statement).limit(1).collect()
|
|
33
|
+
return rows[0].asDict() if rows else None
|
|
34
|
+
|
|
35
|
+
return query_one
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""Databricks runtime object-storage credential helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import urllib.parse
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from contractforge_core.connectors import object_storage_provider
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def configure_object_storage_access(
|
|
12
|
+
spark: Any,
|
|
13
|
+
source: dict[str, Any],
|
|
14
|
+
options: dict[str, str],
|
|
15
|
+
) -> tuple[object | None, dict[str, str]]:
|
|
16
|
+
"""Configure adapter-owned Spark storage credentials and return read path/options."""
|
|
17
|
+
|
|
18
|
+
provider = object_storage_provider(source)
|
|
19
|
+
path = source.get("path")
|
|
20
|
+
if provider == "s3":
|
|
21
|
+
return path, _configure_s3(spark, source, options)
|
|
22
|
+
if provider == "azure_blob":
|
|
23
|
+
return _configure_azure_blob(spark, source, path), options
|
|
24
|
+
return path, options
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _configure_s3(spark: Any, source: dict[str, Any], options: dict[str, str]) -> dict[str, str]:
|
|
28
|
+
reader_options: dict[str, str] = {}
|
|
29
|
+
for key, value in options.items():
|
|
30
|
+
if key.startswith("fs.s3a.") or key.startswith("spark.hadoop.fs.s3a."):
|
|
31
|
+
_set_conf(spark, key, value)
|
|
32
|
+
else:
|
|
33
|
+
reader_options[key] = value
|
|
34
|
+
auth = _dict(source.get("auth"))
|
|
35
|
+
access_key = auth.get("access_key_id") or auth.get("access_key") or auth.get("aws_access_key_id")
|
|
36
|
+
secret_key = auth.get("secret_access_key") or auth.get("secret_key") or auth.get("aws_secret_access_key")
|
|
37
|
+
session_token = auth.get("session_token") or auth.get("token") or auth.get("aws_session_token")
|
|
38
|
+
if bool(access_key) != bool(secret_key):
|
|
39
|
+
raise ValueError("source.auth for connector=s3 requires access_key_id and secret_access_key together")
|
|
40
|
+
if access_key and secret_key:
|
|
41
|
+
_set_conf(spark, "fs.s3a.access.key", str(access_key))
|
|
42
|
+
_set_conf(spark, "fs.s3a.secret.key", str(secret_key))
|
|
43
|
+
if session_token:
|
|
44
|
+
_set_conf(spark, "fs.s3a.session.token", str(session_token))
|
|
45
|
+
_set_conf(spark, "fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider")
|
|
46
|
+
else:
|
|
47
|
+
_set_conf(spark, "fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
|
|
48
|
+
return reader_options
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _configure_azure_blob(spark: Any, source: dict[str, Any], path: object | None) -> object | None:
|
|
52
|
+
auth = _dict(source.get("auth"))
|
|
53
|
+
sas_token = auth.get("sas_token") or auth.get("token")
|
|
54
|
+
if not sas_token:
|
|
55
|
+
return path
|
|
56
|
+
account_url = str(source.get("account_url") or "").strip()
|
|
57
|
+
container = str(source.get("container") or "").strip()
|
|
58
|
+
if account_url or container:
|
|
59
|
+
account = _azure_account_from_url(account_url)
|
|
60
|
+
if not account:
|
|
61
|
+
raise ValueError("source.account_url is required for connector=azure_blob when source.container is used")
|
|
62
|
+
if not container:
|
|
63
|
+
raise ValueError("source.container is required for connector=azure_blob when source.account_url is used")
|
|
64
|
+
_configure_azure_blob_sas(spark, account, container, str(sas_token))
|
|
65
|
+
if path and "://" not in str(path):
|
|
66
|
+
return f"wasbs://{container}@{account}.blob.core.windows.net/{str(path).lstrip('/')}"
|
|
67
|
+
return path
|
|
68
|
+
account, inferred_container = _azure_account_container_from_uri(str(path or ""))
|
|
69
|
+
if not account or not inferred_container:
|
|
70
|
+
raise ValueError(
|
|
71
|
+
"auth.sas_token in connector=azure_blob requires source.account_url/source.container "
|
|
72
|
+
"or path wasbs://container@account.blob.core.windows.net/..."
|
|
73
|
+
)
|
|
74
|
+
_configure_azure_blob_sas(spark, account, inferred_container, str(sas_token))
|
|
75
|
+
return path
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _configure_azure_blob_sas(spark: Any, account: str, container: str, sas_token: str) -> None:
|
|
79
|
+
token = sas_token.strip()
|
|
80
|
+
if token.startswith("?"):
|
|
81
|
+
token = token[1:]
|
|
82
|
+
if not token:
|
|
83
|
+
raise ValueError("auth.sas_token cannot be empty for connector=azure_blob")
|
|
84
|
+
_set_conf(spark, f"fs.azure.sas.{container}.{account}.blob.core.windows.net", token)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _azure_account_from_url(account_url: str) -> str:
|
|
88
|
+
if not account_url:
|
|
89
|
+
return ""
|
|
90
|
+
parsed = urllib.parse.urlparse(account_url if "://" in account_url else f"https://{account_url}")
|
|
91
|
+
host = parsed.netloc or parsed.path
|
|
92
|
+
return host.split(".", 1)[0].strip()
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _azure_account_container_from_uri(path: str) -> tuple[str, str]:
|
|
96
|
+
parsed = urllib.parse.urlparse(path)
|
|
97
|
+
if parsed.scheme not in {"wasbs", "wasb", "abfss", "abfs"} or "@" not in parsed.netloc:
|
|
98
|
+
return "", ""
|
|
99
|
+
container, host = parsed.netloc.split("@", 1)
|
|
100
|
+
return host.split(".", 1)[0].strip(), container.strip()
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _set_conf(spark: Any, key: str, value: str) -> None:
|
|
104
|
+
conf = getattr(spark, "conf", None)
|
|
105
|
+
if conf is None or not hasattr(conf, "set"):
|
|
106
|
+
raise RuntimeError("Object-storage source auth requires a Spark session with spark.conf.set")
|
|
107
|
+
try:
|
|
108
|
+
conf.set(key, value)
|
|
109
|
+
except Exception as exc:
|
|
110
|
+
if _is_spark_config_blocked(exc):
|
|
111
|
+
if key.startswith("fs.azure.sas."):
|
|
112
|
+
raise RuntimeError(
|
|
113
|
+
"Databricks serverless/Spark Connect blocked Spark SAS configuration. "
|
|
114
|
+
"Use a Unity Catalog External Location or Volume, or configure direct SAS only in a runtime "
|
|
115
|
+
"where Hadoop config fs.azure.sas.* is allowed."
|
|
116
|
+
) from exc
|
|
117
|
+
if key.startswith("fs.s3a.") or key.startswith("spark.hadoop.fs.s3a."):
|
|
118
|
+
raise RuntimeError(
|
|
119
|
+
"Databricks serverless/Spark Connect blocked Spark S3 credential configuration. "
|
|
120
|
+
"Use a Unity Catalog External Location or Volume, or configure source.auth for S3 only in a "
|
|
121
|
+
"runtime where Hadoop config fs.s3a.* is allowed."
|
|
122
|
+
) from exc
|
|
123
|
+
raise
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _is_spark_config_blocked(exc: Exception) -> bool:
|
|
127
|
+
message = str(exc)
|
|
128
|
+
return "CONFIG_NOT_AVAILABLE" in message or "Configuration fs.azure.sas" in message
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _dict(value: object) -> dict[str, Any]:
|
|
132
|
+
return dict(value) if isinstance(value, dict) else {}
|