contractforge-databricks 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contractforge_databricks/__init__.py +172 -0
- contractforge_databricks/adapter.py +69 -0
- contractforge_databricks/annotations/__init__.py +10 -0
- contractforge_databricks/annotations/application.py +52 -0
- contractforge_databricks/annotations/audit.py +49 -0
- contractforge_databricks/annotations/sql.py +142 -0
- contractforge_databricks/api.py +65 -0
- contractforge_databricks/bundles/__init__.py +9 -0
- contractforge_databricks/bundles/assets.py +47 -0
- contractforge_databricks/bundles/project.py +213 -0
- contractforge_databricks/bundles/project_config.py +133 -0
- contractforge_databricks/capabilities/__init__.py +17 -0
- contractforge_databricks/capabilities/builders.py +43 -0
- contractforge_databricks/capabilities/evaluate.py +162 -0
- contractforge_databricks/capabilities/mapping.py +36 -0
- contractforge_databricks/capabilities/models.py +44 -0
- contractforge_databricks/capabilities/runtime.py +111 -0
- contractforge_databricks/capabilities/uc.py +47 -0
- contractforge_databricks/cli.py +196 -0
- contractforge_databricks/cli_deploy.py +98 -0
- contractforge_databricks/cli_governance.py +142 -0
- contractforge_databricks/cli_io.py +91 -0
- contractforge_databricks/cli_maintenance.py +69 -0
- contractforge_databricks/coercion.py +31 -0
- contractforge_databricks/contract_extensions.py +70 -0
- contractforge_databricks/cost/__init__.py +11 -0
- contractforge_databricks/cost/model.py +22 -0
- contractforge_databricks/cost/report.py +65 -0
- contractforge_databricks/cost/sql.py +136 -0
- contractforge_databricks/dashboards/__init__.py +15 -0
- contractforge_databricks/dashboards/control_tables.py +150 -0
- contractforge_databricks/diagnostics/__init__.py +7 -0
- contractforge_databricks/diagnostics/explain.py +40 -0
- contractforge_databricks/environment.py +53 -0
- contractforge_databricks/evidence/__init__.py +98 -0
- contractforge_databricks/evidence/ddl.py +35 -0
- contractforge_databricks/evidence/governance_log.py +175 -0
- contractforge_databricks/evidence/helpers.py +29 -0
- contractforge_databricks/evidence/ops_log.py +210 -0
- contractforge_databricks/evidence/records.py +27 -0
- contractforge_databricks/evidence/run_log.py +74 -0
- contractforge_databricks/evidence/schemas.py +7 -0
- contractforge_databricks/evidence/sql.py +144 -0
- contractforge_databricks/evidence/tables.py +20 -0
- contractforge_databricks/evidence/writer.py +118 -0
- contractforge_databricks/execution/__init__.py +70 -0
- contractforge_databricks/execution/delta_basic.py +57 -0
- contractforge_databricks/execution/hash_diff.py +126 -0
- contractforge_databricks/execution/hash_diff_latest.py +142 -0
- contractforge_databricks/execution/replace_partitions.py +40 -0
- contractforge_databricks/execution/results.py +5 -0
- contractforge_databricks/execution/retry.py +36 -0
- contractforge_databricks/execution/scd2.py +213 -0
- contractforge_databricks/execution/scd2_deletes.py +65 -0
- contractforge_databricks/execution/scd2_late.py +30 -0
- contractforge_databricks/execution/snapshot.py +77 -0
- contractforge_databricks/execution/sql_merge.py +85 -0
- contractforge_databricks/execution/tables.py +98 -0
- contractforge_databricks/execution/windows.py +58 -0
- contractforge_databricks/governance/__init__.py +30 -0
- contractforge_databricks/governance/access.py +185 -0
- contractforge_databricks/governance/application.py +93 -0
- contractforge_databricks/governance/drift.py +49 -0
- contractforge_databricks/governance/runtime.py +60 -0
- contractforge_databricks/governance/sql.py +31 -0
- contractforge_databricks/governance/validation.py +135 -0
- contractforge_databricks/lakeflow/__init__.py +21 -0
- contractforge_databricks/lakeflow/compatibility.py +194 -0
- contractforge_databricks/lakeflow/rendering.py +175 -0
- contractforge_databricks/lineage/__init__.py +7 -0
- contractforge_databricks/lineage/openlineage.py +182 -0
- contractforge_databricks/maintenance/__init__.py +27 -0
- contractforge_databricks/maintenance/retention.py +90 -0
- contractforge_databricks/maintenance/sql.py +68 -0
- contractforge_databricks/metrics/__init__.py +19 -0
- contractforge_databricks/metrics/history.py +21 -0
- contractforge_databricks/metrics/write.py +63 -0
- contractforge_databricks/operations/__init__.py +4 -0
- contractforge_databricks/operations/application.py +38 -0
- contractforge_databricks/operations/sql.py +95 -0
- contractforge_databricks/parity/__init__.py +18 -0
- contractforge_databricks/parity/catalog.py +59 -0
- contractforge_databricks/parity/models.py +7 -0
- contractforge_databricks/parity/scenarios.py +111 -0
- contractforge_databricks/partitioning/__init__.py +3 -0
- contractforge_databricks/partitioning/predicates.py +28 -0
- contractforge_databricks/preparation/__init__.py +47 -0
- contractforge_databricks/preparation/deduplicate.py +87 -0
- contractforge_databricks/preparation/encoding.py +37 -0
- contractforge_databricks/preparation/hashing.py +18 -0
- contractforge_databricks/preparation/pyspark.py +178 -0
- contractforge_databricks/preparation/pyspark_staging.py +70 -0
- contractforge_databricks/preparation/shape.py +209 -0
- contractforge_databricks/preparation/shape_validation.py +94 -0
- contractforge_databricks/preparation/staging.py +17 -0
- contractforge_databricks/preparation/zip_arrays.py +51 -0
- contractforge_databricks/presets/__init__.py +3 -0
- contractforge_databricks/presets/base.py +24 -0
- contractforge_databricks/presets/bronze.py +57 -0
- contractforge_databricks/presets/catalog.py +22 -0
- contractforge_databricks/presets/core.py +134 -0
- contractforge_databricks/presets/gold.py +62 -0
- contractforge_databricks/presets/modifiers.py +51 -0
- contractforge_databricks/presets/runtime.py +22 -0
- contractforge_databricks/presets/silver.py +101 -0
- contractforge_databricks/presets/write_engine.py +57 -0
- contractforge_databricks/quality/__init__.py +41 -0
- contractforge_databricks/quality/evaluation.py +178 -0
- contractforge_databricks/quality/persistence.py +81 -0
- contractforge_databricks/quality/registry.py +134 -0
- contractforge_databricks/quality/results.py +17 -0
- contractforge_databricks/quality/sql.py +113 -0
- contractforge_databricks/rendering/__init__.py +11 -0
- contractforge_databricks/rendering/bundle.py +93 -0
- contractforge_databricks/rendering/markdown.py +50 -0
- contractforge_databricks/rendering/names.py +56 -0
- contractforge_databricks/results.py +15 -0
- contractforge_databricks/runtime/__init__.py +101 -0
- contractforge_databricks/runtime/available_now.py +147 -0
- contractforge_databricks/runtime/bundles.py +211 -0
- contractforge_databricks/runtime/cache.py +20 -0
- contractforge_databricks/runtime/control_tables.py +19 -0
- contractforge_databricks/runtime/deploy.py +197 -0
- contractforge_databricks/runtime/detection.py +114 -0
- contractforge_databricks/runtime/dry_run.py +46 -0
- contractforge_databricks/runtime/errors.py +54 -0
- contractforge_databricks/runtime/file_selection.py +109 -0
- contractforge_databricks/runtime/finalization.py +168 -0
- contractforge_databricks/runtime/governance.py +37 -0
- contractforge_databricks/runtime/hooks.py +45 -0
- contractforge_databricks/runtime/http_file.py +37 -0
- contractforge_databricks/runtime/http_retry.py +15 -0
- contractforge_databricks/runtime/http_safety.py +9 -0
- contractforge_databricks/runtime/json_materialization.py +97 -0
- contractforge_databricks/runtime/lineage.py +164 -0
- contractforge_databricks/runtime/maintenance.py +43 -0
- contractforge_databricks/runtime/merge_validation.py +98 -0
- contractforge_databricks/runtime/metadata.py +21 -0
- contractforge_databricks/runtime/metrics.py +34 -0
- contractforge_databricks/runtime/models.py +32 -0
- contractforge_databricks/runtime/options.py +33 -0
- contractforge_databricks/runtime/orchestration_context.py +185 -0
- contractforge_databricks/runtime/orchestrator.py +147 -0
- contractforge_databricks/runtime/partitioning.py +93 -0
- contractforge_databricks/runtime/quality_quarantine.py +92 -0
- contractforge_databricks/runtime/rest_api.py +46 -0
- contractforge_databricks/runtime/rest_auth.py +21 -0
- contractforge_databricks/runtime/rest_pagination.py +21 -0
- contractforge_databricks/runtime/run_payload.py +177 -0
- contractforge_databricks/runtime/schema.py +106 -0
- contractforge_databricks/runtime/source_metadata.py +30 -0
- contractforge_databricks/runtime/source_registry.py +43 -0
- contractforge_databricks/runtime/source_schema.py +24 -0
- contractforge_databricks/runtime/sources.py +208 -0
- contractforge_databricks/runtime/spark.py +183 -0
- contractforge_databricks/runtime/spark_defaults.py +35 -0
- contractforge_databricks/runtime/storage_auth.py +132 -0
- contractforge_databricks/runtime/streaming.py +131 -0
- contractforge_databricks/runtime/success.py +104 -0
- contractforge_databricks/runtime/utils.py +52 -0
- contractforge_databricks/runtime/watermark.py +71 -0
- contractforge_databricks/runtime/windows.py +184 -0
- contractforge_databricks/runtime/write.py +66 -0
- contractforge_databricks/runtime/write_flow.py +146 -0
- contractforge_databricks/runtime/write_strategy.py +40 -0
- contractforge_databricks/schema/__init__.py +21 -0
- contractforge_databricks/schema/diff.py +11 -0
- contractforge_databricks/schema/policy.py +33 -0
- contractforge_databricks/schema/sync.py +23 -0
- contractforge_databricks/security/__init__.py +21 -0
- contractforge_databricks/security/errors.py +5 -0
- contractforge_databricks/security/redaction.py +5 -0
- contractforge_databricks/security/secrets.py +114 -0
- contractforge_databricks/security/source_policy.py +17 -0
- contractforge_databricks/shapes/__init__.py +3 -0
- contractforge_databricks/shapes/sql.py +123 -0
- contractforge_databricks/sources/__init__.py +67 -0
- contractforge_databricks/sources/artifacts.py +100 -0
- contractforge_databricks/sources/autoloader.py +48 -0
- contractforge_databricks/sources/bounded_streams.py +44 -0
- contractforge_databricks/sources/classification.py +115 -0
- contractforge_databricks/sources/delta_share.py +21 -0
- contractforge_databricks/sources/files.py +48 -0
- contractforge_databricks/sources/http_file.py +46 -0
- contractforge_databricks/sources/interpret.py +76 -0
- contractforge_databricks/sources/jdbc.py +32 -0
- contractforge_databricks/sources/metadata.py +18 -0
- contractforge_databricks/sources/native_passthrough.py +33 -0
- contractforge_databricks/sources/rds_iam.py +15 -0
- contractforge_databricks/sources/rds_iam_runtime.py +191 -0
- contractforge_databricks/sources/rest_api.py +33 -0
- contractforge_databricks/sources/support.py +50 -0
- contractforge_databricks/sources/table_refs.py +65 -0
- contractforge_databricks/sql/__init__.py +4 -0
- contractforge_databricks/sql/identifiers.py +17 -0
- contractforge_databricks/sql/literals.py +36 -0
- contractforge_databricks/state/__init__.py +39 -0
- contractforge_databricks/state/ddl.py +24 -0
- contractforge_databricks/state/migrations.py +146 -0
- contractforge_databricks/state/queries.py +149 -0
- contractforge_databricks/state/sql.py +116 -0
- contractforge_databricks/state/tables.py +9 -0
- contractforge_databricks/state/writer.py +83 -0
- contractforge_databricks/templates/__init__.py +15 -0
- contractforge_databricks/templates/catalog.py +205 -0
- contractforge_databricks/templates/catalog_parity.py +85 -0
- contractforge_databricks/templates/core.py +83 -0
- contractforge_databricks/templates/enrichment.py +175 -0
- contractforge_databricks/transforms/__init__.py +3 -0
- contractforge_databricks/transforms/sql.py +118 -0
- contractforge_databricks/watermark/__init__.py +6 -0
- contractforge_databricks/watermark/sql.py +91 -0
- contractforge_databricks/write_modes/__init__.py +20 -0
- contractforge_databricks/write_modes/registry.py +44 -0
- contractforge_databricks/write_modes/sql.py +33 -0
- contractforge_databricks/write_modes/strategy.py +192 -0
- contractforge_databricks-0.1.0.dist-info/METADATA +34 -0
- contractforge_databricks-0.1.0.dist-info/RECORD +220 -0
- contractforge_databricks-0.1.0.dist-info/WHEEL +4 -0
- contractforge_databricks-0.1.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Runtime partition-scope helpers for Databricks writes."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from contractforge_core.runtime import PreparedInput, QueryOne
|
|
8
|
+
from contractforge_core.semantic import SemanticContract
|
|
9
|
+
from contractforge_databricks.contract_extensions import databricks_extensions
|
|
10
|
+
from contractforge_databricks.partitioning import render_partition_in_predicate
|
|
11
|
+
from contractforge_databricks.sql import quote_identifier, quote_table_name
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def target_partition_predicate(
|
|
15
|
+
*,
|
|
16
|
+
contract: SemanticContract,
|
|
17
|
+
prepared: PreparedInput,
|
|
18
|
+
query_one: QueryOne | None,
|
|
19
|
+
) -> str | None:
|
|
20
|
+
extensions = databricks_extensions(contract)
|
|
21
|
+
if contract.write.mode != "scd1_upsert" or extensions.get("merge_strategy") != "delta_by_partition":
|
|
22
|
+
return None
|
|
23
|
+
column = str(extensions.get("merge_partition_column") or extensions.get("partition_column") or "")
|
|
24
|
+
if not column:
|
|
25
|
+
raise ValueError("merge_strategy=delta_by_partition requires merge_partition_column or partition_column")
|
|
26
|
+
if column not in prepared.source_columns:
|
|
27
|
+
raise ValueError(f"partition column {column!r} is not present in prepared source columns")
|
|
28
|
+
values = _partition_values(prepared, column, query_one)
|
|
29
|
+
return f"t.{render_partition_in_predicate(column, values)}"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def replace_partition_predicate(
|
|
33
|
+
*,
|
|
34
|
+
contract: SemanticContract,
|
|
35
|
+
prepared: PreparedInput,
|
|
36
|
+
query_one: QueryOne | None,
|
|
37
|
+
) -> str | None:
|
|
38
|
+
extensions = databricks_extensions(contract)
|
|
39
|
+
if contract.write.mode != "scd1_upsert" or extensions.get("merge_strategy") != "replace_partitions":
|
|
40
|
+
return None
|
|
41
|
+
_validate_replace_partitions_contract(contract, extensions)
|
|
42
|
+
column = str(extensions.get("merge_partition_column") or "")
|
|
43
|
+
if column not in prepared.source_columns:
|
|
44
|
+
raise ValueError(f"merge_partition_column {column!r} is not present in prepared source columns")
|
|
45
|
+
values = _partition_values(prepared, column, query_one)
|
|
46
|
+
return render_partition_in_predicate(column, values)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _validate_replace_partitions_contract(contract: SemanticContract, extensions: dict[str, Any]) -> None:
|
|
50
|
+
column = str(extensions.get("merge_partition_column") or "")
|
|
51
|
+
if not column:
|
|
52
|
+
raise ValueError("merge_strategy=replace_partitions requires merge_partition_column")
|
|
53
|
+
partition_column = extensions.get("partition_column")
|
|
54
|
+
if partition_column and partition_column != column:
|
|
55
|
+
raise ValueError("merge_strategy=replace_partitions requires partition_column equal to merge_partition_column")
|
|
56
|
+
if extensions.get("replace_partitions_source_complete") or _source_declares_complete(contract):
|
|
57
|
+
return
|
|
58
|
+
raise ValueError(
|
|
59
|
+
"merge_strategy=replace_partitions requires replace_partitions_source_complete=true "
|
|
60
|
+
"or source.read.source_complete=true/source.read.full_snapshot=true"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _source_declares_complete(contract: SemanticContract) -> bool:
|
|
65
|
+
source = contract.source.raw or {}
|
|
66
|
+
read = source.get("read") if isinstance(source.get("read"), dict) else {}
|
|
67
|
+
return bool(read.get("source_complete") or read.get("full_snapshot") or source.get("source_complete"))
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _partition_values(prepared: PreparedInput, column: str, query_one: QueryOne | None) -> tuple[Any, ...]:
|
|
71
|
+
if query_one is None:
|
|
72
|
+
metadata_values = (prepared.source_metadata or {}).get("affected_partition_values")
|
|
73
|
+
if metadata_values:
|
|
74
|
+
return tuple(metadata_values)
|
|
75
|
+
raise ValueError("delta_by_partition requires query_one or prepared.source_metadata.affected_partition_values")
|
|
76
|
+
row = query_one(
|
|
77
|
+
f"SELECT collect_set({quote_identifier(column)}) AS partition_values "
|
|
78
|
+
f"FROM {quote_table_name(prepared.source_view)}"
|
|
79
|
+
)
|
|
80
|
+
values = _row_value(row, "partition_values")
|
|
81
|
+
if not values:
|
|
82
|
+
raise ValueError("delta_by_partition could not detect affected partition values")
|
|
83
|
+
return tuple(values)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _row_value(row: Any, key: str) -> Any:
|
|
87
|
+
if row is None:
|
|
88
|
+
return None
|
|
89
|
+
if isinstance(row, dict):
|
|
90
|
+
return row.get(key)
|
|
91
|
+
if hasattr(row, "asDict"):
|
|
92
|
+
return row.asDict().get(key)
|
|
93
|
+
return getattr(row, key, None)
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""Persist row-level Databricks quality quarantine evidence."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import replace
|
|
6
|
+
from importlib import import_module
|
|
7
|
+
from typing import Any
|
|
8
|
+
from uuid import uuid4
|
|
9
|
+
|
|
10
|
+
from contractforge_core.quality import QualityRuleResult
|
|
11
|
+
from contractforge_databricks.evidence.tables import evidence_table_names
|
|
12
|
+
from contractforge_databricks.quality import evaluate_quality
|
|
13
|
+
from contractforge_databricks.runtime.models import DatabricksIngestOptions
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def apply_declared_quality(
|
|
17
|
+
*,
|
|
18
|
+
spark: Any,
|
|
19
|
+
contract: Any,
|
|
20
|
+
prepared: Any,
|
|
21
|
+
opts: DatabricksIngestOptions,
|
|
22
|
+
run_id: str,
|
|
23
|
+
target: str,
|
|
24
|
+
quality_results: tuple[QualityRuleResult, ...],
|
|
25
|
+
) -> tuple[Any, tuple[QualityRuleResult, ...]]:
|
|
26
|
+
if quality_results or not getattr(contract, "quality", ()):
|
|
27
|
+
return prepared, quality_results
|
|
28
|
+
status, evaluated, valid_df, quarantined_df, quarantined_count = evaluate_quality(spark.table(prepared.source_view), contract)
|
|
29
|
+
if status == "NOT_CONFIGURED":
|
|
30
|
+
return prepared, evaluated
|
|
31
|
+
if quarantined_count > 0 and not opts.dry_run:
|
|
32
|
+
persist_quality_quarantine_rows(
|
|
33
|
+
quarantined_df,
|
|
34
|
+
run_id=run_id,
|
|
35
|
+
target_table=target,
|
|
36
|
+
quality_results=evaluated,
|
|
37
|
+
catalog=opts.catalog,
|
|
38
|
+
schema=opts.schema,
|
|
39
|
+
)
|
|
40
|
+
valid_df.createOrReplaceTempView(prepared.source_view)
|
|
41
|
+
return replace(prepared, rows_quarantined=quarantined_count), evaluated
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def with_run_id(opts: DatabricksIngestOptions) -> DatabricksIngestOptions:
|
|
45
|
+
return opts if opts.run_id else replace(opts, run_id=f"run-{uuid4()}")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def persist_quality_quarantine_rows(
|
|
49
|
+
quarantined_df: Any,
|
|
50
|
+
*,
|
|
51
|
+
run_id: str,
|
|
52
|
+
target_table: str,
|
|
53
|
+
quality_results: tuple[QualityRuleResult, ...],
|
|
54
|
+
catalog: str,
|
|
55
|
+
schema: str,
|
|
56
|
+
) -> None:
|
|
57
|
+
"""Append quarantined row payloads to the core quarantine evidence table."""
|
|
58
|
+
|
|
59
|
+
failed = tuple(result for result in quality_results if result.failed_count and result.severity == "quarantine")
|
|
60
|
+
if not failed:
|
|
61
|
+
return
|
|
62
|
+
functions = _functions()
|
|
63
|
+
payload_columns = [functions.col(column) for column in getattr(quarantined_df, "columns", ()) or ()]
|
|
64
|
+
reason = _reason(failed)
|
|
65
|
+
quarantine_table = evidence_table_names(catalog, schema)["quarantine"]
|
|
66
|
+
(
|
|
67
|
+
quarantined_df.select(
|
|
68
|
+
functions.lit(run_id).alias("run_id"),
|
|
69
|
+
functions.lit(target_table).alias("target_table"),
|
|
70
|
+
functions.lit(_rule_name(failed)).alias("rule_name"),
|
|
71
|
+
functions.lit(reason).alias("error_reason"),
|
|
72
|
+
functions.to_json(functions.struct(*payload_columns)).alias("record_payload"),
|
|
73
|
+
functions.lit(None).cast("string").alias("record_ref"),
|
|
74
|
+
functions.lit(reason).alias("reason"),
|
|
75
|
+
functions.current_timestamp().alias("quarantined_at_utc"),
|
|
76
|
+
)
|
|
77
|
+
.write.mode("append")
|
|
78
|
+
.insertInto(quarantine_table)
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _rule_name(results: tuple[QualityRuleResult, ...]) -> str:
|
|
83
|
+
return ", ".join(result.rule_name for result in results)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _reason(results: tuple[QualityRuleResult, ...]) -> str:
|
|
87
|
+
parts = [f"{result.rule_name}: {result.message or 'quality rule failed'}" for result in results]
|
|
88
|
+
return "; ".join(parts)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _functions() -> Any:
|
|
92
|
+
return import_module("pyspark.sql").functions
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Databricks runtime execution for bounded REST API connector sources.
|
|
2
|
+
|
|
3
|
+
The request/pagination/auth/records logic lives in the core REST client; this
|
|
4
|
+
module resolves Databricks secret placeholders, delegates the read to the core,
|
|
5
|
+
and materializes the returned records into a Spark DataFrame.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from contractforge_core.connectors import is_rest_api_connector
|
|
13
|
+
from contractforge_core.connectors.api.rest import read_rest_api_records as _core_read_rest_api_records
|
|
14
|
+
from contractforge_databricks.runtime.json_materialization import materialize_json_records
|
|
15
|
+
from contractforge_databricks.runtime.source_schema import source_declared_schema
|
|
16
|
+
from contractforge_databricks.security import resolve_databricks_secret_placeholders
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def read_rest_api_records(source: dict[str, Any]) -> list[dict[str, Any]]:
|
|
20
|
+
"""Read records from a bounded REST source, resolving secrets via Databricks first."""
|
|
21
|
+
|
|
22
|
+
resolved = dict(source)
|
|
23
|
+
if resolved.get("auth") is not None:
|
|
24
|
+
resolved["auth"] = resolve_databricks_secret_placeholders(resolved.get("auth"))
|
|
25
|
+
return _core_read_rest_api_records(resolved)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def resolve_rest_api_dataframe(spark: Any, source: dict[str, Any]) -> Any:
|
|
29
|
+
if not is_rest_api_connector(source):
|
|
30
|
+
raise ValueError(
|
|
31
|
+
"REST API runtime resolution requires source.type=rest_api or source.type=connector and connector=rest_api"
|
|
32
|
+
)
|
|
33
|
+
records = read_rest_api_records(source)
|
|
34
|
+
schema = source_declared_schema(source)
|
|
35
|
+
read = _dict(source.get("read"))
|
|
36
|
+
return materialize_json_records(
|
|
37
|
+
spark,
|
|
38
|
+
records,
|
|
39
|
+
schema=schema,
|
|
40
|
+
read_options=_dict(read.get("json_options")),
|
|
41
|
+
staging_path=read.get("staging_path"),
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _dict(value: object) -> dict[str, Any]:
|
|
46
|
+
return dict(value) if isinstance(value, dict) else {}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Databricks REST auth headers: resolve secrets, then delegate to the core."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from contractforge_core.connectors.api.rest import rest_request_headers as _core_rest_request_headers
|
|
8
|
+
from contractforge_databricks.security import resolve_databricks_secret_placeholders
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def rest_request_headers(
|
|
12
|
+
source: dict[str, Any],
|
|
13
|
+
incremental: dict[str, Any] | None = None,
|
|
14
|
+
watermark: str | None = None,
|
|
15
|
+
) -> dict[str, str]:
|
|
16
|
+
"""Resolve Databricks secret placeholders in auth, then build headers via the core."""
|
|
17
|
+
|
|
18
|
+
resolved = dict(source)
|
|
19
|
+
if resolved.get("auth") is not None:
|
|
20
|
+
resolved["auth"] = resolve_databricks_secret_placeholders(resolved.get("auth"))
|
|
21
|
+
return _core_rest_request_headers(resolved, incremental, watermark)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Compatibility re-exports for the core REST pagination helpers."""
|
|
2
|
+
|
|
3
|
+
from contractforge_core.connectors.api.rest.pagination import (
|
|
4
|
+
json_path,
|
|
5
|
+
link_header_next,
|
|
6
|
+
max_pages_for_source,
|
|
7
|
+
next_url,
|
|
8
|
+
page_urls,
|
|
9
|
+
pagination_type,
|
|
10
|
+
url_with_params,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"json_path",
|
|
15
|
+
"link_header_next",
|
|
16
|
+
"max_pages_for_source",
|
|
17
|
+
"next_url",
|
|
18
|
+
"page_urls",
|
|
19
|
+
"pagination_type",
|
|
20
|
+
"url_with_params",
|
|
21
|
+
]
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""Databricks run evidence payload assembly."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from contractforge_core.config import CTRL_SCHEMA_VERSION, FRAMEWORK_VERSION
|
|
9
|
+
from contractforge_core.runtime import PreparedInput
|
|
10
|
+
from contractforge_core.semantic import SemanticContract
|
|
11
|
+
from contractforge_databricks.runtime.metadata import contract_metadata
|
|
12
|
+
from contractforge_databricks.runtime.models import DatabricksIngestOptions
|
|
13
|
+
from contractforge_databricks.runtime.write_strategy import write_strategy_evidence
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def run_payload(
|
|
17
|
+
contract: SemanticContract,
|
|
18
|
+
prepared: PreparedInput,
|
|
19
|
+
opts: DatabricksIngestOptions,
|
|
20
|
+
run_id: str,
|
|
21
|
+
target: str,
|
|
22
|
+
status: str,
|
|
23
|
+
started: str,
|
|
24
|
+
finished: str,
|
|
25
|
+
rows_written: int,
|
|
26
|
+
quality_status_value: str,
|
|
27
|
+
operation_metrics: dict[str, Any],
|
|
28
|
+
schema_changes: dict[str, Any],
|
|
29
|
+
governance_results: dict[str, Any],
|
|
30
|
+
write_started_at: str | None,
|
|
31
|
+
write_finished_at: str | None,
|
|
32
|
+
stage_durations: dict[str, float],
|
|
33
|
+
watermark_column: str | None,
|
|
34
|
+
watermark_previous: str | None,
|
|
35
|
+
watermark_current: str | None,
|
|
36
|
+
diagnostics: dict[str, bool],
|
|
37
|
+
error_message: str | None,
|
|
38
|
+
skip_reason: str | None,
|
|
39
|
+
skipped_by_run_id: str | None,
|
|
40
|
+
) -> dict[str, Any]:
|
|
41
|
+
runtime = dict(opts.runtime_metadata or {})
|
|
42
|
+
source_metadata = dict(prepared.source_metadata or {})
|
|
43
|
+
operations = dict(contract.operations.metadata or {}) if contract.operations and contract.operations.metadata else {}
|
|
44
|
+
annotations_result = governance_results.get("annotations") or {}
|
|
45
|
+
operations_result = governance_results.get("operations") or {}
|
|
46
|
+
row_metrics = _row_metrics(operation_metrics, rows_written)
|
|
47
|
+
delta_version_after = _metric_int(operation_metrics, "version", default=None)
|
|
48
|
+
write_strategy = write_strategy_evidence(contract, target, runtime)
|
|
49
|
+
return {
|
|
50
|
+
"run_id": run_id,
|
|
51
|
+
"run_ts_utc": started,
|
|
52
|
+
"run_date": _date_now(),
|
|
53
|
+
"started_at_utc": started,
|
|
54
|
+
"finished_at_utc": finished,
|
|
55
|
+
"duration_seconds": _duration_seconds(started, finished),
|
|
56
|
+
"target_table": target,
|
|
57
|
+
"runtime_entrypoint": runtime.get("notebook_name"),
|
|
58
|
+
"layer": contract.target.layer,
|
|
59
|
+
"mode": contract.write.mode,
|
|
60
|
+
"write_engine_requested": write_strategy["write_engine_requested"],
|
|
61
|
+
"write_engine_selected": write_strategy["write_engine_selected"],
|
|
62
|
+
"write_engine_status": write_strategy["write_engine_status"],
|
|
63
|
+
"write_engine_reason": write_strategy["write_engine_reason"],
|
|
64
|
+
"write_engine_fallback_policy": write_strategy["write_engine_fallback_policy"],
|
|
65
|
+
"write_engine": dict(write_strategy),
|
|
66
|
+
"status": status,
|
|
67
|
+
"source_table": prepared.source_name or prepared.source_view,
|
|
68
|
+
"source_type": source_metadata.get("source_type"),
|
|
69
|
+
"source_connector": source_metadata.get("source_connector"),
|
|
70
|
+
"source_name": source_metadata.get("source_name") or prepared.source_name,
|
|
71
|
+
"source_system": source_metadata.get("source_system") or _source_system(contract),
|
|
72
|
+
"source_provider": source_metadata.get("source_provider"),
|
|
73
|
+
"source_format": source_metadata.get("source_format"),
|
|
74
|
+
"source_path": source_metadata.get("source_path"),
|
|
75
|
+
"source_options_json": _metadata_value(source_metadata, "source_options", "source_options_redacted"),
|
|
76
|
+
"source_read_json": _metadata_value(source_metadata, "source_read", "source_read_redacted"),
|
|
77
|
+
"source_request_json": _metadata_value(source_metadata, "source_request", "source_request_redacted"),
|
|
78
|
+
"source_auth_json": _metadata_value(source_metadata, "source_auth", "source_auth_redacted"),
|
|
79
|
+
"source_pagination_json": _metadata_value(source_metadata, "source_pagination", "source_pagination_redacted"),
|
|
80
|
+
"source_response_json": _metadata_value(source_metadata, "source_response", "source_response_redacted"),
|
|
81
|
+
"source_incremental_json": _metadata_value(source_metadata, "source_incremental", "source_incremental_redacted"),
|
|
82
|
+
"source_limits_json": _metadata_value(source_metadata, "source_limits", "source_limits_redacted"),
|
|
83
|
+
"source_capabilities_json": source_metadata.get("source_capabilities"),
|
|
84
|
+
"source_metrics_json": source_metadata.get("source_metrics"),
|
|
85
|
+
"source": source_metadata or None,
|
|
86
|
+
"rows_read": prepared.rows_read,
|
|
87
|
+
"rows_effective": prepared.rows_read - prepared.rows_quarantined,
|
|
88
|
+
"rows_written": rows_written,
|
|
89
|
+
"rows_inserted": row_metrics["rows_inserted"],
|
|
90
|
+
"rows_updated": row_metrics["rows_updated"],
|
|
91
|
+
"rows_deleted": row_metrics["rows_deleted"],
|
|
92
|
+
"rows_expired": row_metrics["rows_expired"],
|
|
93
|
+
"rows_quarantined": prepared.rows_quarantined,
|
|
94
|
+
"watermark_column": watermark_column,
|
|
95
|
+
"watermark_previous": watermark_previous,
|
|
96
|
+
"watermark_current": watermark_current,
|
|
97
|
+
"quality_status": quality_status_value,
|
|
98
|
+
"schema_changes": schema_changes,
|
|
99
|
+
"schema_changes_json": schema_changes,
|
|
100
|
+
"stage_durations": stage_durations,
|
|
101
|
+
"stage_durations_json": stage_durations,
|
|
102
|
+
"operation_metrics": operation_metrics,
|
|
103
|
+
"operation_metrics_json": operation_metrics,
|
|
104
|
+
"metrics_json": operation_metrics,
|
|
105
|
+
"schema_policy": contract.write.schema_policy,
|
|
106
|
+
"applied_presets": operations.get("applied_presets"),
|
|
107
|
+
"metrics_source": operation_metrics.get("metrics_source") or "adapter_runtime",
|
|
108
|
+
"table_version_after": None if delta_version_after is None else str(delta_version_after),
|
|
109
|
+
"write_delta_version": delta_version_after,
|
|
110
|
+
"write_committed": status == "SUCCESS" and rows_written >= 0,
|
|
111
|
+
"explain_captured": bool(diagnostics.get("explain_captured")),
|
|
112
|
+
"openlineage_event_emitted": bool(diagnostics.get("openlineage_event_emitted")),
|
|
113
|
+
"runtime_type": runtime.get("runtime_type"),
|
|
114
|
+
"engine_version": runtime.get("spark_version"),
|
|
115
|
+
"python_version": runtime.get("python_version"),
|
|
116
|
+
"framework_version": FRAMEWORK_VERSION,
|
|
117
|
+
"ctrl_schema_version": CTRL_SCHEMA_VERSION,
|
|
118
|
+
"contract_owner": contract.governance.owner if contract.governance else None,
|
|
119
|
+
"contract_domain": contract.target.domain,
|
|
120
|
+
"contract_tags_json": operations.get("tags"),
|
|
121
|
+
"contract_sla": operations.get("sla"),
|
|
122
|
+
"runtime_parameters_json": operations.get("runtime_parameters"),
|
|
123
|
+
"contract_metadata": contract_metadata(contract, operations),
|
|
124
|
+
"annotations_status": annotations_result.get("status"),
|
|
125
|
+
"annotations_result_json": annotations_result or None,
|
|
126
|
+
"operations_json": {"metadata": operations, "record_result": operations_result} if operations or operations_result else None,
|
|
127
|
+
"idempotency_key": opts.idempotency_key,
|
|
128
|
+
"idempotency_policy": opts.idempotency_policy,
|
|
129
|
+
"write_started_at_utc": write_started_at,
|
|
130
|
+
"write_finished_at_utc": write_finished_at,
|
|
131
|
+
"parent_run_id": operations.get("parent_run_id"),
|
|
132
|
+
"run_group_id": operations.get("run_group_id"),
|
|
133
|
+
"master_job_id": operations.get("master_job_id"),
|
|
134
|
+
"master_run_id": operations.get("master_run_id"),
|
|
135
|
+
"skip_reason": skip_reason,
|
|
136
|
+
"skipped_by_run_id": skipped_by_run_id,
|
|
137
|
+
"error_message": error_message,
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _date_now() -> str:
|
|
142
|
+
return datetime.now(timezone.utc).date().isoformat()
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _metadata_value(source_metadata: dict[str, Any], raw_key: str, redacted_key: str) -> Any:
|
|
146
|
+
return source_metadata.get(redacted_key) if redacted_key in source_metadata else source_metadata.get(raw_key)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _source_system(contract: SemanticContract) -> str | None:
|
|
150
|
+
value = (contract.source.raw or {}).get("system")
|
|
151
|
+
return str(value) if value not in (None, "") else None
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _row_metrics(operation_metrics: dict[str, Any], rows_written: int) -> dict[str, int]:
|
|
155
|
+
return {
|
|
156
|
+
"rows_inserted": _metric_int(operation_metrics, "rows_inserted", default=rows_written),
|
|
157
|
+
"rows_updated": _metric_int(operation_metrics, "rows_updated"),
|
|
158
|
+
"rows_deleted": _metric_int(operation_metrics, "rows_deleted"),
|
|
159
|
+
"rows_expired": _metric_int(operation_metrics, "rows_expired"),
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _metric_int(metrics: dict[str, Any], key: str, *, default: int | None = 0) -> int | None:
|
|
164
|
+
value = metrics.get(key, default)
|
|
165
|
+
try:
|
|
166
|
+
return int(value or 0)
|
|
167
|
+
except (TypeError, ValueError):
|
|
168
|
+
return default
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _duration_seconds(started: str, finished: str) -> float | None:
|
|
172
|
+
try:
|
|
173
|
+
started_dt = datetime.strptime(started, "%Y-%m-%d %H:%M:%S")
|
|
174
|
+
finished_dt = datetime.strptime(finished, "%Y-%m-%d %H:%M:%S")
|
|
175
|
+
except ValueError:
|
|
176
|
+
return None
|
|
177
|
+
return (finished_dt - started_dt).total_seconds()
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Runtime schema setup and schema-change evidence for Databricks."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from contractforge_core.runtime import PreparedInput
|
|
8
|
+
from contractforge_core.schema import compare_schema, validate_schema_diff
|
|
9
|
+
from contractforge_core.semantic import SemanticContract
|
|
10
|
+
from contractforge_databricks.contract_extensions import databricks_extensions
|
|
11
|
+
from contractforge_databricks.evidence import EvidenceWriter, render_schema_change_log_insert_sqls
|
|
12
|
+
from contractforge_databricks.execution import SqlRunner, execute_table_setup
|
|
13
|
+
from contractforge_databricks.rendering.names import target_full_name
|
|
14
|
+
from contractforge_databricks.schema import render_add_columns_sql, render_type_widening_sql
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def setup_and_sync_schema(
|
|
18
|
+
*,
|
|
19
|
+
runner: SqlRunner,
|
|
20
|
+
evidence: EvidenceWriter,
|
|
21
|
+
contract: SemanticContract,
|
|
22
|
+
prepared: PreparedInput,
|
|
23
|
+
run_id: str,
|
|
24
|
+
ensure_table: bool,
|
|
25
|
+
target_schema: dict[str, str] | None,
|
|
26
|
+
) -> dict[str, Any]:
|
|
27
|
+
"""Apply Databricks schema policy using prepared-source schema evidence."""
|
|
28
|
+
|
|
29
|
+
if not prepared.source_schema:
|
|
30
|
+
return {}
|
|
31
|
+
|
|
32
|
+
target = target_full_name(contract)
|
|
33
|
+
if ensure_table:
|
|
34
|
+
execute_table_setup(runner=runner, contract=contract, columns=prepared.source_schema)
|
|
35
|
+
|
|
36
|
+
if target_schema is None:
|
|
37
|
+
return {"status": "new_or_unknown", "source_schema": dict(prepared.source_schema)}
|
|
38
|
+
|
|
39
|
+
allow_type_widening = bool(databricks_extensions(contract).get("allow_type_widening"))
|
|
40
|
+
diff = compare_schema(
|
|
41
|
+
prepared.source_schema,
|
|
42
|
+
target_schema,
|
|
43
|
+
allow_type_widening=allow_type_widening,
|
|
44
|
+
)
|
|
45
|
+
validate_schema_diff(diff, contract.write.schema_policy)
|
|
46
|
+
changes = diff.as_dict()
|
|
47
|
+
_apply_schema_sync_sql(runner, target, prepared.source_schema, diff)
|
|
48
|
+
_mark_applied_schema_changes(changes)
|
|
49
|
+
_write_schema_change_logs(evidence, run_id, target, changes, prepared.source_schema)
|
|
50
|
+
return changes
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def preview_schema_changes(
|
|
54
|
+
*,
|
|
55
|
+
contract: SemanticContract,
|
|
56
|
+
prepared: PreparedInput,
|
|
57
|
+
target_schema: dict[str, str] | None,
|
|
58
|
+
) -> dict[str, Any]:
|
|
59
|
+
"""Validate and describe schema changes without running Databricks DDL."""
|
|
60
|
+
|
|
61
|
+
if not prepared.source_schema:
|
|
62
|
+
return {}
|
|
63
|
+
if target_schema is None:
|
|
64
|
+
return {"status": "new_or_unknown", "source_schema": dict(prepared.source_schema)}
|
|
65
|
+
diff = compare_schema(
|
|
66
|
+
prepared.source_schema,
|
|
67
|
+
target_schema,
|
|
68
|
+
allow_type_widening=bool(databricks_extensions(contract).get("allow_type_widening")),
|
|
69
|
+
)
|
|
70
|
+
validate_schema_diff(diff, contract.write.schema_policy)
|
|
71
|
+
return diff.as_dict()
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _apply_schema_sync_sql(runner: SqlRunner, target: str, source_schema: dict[str, str], diff: Any) -> None:
|
|
75
|
+
for statement in (
|
|
76
|
+
render_add_columns_sql(target_table=target, source_schema=source_schema, diff=diff),
|
|
77
|
+
render_type_widening_sql(target_table=target, diff=diff),
|
|
78
|
+
):
|
|
79
|
+
for part in statement.split(";"):
|
|
80
|
+
sql = part.strip()
|
|
81
|
+
if sql and not sql.startswith("--"):
|
|
82
|
+
runner.sql(sql)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _write_schema_change_logs(
|
|
86
|
+
evidence: EvidenceWriter,
|
|
87
|
+
run_id: str,
|
|
88
|
+
target: str,
|
|
89
|
+
changes: dict[str, Any],
|
|
90
|
+
source_schema: dict[str, str],
|
|
91
|
+
) -> None:
|
|
92
|
+
for statement in render_schema_change_log_insert_sqls(
|
|
93
|
+
run_id=run_id,
|
|
94
|
+
target_table=target,
|
|
95
|
+
schema_changes=changes,
|
|
96
|
+
source_schema=source_schema,
|
|
97
|
+
catalog=evidence.catalog,
|
|
98
|
+
schema=evidence.schema,
|
|
99
|
+
):
|
|
100
|
+
evidence.runner.sql(statement)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _mark_applied_schema_changes(changes: dict[str, Any]) -> None:
|
|
104
|
+
for change in changes.get("type_changes") or ():
|
|
105
|
+
if change.get("allowed"):
|
|
106
|
+
change["applied"] = True
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Databricks runtime source metadata helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from contractforge_core.connectors import source_metadata_from_mapping
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def source_name(source: dict[str, Any]) -> str:
|
|
11
|
+
return str(source.get("table") or source.get("path") or source.get("url") or source.get("type") or "source")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def source_metadata(source: dict[str, Any]) -> dict[str, Any]:
|
|
15
|
+
return source_metadata_from_mapping(source)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def schema_types(df: Any) -> dict[str, str] | None:
|
|
19
|
+
schema = getattr(df, "schema", None)
|
|
20
|
+
fields = getattr(schema, "fields", None)
|
|
21
|
+
if not fields:
|
|
22
|
+
return None
|
|
23
|
+
return {str(field.name): str(field.dataType.simpleString()) for field in fields}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def source_metadata_with_watermark(source: dict[str, Any], watermark_previous: str | None) -> dict[str, Any]:
|
|
27
|
+
metadata = source_metadata(source)
|
|
28
|
+
if watermark_previous is not None:
|
|
29
|
+
metadata["watermark_previous"] = watermark_previous
|
|
30
|
+
return metadata
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Databricks runtime source resolver registry."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any, Protocol
|
|
7
|
+
|
|
8
|
+
_SOURCE_NAME_RE = re.compile(r"^[A-Za-z][A-Za-z0-9_-]*$")
|
|
9
|
+
_CUSTOM_RESOLVERS: dict[str, "DatabricksSourceResolver"] = {}
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DatabricksSourceResolver(Protocol):
|
|
13
|
+
def resolve(self, spark: Any, source: dict[str, Any]) -> Any:
|
|
14
|
+
"""Resolve a source mapping into a Databricks DataFrame-like object."""
|
|
15
|
+
...
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def register_source_resolver(source_type: str, resolver: DatabricksSourceResolver, *, overwrite: bool = False) -> None:
|
|
19
|
+
normalized = _normalize_source_name(source_type)
|
|
20
|
+
if not hasattr(resolver, "resolve"):
|
|
21
|
+
raise ValueError("resolver must implement resolve(spark, source)")
|
|
22
|
+
if normalized in _CUSTOM_RESOLVERS and not overwrite:
|
|
23
|
+
raise ValueError(f"source resolver already registered: {normalized}")
|
|
24
|
+
_CUSTOM_RESOLVERS[normalized] = resolver
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def unregister_source_resolver(source_type: str) -> None:
|
|
28
|
+
_CUSTOM_RESOLVERS.pop(_normalize_source_name(source_type), None)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def get_source_resolver(source_type: str) -> DatabricksSourceResolver | None:
|
|
32
|
+
return _CUSTOM_RESOLVERS.get(_normalize_source_name(source_type))
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def list_source_resolvers() -> list[str]:
|
|
36
|
+
return sorted(_CUSTOM_RESOLVERS)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _normalize_source_name(source_type: str) -> str:
|
|
40
|
+
normalized = str(source_type or "").strip()
|
|
41
|
+
if not _SOURCE_NAME_RE.match(normalized):
|
|
42
|
+
raise ValueError("source_type must start with a letter and contain only letters, numbers, '_' or '-'")
|
|
43
|
+
return normalized
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Runtime source schema helpers for Databricks readers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def source_declared_schema(source: dict[str, Any]) -> str | None:
|
|
9
|
+
if source.get("schema") not in (None, ""):
|
|
10
|
+
raise ValueError("source.schema is not supported; declare source.read.schema")
|
|
11
|
+
read = source.get("read") if isinstance(source.get("read"), dict) else {}
|
|
12
|
+
options = source.get("options") if isinstance(source.get("options"), dict) else {}
|
|
13
|
+
values = [read.get("schema"), options.get("schema")]
|
|
14
|
+
declared = [str(value).strip() for value in values if value not in (None, "")]
|
|
15
|
+
if len(set(declared)) > 1:
|
|
16
|
+
raise ValueError("source.read.schema conflicts with source.options.schema")
|
|
17
|
+
if declared and not declared[0]:
|
|
18
|
+
raise ValueError("source.read.schema cannot be empty")
|
|
19
|
+
return declared[0] if declared else None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def apply_declared_schema(reader: Any, source: dict[str, Any]) -> Any:
|
|
23
|
+
schema = source_declared_schema(source)
|
|
24
|
+
return reader.schema(schema) if schema else reader
|