contractforge-databricks 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contractforge_databricks/__init__.py +172 -0
- contractforge_databricks/adapter.py +69 -0
- contractforge_databricks/annotations/__init__.py +10 -0
- contractforge_databricks/annotations/application.py +52 -0
- contractforge_databricks/annotations/audit.py +49 -0
- contractforge_databricks/annotations/sql.py +142 -0
- contractforge_databricks/api.py +65 -0
- contractforge_databricks/bundles/__init__.py +9 -0
- contractforge_databricks/bundles/assets.py +47 -0
- contractforge_databricks/bundles/project.py +213 -0
- contractforge_databricks/bundles/project_config.py +133 -0
- contractforge_databricks/capabilities/__init__.py +17 -0
- contractforge_databricks/capabilities/builders.py +43 -0
- contractforge_databricks/capabilities/evaluate.py +162 -0
- contractforge_databricks/capabilities/mapping.py +36 -0
- contractforge_databricks/capabilities/models.py +44 -0
- contractforge_databricks/capabilities/runtime.py +111 -0
- contractforge_databricks/capabilities/uc.py +47 -0
- contractforge_databricks/cli.py +196 -0
- contractforge_databricks/cli_deploy.py +98 -0
- contractforge_databricks/cli_governance.py +142 -0
- contractforge_databricks/cli_io.py +91 -0
- contractforge_databricks/cli_maintenance.py +69 -0
- contractforge_databricks/coercion.py +31 -0
- contractforge_databricks/contract_extensions.py +70 -0
- contractforge_databricks/cost/__init__.py +11 -0
- contractforge_databricks/cost/model.py +22 -0
- contractforge_databricks/cost/report.py +65 -0
- contractforge_databricks/cost/sql.py +136 -0
- contractforge_databricks/dashboards/__init__.py +15 -0
- contractforge_databricks/dashboards/control_tables.py +150 -0
- contractforge_databricks/diagnostics/__init__.py +7 -0
- contractforge_databricks/diagnostics/explain.py +40 -0
- contractforge_databricks/environment.py +53 -0
- contractforge_databricks/evidence/__init__.py +98 -0
- contractforge_databricks/evidence/ddl.py +35 -0
- contractforge_databricks/evidence/governance_log.py +175 -0
- contractforge_databricks/evidence/helpers.py +29 -0
- contractforge_databricks/evidence/ops_log.py +210 -0
- contractforge_databricks/evidence/records.py +27 -0
- contractforge_databricks/evidence/run_log.py +74 -0
- contractforge_databricks/evidence/schemas.py +7 -0
- contractforge_databricks/evidence/sql.py +144 -0
- contractforge_databricks/evidence/tables.py +20 -0
- contractforge_databricks/evidence/writer.py +118 -0
- contractforge_databricks/execution/__init__.py +70 -0
- contractforge_databricks/execution/delta_basic.py +57 -0
- contractforge_databricks/execution/hash_diff.py +126 -0
- contractforge_databricks/execution/hash_diff_latest.py +142 -0
- contractforge_databricks/execution/replace_partitions.py +40 -0
- contractforge_databricks/execution/results.py +5 -0
- contractforge_databricks/execution/retry.py +36 -0
- contractforge_databricks/execution/scd2.py +213 -0
- contractforge_databricks/execution/scd2_deletes.py +65 -0
- contractforge_databricks/execution/scd2_late.py +30 -0
- contractforge_databricks/execution/snapshot.py +77 -0
- contractforge_databricks/execution/sql_merge.py +85 -0
- contractforge_databricks/execution/tables.py +98 -0
- contractforge_databricks/execution/windows.py +58 -0
- contractforge_databricks/governance/__init__.py +30 -0
- contractforge_databricks/governance/access.py +185 -0
- contractforge_databricks/governance/application.py +93 -0
- contractforge_databricks/governance/drift.py +49 -0
- contractforge_databricks/governance/runtime.py +60 -0
- contractforge_databricks/governance/sql.py +31 -0
- contractforge_databricks/governance/validation.py +135 -0
- contractforge_databricks/lakeflow/__init__.py +21 -0
- contractforge_databricks/lakeflow/compatibility.py +194 -0
- contractforge_databricks/lakeflow/rendering.py +175 -0
- contractforge_databricks/lineage/__init__.py +7 -0
- contractforge_databricks/lineage/openlineage.py +182 -0
- contractforge_databricks/maintenance/__init__.py +27 -0
- contractforge_databricks/maintenance/retention.py +90 -0
- contractforge_databricks/maintenance/sql.py +68 -0
- contractforge_databricks/metrics/__init__.py +19 -0
- contractforge_databricks/metrics/history.py +21 -0
- contractforge_databricks/metrics/write.py +63 -0
- contractforge_databricks/operations/__init__.py +4 -0
- contractforge_databricks/operations/application.py +38 -0
- contractforge_databricks/operations/sql.py +95 -0
- contractforge_databricks/parity/__init__.py +18 -0
- contractforge_databricks/parity/catalog.py +59 -0
- contractforge_databricks/parity/models.py +7 -0
- contractforge_databricks/parity/scenarios.py +111 -0
- contractforge_databricks/partitioning/__init__.py +3 -0
- contractforge_databricks/partitioning/predicates.py +28 -0
- contractforge_databricks/preparation/__init__.py +47 -0
- contractforge_databricks/preparation/deduplicate.py +87 -0
- contractforge_databricks/preparation/encoding.py +37 -0
- contractforge_databricks/preparation/hashing.py +18 -0
- contractforge_databricks/preparation/pyspark.py +178 -0
- contractforge_databricks/preparation/pyspark_staging.py +70 -0
- contractforge_databricks/preparation/shape.py +209 -0
- contractforge_databricks/preparation/shape_validation.py +94 -0
- contractforge_databricks/preparation/staging.py +17 -0
- contractforge_databricks/preparation/zip_arrays.py +51 -0
- contractforge_databricks/presets/__init__.py +3 -0
- contractforge_databricks/presets/base.py +24 -0
- contractforge_databricks/presets/bronze.py +57 -0
- contractforge_databricks/presets/catalog.py +22 -0
- contractforge_databricks/presets/core.py +134 -0
- contractforge_databricks/presets/gold.py +62 -0
- contractforge_databricks/presets/modifiers.py +51 -0
- contractforge_databricks/presets/runtime.py +22 -0
- contractforge_databricks/presets/silver.py +101 -0
- contractforge_databricks/presets/write_engine.py +57 -0
- contractforge_databricks/quality/__init__.py +41 -0
- contractforge_databricks/quality/evaluation.py +178 -0
- contractforge_databricks/quality/persistence.py +81 -0
- contractforge_databricks/quality/registry.py +134 -0
- contractforge_databricks/quality/results.py +17 -0
- contractforge_databricks/quality/sql.py +113 -0
- contractforge_databricks/rendering/__init__.py +11 -0
- contractforge_databricks/rendering/bundle.py +93 -0
- contractforge_databricks/rendering/markdown.py +50 -0
- contractforge_databricks/rendering/names.py +56 -0
- contractforge_databricks/results.py +15 -0
- contractforge_databricks/runtime/__init__.py +101 -0
- contractforge_databricks/runtime/available_now.py +147 -0
- contractforge_databricks/runtime/bundles.py +211 -0
- contractforge_databricks/runtime/cache.py +20 -0
- contractforge_databricks/runtime/control_tables.py +19 -0
- contractforge_databricks/runtime/deploy.py +197 -0
- contractforge_databricks/runtime/detection.py +114 -0
- contractforge_databricks/runtime/dry_run.py +46 -0
- contractforge_databricks/runtime/errors.py +54 -0
- contractforge_databricks/runtime/file_selection.py +109 -0
- contractforge_databricks/runtime/finalization.py +168 -0
- contractforge_databricks/runtime/governance.py +37 -0
- contractforge_databricks/runtime/hooks.py +45 -0
- contractforge_databricks/runtime/http_file.py +37 -0
- contractforge_databricks/runtime/http_retry.py +15 -0
- contractforge_databricks/runtime/http_safety.py +9 -0
- contractforge_databricks/runtime/json_materialization.py +97 -0
- contractforge_databricks/runtime/lineage.py +164 -0
- contractforge_databricks/runtime/maintenance.py +43 -0
- contractforge_databricks/runtime/merge_validation.py +98 -0
- contractforge_databricks/runtime/metadata.py +21 -0
- contractforge_databricks/runtime/metrics.py +34 -0
- contractforge_databricks/runtime/models.py +32 -0
- contractforge_databricks/runtime/options.py +33 -0
- contractforge_databricks/runtime/orchestration_context.py +185 -0
- contractforge_databricks/runtime/orchestrator.py +147 -0
- contractforge_databricks/runtime/partitioning.py +93 -0
- contractforge_databricks/runtime/quality_quarantine.py +92 -0
- contractforge_databricks/runtime/rest_api.py +46 -0
- contractforge_databricks/runtime/rest_auth.py +21 -0
- contractforge_databricks/runtime/rest_pagination.py +21 -0
- contractforge_databricks/runtime/run_payload.py +177 -0
- contractforge_databricks/runtime/schema.py +106 -0
- contractforge_databricks/runtime/source_metadata.py +30 -0
- contractforge_databricks/runtime/source_registry.py +43 -0
- contractforge_databricks/runtime/source_schema.py +24 -0
- contractforge_databricks/runtime/sources.py +208 -0
- contractforge_databricks/runtime/spark.py +183 -0
- contractforge_databricks/runtime/spark_defaults.py +35 -0
- contractforge_databricks/runtime/storage_auth.py +132 -0
- contractforge_databricks/runtime/streaming.py +131 -0
- contractforge_databricks/runtime/success.py +104 -0
- contractforge_databricks/runtime/utils.py +52 -0
- contractforge_databricks/runtime/watermark.py +71 -0
- contractforge_databricks/runtime/windows.py +184 -0
- contractforge_databricks/runtime/write.py +66 -0
- contractforge_databricks/runtime/write_flow.py +146 -0
- contractforge_databricks/runtime/write_strategy.py +40 -0
- contractforge_databricks/schema/__init__.py +21 -0
- contractforge_databricks/schema/diff.py +11 -0
- contractforge_databricks/schema/policy.py +33 -0
- contractforge_databricks/schema/sync.py +23 -0
- contractforge_databricks/security/__init__.py +21 -0
- contractforge_databricks/security/errors.py +5 -0
- contractforge_databricks/security/redaction.py +5 -0
- contractforge_databricks/security/secrets.py +114 -0
- contractforge_databricks/security/source_policy.py +17 -0
- contractforge_databricks/shapes/__init__.py +3 -0
- contractforge_databricks/shapes/sql.py +123 -0
- contractforge_databricks/sources/__init__.py +67 -0
- contractforge_databricks/sources/artifacts.py +100 -0
- contractforge_databricks/sources/autoloader.py +48 -0
- contractforge_databricks/sources/bounded_streams.py +44 -0
- contractforge_databricks/sources/classification.py +115 -0
- contractforge_databricks/sources/delta_share.py +21 -0
- contractforge_databricks/sources/files.py +48 -0
- contractforge_databricks/sources/http_file.py +46 -0
- contractforge_databricks/sources/interpret.py +76 -0
- contractforge_databricks/sources/jdbc.py +32 -0
- contractforge_databricks/sources/metadata.py +18 -0
- contractforge_databricks/sources/native_passthrough.py +33 -0
- contractforge_databricks/sources/rds_iam.py +15 -0
- contractforge_databricks/sources/rds_iam_runtime.py +191 -0
- contractforge_databricks/sources/rest_api.py +33 -0
- contractforge_databricks/sources/support.py +50 -0
- contractforge_databricks/sources/table_refs.py +65 -0
- contractforge_databricks/sql/__init__.py +4 -0
- contractforge_databricks/sql/identifiers.py +17 -0
- contractforge_databricks/sql/literals.py +36 -0
- contractforge_databricks/state/__init__.py +39 -0
- contractforge_databricks/state/ddl.py +24 -0
- contractforge_databricks/state/migrations.py +146 -0
- contractforge_databricks/state/queries.py +149 -0
- contractforge_databricks/state/sql.py +116 -0
- contractforge_databricks/state/tables.py +9 -0
- contractforge_databricks/state/writer.py +83 -0
- contractforge_databricks/templates/__init__.py +15 -0
- contractforge_databricks/templates/catalog.py +205 -0
- contractforge_databricks/templates/catalog_parity.py +85 -0
- contractforge_databricks/templates/core.py +83 -0
- contractforge_databricks/templates/enrichment.py +175 -0
- contractforge_databricks/transforms/__init__.py +3 -0
- contractforge_databricks/transforms/sql.py +118 -0
- contractforge_databricks/watermark/__init__.py +6 -0
- contractforge_databricks/watermark/sql.py +91 -0
- contractforge_databricks/write_modes/__init__.py +20 -0
- contractforge_databricks/write_modes/registry.py +44 -0
- contractforge_databricks/write_modes/sql.py +33 -0
- contractforge_databricks/write_modes/strategy.py +192 -0
- contractforge_databricks-0.1.0.dist-info/METADATA +34 -0
- contractforge_databricks-0.1.0.dist-info/RECORD +220 -0
- contractforge_databricks-0.1.0.dist-info/WHEEL +4 -0
- contractforge_databricks-0.1.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Query helpers for the Databricks parity catalog."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from contractforge_core.parity import WriteEngineParityScenario
|
|
8
|
+
from contractforge_databricks.parity.scenarios import WRITE_ENGINE_PARITY_SCENARIOS
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def list_write_engine_parity_scenarios() -> list[str]:
|
|
12
|
+
return sorted(scenario.scenario_id for scenario in WRITE_ENGINE_PARITY_SCENARIOS)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_write_engine_parity_scenario(scenario_id: str) -> WriteEngineParityScenario:
|
|
16
|
+
for scenario in WRITE_ENGINE_PARITY_SCENARIOS:
|
|
17
|
+
if scenario.scenario_id == scenario_id:
|
|
18
|
+
return scenario
|
|
19
|
+
raise ValueError(
|
|
20
|
+
f"Write-engine parity scenario not found: {scenario_id}. "
|
|
21
|
+
f"Valid scenarios: {list_write_engine_parity_scenarios()}"
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def scenarios_for_engine(engine: str) -> list[WriteEngineParityScenario]:
|
|
26
|
+
return [scenario for scenario in WRITE_ENGINE_PARITY_SCENARIOS if scenario.candidate_engine == engine]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def scenarios_for_mode(mode: str) -> list[WriteEngineParityScenario]:
|
|
30
|
+
return [scenario for scenario in WRITE_ENGINE_PARITY_SCENARIOS if scenario.write_mode == mode]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def build_write_engine_parity_plan(
|
|
34
|
+
*,
|
|
35
|
+
engine: str | None = None,
|
|
36
|
+
mode: str | None = None,
|
|
37
|
+
runtime: str | None = None,
|
|
38
|
+
) -> dict[str, Any]:
|
|
39
|
+
scenarios = list(WRITE_ENGINE_PARITY_SCENARIOS)
|
|
40
|
+
if engine:
|
|
41
|
+
scenarios = [scenario for scenario in scenarios if scenario.candidate_engine == engine]
|
|
42
|
+
if mode:
|
|
43
|
+
scenarios = [scenario for scenario in scenarios if scenario.write_mode == mode]
|
|
44
|
+
if runtime:
|
|
45
|
+
scenarios = [scenario for scenario in scenarios if runtime in scenario.runtime_targets]
|
|
46
|
+
|
|
47
|
+
expectation_counts: dict[str, int] = {}
|
|
48
|
+
for scenario in scenarios:
|
|
49
|
+
expectation_counts[scenario.expectation] = expectation_counts.get(scenario.expectation, 0) + 1
|
|
50
|
+
|
|
51
|
+
return {
|
|
52
|
+
"kind": "write_engine_parity_plan",
|
|
53
|
+
"engine": engine,
|
|
54
|
+
"mode": mode,
|
|
55
|
+
"runtime": runtime,
|
|
56
|
+
"scenario_count": len(scenarios),
|
|
57
|
+
"expectation_counts": expectation_counts,
|
|
58
|
+
"scenarios": [scenario.as_dict() for scenario in scenarios],
|
|
59
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"""Compatibility exports for platform-neutral parity catalog models."""
|
|
2
|
+
|
|
3
|
+
from contractforge_core.parity import ParityExpectation, ParityMetricExpectation, WriteEngineParityScenario
|
|
4
|
+
|
|
5
|
+
RuntimeTarget = str
|
|
6
|
+
|
|
7
|
+
__all__ = ["ParityExpectation", "ParityMetricExpectation", "RuntimeTarget", "WriteEngineParityScenario"]
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""Official Databricks write-engine parity scenarios."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from contractforge_core.parity import ParityMetricExpectation, WriteEngineParityScenario
|
|
6
|
+
|
|
7
|
+
CLASSIC_AND_SERVERLESS = ("databricks_classic", "databricks_serverless")
|
|
8
|
+
|
|
9
|
+
WRITE_ENGINE_PARITY_SCENARIOS: tuple[WriteEngineParityScenario, ...] = (
|
|
10
|
+
WriteEngineParityScenario(
|
|
11
|
+
scenario_id="scd1_sql_merge_insert_update",
|
|
12
|
+
title="SCD1 SQL MERGE preserves insert/update semantics",
|
|
13
|
+
write_mode="scd1_upsert",
|
|
14
|
+
candidate_engine="databricks_sql_merge",
|
|
15
|
+
expectation="must_match",
|
|
16
|
+
runtime_targets=CLASSIC_AND_SERVERLESS,
|
|
17
|
+
required_capabilities=("databricks_runtime", "unity_catalog_table", "sql_merge"),
|
|
18
|
+
required_contract_fields=("merge_keys", "transform.deduplicate"),
|
|
19
|
+
expected_semantics=("one current row per merge key", "changed rows update", "new keys insert"),
|
|
20
|
+
metric_expectations=(
|
|
21
|
+
ParityMetricExpectation("rows_inserted", "must match ContractForge Delta"),
|
|
22
|
+
ParityMetricExpectation("rows_updated", "must match ContractForge Delta"),
|
|
23
|
+
),
|
|
24
|
+
),
|
|
25
|
+
WriteEngineParityScenario(
|
|
26
|
+
scenario_id="scd1_sql_merge_duplicate_keys",
|
|
27
|
+
title="SCD1 SQL MERGE handles duplicate keys deterministically",
|
|
28
|
+
write_mode="scd1_upsert",
|
|
29
|
+
candidate_engine="databricks_sql_merge",
|
|
30
|
+
expectation="must_match",
|
|
31
|
+
runtime_targets=CLASSIC_AND_SERVERLESS,
|
|
32
|
+
required_capabilities=("databricks_runtime", "unity_catalog_table", "sql_merge"),
|
|
33
|
+
required_contract_fields=("merge_keys", "transform.deduplicate"),
|
|
34
|
+
expected_semantics=("source row selected per key must match ContractForge deduplication",),
|
|
35
|
+
metric_expectations=(ParityMetricExpectation("rows_affected", "must match after deduplication"),),
|
|
36
|
+
),
|
|
37
|
+
WriteEngineParityScenario(
|
|
38
|
+
scenario_id="scd1_sql_merge_null_keys",
|
|
39
|
+
title="SCD1 SQL MERGE rejects or quarantines null merge keys consistently",
|
|
40
|
+
write_mode="scd1_upsert",
|
|
41
|
+
candidate_engine="databricks_sql_merge",
|
|
42
|
+
expectation="must_match",
|
|
43
|
+
runtime_targets=CLASSIC_AND_SERVERLESS,
|
|
44
|
+
required_capabilities=("databricks_runtime", "unity_catalog_table", "sql_merge"),
|
|
45
|
+
required_contract_fields=("merge_keys", "quality_rules.not_null"),
|
|
46
|
+
expected_semantics=("null merge-key rows follow declared quality behavior",),
|
|
47
|
+
metric_expectations=(ParityMetricExpectation("rows_quarantined", "must match when quarantine is configured"),),
|
|
48
|
+
),
|
|
49
|
+
WriteEngineParityScenario(
|
|
50
|
+
scenario_id="scd2_auto_cdc_history_lifecycle",
|
|
51
|
+
title="Lakeflow AUTO CDC SCD2 preserves history lifecycle",
|
|
52
|
+
write_mode="scd2_historical",
|
|
53
|
+
candidate_engine="databricks_lakeflow_auto_cdc",
|
|
54
|
+
expectation="must_match",
|
|
55
|
+
runtime_targets=CLASSIC_AND_SERVERLESS,
|
|
56
|
+
required_capabilities=("lakeflow_auto_cdc", "unity_catalog_table"),
|
|
57
|
+
required_contract_fields=("merge_keys", "sequence_by", "scd2_change_columns"),
|
|
58
|
+
expected_semantics=("changed columns expire current row", "new current version is inserted"),
|
|
59
|
+
metric_expectations=(ParityMetricExpectation("rows_expired", "normalized metric must match"),),
|
|
60
|
+
),
|
|
61
|
+
WriteEngineParityScenario(
|
|
62
|
+
scenario_id="scd2_auto_cdc_late_arriving",
|
|
63
|
+
title="Lakeflow AUTO CDC SCD2 handles late-arriving changes explicitly",
|
|
64
|
+
write_mode="scd2_historical",
|
|
65
|
+
candidate_engine="databricks_lakeflow_auto_cdc",
|
|
66
|
+
expectation="must_match",
|
|
67
|
+
runtime_targets=CLASSIC_AND_SERVERLESS,
|
|
68
|
+
required_capabilities=("lakeflow_auto_cdc", "unity_catalog_table"),
|
|
69
|
+
required_contract_fields=("merge_keys", "sequence_by", "scd2_late_arriving_policy"),
|
|
70
|
+
expected_semantics=("late-arriving records must not corrupt current row",),
|
|
71
|
+
metric_expectations=(ParityMetricExpectation("rows_expired", "must match or document intentional difference"),),
|
|
72
|
+
),
|
|
73
|
+
WriteEngineParityScenario(
|
|
74
|
+
scenario_id="scd2_auto_cdc_delete_semantics",
|
|
75
|
+
title="Lakeflow AUTO CDC SCD2 delete predicates preserve declared delete behavior",
|
|
76
|
+
write_mode="scd2_historical",
|
|
77
|
+
candidate_engine="databricks_lakeflow_auto_cdc",
|
|
78
|
+
expectation="must_match",
|
|
79
|
+
runtime_targets=CLASSIC_AND_SERVERLESS,
|
|
80
|
+
required_capabilities=("lakeflow_auto_cdc", "unity_catalog_table"),
|
|
81
|
+
required_contract_fields=("merge_keys", "sequence_by", "apply_as_deletes"),
|
|
82
|
+
expected_semantics=("delete predicates must be explicit and reviewed",),
|
|
83
|
+
metric_expectations=(ParityMetricExpectation("rows_deleted", "must match declared delete behavior"),),
|
|
84
|
+
),
|
|
85
|
+
WriteEngineParityScenario(
|
|
86
|
+
scenario_id="hash_diff_auto_cdc_non_equivalence",
|
|
87
|
+
title="Hash-diff append is not equivalent to Lakeflow SCD type 1",
|
|
88
|
+
write_mode="scd1_hash_diff",
|
|
89
|
+
candidate_engine="databricks_lakeflow_auto_cdc",
|
|
90
|
+
expectation="unsupported",
|
|
91
|
+
runtime_targets=CLASSIC_AND_SERVERLESS,
|
|
92
|
+
required_capabilities=("lakeflow_auto_cdc",),
|
|
93
|
+
required_contract_fields=("hash_keys",),
|
|
94
|
+
expected_semantics=("hash diff appends changed versions; Lakeflow SCD1 stores current state",),
|
|
95
|
+
metric_expectations=(ParityMetricExpectation("rows_inserted", "must not be compared"),),
|
|
96
|
+
blockers_to_record=("scd1_hash_diff is append-only version capture, not Lakeflow SCD type 1.",),
|
|
97
|
+
),
|
|
98
|
+
WriteEngineParityScenario(
|
|
99
|
+
scenario_id="snapshot_soft_delete_auto_cdc_difference",
|
|
100
|
+
title="Snapshot soft delete differs from AUTO CDC snapshot deletes",
|
|
101
|
+
write_mode="snapshot_soft_delete",
|
|
102
|
+
candidate_engine="databricks_lakeflow_auto_cdc",
|
|
103
|
+
expectation="intentional_difference",
|
|
104
|
+
runtime_targets=CLASSIC_AND_SERVERLESS,
|
|
105
|
+
required_capabilities=("lakeflow_auto_cdc",),
|
|
106
|
+
required_contract_fields=("merge_keys",),
|
|
107
|
+
expected_semantics=("ContractForge marks missing keys inactive; snapshot CDC may delete missing keys",),
|
|
108
|
+
metric_expectations=(ParityMetricExpectation("rows_deleted", "expected to differ"),),
|
|
109
|
+
blockers_to_record=("AUTO CDC snapshot deletes are not inactive-marker soft deletes.",),
|
|
110
|
+
),
|
|
111
|
+
)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Databricks partition predicate rendering."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Iterable
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from contractforge_core.partitioning import distinct_partition_values
|
|
9
|
+
from contractforge_databricks.sql import quote_identifier, sql_string
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def render_partition_in_predicate(column: str, values: Iterable[Any], *, max_values: int = 1000) -> str:
|
|
13
|
+
distinct = distinct_partition_values(values, max_values=max_values)
|
|
14
|
+
quoted = quote_identifier(column)
|
|
15
|
+
non_null = [value for value in distinct if value is not None]
|
|
16
|
+
predicates = []
|
|
17
|
+
if non_null:
|
|
18
|
+
literals = ", ".join(sql_string(value) for value in non_null)
|
|
19
|
+
predicates.append(f"{quoted} IN ({literals})")
|
|
20
|
+
if any(value is None for value in distinct):
|
|
21
|
+
predicates.append(f"{quoted} IS NULL")
|
|
22
|
+
return " OR ".join(predicates)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def render_replace_where(column: str, value: Any) -> str:
|
|
26
|
+
if value is None:
|
|
27
|
+
return f"{quote_identifier(column)} IS NULL"
|
|
28
|
+
return f"{quote_identifier(column)} = {sql_string(value)}"
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from contractforge_core.preparation import HashDiffStageSpec, SCD2StageSpec, SnapshotStageSpec
|
|
2
|
+
from contractforge_databricks.preparation.hashing import (
|
|
3
|
+
HASH_DELIMITER,
|
|
4
|
+
HASH_NULL_SENTINEL,
|
|
5
|
+
ROW_HASH_COLUMN,
|
|
6
|
+
render_row_hash_expression,
|
|
7
|
+
)
|
|
8
|
+
from contractforge_databricks.preparation.encoding import apply_encoding_fix
|
|
9
|
+
from contractforge_databricks.preparation.shape import apply_shape
|
|
10
|
+
from contractforge_databricks.preparation.deduplicate import apply_transform_deduplicate
|
|
11
|
+
from contractforge_databricks.preparation.pyspark import (
|
|
12
|
+
apply_transform,
|
|
13
|
+
apply_contract_preparation,
|
|
14
|
+
apply_transform_cast,
|
|
15
|
+
apply_transform_derive,
|
|
16
|
+
apply_transform_standardize,
|
|
17
|
+
)
|
|
18
|
+
from contractforge_databricks.preparation.pyspark_staging import (
|
|
19
|
+
apply_write_staging,
|
|
20
|
+
prepare_hash_diff_stage,
|
|
21
|
+
prepare_scd2_stage,
|
|
22
|
+
prepare_snapshot_stage,
|
|
23
|
+
with_row_hash,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
"HashDiffStageSpec",
|
|
28
|
+
"HASH_DELIMITER",
|
|
29
|
+
"HASH_NULL_SENTINEL",
|
|
30
|
+
"ROW_HASH_COLUMN",
|
|
31
|
+
"SCD2StageSpec",
|
|
32
|
+
"SnapshotStageSpec",
|
|
33
|
+
"apply_encoding_fix",
|
|
34
|
+
"apply_shape",
|
|
35
|
+
"apply_contract_preparation",
|
|
36
|
+
"apply_transform",
|
|
37
|
+
"apply_write_staging",
|
|
38
|
+
"apply_transform_cast",
|
|
39
|
+
"apply_transform_deduplicate",
|
|
40
|
+
"apply_transform_derive",
|
|
41
|
+
"apply_transform_standardize",
|
|
42
|
+
"prepare_hash_diff_stage",
|
|
43
|
+
"prepare_scd2_stage",
|
|
44
|
+
"prepare_snapshot_stage",
|
|
45
|
+
"render_row_hash_expression",
|
|
46
|
+
"with_row_hash",
|
|
47
|
+
]
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""PySpark deduplication helpers for portable transform intent."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def apply_transform_deduplicate(df: Any, deduplicate: object) -> Any:
|
|
10
|
+
if not isinstance(deduplicate, dict):
|
|
11
|
+
return df
|
|
12
|
+
from pyspark.sql import Window, functions as F
|
|
13
|
+
|
|
14
|
+
keys = deduplicate.get("keys")
|
|
15
|
+
key_columns = [str(keys)] if isinstance(keys, str) else [str(key) for key in keys or ()]
|
|
16
|
+
if not key_columns:
|
|
17
|
+
raise ValueError("transform.deduplicate.keys is required")
|
|
18
|
+
_validate_columns(df, {column: True for column in key_columns}, "transform.deduplicate.keys")
|
|
19
|
+
window = Window.partitionBy(*key_columns).orderBy(*_deduplicate_order_columns(deduplicate.get("order_by"), F))
|
|
20
|
+
return df.withColumn("__cf_row_number", F.row_number().over(window)).filter(F.col("__cf_row_number") == 1).drop(
|
|
21
|
+
"__cf_row_number"
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _deduplicate_order_columns(order_by: object, functions: Any) -> list[Any]:
|
|
26
|
+
if isinstance(order_by, str):
|
|
27
|
+
return _deduplicate_order_columns_from_string(order_by, functions)
|
|
28
|
+
order_columns = []
|
|
29
|
+
for item in order_by or ():
|
|
30
|
+
if not isinstance(item, dict):
|
|
31
|
+
continue
|
|
32
|
+
order_columns.append(
|
|
33
|
+
_deduplicate_order_column(
|
|
34
|
+
functions.col(str(item["column"])),
|
|
35
|
+
direction=str(item.get("direction", "desc")).lower(),
|
|
36
|
+
nulls=str(item.get("nulls") or "").lower(),
|
|
37
|
+
)
|
|
38
|
+
)
|
|
39
|
+
if not order_columns:
|
|
40
|
+
raise ValueError("transform.deduplicate.order_by is required")
|
|
41
|
+
return order_columns
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _deduplicate_order_columns_from_string(order_by: str, functions: Any) -> list[Any]:
|
|
45
|
+
order_columns = []
|
|
46
|
+
for clause in (item.strip() for item in order_by.split(",")):
|
|
47
|
+
if not clause:
|
|
48
|
+
continue
|
|
49
|
+
parsed = re.match(
|
|
50
|
+
r"^`?(?P<column>[A-Za-z_][A-Za-z0-9_]*)`?(?:\s+(?P<direction>ASC|DESC))?(?:\s+NULLS\s+(?P<nulls>FIRST|LAST))?$",
|
|
51
|
+
clause,
|
|
52
|
+
flags=re.IGNORECASE,
|
|
53
|
+
)
|
|
54
|
+
if parsed is None:
|
|
55
|
+
order_columns.append(functions.expr(clause))
|
|
56
|
+
continue
|
|
57
|
+
order_columns.append(
|
|
58
|
+
_deduplicate_order_column(
|
|
59
|
+
functions.col(parsed.group("column")),
|
|
60
|
+
direction=(parsed.group("direction") or "desc").lower(),
|
|
61
|
+
nulls=(parsed.group("nulls") or "").lower(),
|
|
62
|
+
)
|
|
63
|
+
)
|
|
64
|
+
if not order_columns:
|
|
65
|
+
raise ValueError("transform.deduplicate.order_by is required")
|
|
66
|
+
return order_columns
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _deduplicate_order_column(column: Any, *, direction: str, nulls: str) -> Any:
|
|
70
|
+
if direction == "asc" and nulls == "first":
|
|
71
|
+
return column.asc_nulls_first()
|
|
72
|
+
if direction == "asc" and nulls == "last":
|
|
73
|
+
return column.asc_nulls_last()
|
|
74
|
+
if direction == "asc":
|
|
75
|
+
return column.asc()
|
|
76
|
+
if nulls == "first":
|
|
77
|
+
return column.desc_nulls_first()
|
|
78
|
+
if nulls == "last":
|
|
79
|
+
return column.desc_nulls_last()
|
|
80
|
+
return column.desc()
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _validate_columns(df: Any, columns: dict[str, Any], context: str) -> None:
|
|
84
|
+
available = set(getattr(df, "columns", ()) or ())
|
|
85
|
+
missing = sorted(str(column) for column in columns if str(column) not in available)
|
|
86
|
+
if missing:
|
|
87
|
+
raise ValueError(f"{context} references missing columns: {missing}")
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Databricks preparation helpers for adapter-owned encoding fixes."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import importlib
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from contractforge_databricks.contract_extensions import databricks_extensions
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def apply_encoding_fix(df: Any, contract: Any) -> Any:
|
|
12
|
+
extensions = databricks_extensions(contract)
|
|
13
|
+
if not extensions.get("fix_encoding"):
|
|
14
|
+
return df
|
|
15
|
+
functions = importlib.import_module("pyspark.sql").functions
|
|
16
|
+
encoding = str(extensions.get("encoding") or "utf-8")
|
|
17
|
+
string_columns = _string_columns(df)
|
|
18
|
+
for column in _string_tuple(extensions.get("encoding_columns")) or string_columns:
|
|
19
|
+
if column in string_columns:
|
|
20
|
+
df = df.withColumn(column, functions.decode(functions.col(column).cast("binary"), encoding))
|
|
21
|
+
return df
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _string_columns(df: Any) -> tuple[str, ...]:
|
|
25
|
+
return tuple(
|
|
26
|
+
field.name
|
|
27
|
+
for field in getattr(getattr(df, "schema", None), "fields", ()) or ()
|
|
28
|
+
if field.dataType.typeName() == "string"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _string_tuple(value: object) -> tuple[str, ...]:
|
|
33
|
+
if value is None:
|
|
34
|
+
return ()
|
|
35
|
+
if isinstance(value, str):
|
|
36
|
+
return tuple(part.strip() for part in value.split(",") if part.strip())
|
|
37
|
+
return tuple(str(part) for part in value or ())
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Hash expression helpers for Databricks staging."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from contractforge_core.preparation import HASH_DELIMITER, HASH_NULL_SENTINEL
|
|
6
|
+
from contractforge_databricks.sql import quote_identifier
|
|
7
|
+
|
|
8
|
+
ROW_HASH_COLUMN = "row_hash"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def render_row_hash_expression(columns: tuple[str, ...], *, exclude: tuple[str, ...] = ()) -> str:
|
|
12
|
+
included = tuple(column for column in columns if column not in set(exclude))
|
|
13
|
+
if not included:
|
|
14
|
+
raise ValueError("row hash requires at least one included column")
|
|
15
|
+
payload = ", ".join(
|
|
16
|
+
f"coalesce(cast({quote_identifier(column)} as string), '{HASH_NULL_SENTINEL}')" for column in included
|
|
17
|
+
)
|
|
18
|
+
return f"sha2(concat_ws('{HASH_DELIMITER}', {payload}), 256)"
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""Optional PySpark staging helpers.
|
|
2
|
+
|
|
3
|
+
Imports stay inside functions so the package can be imported without PySpark.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from contractforge_core.config import CONTROL_COLUMNS
|
|
11
|
+
from contractforge_core.semantic import SemanticContract
|
|
12
|
+
from contractforge_databricks.preparation.deduplicate import apply_transform_deduplicate
|
|
13
|
+
from contractforge_databricks.preparation.encoding import apply_encoding_fix
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def create_or_replace_temp_view(df: Any, name: str) -> str:
|
|
17
|
+
if not name or not name.strip():
|
|
18
|
+
raise ValueError("temp view name must not be empty")
|
|
19
|
+
df.createOrReplaceTempView(name)
|
|
20
|
+
return name
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def apply_transform(df: Any, transform: dict[str, Any] | None) -> Any:
|
|
24
|
+
"""Apply portable transform intent with PySpark DataFrame operations."""
|
|
25
|
+
|
|
26
|
+
if not transform:
|
|
27
|
+
return df
|
|
28
|
+
df = apply_transform_cast(df, _dict(transform.get("cast")))
|
|
29
|
+
df = apply_transform_standardize(df, _dict(transform.get("standardize")))
|
|
30
|
+
df = apply_transform_derive(df, _dict(transform.get("derive")))
|
|
31
|
+
df = apply_transform_composite_keys(df, _dict(transform.get("composite_keys")))
|
|
32
|
+
return apply_transform_deduplicate(df, transform.get("deduplicate"))
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def apply_contract_preparation(
|
|
36
|
+
df: Any,
|
|
37
|
+
contract: SemanticContract,
|
|
38
|
+
*,
|
|
39
|
+
watermark_column: str | None = None,
|
|
40
|
+
watermark_previous: str | None = None,
|
|
41
|
+
) -> Any:
|
|
42
|
+
"""Apply portable pre-write preparation declared in the core contract."""
|
|
43
|
+
|
|
44
|
+
metadata = _contract_metadata(contract)
|
|
45
|
+
select_columns = _string_list(metadata.get("select_columns"))
|
|
46
|
+
if select_columns:
|
|
47
|
+
_validate_columns(df, {column: True for column in select_columns}, "select_columns")
|
|
48
|
+
df = df.select(*select_columns)
|
|
49
|
+
column_mapping = _dict(metadata.get("column_mapping"))
|
|
50
|
+
if column_mapping:
|
|
51
|
+
_validate_column_mapping(df, column_mapping)
|
|
52
|
+
for source_col, target_col in column_mapping.items():
|
|
53
|
+
df = df.withColumnRenamed(str(source_col), str(target_col))
|
|
54
|
+
if contract.shape:
|
|
55
|
+
from contractforge_databricks.preparation.shape import apply_shape
|
|
56
|
+
|
|
57
|
+
df = apply_shape(df, contract.shape.raw, layer=contract.target.layer)
|
|
58
|
+
transform = contract.transform.raw if contract.transform else {}
|
|
59
|
+
df = apply_transform_cast(df, _dict(transform.get("cast")))
|
|
60
|
+
df = apply_transform_standardize(df, _dict(transform.get("standardize")))
|
|
61
|
+
df = apply_transform_derive(df, _dict(transform.get("derive")))
|
|
62
|
+
filter_expression = metadata.get("filter_expression")
|
|
63
|
+
if filter_expression:
|
|
64
|
+
from pyspark.sql import functions as F
|
|
65
|
+
|
|
66
|
+
df = df.where(F.expr(str(filter_expression)))
|
|
67
|
+
df = apply_transform_composite_keys(df, _dict(transform.get("composite_keys")))
|
|
68
|
+
df = _apply_watermark_filter(df, watermark_column, watermark_previous)
|
|
69
|
+
df = apply_transform_deduplicate(df, transform.get("deduplicate"))
|
|
70
|
+
return apply_encoding_fix(df, contract)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def apply_transform_cast(df: Any, casts: dict[str, Any]) -> Any:
|
|
74
|
+
if not casts:
|
|
75
|
+
return df
|
|
76
|
+
from pyspark.sql import functions as F
|
|
77
|
+
|
|
78
|
+
_validate_columns(df, casts, "transform.cast")
|
|
79
|
+
for column_name, data_type in casts.items():
|
|
80
|
+
df = df.withColumn(str(column_name), F.col(str(column_name)).cast(str(data_type)))
|
|
81
|
+
return df
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def apply_transform_derive(df: Any, expressions: dict[str, Any]) -> Any:
|
|
85
|
+
if not expressions:
|
|
86
|
+
return df
|
|
87
|
+
from pyspark.sql import functions as F
|
|
88
|
+
|
|
89
|
+
for column_name, expression in expressions.items():
|
|
90
|
+
df = df.withColumn(str(column_name), F.expr(str(expression)))
|
|
91
|
+
return df
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def apply_transform_composite_keys(df: Any, composite_keys: dict[str, Any]) -> Any:
|
|
95
|
+
if not composite_keys:
|
|
96
|
+
return df
|
|
97
|
+
from pyspark.sql import functions as F
|
|
98
|
+
|
|
99
|
+
for key_name, source_columns in composite_keys.items():
|
|
100
|
+
columns = [source_columns] if isinstance(source_columns, str) else list(source_columns or ())
|
|
101
|
+
_validate_columns(df, {str(column): True for column in columns}, f"transform.composite_keys.{key_name}")
|
|
102
|
+
parts = [F.coalesce(F.col(str(column)).cast("string"), F.lit("")) for column in columns]
|
|
103
|
+
df = df.withColumn(str(key_name), F.concat_ws("|", *parts))
|
|
104
|
+
return df
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def apply_transform_standardize(df: Any, standardize: dict[str, Any]) -> Any:
|
|
108
|
+
if not standardize:
|
|
109
|
+
return df
|
|
110
|
+
from pyspark.sql import functions as F
|
|
111
|
+
|
|
112
|
+
_validate_columns(df, standardize, "transform.standardize")
|
|
113
|
+
for column_name, config in standardize.items():
|
|
114
|
+
column = F.col(str(column_name))
|
|
115
|
+
if config.get("normalize_whitespace"):
|
|
116
|
+
column = F.regexp_replace(column, r"\s+", " ")
|
|
117
|
+
if config.get("trim"):
|
|
118
|
+
column = F.trim(column)
|
|
119
|
+
if config.get("lower"):
|
|
120
|
+
column = F.lower(column)
|
|
121
|
+
if config.get("upper"):
|
|
122
|
+
column = F.upper(column)
|
|
123
|
+
if config.get("empty_as_null"):
|
|
124
|
+
column = F.when(column == "", F.lit(None)).otherwise(column)
|
|
125
|
+
df = df.withColumn(str(column_name), column)
|
|
126
|
+
return df
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _validate_columns(df: Any, columns: dict[str, Any], context: str) -> None:
|
|
130
|
+
available = set(getattr(df, "columns", ()) or ())
|
|
131
|
+
missing = sorted(str(column) for column in columns if str(column) not in available)
|
|
132
|
+
if missing:
|
|
133
|
+
raise ValueError(f"{context} references missing columns: {missing}")
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _validate_column_mapping(df: Any, mapping: dict[str, Any]) -> None:
|
|
137
|
+
_validate_columns(df, mapping, "column_mapping")
|
|
138
|
+
existing = set(getattr(df, "columns", ()) or ())
|
|
139
|
+
targets = [str(target) for target in mapping.values()]
|
|
140
|
+
duplicates = sorted({target for target in targets if targets.count(target) > 1})
|
|
141
|
+
if duplicates:
|
|
142
|
+
raise ValueError(f"column_mapping has duplicate targets: {duplicates}")
|
|
143
|
+
reserved_targets = sorted(set(targets) & CONTROL_COLUMNS)
|
|
144
|
+
if reserved_targets:
|
|
145
|
+
raise ValueError(f"column_mapping cannot produce reserved control columns: {reserved_targets}")
|
|
146
|
+
collisions = sorted(
|
|
147
|
+
target
|
|
148
|
+
for source, target in ((str(source), str(target)) for source, target in mapping.items())
|
|
149
|
+
if target in existing and target != source
|
|
150
|
+
)
|
|
151
|
+
if collisions:
|
|
152
|
+
raise ValueError(f"column_mapping would collide with existing columns: {collisions}")
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _apply_watermark_filter(df: Any, watermark_column: str | None, watermark_value: str | None) -> Any:
|
|
156
|
+
if not watermark_column or not watermark_value:
|
|
157
|
+
return df
|
|
158
|
+
from contractforge_databricks.watermark import render_watermark_filter_predicate
|
|
159
|
+
|
|
160
|
+
columns = tuple(part for part in watermark_column.split("|") if part)
|
|
161
|
+
_validate_columns(df, {column: True for column in columns}, "watermark_columns")
|
|
162
|
+
return df.where(render_watermark_filter_predicate(columns=columns, watermark_value=watermark_value))
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _dict(value: object) -> dict[str, Any]:
|
|
166
|
+
return dict(value) if isinstance(value, dict) else {}
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _contract_metadata(contract: SemanticContract) -> dict[str, Any]:
|
|
170
|
+
return dict(contract.operations.metadata or {}) if contract.operations and contract.operations.metadata else {}
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _string_list(value: object) -> list[str]:
|
|
174
|
+
if value is None:
|
|
175
|
+
return []
|
|
176
|
+
if isinstance(value, str):
|
|
177
|
+
return [item.strip() for item in value.split(",") if item.strip()]
|
|
178
|
+
return [str(item) for item in value or ()]
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""PySpark write-mode staging helpers with lazy imports."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from contractforge_core.preparation import SCD2StageSpec, SnapshotStageSpec, resolved_hash_exclude_columns
|
|
8
|
+
from contractforge_core.preparation import scd2_stage_spec_from_contract, snapshot_stage_spec_from_contract
|
|
9
|
+
from contractforge_core.semantic import SemanticContract
|
|
10
|
+
from contractforge_databricks.preparation.hashing import HASH_DELIMITER, HASH_NULL_SENTINEL, ROW_HASH_COLUMN
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def with_row_hash(df: Any, columns: tuple[str, ...], *, exclude: tuple[str, ...] = ()) -> Any:
|
|
14
|
+
from pyspark.sql import functions as F
|
|
15
|
+
|
|
16
|
+
included = tuple(column for column in columns if column not in set(exclude))
|
|
17
|
+
if not included:
|
|
18
|
+
raise ValueError("row hash requires at least one included column")
|
|
19
|
+
payload = [F.coalesce(F.col(column).cast("string"), F.lit(HASH_NULL_SENTINEL)) for column in included]
|
|
20
|
+
return df.withColumn(ROW_HASH_COLUMN, F.sha2(F.concat_ws(HASH_DELIMITER, *payload), 256))
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def prepare_snapshot_stage(df: Any, spec: SnapshotStageSpec) -> Any:
|
|
24
|
+
from pyspark.sql import functions as F
|
|
25
|
+
|
|
26
|
+
source_columns = tuple(column for column in spec.source_columns if column not in {"is_active", "deleted_at", "row_hash"})
|
|
27
|
+
staged = with_row_hash(df, source_columns)
|
|
28
|
+
return staged.withColumn(spec.is_active_column, F.lit(True)).withColumn(
|
|
29
|
+
spec.deleted_at_column,
|
|
30
|
+
F.lit(None).cast("timestamp"),
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def prepare_scd2_stage(df: Any, spec: SCD2StageSpec) -> Any:
|
|
35
|
+
from pyspark.sql import functions as F
|
|
36
|
+
|
|
37
|
+
staged = with_row_hash(df, spec.change_columns)
|
|
38
|
+
if spec.effective_from_column:
|
|
39
|
+
staged = staged.withColumn("valid_from", F.col(spec.effective_from_column).cast("timestamp"))
|
|
40
|
+
else:
|
|
41
|
+
staged = staged.withColumn("valid_from", F.current_timestamp())
|
|
42
|
+
staged = staged.withColumn("valid_to", F.lit(None).cast("timestamp"))
|
|
43
|
+
staged = staged.withColumn("is_current", F.lit(True))
|
|
44
|
+
staged = staged.withColumn("changed_columns", F.lit(None).cast("string"))
|
|
45
|
+
for key in spec.merge_keys:
|
|
46
|
+
staged = staged.withColumn(f"__merge_key_{key}", F.lit(None))
|
|
47
|
+
return staged
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def prepare_hash_diff_stage(df: Any, contract: SemanticContract) -> Any:
|
|
51
|
+
if contract.write.mode != "scd1_hash_diff":
|
|
52
|
+
raise ValueError("Hash-diff staging requires mode=scd1_hash_diff")
|
|
53
|
+
source_columns = tuple(str(column) for column in getattr(df, "columns", ()) or ())
|
|
54
|
+
hash_columns = source_columns if contract.write.hash_strategy == "all_columns_except" else contract.write.hash_keys
|
|
55
|
+
return with_row_hash(
|
|
56
|
+
df,
|
|
57
|
+
hash_columns,
|
|
58
|
+
exclude=resolved_hash_exclude_columns(contract),
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def apply_write_staging(df: Any, contract: SemanticContract) -> Any:
|
|
63
|
+
source_columns = tuple(str(column) for column in getattr(df, "columns", ()) or ())
|
|
64
|
+
if contract.write.mode == "scd1_hash_diff":
|
|
65
|
+
return prepare_hash_diff_stage(df, contract)
|
|
66
|
+
if contract.write.mode == "scd2_historical":
|
|
67
|
+
return prepare_scd2_stage(df, scd2_stage_spec_from_contract(contract, source_columns=source_columns))
|
|
68
|
+
if contract.write.mode == "snapshot_soft_delete":
|
|
69
|
+
return prepare_snapshot_stage(df, snapshot_stage_spec_from_contract(contract, source_columns=source_columns))
|
|
70
|
+
return df
|