contractforge-databricks 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. contractforge_databricks/__init__.py +172 -0
  2. contractforge_databricks/adapter.py +69 -0
  3. contractforge_databricks/annotations/__init__.py +10 -0
  4. contractforge_databricks/annotations/application.py +52 -0
  5. contractforge_databricks/annotations/audit.py +49 -0
  6. contractforge_databricks/annotations/sql.py +142 -0
  7. contractforge_databricks/api.py +65 -0
  8. contractforge_databricks/bundles/__init__.py +9 -0
  9. contractforge_databricks/bundles/assets.py +47 -0
  10. contractforge_databricks/bundles/project.py +213 -0
  11. contractforge_databricks/bundles/project_config.py +133 -0
  12. contractforge_databricks/capabilities/__init__.py +17 -0
  13. contractforge_databricks/capabilities/builders.py +43 -0
  14. contractforge_databricks/capabilities/evaluate.py +162 -0
  15. contractforge_databricks/capabilities/mapping.py +36 -0
  16. contractforge_databricks/capabilities/models.py +44 -0
  17. contractforge_databricks/capabilities/runtime.py +111 -0
  18. contractforge_databricks/capabilities/uc.py +47 -0
  19. contractforge_databricks/cli.py +196 -0
  20. contractforge_databricks/cli_deploy.py +98 -0
  21. contractforge_databricks/cli_governance.py +142 -0
  22. contractforge_databricks/cli_io.py +91 -0
  23. contractforge_databricks/cli_maintenance.py +69 -0
  24. contractforge_databricks/coercion.py +31 -0
  25. contractforge_databricks/contract_extensions.py +70 -0
  26. contractforge_databricks/cost/__init__.py +11 -0
  27. contractforge_databricks/cost/model.py +22 -0
  28. contractforge_databricks/cost/report.py +65 -0
  29. contractforge_databricks/cost/sql.py +136 -0
  30. contractforge_databricks/dashboards/__init__.py +15 -0
  31. contractforge_databricks/dashboards/control_tables.py +150 -0
  32. contractforge_databricks/diagnostics/__init__.py +7 -0
  33. contractforge_databricks/diagnostics/explain.py +40 -0
  34. contractforge_databricks/environment.py +53 -0
  35. contractforge_databricks/evidence/__init__.py +98 -0
  36. contractforge_databricks/evidence/ddl.py +35 -0
  37. contractforge_databricks/evidence/governance_log.py +175 -0
  38. contractforge_databricks/evidence/helpers.py +29 -0
  39. contractforge_databricks/evidence/ops_log.py +210 -0
  40. contractforge_databricks/evidence/records.py +27 -0
  41. contractforge_databricks/evidence/run_log.py +74 -0
  42. contractforge_databricks/evidence/schemas.py +7 -0
  43. contractforge_databricks/evidence/sql.py +144 -0
  44. contractforge_databricks/evidence/tables.py +20 -0
  45. contractforge_databricks/evidence/writer.py +118 -0
  46. contractforge_databricks/execution/__init__.py +70 -0
  47. contractforge_databricks/execution/delta_basic.py +57 -0
  48. contractforge_databricks/execution/hash_diff.py +126 -0
  49. contractforge_databricks/execution/hash_diff_latest.py +142 -0
  50. contractforge_databricks/execution/replace_partitions.py +40 -0
  51. contractforge_databricks/execution/results.py +5 -0
  52. contractforge_databricks/execution/retry.py +36 -0
  53. contractforge_databricks/execution/scd2.py +213 -0
  54. contractforge_databricks/execution/scd2_deletes.py +65 -0
  55. contractforge_databricks/execution/scd2_late.py +30 -0
  56. contractforge_databricks/execution/snapshot.py +77 -0
  57. contractforge_databricks/execution/sql_merge.py +85 -0
  58. contractforge_databricks/execution/tables.py +98 -0
  59. contractforge_databricks/execution/windows.py +58 -0
  60. contractforge_databricks/governance/__init__.py +30 -0
  61. contractforge_databricks/governance/access.py +185 -0
  62. contractforge_databricks/governance/application.py +93 -0
  63. contractforge_databricks/governance/drift.py +49 -0
  64. contractforge_databricks/governance/runtime.py +60 -0
  65. contractforge_databricks/governance/sql.py +31 -0
  66. contractforge_databricks/governance/validation.py +135 -0
  67. contractforge_databricks/lakeflow/__init__.py +21 -0
  68. contractforge_databricks/lakeflow/compatibility.py +194 -0
  69. contractforge_databricks/lakeflow/rendering.py +175 -0
  70. contractforge_databricks/lineage/__init__.py +7 -0
  71. contractforge_databricks/lineage/openlineage.py +182 -0
  72. contractforge_databricks/maintenance/__init__.py +27 -0
  73. contractforge_databricks/maintenance/retention.py +90 -0
  74. contractforge_databricks/maintenance/sql.py +68 -0
  75. contractforge_databricks/metrics/__init__.py +19 -0
  76. contractforge_databricks/metrics/history.py +21 -0
  77. contractforge_databricks/metrics/write.py +63 -0
  78. contractforge_databricks/operations/__init__.py +4 -0
  79. contractforge_databricks/operations/application.py +38 -0
  80. contractforge_databricks/operations/sql.py +95 -0
  81. contractforge_databricks/parity/__init__.py +18 -0
  82. contractforge_databricks/parity/catalog.py +59 -0
  83. contractforge_databricks/parity/models.py +7 -0
  84. contractforge_databricks/parity/scenarios.py +111 -0
  85. contractforge_databricks/partitioning/__init__.py +3 -0
  86. contractforge_databricks/partitioning/predicates.py +28 -0
  87. contractforge_databricks/preparation/__init__.py +47 -0
  88. contractforge_databricks/preparation/deduplicate.py +87 -0
  89. contractforge_databricks/preparation/encoding.py +37 -0
  90. contractforge_databricks/preparation/hashing.py +18 -0
  91. contractforge_databricks/preparation/pyspark.py +178 -0
  92. contractforge_databricks/preparation/pyspark_staging.py +70 -0
  93. contractforge_databricks/preparation/shape.py +209 -0
  94. contractforge_databricks/preparation/shape_validation.py +94 -0
  95. contractforge_databricks/preparation/staging.py +17 -0
  96. contractforge_databricks/preparation/zip_arrays.py +51 -0
  97. contractforge_databricks/presets/__init__.py +3 -0
  98. contractforge_databricks/presets/base.py +24 -0
  99. contractforge_databricks/presets/bronze.py +57 -0
  100. contractforge_databricks/presets/catalog.py +22 -0
  101. contractforge_databricks/presets/core.py +134 -0
  102. contractforge_databricks/presets/gold.py +62 -0
  103. contractforge_databricks/presets/modifiers.py +51 -0
  104. contractforge_databricks/presets/runtime.py +22 -0
  105. contractforge_databricks/presets/silver.py +101 -0
  106. contractforge_databricks/presets/write_engine.py +57 -0
  107. contractforge_databricks/quality/__init__.py +41 -0
  108. contractforge_databricks/quality/evaluation.py +178 -0
  109. contractforge_databricks/quality/persistence.py +81 -0
  110. contractforge_databricks/quality/registry.py +134 -0
  111. contractforge_databricks/quality/results.py +17 -0
  112. contractforge_databricks/quality/sql.py +113 -0
  113. contractforge_databricks/rendering/__init__.py +11 -0
  114. contractforge_databricks/rendering/bundle.py +93 -0
  115. contractforge_databricks/rendering/markdown.py +50 -0
  116. contractforge_databricks/rendering/names.py +56 -0
  117. contractforge_databricks/results.py +15 -0
  118. contractforge_databricks/runtime/__init__.py +101 -0
  119. contractforge_databricks/runtime/available_now.py +147 -0
  120. contractforge_databricks/runtime/bundles.py +211 -0
  121. contractforge_databricks/runtime/cache.py +20 -0
  122. contractforge_databricks/runtime/control_tables.py +19 -0
  123. contractforge_databricks/runtime/deploy.py +197 -0
  124. contractforge_databricks/runtime/detection.py +114 -0
  125. contractforge_databricks/runtime/dry_run.py +46 -0
  126. contractforge_databricks/runtime/errors.py +54 -0
  127. contractforge_databricks/runtime/file_selection.py +109 -0
  128. contractforge_databricks/runtime/finalization.py +168 -0
  129. contractforge_databricks/runtime/governance.py +37 -0
  130. contractforge_databricks/runtime/hooks.py +45 -0
  131. contractforge_databricks/runtime/http_file.py +37 -0
  132. contractforge_databricks/runtime/http_retry.py +15 -0
  133. contractforge_databricks/runtime/http_safety.py +9 -0
  134. contractforge_databricks/runtime/json_materialization.py +97 -0
  135. contractforge_databricks/runtime/lineage.py +164 -0
  136. contractforge_databricks/runtime/maintenance.py +43 -0
  137. contractforge_databricks/runtime/merge_validation.py +98 -0
  138. contractforge_databricks/runtime/metadata.py +21 -0
  139. contractforge_databricks/runtime/metrics.py +34 -0
  140. contractforge_databricks/runtime/models.py +32 -0
  141. contractforge_databricks/runtime/options.py +33 -0
  142. contractforge_databricks/runtime/orchestration_context.py +185 -0
  143. contractforge_databricks/runtime/orchestrator.py +147 -0
  144. contractforge_databricks/runtime/partitioning.py +93 -0
  145. contractforge_databricks/runtime/quality_quarantine.py +92 -0
  146. contractforge_databricks/runtime/rest_api.py +46 -0
  147. contractforge_databricks/runtime/rest_auth.py +21 -0
  148. contractforge_databricks/runtime/rest_pagination.py +21 -0
  149. contractforge_databricks/runtime/run_payload.py +177 -0
  150. contractforge_databricks/runtime/schema.py +106 -0
  151. contractforge_databricks/runtime/source_metadata.py +30 -0
  152. contractforge_databricks/runtime/source_registry.py +43 -0
  153. contractforge_databricks/runtime/source_schema.py +24 -0
  154. contractforge_databricks/runtime/sources.py +208 -0
  155. contractforge_databricks/runtime/spark.py +183 -0
  156. contractforge_databricks/runtime/spark_defaults.py +35 -0
  157. contractforge_databricks/runtime/storage_auth.py +132 -0
  158. contractforge_databricks/runtime/streaming.py +131 -0
  159. contractforge_databricks/runtime/success.py +104 -0
  160. contractforge_databricks/runtime/utils.py +52 -0
  161. contractforge_databricks/runtime/watermark.py +71 -0
  162. contractforge_databricks/runtime/windows.py +184 -0
  163. contractforge_databricks/runtime/write.py +66 -0
  164. contractforge_databricks/runtime/write_flow.py +146 -0
  165. contractforge_databricks/runtime/write_strategy.py +40 -0
  166. contractforge_databricks/schema/__init__.py +21 -0
  167. contractforge_databricks/schema/diff.py +11 -0
  168. contractforge_databricks/schema/policy.py +33 -0
  169. contractforge_databricks/schema/sync.py +23 -0
  170. contractforge_databricks/security/__init__.py +21 -0
  171. contractforge_databricks/security/errors.py +5 -0
  172. contractforge_databricks/security/redaction.py +5 -0
  173. contractforge_databricks/security/secrets.py +114 -0
  174. contractforge_databricks/security/source_policy.py +17 -0
  175. contractforge_databricks/shapes/__init__.py +3 -0
  176. contractforge_databricks/shapes/sql.py +123 -0
  177. contractforge_databricks/sources/__init__.py +67 -0
  178. contractforge_databricks/sources/artifacts.py +100 -0
  179. contractforge_databricks/sources/autoloader.py +48 -0
  180. contractforge_databricks/sources/bounded_streams.py +44 -0
  181. contractforge_databricks/sources/classification.py +115 -0
  182. contractforge_databricks/sources/delta_share.py +21 -0
  183. contractforge_databricks/sources/files.py +48 -0
  184. contractforge_databricks/sources/http_file.py +46 -0
  185. contractforge_databricks/sources/interpret.py +76 -0
  186. contractforge_databricks/sources/jdbc.py +32 -0
  187. contractforge_databricks/sources/metadata.py +18 -0
  188. contractforge_databricks/sources/native_passthrough.py +33 -0
  189. contractforge_databricks/sources/rds_iam.py +15 -0
  190. contractforge_databricks/sources/rds_iam_runtime.py +191 -0
  191. contractforge_databricks/sources/rest_api.py +33 -0
  192. contractforge_databricks/sources/support.py +50 -0
  193. contractforge_databricks/sources/table_refs.py +65 -0
  194. contractforge_databricks/sql/__init__.py +4 -0
  195. contractforge_databricks/sql/identifiers.py +17 -0
  196. contractforge_databricks/sql/literals.py +36 -0
  197. contractforge_databricks/state/__init__.py +39 -0
  198. contractforge_databricks/state/ddl.py +24 -0
  199. contractforge_databricks/state/migrations.py +146 -0
  200. contractforge_databricks/state/queries.py +149 -0
  201. contractforge_databricks/state/sql.py +116 -0
  202. contractforge_databricks/state/tables.py +9 -0
  203. contractforge_databricks/state/writer.py +83 -0
  204. contractforge_databricks/templates/__init__.py +15 -0
  205. contractforge_databricks/templates/catalog.py +205 -0
  206. contractforge_databricks/templates/catalog_parity.py +85 -0
  207. contractforge_databricks/templates/core.py +83 -0
  208. contractforge_databricks/templates/enrichment.py +175 -0
  209. contractforge_databricks/transforms/__init__.py +3 -0
  210. contractforge_databricks/transforms/sql.py +118 -0
  211. contractforge_databricks/watermark/__init__.py +6 -0
  212. contractforge_databricks/watermark/sql.py +91 -0
  213. contractforge_databricks/write_modes/__init__.py +20 -0
  214. contractforge_databricks/write_modes/registry.py +44 -0
  215. contractforge_databricks/write_modes/sql.py +33 -0
  216. contractforge_databricks/write_modes/strategy.py +192 -0
  217. contractforge_databricks-0.1.0.dist-info/METADATA +34 -0
  218. contractforge_databricks-0.1.0.dist-info/RECORD +220 -0
  219. contractforge_databricks-0.1.0.dist-info/WHEEL +4 -0
  220. contractforge_databricks-0.1.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,59 @@
1
+ """Query helpers for the Databricks parity catalog."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from contractforge_core.parity import WriteEngineParityScenario
8
+ from contractforge_databricks.parity.scenarios import WRITE_ENGINE_PARITY_SCENARIOS
9
+
10
+
11
+ def list_write_engine_parity_scenarios() -> list[str]:
12
+ return sorted(scenario.scenario_id for scenario in WRITE_ENGINE_PARITY_SCENARIOS)
13
+
14
+
15
+ def get_write_engine_parity_scenario(scenario_id: str) -> WriteEngineParityScenario:
16
+ for scenario in WRITE_ENGINE_PARITY_SCENARIOS:
17
+ if scenario.scenario_id == scenario_id:
18
+ return scenario
19
+ raise ValueError(
20
+ f"Write-engine parity scenario not found: {scenario_id}. "
21
+ f"Valid scenarios: {list_write_engine_parity_scenarios()}"
22
+ )
23
+
24
+
25
+ def scenarios_for_engine(engine: str) -> list[WriteEngineParityScenario]:
26
+ return [scenario for scenario in WRITE_ENGINE_PARITY_SCENARIOS if scenario.candidate_engine == engine]
27
+
28
+
29
+ def scenarios_for_mode(mode: str) -> list[WriteEngineParityScenario]:
30
+ return [scenario for scenario in WRITE_ENGINE_PARITY_SCENARIOS if scenario.write_mode == mode]
31
+
32
+
33
+ def build_write_engine_parity_plan(
34
+ *,
35
+ engine: str | None = None,
36
+ mode: str | None = None,
37
+ runtime: str | None = None,
38
+ ) -> dict[str, Any]:
39
+ scenarios = list(WRITE_ENGINE_PARITY_SCENARIOS)
40
+ if engine:
41
+ scenarios = [scenario for scenario in scenarios if scenario.candidate_engine == engine]
42
+ if mode:
43
+ scenarios = [scenario for scenario in scenarios if scenario.write_mode == mode]
44
+ if runtime:
45
+ scenarios = [scenario for scenario in scenarios if runtime in scenario.runtime_targets]
46
+
47
+ expectation_counts: dict[str, int] = {}
48
+ for scenario in scenarios:
49
+ expectation_counts[scenario.expectation] = expectation_counts.get(scenario.expectation, 0) + 1
50
+
51
+ return {
52
+ "kind": "write_engine_parity_plan",
53
+ "engine": engine,
54
+ "mode": mode,
55
+ "runtime": runtime,
56
+ "scenario_count": len(scenarios),
57
+ "expectation_counts": expectation_counts,
58
+ "scenarios": [scenario.as_dict() for scenario in scenarios],
59
+ }
@@ -0,0 +1,7 @@
1
+ """Compatibility exports for platform-neutral parity catalog models."""
2
+
3
+ from contractforge_core.parity import ParityExpectation, ParityMetricExpectation, WriteEngineParityScenario
4
+
5
+ RuntimeTarget = str
6
+
7
+ __all__ = ["ParityExpectation", "ParityMetricExpectation", "RuntimeTarget", "WriteEngineParityScenario"]
@@ -0,0 +1,111 @@
1
+ """Official Databricks write-engine parity scenarios."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from contractforge_core.parity import ParityMetricExpectation, WriteEngineParityScenario
6
+
7
+ CLASSIC_AND_SERVERLESS = ("databricks_classic", "databricks_serverless")
8
+
9
+ WRITE_ENGINE_PARITY_SCENARIOS: tuple[WriteEngineParityScenario, ...] = (
10
+ WriteEngineParityScenario(
11
+ scenario_id="scd1_sql_merge_insert_update",
12
+ title="SCD1 SQL MERGE preserves insert/update semantics",
13
+ write_mode="scd1_upsert",
14
+ candidate_engine="databricks_sql_merge",
15
+ expectation="must_match",
16
+ runtime_targets=CLASSIC_AND_SERVERLESS,
17
+ required_capabilities=("databricks_runtime", "unity_catalog_table", "sql_merge"),
18
+ required_contract_fields=("merge_keys", "transform.deduplicate"),
19
+ expected_semantics=("one current row per merge key", "changed rows update", "new keys insert"),
20
+ metric_expectations=(
21
+ ParityMetricExpectation("rows_inserted", "must match ContractForge Delta"),
22
+ ParityMetricExpectation("rows_updated", "must match ContractForge Delta"),
23
+ ),
24
+ ),
25
+ WriteEngineParityScenario(
26
+ scenario_id="scd1_sql_merge_duplicate_keys",
27
+ title="SCD1 SQL MERGE handles duplicate keys deterministically",
28
+ write_mode="scd1_upsert",
29
+ candidate_engine="databricks_sql_merge",
30
+ expectation="must_match",
31
+ runtime_targets=CLASSIC_AND_SERVERLESS,
32
+ required_capabilities=("databricks_runtime", "unity_catalog_table", "sql_merge"),
33
+ required_contract_fields=("merge_keys", "transform.deduplicate"),
34
+ expected_semantics=("source row selected per key must match ContractForge deduplication",),
35
+ metric_expectations=(ParityMetricExpectation("rows_affected", "must match after deduplication"),),
36
+ ),
37
+ WriteEngineParityScenario(
38
+ scenario_id="scd1_sql_merge_null_keys",
39
+ title="SCD1 SQL MERGE rejects or quarantines null merge keys consistently",
40
+ write_mode="scd1_upsert",
41
+ candidate_engine="databricks_sql_merge",
42
+ expectation="must_match",
43
+ runtime_targets=CLASSIC_AND_SERVERLESS,
44
+ required_capabilities=("databricks_runtime", "unity_catalog_table", "sql_merge"),
45
+ required_contract_fields=("merge_keys", "quality_rules.not_null"),
46
+ expected_semantics=("null merge-key rows follow declared quality behavior",),
47
+ metric_expectations=(ParityMetricExpectation("rows_quarantined", "must match when quarantine is configured"),),
48
+ ),
49
+ WriteEngineParityScenario(
50
+ scenario_id="scd2_auto_cdc_history_lifecycle",
51
+ title="Lakeflow AUTO CDC SCD2 preserves history lifecycle",
52
+ write_mode="scd2_historical",
53
+ candidate_engine="databricks_lakeflow_auto_cdc",
54
+ expectation="must_match",
55
+ runtime_targets=CLASSIC_AND_SERVERLESS,
56
+ required_capabilities=("lakeflow_auto_cdc", "unity_catalog_table"),
57
+ required_contract_fields=("merge_keys", "sequence_by", "scd2_change_columns"),
58
+ expected_semantics=("changed columns expire current row", "new current version is inserted"),
59
+ metric_expectations=(ParityMetricExpectation("rows_expired", "normalized metric must match"),),
60
+ ),
61
+ WriteEngineParityScenario(
62
+ scenario_id="scd2_auto_cdc_late_arriving",
63
+ title="Lakeflow AUTO CDC SCD2 handles late-arriving changes explicitly",
64
+ write_mode="scd2_historical",
65
+ candidate_engine="databricks_lakeflow_auto_cdc",
66
+ expectation="must_match",
67
+ runtime_targets=CLASSIC_AND_SERVERLESS,
68
+ required_capabilities=("lakeflow_auto_cdc", "unity_catalog_table"),
69
+ required_contract_fields=("merge_keys", "sequence_by", "scd2_late_arriving_policy"),
70
+ expected_semantics=("late-arriving records must not corrupt current row",),
71
+ metric_expectations=(ParityMetricExpectation("rows_expired", "must match or document intentional difference"),),
72
+ ),
73
+ WriteEngineParityScenario(
74
+ scenario_id="scd2_auto_cdc_delete_semantics",
75
+ title="Lakeflow AUTO CDC SCD2 delete predicates preserve declared delete behavior",
76
+ write_mode="scd2_historical",
77
+ candidate_engine="databricks_lakeflow_auto_cdc",
78
+ expectation="must_match",
79
+ runtime_targets=CLASSIC_AND_SERVERLESS,
80
+ required_capabilities=("lakeflow_auto_cdc", "unity_catalog_table"),
81
+ required_contract_fields=("merge_keys", "sequence_by", "apply_as_deletes"),
82
+ expected_semantics=("delete predicates must be explicit and reviewed",),
83
+ metric_expectations=(ParityMetricExpectation("rows_deleted", "must match declared delete behavior"),),
84
+ ),
85
+ WriteEngineParityScenario(
86
+ scenario_id="hash_diff_auto_cdc_non_equivalence",
87
+ title="Hash-diff append is not equivalent to Lakeflow SCD type 1",
88
+ write_mode="scd1_hash_diff",
89
+ candidate_engine="databricks_lakeflow_auto_cdc",
90
+ expectation="unsupported",
91
+ runtime_targets=CLASSIC_AND_SERVERLESS,
92
+ required_capabilities=("lakeflow_auto_cdc",),
93
+ required_contract_fields=("hash_keys",),
94
+ expected_semantics=("hash diff appends changed versions; Lakeflow SCD1 stores current state",),
95
+ metric_expectations=(ParityMetricExpectation("rows_inserted", "must not be compared"),),
96
+ blockers_to_record=("scd1_hash_diff is append-only version capture, not Lakeflow SCD type 1.",),
97
+ ),
98
+ WriteEngineParityScenario(
99
+ scenario_id="snapshot_soft_delete_auto_cdc_difference",
100
+ title="Snapshot soft delete differs from AUTO CDC snapshot deletes",
101
+ write_mode="snapshot_soft_delete",
102
+ candidate_engine="databricks_lakeflow_auto_cdc",
103
+ expectation="intentional_difference",
104
+ runtime_targets=CLASSIC_AND_SERVERLESS,
105
+ required_capabilities=("lakeflow_auto_cdc",),
106
+ required_contract_fields=("merge_keys",),
107
+ expected_semantics=("ContractForge marks missing keys inactive; snapshot CDC may delete missing keys",),
108
+ metric_expectations=(ParityMetricExpectation("rows_deleted", "expected to differ"),),
109
+ blockers_to_record=("AUTO CDC snapshot deletes are not inactive-marker soft deletes.",),
110
+ ),
111
+ )
@@ -0,0 +1,3 @@
1
+ from contractforge_databricks.partitioning.predicates import render_partition_in_predicate, render_replace_where
2
+
3
+ __all__ = ["render_partition_in_predicate", "render_replace_where"]
@@ -0,0 +1,28 @@
1
+ """Databricks partition predicate rendering."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Iterable
6
+ from typing import Any
7
+
8
+ from contractforge_core.partitioning import distinct_partition_values
9
+ from contractforge_databricks.sql import quote_identifier, sql_string
10
+
11
+
12
+ def render_partition_in_predicate(column: str, values: Iterable[Any], *, max_values: int = 1000) -> str:
13
+ distinct = distinct_partition_values(values, max_values=max_values)
14
+ quoted = quote_identifier(column)
15
+ non_null = [value for value in distinct if value is not None]
16
+ predicates = []
17
+ if non_null:
18
+ literals = ", ".join(sql_string(value) for value in non_null)
19
+ predicates.append(f"{quoted} IN ({literals})")
20
+ if any(value is None for value in distinct):
21
+ predicates.append(f"{quoted} IS NULL")
22
+ return " OR ".join(predicates)
23
+
24
+
25
+ def render_replace_where(column: str, value: Any) -> str:
26
+ if value is None:
27
+ return f"{quote_identifier(column)} IS NULL"
28
+ return f"{quote_identifier(column)} = {sql_string(value)}"
@@ -0,0 +1,47 @@
1
+ from contractforge_core.preparation import HashDiffStageSpec, SCD2StageSpec, SnapshotStageSpec
2
+ from contractforge_databricks.preparation.hashing import (
3
+ HASH_DELIMITER,
4
+ HASH_NULL_SENTINEL,
5
+ ROW_HASH_COLUMN,
6
+ render_row_hash_expression,
7
+ )
8
+ from contractforge_databricks.preparation.encoding import apply_encoding_fix
9
+ from contractforge_databricks.preparation.shape import apply_shape
10
+ from contractforge_databricks.preparation.deduplicate import apply_transform_deduplicate
11
+ from contractforge_databricks.preparation.pyspark import (
12
+ apply_transform,
13
+ apply_contract_preparation,
14
+ apply_transform_cast,
15
+ apply_transform_derive,
16
+ apply_transform_standardize,
17
+ )
18
+ from contractforge_databricks.preparation.pyspark_staging import (
19
+ apply_write_staging,
20
+ prepare_hash_diff_stage,
21
+ prepare_scd2_stage,
22
+ prepare_snapshot_stage,
23
+ with_row_hash,
24
+ )
25
+
26
+ __all__ = [
27
+ "HashDiffStageSpec",
28
+ "HASH_DELIMITER",
29
+ "HASH_NULL_SENTINEL",
30
+ "ROW_HASH_COLUMN",
31
+ "SCD2StageSpec",
32
+ "SnapshotStageSpec",
33
+ "apply_encoding_fix",
34
+ "apply_shape",
35
+ "apply_contract_preparation",
36
+ "apply_transform",
37
+ "apply_write_staging",
38
+ "apply_transform_cast",
39
+ "apply_transform_deduplicate",
40
+ "apply_transform_derive",
41
+ "apply_transform_standardize",
42
+ "prepare_hash_diff_stage",
43
+ "prepare_scd2_stage",
44
+ "prepare_snapshot_stage",
45
+ "render_row_hash_expression",
46
+ "with_row_hash",
47
+ ]
@@ -0,0 +1,87 @@
1
+ """PySpark deduplication helpers for portable transform intent."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from typing import Any
7
+
8
+
9
+ def apply_transform_deduplicate(df: Any, deduplicate: object) -> Any:
10
+ if not isinstance(deduplicate, dict):
11
+ return df
12
+ from pyspark.sql import Window, functions as F
13
+
14
+ keys = deduplicate.get("keys")
15
+ key_columns = [str(keys)] if isinstance(keys, str) else [str(key) for key in keys or ()]
16
+ if not key_columns:
17
+ raise ValueError("transform.deduplicate.keys is required")
18
+ _validate_columns(df, {column: True for column in key_columns}, "transform.deduplicate.keys")
19
+ window = Window.partitionBy(*key_columns).orderBy(*_deduplicate_order_columns(deduplicate.get("order_by"), F))
20
+ return df.withColumn("__cf_row_number", F.row_number().over(window)).filter(F.col("__cf_row_number") == 1).drop(
21
+ "__cf_row_number"
22
+ )
23
+
24
+
25
+ def _deduplicate_order_columns(order_by: object, functions: Any) -> list[Any]:
26
+ if isinstance(order_by, str):
27
+ return _deduplicate_order_columns_from_string(order_by, functions)
28
+ order_columns = []
29
+ for item in order_by or ():
30
+ if not isinstance(item, dict):
31
+ continue
32
+ order_columns.append(
33
+ _deduplicate_order_column(
34
+ functions.col(str(item["column"])),
35
+ direction=str(item.get("direction", "desc")).lower(),
36
+ nulls=str(item.get("nulls") or "").lower(),
37
+ )
38
+ )
39
+ if not order_columns:
40
+ raise ValueError("transform.deduplicate.order_by is required")
41
+ return order_columns
42
+
43
+
44
+ def _deduplicate_order_columns_from_string(order_by: str, functions: Any) -> list[Any]:
45
+ order_columns = []
46
+ for clause in (item.strip() for item in order_by.split(",")):
47
+ if not clause:
48
+ continue
49
+ parsed = re.match(
50
+ r"^`?(?P<column>[A-Za-z_][A-Za-z0-9_]*)`?(?:\s+(?P<direction>ASC|DESC))?(?:\s+NULLS\s+(?P<nulls>FIRST|LAST))?$",
51
+ clause,
52
+ flags=re.IGNORECASE,
53
+ )
54
+ if parsed is None:
55
+ order_columns.append(functions.expr(clause))
56
+ continue
57
+ order_columns.append(
58
+ _deduplicate_order_column(
59
+ functions.col(parsed.group("column")),
60
+ direction=(parsed.group("direction") or "desc").lower(),
61
+ nulls=(parsed.group("nulls") or "").lower(),
62
+ )
63
+ )
64
+ if not order_columns:
65
+ raise ValueError("transform.deduplicate.order_by is required")
66
+ return order_columns
67
+
68
+
69
+ def _deduplicate_order_column(column: Any, *, direction: str, nulls: str) -> Any:
70
+ if direction == "asc" and nulls == "first":
71
+ return column.asc_nulls_first()
72
+ if direction == "asc" and nulls == "last":
73
+ return column.asc_nulls_last()
74
+ if direction == "asc":
75
+ return column.asc()
76
+ if nulls == "first":
77
+ return column.desc_nulls_first()
78
+ if nulls == "last":
79
+ return column.desc_nulls_last()
80
+ return column.desc()
81
+
82
+
83
+ def _validate_columns(df: Any, columns: dict[str, Any], context: str) -> None:
84
+ available = set(getattr(df, "columns", ()) or ())
85
+ missing = sorted(str(column) for column in columns if str(column) not in available)
86
+ if missing:
87
+ raise ValueError(f"{context} references missing columns: {missing}")
@@ -0,0 +1,37 @@
1
+ """Databricks preparation helpers for adapter-owned encoding fixes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import importlib
6
+ from typing import Any
7
+
8
+ from contractforge_databricks.contract_extensions import databricks_extensions
9
+
10
+
11
+ def apply_encoding_fix(df: Any, contract: Any) -> Any:
12
+ extensions = databricks_extensions(contract)
13
+ if not extensions.get("fix_encoding"):
14
+ return df
15
+ functions = importlib.import_module("pyspark.sql").functions
16
+ encoding = str(extensions.get("encoding") or "utf-8")
17
+ string_columns = _string_columns(df)
18
+ for column in _string_tuple(extensions.get("encoding_columns")) or string_columns:
19
+ if column in string_columns:
20
+ df = df.withColumn(column, functions.decode(functions.col(column).cast("binary"), encoding))
21
+ return df
22
+
23
+
24
+ def _string_columns(df: Any) -> tuple[str, ...]:
25
+ return tuple(
26
+ field.name
27
+ for field in getattr(getattr(df, "schema", None), "fields", ()) or ()
28
+ if field.dataType.typeName() == "string"
29
+ )
30
+
31
+
32
+ def _string_tuple(value: object) -> tuple[str, ...]:
33
+ if value is None:
34
+ return ()
35
+ if isinstance(value, str):
36
+ return tuple(part.strip() for part in value.split(",") if part.strip())
37
+ return tuple(str(part) for part in value or ())
@@ -0,0 +1,18 @@
1
+ """Hash expression helpers for Databricks staging."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from contractforge_core.preparation import HASH_DELIMITER, HASH_NULL_SENTINEL
6
+ from contractforge_databricks.sql import quote_identifier
7
+
8
+ ROW_HASH_COLUMN = "row_hash"
9
+
10
+
11
+ def render_row_hash_expression(columns: tuple[str, ...], *, exclude: tuple[str, ...] = ()) -> str:
12
+ included = tuple(column for column in columns if column not in set(exclude))
13
+ if not included:
14
+ raise ValueError("row hash requires at least one included column")
15
+ payload = ", ".join(
16
+ f"coalesce(cast({quote_identifier(column)} as string), '{HASH_NULL_SENTINEL}')" for column in included
17
+ )
18
+ return f"sha2(concat_ws('{HASH_DELIMITER}', {payload}), 256)"
@@ -0,0 +1,178 @@
1
+ """Optional PySpark staging helpers.
2
+
3
+ Imports stay inside functions so the package can be imported without PySpark.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from typing import Any
9
+
10
+ from contractforge_core.config import CONTROL_COLUMNS
11
+ from contractforge_core.semantic import SemanticContract
12
+ from contractforge_databricks.preparation.deduplicate import apply_transform_deduplicate
13
+ from contractforge_databricks.preparation.encoding import apply_encoding_fix
14
+
15
+
16
+ def create_or_replace_temp_view(df: Any, name: str) -> str:
17
+ if not name or not name.strip():
18
+ raise ValueError("temp view name must not be empty")
19
+ df.createOrReplaceTempView(name)
20
+ return name
21
+
22
+
23
+ def apply_transform(df: Any, transform: dict[str, Any] | None) -> Any:
24
+ """Apply portable transform intent with PySpark DataFrame operations."""
25
+
26
+ if not transform:
27
+ return df
28
+ df = apply_transform_cast(df, _dict(transform.get("cast")))
29
+ df = apply_transform_standardize(df, _dict(transform.get("standardize")))
30
+ df = apply_transform_derive(df, _dict(transform.get("derive")))
31
+ df = apply_transform_composite_keys(df, _dict(transform.get("composite_keys")))
32
+ return apply_transform_deduplicate(df, transform.get("deduplicate"))
33
+
34
+
35
+ def apply_contract_preparation(
36
+ df: Any,
37
+ contract: SemanticContract,
38
+ *,
39
+ watermark_column: str | None = None,
40
+ watermark_previous: str | None = None,
41
+ ) -> Any:
42
+ """Apply portable pre-write preparation declared in the core contract."""
43
+
44
+ metadata = _contract_metadata(contract)
45
+ select_columns = _string_list(metadata.get("select_columns"))
46
+ if select_columns:
47
+ _validate_columns(df, {column: True for column in select_columns}, "select_columns")
48
+ df = df.select(*select_columns)
49
+ column_mapping = _dict(metadata.get("column_mapping"))
50
+ if column_mapping:
51
+ _validate_column_mapping(df, column_mapping)
52
+ for source_col, target_col in column_mapping.items():
53
+ df = df.withColumnRenamed(str(source_col), str(target_col))
54
+ if contract.shape:
55
+ from contractforge_databricks.preparation.shape import apply_shape
56
+
57
+ df = apply_shape(df, contract.shape.raw, layer=contract.target.layer)
58
+ transform = contract.transform.raw if contract.transform else {}
59
+ df = apply_transform_cast(df, _dict(transform.get("cast")))
60
+ df = apply_transform_standardize(df, _dict(transform.get("standardize")))
61
+ df = apply_transform_derive(df, _dict(transform.get("derive")))
62
+ filter_expression = metadata.get("filter_expression")
63
+ if filter_expression:
64
+ from pyspark.sql import functions as F
65
+
66
+ df = df.where(F.expr(str(filter_expression)))
67
+ df = apply_transform_composite_keys(df, _dict(transform.get("composite_keys")))
68
+ df = _apply_watermark_filter(df, watermark_column, watermark_previous)
69
+ df = apply_transform_deduplicate(df, transform.get("deduplicate"))
70
+ return apply_encoding_fix(df, contract)
71
+
72
+
73
+ def apply_transform_cast(df: Any, casts: dict[str, Any]) -> Any:
74
+ if not casts:
75
+ return df
76
+ from pyspark.sql import functions as F
77
+
78
+ _validate_columns(df, casts, "transform.cast")
79
+ for column_name, data_type in casts.items():
80
+ df = df.withColumn(str(column_name), F.col(str(column_name)).cast(str(data_type)))
81
+ return df
82
+
83
+
84
+ def apply_transform_derive(df: Any, expressions: dict[str, Any]) -> Any:
85
+ if not expressions:
86
+ return df
87
+ from pyspark.sql import functions as F
88
+
89
+ for column_name, expression in expressions.items():
90
+ df = df.withColumn(str(column_name), F.expr(str(expression)))
91
+ return df
92
+
93
+
94
+ def apply_transform_composite_keys(df: Any, composite_keys: dict[str, Any]) -> Any:
95
+ if not composite_keys:
96
+ return df
97
+ from pyspark.sql import functions as F
98
+
99
+ for key_name, source_columns in composite_keys.items():
100
+ columns = [source_columns] if isinstance(source_columns, str) else list(source_columns or ())
101
+ _validate_columns(df, {str(column): True for column in columns}, f"transform.composite_keys.{key_name}")
102
+ parts = [F.coalesce(F.col(str(column)).cast("string"), F.lit("")) for column in columns]
103
+ df = df.withColumn(str(key_name), F.concat_ws("|", *parts))
104
+ return df
105
+
106
+
107
+ def apply_transform_standardize(df: Any, standardize: dict[str, Any]) -> Any:
108
+ if not standardize:
109
+ return df
110
+ from pyspark.sql import functions as F
111
+
112
+ _validate_columns(df, standardize, "transform.standardize")
113
+ for column_name, config in standardize.items():
114
+ column = F.col(str(column_name))
115
+ if config.get("normalize_whitespace"):
116
+ column = F.regexp_replace(column, r"\s+", " ")
117
+ if config.get("trim"):
118
+ column = F.trim(column)
119
+ if config.get("lower"):
120
+ column = F.lower(column)
121
+ if config.get("upper"):
122
+ column = F.upper(column)
123
+ if config.get("empty_as_null"):
124
+ column = F.when(column == "", F.lit(None)).otherwise(column)
125
+ df = df.withColumn(str(column_name), column)
126
+ return df
127
+
128
+
129
+ def _validate_columns(df: Any, columns: dict[str, Any], context: str) -> None:
130
+ available = set(getattr(df, "columns", ()) or ())
131
+ missing = sorted(str(column) for column in columns if str(column) not in available)
132
+ if missing:
133
+ raise ValueError(f"{context} references missing columns: {missing}")
134
+
135
+
136
+ def _validate_column_mapping(df: Any, mapping: dict[str, Any]) -> None:
137
+ _validate_columns(df, mapping, "column_mapping")
138
+ existing = set(getattr(df, "columns", ()) or ())
139
+ targets = [str(target) for target in mapping.values()]
140
+ duplicates = sorted({target for target in targets if targets.count(target) > 1})
141
+ if duplicates:
142
+ raise ValueError(f"column_mapping has duplicate targets: {duplicates}")
143
+ reserved_targets = sorted(set(targets) & CONTROL_COLUMNS)
144
+ if reserved_targets:
145
+ raise ValueError(f"column_mapping cannot produce reserved control columns: {reserved_targets}")
146
+ collisions = sorted(
147
+ target
148
+ for source, target in ((str(source), str(target)) for source, target in mapping.items())
149
+ if target in existing and target != source
150
+ )
151
+ if collisions:
152
+ raise ValueError(f"column_mapping would collide with existing columns: {collisions}")
153
+
154
+
155
+ def _apply_watermark_filter(df: Any, watermark_column: str | None, watermark_value: str | None) -> Any:
156
+ if not watermark_column or not watermark_value:
157
+ return df
158
+ from contractforge_databricks.watermark import render_watermark_filter_predicate
159
+
160
+ columns = tuple(part for part in watermark_column.split("|") if part)
161
+ _validate_columns(df, {column: True for column in columns}, "watermark_columns")
162
+ return df.where(render_watermark_filter_predicate(columns=columns, watermark_value=watermark_value))
163
+
164
+
165
+ def _dict(value: object) -> dict[str, Any]:
166
+ return dict(value) if isinstance(value, dict) else {}
167
+
168
+
169
+ def _contract_metadata(contract: SemanticContract) -> dict[str, Any]:
170
+ return dict(contract.operations.metadata or {}) if contract.operations and contract.operations.metadata else {}
171
+
172
+
173
+ def _string_list(value: object) -> list[str]:
174
+ if value is None:
175
+ return []
176
+ if isinstance(value, str):
177
+ return [item.strip() for item in value.split(",") if item.strip()]
178
+ return [str(item) for item in value or ()]
@@ -0,0 +1,70 @@
1
+ """PySpark write-mode staging helpers with lazy imports."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from contractforge_core.preparation import SCD2StageSpec, SnapshotStageSpec, resolved_hash_exclude_columns
8
+ from contractforge_core.preparation import scd2_stage_spec_from_contract, snapshot_stage_spec_from_contract
9
+ from contractforge_core.semantic import SemanticContract
10
+ from contractforge_databricks.preparation.hashing import HASH_DELIMITER, HASH_NULL_SENTINEL, ROW_HASH_COLUMN
11
+
12
+
13
+ def with_row_hash(df: Any, columns: tuple[str, ...], *, exclude: tuple[str, ...] = ()) -> Any:
14
+ from pyspark.sql import functions as F
15
+
16
+ included = tuple(column for column in columns if column not in set(exclude))
17
+ if not included:
18
+ raise ValueError("row hash requires at least one included column")
19
+ payload = [F.coalesce(F.col(column).cast("string"), F.lit(HASH_NULL_SENTINEL)) for column in included]
20
+ return df.withColumn(ROW_HASH_COLUMN, F.sha2(F.concat_ws(HASH_DELIMITER, *payload), 256))
21
+
22
+
23
+ def prepare_snapshot_stage(df: Any, spec: SnapshotStageSpec) -> Any:
24
+ from pyspark.sql import functions as F
25
+
26
+ source_columns = tuple(column for column in spec.source_columns if column not in {"is_active", "deleted_at", "row_hash"})
27
+ staged = with_row_hash(df, source_columns)
28
+ return staged.withColumn(spec.is_active_column, F.lit(True)).withColumn(
29
+ spec.deleted_at_column,
30
+ F.lit(None).cast("timestamp"),
31
+ )
32
+
33
+
34
+ def prepare_scd2_stage(df: Any, spec: SCD2StageSpec) -> Any:
35
+ from pyspark.sql import functions as F
36
+
37
+ staged = with_row_hash(df, spec.change_columns)
38
+ if spec.effective_from_column:
39
+ staged = staged.withColumn("valid_from", F.col(spec.effective_from_column).cast("timestamp"))
40
+ else:
41
+ staged = staged.withColumn("valid_from", F.current_timestamp())
42
+ staged = staged.withColumn("valid_to", F.lit(None).cast("timestamp"))
43
+ staged = staged.withColumn("is_current", F.lit(True))
44
+ staged = staged.withColumn("changed_columns", F.lit(None).cast("string"))
45
+ for key in spec.merge_keys:
46
+ staged = staged.withColumn(f"__merge_key_{key}", F.lit(None))
47
+ return staged
48
+
49
+
50
+ def prepare_hash_diff_stage(df: Any, contract: SemanticContract) -> Any:
51
+ if contract.write.mode != "scd1_hash_diff":
52
+ raise ValueError("Hash-diff staging requires mode=scd1_hash_diff")
53
+ source_columns = tuple(str(column) for column in getattr(df, "columns", ()) or ())
54
+ hash_columns = source_columns if contract.write.hash_strategy == "all_columns_except" else contract.write.hash_keys
55
+ return with_row_hash(
56
+ df,
57
+ hash_columns,
58
+ exclude=resolved_hash_exclude_columns(contract),
59
+ )
60
+
61
+
62
+ def apply_write_staging(df: Any, contract: SemanticContract) -> Any:
63
+ source_columns = tuple(str(column) for column in getattr(df, "columns", ()) or ())
64
+ if contract.write.mode == "scd1_hash_diff":
65
+ return prepare_hash_diff_stage(df, contract)
66
+ if contract.write.mode == "scd2_historical":
67
+ return prepare_scd2_stage(df, scd2_stage_spec_from_contract(contract, source_columns=source_columns))
68
+ if contract.write.mode == "snapshot_soft_delete":
69
+ return prepare_snapshot_stage(df, snapshot_stage_spec_from_contract(contract, source_columns=source_columns))
70
+ return df