contractforge-databricks 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. contractforge_databricks/__init__.py +172 -0
  2. contractforge_databricks/adapter.py +69 -0
  3. contractforge_databricks/annotations/__init__.py +10 -0
  4. contractforge_databricks/annotations/application.py +52 -0
  5. contractforge_databricks/annotations/audit.py +49 -0
  6. contractforge_databricks/annotations/sql.py +142 -0
  7. contractforge_databricks/api.py +65 -0
  8. contractforge_databricks/bundles/__init__.py +9 -0
  9. contractforge_databricks/bundles/assets.py +47 -0
  10. contractforge_databricks/bundles/project.py +213 -0
  11. contractforge_databricks/bundles/project_config.py +133 -0
  12. contractforge_databricks/capabilities/__init__.py +17 -0
  13. contractforge_databricks/capabilities/builders.py +43 -0
  14. contractforge_databricks/capabilities/evaluate.py +162 -0
  15. contractforge_databricks/capabilities/mapping.py +36 -0
  16. contractforge_databricks/capabilities/models.py +44 -0
  17. contractforge_databricks/capabilities/runtime.py +111 -0
  18. contractforge_databricks/capabilities/uc.py +47 -0
  19. contractforge_databricks/cli.py +196 -0
  20. contractforge_databricks/cli_deploy.py +98 -0
  21. contractforge_databricks/cli_governance.py +142 -0
  22. contractforge_databricks/cli_io.py +91 -0
  23. contractforge_databricks/cli_maintenance.py +69 -0
  24. contractforge_databricks/coercion.py +31 -0
  25. contractforge_databricks/contract_extensions.py +70 -0
  26. contractforge_databricks/cost/__init__.py +11 -0
  27. contractforge_databricks/cost/model.py +22 -0
  28. contractforge_databricks/cost/report.py +65 -0
  29. contractforge_databricks/cost/sql.py +136 -0
  30. contractforge_databricks/dashboards/__init__.py +15 -0
  31. contractforge_databricks/dashboards/control_tables.py +150 -0
  32. contractforge_databricks/diagnostics/__init__.py +7 -0
  33. contractforge_databricks/diagnostics/explain.py +40 -0
  34. contractforge_databricks/environment.py +53 -0
  35. contractforge_databricks/evidence/__init__.py +98 -0
  36. contractforge_databricks/evidence/ddl.py +35 -0
  37. contractforge_databricks/evidence/governance_log.py +175 -0
  38. contractforge_databricks/evidence/helpers.py +29 -0
  39. contractforge_databricks/evidence/ops_log.py +210 -0
  40. contractforge_databricks/evidence/records.py +27 -0
  41. contractforge_databricks/evidence/run_log.py +74 -0
  42. contractforge_databricks/evidence/schemas.py +7 -0
  43. contractforge_databricks/evidence/sql.py +144 -0
  44. contractforge_databricks/evidence/tables.py +20 -0
  45. contractforge_databricks/evidence/writer.py +118 -0
  46. contractforge_databricks/execution/__init__.py +70 -0
  47. contractforge_databricks/execution/delta_basic.py +57 -0
  48. contractforge_databricks/execution/hash_diff.py +126 -0
  49. contractforge_databricks/execution/hash_diff_latest.py +142 -0
  50. contractforge_databricks/execution/replace_partitions.py +40 -0
  51. contractforge_databricks/execution/results.py +5 -0
  52. contractforge_databricks/execution/retry.py +36 -0
  53. contractforge_databricks/execution/scd2.py +213 -0
  54. contractforge_databricks/execution/scd2_deletes.py +65 -0
  55. contractforge_databricks/execution/scd2_late.py +30 -0
  56. contractforge_databricks/execution/snapshot.py +77 -0
  57. contractforge_databricks/execution/sql_merge.py +85 -0
  58. contractforge_databricks/execution/tables.py +98 -0
  59. contractforge_databricks/execution/windows.py +58 -0
  60. contractforge_databricks/governance/__init__.py +30 -0
  61. contractforge_databricks/governance/access.py +185 -0
  62. contractforge_databricks/governance/application.py +93 -0
  63. contractforge_databricks/governance/drift.py +49 -0
  64. contractforge_databricks/governance/runtime.py +60 -0
  65. contractforge_databricks/governance/sql.py +31 -0
  66. contractforge_databricks/governance/validation.py +135 -0
  67. contractforge_databricks/lakeflow/__init__.py +21 -0
  68. contractforge_databricks/lakeflow/compatibility.py +194 -0
  69. contractforge_databricks/lakeflow/rendering.py +175 -0
  70. contractforge_databricks/lineage/__init__.py +7 -0
  71. contractforge_databricks/lineage/openlineage.py +182 -0
  72. contractforge_databricks/maintenance/__init__.py +27 -0
  73. contractforge_databricks/maintenance/retention.py +90 -0
  74. contractforge_databricks/maintenance/sql.py +68 -0
  75. contractforge_databricks/metrics/__init__.py +19 -0
  76. contractforge_databricks/metrics/history.py +21 -0
  77. contractforge_databricks/metrics/write.py +63 -0
  78. contractforge_databricks/operations/__init__.py +4 -0
  79. contractforge_databricks/operations/application.py +38 -0
  80. contractforge_databricks/operations/sql.py +95 -0
  81. contractforge_databricks/parity/__init__.py +18 -0
  82. contractforge_databricks/parity/catalog.py +59 -0
  83. contractforge_databricks/parity/models.py +7 -0
  84. contractforge_databricks/parity/scenarios.py +111 -0
  85. contractforge_databricks/partitioning/__init__.py +3 -0
  86. contractforge_databricks/partitioning/predicates.py +28 -0
  87. contractforge_databricks/preparation/__init__.py +47 -0
  88. contractforge_databricks/preparation/deduplicate.py +87 -0
  89. contractforge_databricks/preparation/encoding.py +37 -0
  90. contractforge_databricks/preparation/hashing.py +18 -0
  91. contractforge_databricks/preparation/pyspark.py +178 -0
  92. contractforge_databricks/preparation/pyspark_staging.py +70 -0
  93. contractforge_databricks/preparation/shape.py +209 -0
  94. contractforge_databricks/preparation/shape_validation.py +94 -0
  95. contractforge_databricks/preparation/staging.py +17 -0
  96. contractforge_databricks/preparation/zip_arrays.py +51 -0
  97. contractforge_databricks/presets/__init__.py +3 -0
  98. contractforge_databricks/presets/base.py +24 -0
  99. contractforge_databricks/presets/bronze.py +57 -0
  100. contractforge_databricks/presets/catalog.py +22 -0
  101. contractforge_databricks/presets/core.py +134 -0
  102. contractforge_databricks/presets/gold.py +62 -0
  103. contractforge_databricks/presets/modifiers.py +51 -0
  104. contractforge_databricks/presets/runtime.py +22 -0
  105. contractforge_databricks/presets/silver.py +101 -0
  106. contractforge_databricks/presets/write_engine.py +57 -0
  107. contractforge_databricks/quality/__init__.py +41 -0
  108. contractforge_databricks/quality/evaluation.py +178 -0
  109. contractforge_databricks/quality/persistence.py +81 -0
  110. contractforge_databricks/quality/registry.py +134 -0
  111. contractforge_databricks/quality/results.py +17 -0
  112. contractforge_databricks/quality/sql.py +113 -0
  113. contractforge_databricks/rendering/__init__.py +11 -0
  114. contractforge_databricks/rendering/bundle.py +93 -0
  115. contractforge_databricks/rendering/markdown.py +50 -0
  116. contractforge_databricks/rendering/names.py +56 -0
  117. contractforge_databricks/results.py +15 -0
  118. contractforge_databricks/runtime/__init__.py +101 -0
  119. contractforge_databricks/runtime/available_now.py +147 -0
  120. contractforge_databricks/runtime/bundles.py +211 -0
  121. contractforge_databricks/runtime/cache.py +20 -0
  122. contractforge_databricks/runtime/control_tables.py +19 -0
  123. contractforge_databricks/runtime/deploy.py +197 -0
  124. contractforge_databricks/runtime/detection.py +114 -0
  125. contractforge_databricks/runtime/dry_run.py +46 -0
  126. contractforge_databricks/runtime/errors.py +54 -0
  127. contractforge_databricks/runtime/file_selection.py +109 -0
  128. contractforge_databricks/runtime/finalization.py +168 -0
  129. contractforge_databricks/runtime/governance.py +37 -0
  130. contractforge_databricks/runtime/hooks.py +45 -0
  131. contractforge_databricks/runtime/http_file.py +37 -0
  132. contractforge_databricks/runtime/http_retry.py +15 -0
  133. contractforge_databricks/runtime/http_safety.py +9 -0
  134. contractforge_databricks/runtime/json_materialization.py +97 -0
  135. contractforge_databricks/runtime/lineage.py +164 -0
  136. contractforge_databricks/runtime/maintenance.py +43 -0
  137. contractforge_databricks/runtime/merge_validation.py +98 -0
  138. contractforge_databricks/runtime/metadata.py +21 -0
  139. contractforge_databricks/runtime/metrics.py +34 -0
  140. contractforge_databricks/runtime/models.py +32 -0
  141. contractforge_databricks/runtime/options.py +33 -0
  142. contractforge_databricks/runtime/orchestration_context.py +185 -0
  143. contractforge_databricks/runtime/orchestrator.py +147 -0
  144. contractforge_databricks/runtime/partitioning.py +93 -0
  145. contractforge_databricks/runtime/quality_quarantine.py +92 -0
  146. contractforge_databricks/runtime/rest_api.py +46 -0
  147. contractforge_databricks/runtime/rest_auth.py +21 -0
  148. contractforge_databricks/runtime/rest_pagination.py +21 -0
  149. contractforge_databricks/runtime/run_payload.py +177 -0
  150. contractforge_databricks/runtime/schema.py +106 -0
  151. contractforge_databricks/runtime/source_metadata.py +30 -0
  152. contractforge_databricks/runtime/source_registry.py +43 -0
  153. contractforge_databricks/runtime/source_schema.py +24 -0
  154. contractforge_databricks/runtime/sources.py +208 -0
  155. contractforge_databricks/runtime/spark.py +183 -0
  156. contractforge_databricks/runtime/spark_defaults.py +35 -0
  157. contractforge_databricks/runtime/storage_auth.py +132 -0
  158. contractforge_databricks/runtime/streaming.py +131 -0
  159. contractforge_databricks/runtime/success.py +104 -0
  160. contractforge_databricks/runtime/utils.py +52 -0
  161. contractforge_databricks/runtime/watermark.py +71 -0
  162. contractforge_databricks/runtime/windows.py +184 -0
  163. contractforge_databricks/runtime/write.py +66 -0
  164. contractforge_databricks/runtime/write_flow.py +146 -0
  165. contractforge_databricks/runtime/write_strategy.py +40 -0
  166. contractforge_databricks/schema/__init__.py +21 -0
  167. contractforge_databricks/schema/diff.py +11 -0
  168. contractforge_databricks/schema/policy.py +33 -0
  169. contractforge_databricks/schema/sync.py +23 -0
  170. contractforge_databricks/security/__init__.py +21 -0
  171. contractforge_databricks/security/errors.py +5 -0
  172. contractforge_databricks/security/redaction.py +5 -0
  173. contractforge_databricks/security/secrets.py +114 -0
  174. contractforge_databricks/security/source_policy.py +17 -0
  175. contractforge_databricks/shapes/__init__.py +3 -0
  176. contractforge_databricks/shapes/sql.py +123 -0
  177. contractforge_databricks/sources/__init__.py +67 -0
  178. contractforge_databricks/sources/artifacts.py +100 -0
  179. contractforge_databricks/sources/autoloader.py +48 -0
  180. contractforge_databricks/sources/bounded_streams.py +44 -0
  181. contractforge_databricks/sources/classification.py +115 -0
  182. contractforge_databricks/sources/delta_share.py +21 -0
  183. contractforge_databricks/sources/files.py +48 -0
  184. contractforge_databricks/sources/http_file.py +46 -0
  185. contractforge_databricks/sources/interpret.py +76 -0
  186. contractforge_databricks/sources/jdbc.py +32 -0
  187. contractforge_databricks/sources/metadata.py +18 -0
  188. contractforge_databricks/sources/native_passthrough.py +33 -0
  189. contractforge_databricks/sources/rds_iam.py +15 -0
  190. contractforge_databricks/sources/rds_iam_runtime.py +191 -0
  191. contractforge_databricks/sources/rest_api.py +33 -0
  192. contractforge_databricks/sources/support.py +50 -0
  193. contractforge_databricks/sources/table_refs.py +65 -0
  194. contractforge_databricks/sql/__init__.py +4 -0
  195. contractforge_databricks/sql/identifiers.py +17 -0
  196. contractforge_databricks/sql/literals.py +36 -0
  197. contractforge_databricks/state/__init__.py +39 -0
  198. contractforge_databricks/state/ddl.py +24 -0
  199. contractforge_databricks/state/migrations.py +146 -0
  200. contractforge_databricks/state/queries.py +149 -0
  201. contractforge_databricks/state/sql.py +116 -0
  202. contractforge_databricks/state/tables.py +9 -0
  203. contractforge_databricks/state/writer.py +83 -0
  204. contractforge_databricks/templates/__init__.py +15 -0
  205. contractforge_databricks/templates/catalog.py +205 -0
  206. contractforge_databricks/templates/catalog_parity.py +85 -0
  207. contractforge_databricks/templates/core.py +83 -0
  208. contractforge_databricks/templates/enrichment.py +175 -0
  209. contractforge_databricks/transforms/__init__.py +3 -0
  210. contractforge_databricks/transforms/sql.py +118 -0
  211. contractforge_databricks/watermark/__init__.py +6 -0
  212. contractforge_databricks/watermark/sql.py +91 -0
  213. contractforge_databricks/write_modes/__init__.py +20 -0
  214. contractforge_databricks/write_modes/registry.py +44 -0
  215. contractforge_databricks/write_modes/sql.py +33 -0
  216. contractforge_databricks/write_modes/strategy.py +192 -0
  217. contractforge_databricks-0.1.0.dist-info/METADATA +34 -0
  218. contractforge_databricks-0.1.0.dist-info/RECORD +220 -0
  219. contractforge_databricks-0.1.0.dist-info/WHEEL +4 -0
  220. contractforge_databricks-0.1.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,209 @@
1
+ """Optional PySpark execution for declarative shape intent."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from contractforge_databricks.preparation.shape_validation import (
8
+ CARDINALITY_CHANGING_MODES,
9
+ as_dict,
10
+ as_list,
11
+ data_type_at_path,
12
+ flatten_config,
13
+ is_excluded,
14
+ path_col,
15
+ validate_cardinality_policy,
16
+ validate_cartesian_arrays,
17
+ validate_columns,
18
+ )
19
+ from contractforge_databricks.preparation.zip_arrays import apply_zip_arrays
20
+
21
+
22
+ def apply_shape(df: Any, shape: dict[str, Any] | None, *, layer: str = "silver") -> Any:
23
+ """Apply portable shape intent with PySpark DataFrame operations."""
24
+
25
+ if not shape:
26
+ return df
27
+ validate_cardinality_policy(shape, layer)
28
+ validate_cartesian_arrays(shape)
29
+ df = _apply_parse_json(df, as_list(shape.get("parse_json")))
30
+ df = apply_zip_arrays(df, as_list(shape.get("zip_arrays")))
31
+ df = _apply_arrays(df, as_list(shape.get("arrays")))
32
+ df = _drop_shape_intermediates(df, shape)
33
+ df = _apply_columns(df, as_dict(shape.get("columns")))
34
+ return _apply_flatten(df, shape.get("flatten"))
35
+
36
+
37
+ def _apply_parse_json(df: Any, configs: list[dict[str, Any]]) -> Any:
38
+ from pyspark.sql import functions as F
39
+ from pyspark.sql.types import StringType
40
+
41
+ for config in configs:
42
+ column = str(config["column"])
43
+ validate_columns(df, {column: True}, "shape.parse_json")
44
+ cast_input = str(config.get("cast_input") or "").strip().upper()
45
+ source_expr = path_col(F, column)
46
+ if cast_input == "STRING":
47
+ source_expr = source_expr.cast("string")
48
+ elif cast_input:
49
+ raise ValueError(f"shape.parse_json.{column}.cast_input={cast_input!r} is not supported")
50
+ else:
51
+ data_type = data_type_at_path(getattr(df, "schema", None), column)
52
+ if data_type is not None and not isinstance(data_type, StringType):
53
+ raise ValueError(
54
+ f"shape.parse_json.{column} must be string;"
55
+ " declare cast_input: STRING to coerce a binary/non-string source column"
56
+ )
57
+ schema = config.get("schema")
58
+ if not schema:
59
+ raise ValueError("shape.parse_json requires schema for runtime execution")
60
+ alias = str(config.get("alias") or column)
61
+ df = df.withColumn(alias, F.from_json(source_expr, str(schema)))
62
+ if config.get("drop_source") and alias != column:
63
+ df = df.drop(column)
64
+ return df
65
+
66
+
67
+ def _apply_arrays(df: Any, arrays: list[dict[str, Any]]) -> Any:
68
+ from pyspark.sql import functions as F
69
+ from pyspark.sql.types import ArrayType
70
+
71
+ pending = [config for config in arrays if str(config.get("mode", "keep")) != "keep"]
72
+ while pending:
73
+ progressed = False
74
+ remaining = []
75
+ for config in pending:
76
+ top_level = str(config["path"]).split(".", 1)[0]
77
+ if top_level not in set(getattr(df, "columns", ()) or ()):
78
+ remaining.append(config)
79
+ continue
80
+ df = _apply_array_config(df, config, F, ArrayType)
81
+ progressed = True
82
+ if not progressed:
83
+ unresolved = [str(config["path"]) for config in remaining]
84
+ raise ValueError(f"shape.arrays contains unresolved paths: {unresolved}")
85
+ pending = remaining
86
+ return df
87
+
88
+
89
+ def _apply_array_config(df: Any, config: dict[str, Any], functions: Any, array_type: Any) -> Any:
90
+ mode = str(config.get("mode", "keep"))
91
+ path = str(config["path"])
92
+ data_type = data_type_at_path(getattr(df, "schema", None), path)
93
+ if data_type is not None and not isinstance(data_type, array_type):
94
+ raise ValueError(f"shape.arrays.{path} must be array")
95
+ alias = str(config.get("alias") or path.replace(".", "_"))
96
+ column = path_col(functions, path)
97
+ if mode == "to_json":
98
+ expr = functions.to_json(column)
99
+ elif mode == "size":
100
+ expr = functions.size(column)
101
+ elif mode == "first":
102
+ expr = functions.element_at(column, 1)
103
+ elif mode == "explode":
104
+ expr = functions.explode(column)
105
+ elif mode == "explode_outer":
106
+ expr = functions.explode_outer(column)
107
+ else:
108
+ raise ValueError(f"shape.arrays mode {mode!r} is not supported")
109
+ return df.withColumn(alias, expr)
110
+
111
+
112
+ def _apply_columns(df: Any, columns: dict[str, Any]) -> Any:
113
+ if not columns:
114
+ return df
115
+ from pyspark.sql import functions as F
116
+
117
+ projected = []
118
+ for path, config in columns.items():
119
+ if isinstance(config, str):
120
+ alias = config
121
+ expr = path_col(F, str(path))
122
+ else:
123
+ alias = str(config.get("alias") or str(path).replace(".", "_"))
124
+ expr = F.expr(str(config["expression"])) if config.get("expression") else path_col(F, str(path))
125
+ if config.get("cast"):
126
+ expr = expr.cast(str(config["cast"]))
127
+ projected.append(expr.alias(str(alias)))
128
+ return df.select(*projected)
129
+
130
+
131
+ def _apply_flatten(df: Any, flatten: object) -> Any:
132
+ config = flatten_config(flatten)
133
+ if not config["enabled"]:
134
+ return df
135
+ from pyspark.sql.types import StructType
136
+ from pyspark.sql import functions as F
137
+
138
+ projections = []
139
+ aliases = set()
140
+ top_level = set(getattr(df, "columns", ()) or ())
141
+ separator = str(config["separator"])
142
+ max_depth = int(config["max_depth"])
143
+ include = set(config["include"])
144
+ exclude = set(config["exclude"])
145
+ for field in getattr(getattr(df, "schema", None), "fields", ()):
146
+ if include and field.name not in include:
147
+ projections.append(path_col(F, field.name).alias(field.name))
148
+ aliases.add(field.name)
149
+ continue
150
+ if is_excluded(field.name, exclude):
151
+ projections.append(path_col(F, field.name).alias(field.name))
152
+ aliases.add(field.name)
153
+ continue
154
+ if isinstance(field.dataType, StructType):
155
+ for path, alias in _struct_leaf_paths(field.dataType, field.name, separator, max_depth=max_depth):
156
+ if is_excluded(path, exclude):
157
+ continue
158
+ if alias in top_level:
159
+ continue
160
+ if alias in aliases:
161
+ raise ValueError(f"shape.flatten would create duplicate column: {alias}")
162
+ projections.append(path_col(F, path).alias(alias))
163
+ aliases.add(alias)
164
+ else:
165
+ if field.name in aliases:
166
+ raise ValueError(f"shape.flatten would create duplicate column: {field.name}")
167
+ projections.append(path_col(F, field.name).alias(field.name))
168
+ aliases.add(field.name)
169
+ return df.select(*projections) if projections else df
170
+
171
+
172
+ def _drop_shape_intermediates(df: Any, shape: dict[str, Any]) -> Any:
173
+ if shape.get("columns"):
174
+ return df
175
+ arrays = as_list(shape.get("arrays"))
176
+ array_paths = [str(item["path"]) for item in arrays]
177
+ zip_aliases = {
178
+ str(config["alias"])
179
+ for config in as_list(shape.get("zip_arrays"))
180
+ if any(path == str(config["alias"]) or path.startswith(f"{config['alias']}.") for path in array_paths)
181
+ }
182
+ exploded_aliases = {
183
+ str(item.get("alias") or str(item["path"]).replace(".", "_"))
184
+ for item in arrays
185
+ if item.get("mode") in CARDINALITY_CHANGING_MODES
186
+ and any(
187
+ path != str(item["path"])
188
+ and (
189
+ path == str(item.get("alias") or str(item["path"]).replace(".", "_"))
190
+ or path.startswith(f"{item.get('alias') or str(item['path']).replace('.', '_')}.")
191
+ )
192
+ for path in array_paths
193
+ )
194
+ }
195
+ to_drop = sorted((zip_aliases | exploded_aliases) & set(getattr(df, "columns", ()) or ()))
196
+ return df.drop(*to_drop) if to_drop else df
197
+
198
+
199
+ def _struct_leaf_paths(struct: Any, prefix: str, separator: str, *, max_depth: int, depth: int = 1) -> list[tuple[str, str]]:
200
+ from pyspark.sql.types import StructType
201
+
202
+ leaves = []
203
+ for field in struct.fields:
204
+ path = f"{prefix}.{field.name}"
205
+ if isinstance(field.dataType, StructType) and depth < max_depth:
206
+ leaves.extend(_struct_leaf_paths(field.dataType, path, separator, max_depth=max_depth, depth=depth + 1))
207
+ else:
208
+ leaves.append((path, path.replace(".", separator)))
209
+ return leaves
@@ -0,0 +1,94 @@
1
+ """Validation helpers for Databricks shape execution."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ CARDINALITY_CHANGING_MODES = frozenset({"explode", "explode_outer"})
8
+
9
+
10
+ def validate_cardinality_policy(shape: dict[str, Any], layer: str) -> None:
11
+ if layer != "bronze" or shape.get("allow_cardinality_change_on_bronze"):
12
+ return
13
+ changing = [item["path"] for item in as_list(shape.get("arrays")) if item.get("mode") in CARDINALITY_CHANGING_MODES]
14
+ if changing:
15
+ raise ValueError(f"shape cardinality change is blocked in bronze by default: {changing}")
16
+
17
+
18
+ def validate_cartesian_arrays(shape: dict[str, Any]) -> None:
19
+ groups: dict[str, list[dict[str, Any]]] = {}
20
+ for item in as_list(shape.get("arrays")):
21
+ if item.get("mode") not in CARDINALITY_CHANGING_MODES:
22
+ continue
23
+ groups.setdefault(parent_path(str(item["path"])), []).append(item)
24
+ conflicts = {
25
+ parent: [str(item["path"]) for item in items if not item.get("allow_cartesian")]
26
+ for parent, items in groups.items()
27
+ if len(items) > 1 and any(not item.get("allow_cartesian") for item in items)
28
+ }
29
+ if conflicts:
30
+ raise ValueError(f"shape contains sibling explodes that may create a cartesian product: {conflicts}")
31
+
32
+
33
+ def validate_columns(df: Any, columns: dict[str, Any], context: str) -> None:
34
+ available = set(getattr(df, "columns", ()) or ())
35
+ missing = sorted(str(column) for column in columns if str(column).split(".", 1)[0] not in available)
36
+ if missing:
37
+ raise ValueError(f"{context} references missing columns: {missing}")
38
+
39
+
40
+ def data_type_at_path(schema: Any, path: str) -> Any | None:
41
+ from pyspark.sql.types import ArrayType, StructType
42
+
43
+ current = schema
44
+ for part in path.split("."):
45
+ if isinstance(current, ArrayType):
46
+ current = current.elementType
47
+ if not isinstance(current, StructType):
48
+ return None
49
+ field = next((item for item in current.fields if item.name == part), None)
50
+ if field is None:
51
+ return None
52
+ current = field.dataType
53
+ return current
54
+
55
+
56
+ def flatten_config(flatten: object) -> dict[str, Any]:
57
+ if isinstance(flatten, bool):
58
+ return {"enabled": flatten, "separator": "_", "max_depth": 10, "include": [], "exclude": []}
59
+ config = as_dict(flatten)
60
+ return {
61
+ "enabled": bool(config.get("enabled", False)),
62
+ "separator": config.get("separator", "_"),
63
+ "max_depth": config.get("max_depth", 10),
64
+ "include": string_list(config.get("include")),
65
+ "exclude": string_list(config.get("exclude")),
66
+ }
67
+
68
+
69
+ def path_col(functions: Any, path: str) -> Any:
70
+ return functions.col(".".join(f"`{part}`" for part in path.split(".")))
71
+
72
+
73
+ def parent_path(path: str) -> str:
74
+ return ".".join(path.split(".")[:-1])
75
+
76
+
77
+ def is_excluded(path: str, exclude: set[str]) -> bool:
78
+ return path in exclude or any(path.startswith(f"{item}.") for item in exclude)
79
+
80
+
81
+ def as_dict(value: object) -> dict[str, Any]:
82
+ return dict(value) if isinstance(value, dict) else {}
83
+
84
+
85
+ def as_list(value: object) -> list[dict[str, Any]]:
86
+ return [dict(item) for item in value or () if isinstance(item, dict)]
87
+
88
+
89
+ def string_list(value: object) -> list[str]:
90
+ if value is None:
91
+ return []
92
+ if isinstance(value, str):
93
+ return [value]
94
+ return [str(item) for item in value] # type: ignore[union-attr]
@@ -0,0 +1,17 @@
1
+ """Compatibility exports for platform-neutral staging specifications."""
2
+
3
+ from contractforge_core.preparation import (
4
+ HashDiffStageSpec,
5
+ SCD2StageSpec,
6
+ SnapshotStageSpec,
7
+ scd2_stage_spec_from_contract,
8
+ snapshot_stage_spec_from_contract,
9
+ )
10
+
11
+ __all__ = [
12
+ "HashDiffStageSpec",
13
+ "SCD2StageSpec",
14
+ "SnapshotStageSpec",
15
+ "scd2_stage_spec_from_contract",
16
+ "snapshot_stage_spec_from_contract",
17
+ ]
@@ -0,0 +1,51 @@
1
+ """PySpark execution for shape.zip_arrays."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from contractforge_databricks.preparation.shape_validation import as_dict, data_type_at_path, path_col, validate_columns
8
+
9
+
10
+ def apply_zip_arrays(df: Any, configs: list[dict[str, Any]]) -> Any:
11
+ """Zip parallel arrays and rename struct fields according to the contract."""
12
+
13
+ from pyspark.sql import functions as F
14
+ from pyspark.sql.types import ArrayType
15
+
16
+ aliases = set(getattr(df, "columns", ()) or ())
17
+ for config_idx, config in enumerate(configs):
18
+ alias = str(config["alias"])
19
+ if alias in aliases:
20
+ raise ValueError(f"shape.zip_arrays would collide with existing column: {alias}")
21
+ columns = as_dict(config.get("columns"))
22
+ if not columns:
23
+ raise ValueError("shape.zip_arrays.columns is required")
24
+ validate_columns(df, {path: True for path in columns}, "shape.zip_arrays")
25
+ temp_columns = []
26
+ for path, field_alias in columns.items():
27
+ data_type = data_type_at_path(getattr(df, "schema", None), path)
28
+ if data_type is not None and not isinstance(data_type, ArrayType):
29
+ raise ValueError(f"shape.zip_arrays.{path} must be array")
30
+ temp = _unique_temp_column(getattr(df, "columns", ()) or (), f"__cf_shape_zip_{config_idx}_{len(temp_columns)}")
31
+ df = df.withColumn(temp, path_col(F, str(path)))
32
+ temp_columns.append((temp, str(field_alias)))
33
+
34
+ zipped = F.arrays_zip(*[F.col(temp) for temp, _ in temp_columns])
35
+ renamed = F.transform(
36
+ zipped,
37
+ lambda item: F.struct(*[item.getField(temp).alias(field_alias) for temp, field_alias in temp_columns]),
38
+ )
39
+ df = df.withColumn(alias, renamed).drop(*[temp for temp, _ in temp_columns])
40
+ aliases.add(alias)
41
+ return df
42
+
43
+
44
+ def _unique_temp_column(columns: object, prefix: str) -> str:
45
+ existing = set(columns or ())
46
+ candidate = prefix
47
+ idx = 0
48
+ while candidate in existing:
49
+ idx += 1
50
+ candidate = f"{prefix}_{idx}"
51
+ return candidate
@@ -0,0 +1,3 @@
1
+ from contractforge_databricks.presets.core import apply_preset, get_preset, list_presets, preset_details, register_preset
2
+
3
+ __all__ = ["apply_preset", "get_preset", "list_presets", "preset_details", "register_preset"]
@@ -0,0 +1,24 @@
1
+ """Shared Databricks preset metadata helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ Preset = dict[str, Any]
8
+ PRESET_META_KEY = "_preset"
9
+
10
+
11
+ def meta(
12
+ name: str,
13
+ category: str,
14
+ kind: str,
15
+ description: str,
16
+ required_fields: list[str] | None = None,
17
+ ) -> dict[str, Any]:
18
+ return {
19
+ "name": name,
20
+ "description": description,
21
+ "category": category,
22
+ "kind": kind,
23
+ "required_fields": list(required_fields or []),
24
+ }
@@ -0,0 +1,57 @@
1
+ """Bronze Databricks presets ported from ContractForge."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from contractforge_databricks.presets.base import PRESET_META_KEY, Preset, meta
6
+
7
+ BRONZE_PRESETS: dict[str, Preset] = {
8
+ "bronze_file_append": {
9
+ PRESET_META_KEY: meta("bronze_file_append", "bronze", "ingestion", "Bronze append for batch files."),
10
+ "layer": "bronze",
11
+ "mode": "scd0_append",
12
+ "schema_policy": "additive_only",
13
+ "on_quality_fail": "fail",
14
+ },
15
+ "bronze_table_append": {
16
+ PRESET_META_KEY: meta("bronze_table_append", "bronze", "ingestion", "Bronze append for table replication."),
17
+ "layer": "bronze",
18
+ "mode": "scd0_append",
19
+ "schema_policy": "additive_only",
20
+ "on_quality_fail": "fail",
21
+ },
22
+ "bronze_autoloader_append": {
23
+ PRESET_META_KEY: meta(
24
+ "bronze_autoloader_append",
25
+ "bronze",
26
+ "ingestion",
27
+ "Bronze available-now Auto Loader append.",
28
+ ["source.path", "source.progress_location", "source.schema_tracking_location", "target_table"],
29
+ ),
30
+ "source": {"type": "incremental_files", "trigger": "available_now", "format": "parquet"},
31
+ "layer": "bronze",
32
+ "mode": "scd0_append",
33
+ "schema_policy": "additive_only",
34
+ "on_quality_fail": "fail",
35
+ "idempotency_policy": "skip_if_success",
36
+ },
37
+ "bronze_full_overwrite": {
38
+ PRESET_META_KEY: meta("bronze_full_overwrite", "bronze", "ingestion", "Bronze full snapshot overwrite."),
39
+ "layer": "bronze",
40
+ "mode": "scd0_overwrite",
41
+ "schema_policy": "additive_only",
42
+ "on_quality_fail": "fail",
43
+ },
44
+ "bronze_partition_overwrite": {
45
+ PRESET_META_KEY: meta(
46
+ "bronze_partition_overwrite",
47
+ "bronze",
48
+ "ingestion",
49
+ "Bronze overwrite for one controlled partition.",
50
+ ["extensions.databricks.partition_column", "extensions.databricks.partition_value"],
51
+ ),
52
+ "layer": "bronze",
53
+ "mode": "scd0_overwrite",
54
+ "schema_policy": "additive_only",
55
+ "on_quality_fail": "fail",
56
+ },
57
+ }
@@ -0,0 +1,22 @@
1
+ """Databricks-owned preset catalog ported from ContractForge."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from contractforge_databricks.presets.base import PRESET_META_KEY as PRESET_META_KEY, Preset
6
+ from contractforge_databricks.presets.bronze import BRONZE_PRESETS
7
+ from contractforge_databricks.presets.gold import GOLD_PRESETS
8
+ from contractforge_databricks.presets.modifiers import DELTA_PRESETS, GOVERNANCE_PRESETS, QUALITY_PRESETS
9
+ from contractforge_databricks.presets.runtime import RUNTIME_PRESETS
10
+ from contractforge_databricks.presets.silver import SILVER_PRESETS
11
+ from contractforge_databricks.presets.write_engine import WRITE_ENGINE_PRESETS
12
+
13
+ BUILTIN_PRESETS: dict[str, Preset] = {
14
+ **BRONZE_PRESETS,
15
+ **SILVER_PRESETS,
16
+ **GOLD_PRESETS,
17
+ **QUALITY_PRESETS,
18
+ **DELTA_PRESETS,
19
+ **GOVERNANCE_PRESETS,
20
+ **RUNTIME_PRESETS,
21
+ **WRITE_ENGINE_PRESETS,
22
+ }
@@ -0,0 +1,134 @@
1
+ """Preset helpers for Databricks adapter examples and templates."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from copy import deepcopy
6
+ from typing import Any
7
+
8
+ from contractforge_databricks.presets.catalog import BUILTIN_PRESETS, PRESET_META_KEY, Preset
9
+
10
+ PRESETS: dict[str, Preset] = deepcopy(BUILTIN_PRESETS)
11
+
12
+
13
+ def list_presets() -> list[str]:
14
+ return sorted(PRESETS)
15
+
16
+
17
+ def get_preset(name: str) -> Preset:
18
+ if name not in PRESETS:
19
+ raise ValueError(f"Preset not found: {name}. valid presets: {list_presets()}")
20
+ return deepcopy(PRESETS[name])
21
+
22
+
23
+ def register_preset(name: str, preset: Preset, *, override: bool = False) -> None:
24
+ normalized_name = str(name or "").strip()
25
+ if not normalized_name:
26
+ raise ValueError("preset name cannot be empty")
27
+ if not isinstance(preset, dict):
28
+ raise ValueError("preset must be a dict")
29
+ if normalized_name in PRESETS and not override:
30
+ raise ValueError(f"Preset already registered: {normalized_name}")
31
+ payload = deepcopy(preset)
32
+ meta = dict(payload.get(PRESET_META_KEY) or {})
33
+ meta.setdefault("name", normalized_name)
34
+ meta.setdefault("description", "")
35
+ meta.setdefault("category", "custom")
36
+ meta.setdefault("kind", "modifier")
37
+ meta.setdefault("required_fields", [])
38
+ payload[PRESET_META_KEY] = meta
39
+ PRESETS[normalized_name] = payload
40
+
41
+
42
+ def preset_details(name: str) -> dict[str, Any]:
43
+ preset = get_preset(name)
44
+ meta = dict(preset.pop(PRESET_META_KEY, {}))
45
+ return {
46
+ "name": name,
47
+ "description": meta.get("description", ""),
48
+ "category": meta.get("category", "custom"),
49
+ "kind": meta.get("kind", "modifier"),
50
+ "required_fields": list(meta.get("required_fields") or []),
51
+ "sets": sorted(_flatten_keys(preset)),
52
+ }
53
+
54
+
55
+ def apply_preset(contract: dict[str, Any]) -> dict[str, Any]:
56
+ names = _preset_names(contract)
57
+ expanded: dict[str, Any] = {}
58
+ metas = []
59
+ for name in names:
60
+ preset = get_preset(name)
61
+ metas.append(dict(preset.pop(PRESET_META_KEY, {})))
62
+ expanded = _deep_merge(expanded, preset)
63
+ explicit = _copy(contract)
64
+ explicit.pop("preset", None)
65
+ explicit.pop("presets", None)
66
+ expanded = _deep_merge(expanded, explicit)
67
+ expanded["applied_presets"] = names
68
+ _validate_exclusive(metas)
69
+ _validate_required(expanded, metas)
70
+ return expanded
71
+
72
+
73
+ def _preset_names(contract: dict[str, Any]) -> list[str]:
74
+ raw = contract.get("preset", contract.get("presets", []))
75
+ if raw is None:
76
+ return []
77
+ values = raw if isinstance(raw, list) else [raw]
78
+ names = [str(item).strip() for item in values]
79
+ if any(not name for name in names):
80
+ raise ValueError("preset cannot contain empty values")
81
+ return names
82
+
83
+
84
+ def _deep_merge(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
85
+ result = deepcopy(base)
86
+ for key, value in override.items():
87
+ if key in result and isinstance(result[key], dict) and isinstance(value, dict):
88
+ result[key] = _deep_merge(result[key], value)
89
+ else:
90
+ result[key] = _copy(value)
91
+ return result
92
+
93
+
94
+ def _copy(value: Any) -> Any:
95
+ return deepcopy(value)
96
+
97
+
98
+ def _validate_exclusive(metas: list[dict[str, Any]]) -> None:
99
+ kinds: dict[str, list[str]] = {}
100
+ for meta in metas:
101
+ kinds.setdefault(str(meta.get("kind") or "modifier"), []).append(str(meta.get("name") or "unknown"))
102
+ for kind in ("ingestion", "runtime"):
103
+ if len(kinds.get(kind, [])) > 1:
104
+ raise ValueError(f"Presets of kind {kind} are exclusive; received: {kinds[kind]}")
105
+
106
+
107
+ def _validate_required(contract: dict[str, Any], metas: list[dict[str, Any]]) -> None:
108
+ missing = []
109
+ for meta in metas:
110
+ for field in meta.get("required_fields") or []:
111
+ if not _has_value(contract, str(field)):
112
+ missing.append(f"{meta.get('name')}:{field}")
113
+ if missing:
114
+ raise ValueError(f"Missing required fields for presets: {missing}")
115
+
116
+
117
+ def _has_value(contract: dict[str, Any], field_path: str) -> bool:
118
+ current: Any = contract
119
+ for part in field_path.split("."):
120
+ if not isinstance(current, dict) or part not in current:
121
+ return False
122
+ current = current[part]
123
+ return current is not None and (not isinstance(current, (str, list, tuple, dict)) or bool(current))
124
+
125
+
126
+ def _flatten_keys(payload: dict[str, Any], prefix: str = "") -> list[str]:
127
+ keys = []
128
+ for key, value in payload.items():
129
+ path = f"{prefix}.{key}" if prefix else str(key)
130
+ if isinstance(value, dict):
131
+ keys.extend(_flatten_keys(value, path))
132
+ else:
133
+ keys.append(path)
134
+ return keys
@@ -0,0 +1,62 @@
1
+ """Gold Databricks presets ported from ContractForge."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from contractforge_databricks.presets.base import PRESET_META_KEY, Preset, meta
6
+
7
+ GOLD_PRESETS: dict[str, Preset] = {
8
+ "gold_full_refresh": {
9
+ PRESET_META_KEY: meta("gold_full_refresh", "gold", "ingestion", "Gold full refresh."),
10
+ "layer": "gold",
11
+ "mode": "scd0_overwrite",
12
+ "schema_policy": "strict",
13
+ "on_quality_fail": "fail",
14
+ },
15
+ "gold_partition_refresh": {
16
+ PRESET_META_KEY: meta(
17
+ "gold_partition_refresh",
18
+ "gold",
19
+ "ingestion",
20
+ "Gold recalculated by partition.",
21
+ ["extensions.databricks.partition_column", "extensions.databricks.partition_value"],
22
+ ),
23
+ "layer": "gold",
24
+ "mode": "scd0_overwrite",
25
+ "schema_policy": "strict",
26
+ "on_quality_fail": "fail",
27
+ },
28
+ "gold_replace_partitions": {
29
+ PRESET_META_KEY: meta(
30
+ "gold_replace_partitions",
31
+ "gold",
32
+ "ingestion",
33
+ "Gold declarative replacement of complete partitions.",
34
+ ["extensions.databricks.merge_partition_column"],
35
+ ),
36
+ "layer": "gold",
37
+ "mode": "scd1_upsert",
38
+ "extensions": {
39
+ "databricks": {
40
+ "merge_strategy": "replace_partitions",
41
+ "replace_partitions_source_complete": True,
42
+ }
43
+ },
44
+ "schema_policy": "strict",
45
+ "on_quality_fail": "fail",
46
+ },
47
+ "gold_snapshot_serving": {
48
+ PRESET_META_KEY: meta("gold_snapshot_serving", "gold", "ingestion", "Gold snapshot serving.", ["merge_keys"]),
49
+ "layer": "gold",
50
+ "mode": "snapshot_soft_delete",
51
+ "schema_policy": "strict",
52
+ "on_quality_fail": "fail",
53
+ },
54
+ "gold_scd1_serving": {
55
+ PRESET_META_KEY: meta("gold_scd1_serving", "gold", "ingestion", "Gold SCD1 serving.", ["merge_keys"]),
56
+ "layer": "gold",
57
+ "mode": "scd1_upsert",
58
+ "extensions": {"databricks": {"merge_strategy": "delta"}},
59
+ "schema_policy": "strict",
60
+ "on_quality_fail": "fail",
61
+ },
62
+ }