contractforge-databricks 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. contractforge_databricks/__init__.py +172 -0
  2. contractforge_databricks/adapter.py +69 -0
  3. contractforge_databricks/annotations/__init__.py +10 -0
  4. contractforge_databricks/annotations/application.py +52 -0
  5. contractforge_databricks/annotations/audit.py +49 -0
  6. contractforge_databricks/annotations/sql.py +142 -0
  7. contractforge_databricks/api.py +65 -0
  8. contractforge_databricks/bundles/__init__.py +9 -0
  9. contractforge_databricks/bundles/assets.py +47 -0
  10. contractforge_databricks/bundles/project.py +213 -0
  11. contractforge_databricks/bundles/project_config.py +133 -0
  12. contractforge_databricks/capabilities/__init__.py +17 -0
  13. contractforge_databricks/capabilities/builders.py +43 -0
  14. contractforge_databricks/capabilities/evaluate.py +162 -0
  15. contractforge_databricks/capabilities/mapping.py +36 -0
  16. contractforge_databricks/capabilities/models.py +44 -0
  17. contractforge_databricks/capabilities/runtime.py +111 -0
  18. contractforge_databricks/capabilities/uc.py +47 -0
  19. contractforge_databricks/cli.py +196 -0
  20. contractforge_databricks/cli_deploy.py +98 -0
  21. contractforge_databricks/cli_governance.py +142 -0
  22. contractforge_databricks/cli_io.py +91 -0
  23. contractforge_databricks/cli_maintenance.py +69 -0
  24. contractforge_databricks/coercion.py +31 -0
  25. contractforge_databricks/contract_extensions.py +70 -0
  26. contractforge_databricks/cost/__init__.py +11 -0
  27. contractforge_databricks/cost/model.py +22 -0
  28. contractforge_databricks/cost/report.py +65 -0
  29. contractforge_databricks/cost/sql.py +136 -0
  30. contractforge_databricks/dashboards/__init__.py +15 -0
  31. contractforge_databricks/dashboards/control_tables.py +150 -0
  32. contractforge_databricks/diagnostics/__init__.py +7 -0
  33. contractforge_databricks/diagnostics/explain.py +40 -0
  34. contractforge_databricks/environment.py +53 -0
  35. contractforge_databricks/evidence/__init__.py +98 -0
  36. contractforge_databricks/evidence/ddl.py +35 -0
  37. contractforge_databricks/evidence/governance_log.py +175 -0
  38. contractforge_databricks/evidence/helpers.py +29 -0
  39. contractforge_databricks/evidence/ops_log.py +210 -0
  40. contractforge_databricks/evidence/records.py +27 -0
  41. contractforge_databricks/evidence/run_log.py +74 -0
  42. contractforge_databricks/evidence/schemas.py +7 -0
  43. contractforge_databricks/evidence/sql.py +144 -0
  44. contractforge_databricks/evidence/tables.py +20 -0
  45. contractforge_databricks/evidence/writer.py +118 -0
  46. contractforge_databricks/execution/__init__.py +70 -0
  47. contractforge_databricks/execution/delta_basic.py +57 -0
  48. contractforge_databricks/execution/hash_diff.py +126 -0
  49. contractforge_databricks/execution/hash_diff_latest.py +142 -0
  50. contractforge_databricks/execution/replace_partitions.py +40 -0
  51. contractforge_databricks/execution/results.py +5 -0
  52. contractforge_databricks/execution/retry.py +36 -0
  53. contractforge_databricks/execution/scd2.py +213 -0
  54. contractforge_databricks/execution/scd2_deletes.py +65 -0
  55. contractforge_databricks/execution/scd2_late.py +30 -0
  56. contractforge_databricks/execution/snapshot.py +77 -0
  57. contractforge_databricks/execution/sql_merge.py +85 -0
  58. contractforge_databricks/execution/tables.py +98 -0
  59. contractforge_databricks/execution/windows.py +58 -0
  60. contractforge_databricks/governance/__init__.py +30 -0
  61. contractforge_databricks/governance/access.py +185 -0
  62. contractforge_databricks/governance/application.py +93 -0
  63. contractforge_databricks/governance/drift.py +49 -0
  64. contractforge_databricks/governance/runtime.py +60 -0
  65. contractforge_databricks/governance/sql.py +31 -0
  66. contractforge_databricks/governance/validation.py +135 -0
  67. contractforge_databricks/lakeflow/__init__.py +21 -0
  68. contractforge_databricks/lakeflow/compatibility.py +194 -0
  69. contractforge_databricks/lakeflow/rendering.py +175 -0
  70. contractforge_databricks/lineage/__init__.py +7 -0
  71. contractforge_databricks/lineage/openlineage.py +182 -0
  72. contractforge_databricks/maintenance/__init__.py +27 -0
  73. contractforge_databricks/maintenance/retention.py +90 -0
  74. contractforge_databricks/maintenance/sql.py +68 -0
  75. contractforge_databricks/metrics/__init__.py +19 -0
  76. contractforge_databricks/metrics/history.py +21 -0
  77. contractforge_databricks/metrics/write.py +63 -0
  78. contractforge_databricks/operations/__init__.py +4 -0
  79. contractforge_databricks/operations/application.py +38 -0
  80. contractforge_databricks/operations/sql.py +95 -0
  81. contractforge_databricks/parity/__init__.py +18 -0
  82. contractforge_databricks/parity/catalog.py +59 -0
  83. contractforge_databricks/parity/models.py +7 -0
  84. contractforge_databricks/parity/scenarios.py +111 -0
  85. contractforge_databricks/partitioning/__init__.py +3 -0
  86. contractforge_databricks/partitioning/predicates.py +28 -0
  87. contractforge_databricks/preparation/__init__.py +47 -0
  88. contractforge_databricks/preparation/deduplicate.py +87 -0
  89. contractforge_databricks/preparation/encoding.py +37 -0
  90. contractforge_databricks/preparation/hashing.py +18 -0
  91. contractforge_databricks/preparation/pyspark.py +178 -0
  92. contractforge_databricks/preparation/pyspark_staging.py +70 -0
  93. contractforge_databricks/preparation/shape.py +209 -0
  94. contractforge_databricks/preparation/shape_validation.py +94 -0
  95. contractforge_databricks/preparation/staging.py +17 -0
  96. contractforge_databricks/preparation/zip_arrays.py +51 -0
  97. contractforge_databricks/presets/__init__.py +3 -0
  98. contractforge_databricks/presets/base.py +24 -0
  99. contractforge_databricks/presets/bronze.py +57 -0
  100. contractforge_databricks/presets/catalog.py +22 -0
  101. contractforge_databricks/presets/core.py +134 -0
  102. contractforge_databricks/presets/gold.py +62 -0
  103. contractforge_databricks/presets/modifiers.py +51 -0
  104. contractforge_databricks/presets/runtime.py +22 -0
  105. contractforge_databricks/presets/silver.py +101 -0
  106. contractforge_databricks/presets/write_engine.py +57 -0
  107. contractforge_databricks/quality/__init__.py +41 -0
  108. contractforge_databricks/quality/evaluation.py +178 -0
  109. contractforge_databricks/quality/persistence.py +81 -0
  110. contractforge_databricks/quality/registry.py +134 -0
  111. contractforge_databricks/quality/results.py +17 -0
  112. contractforge_databricks/quality/sql.py +113 -0
  113. contractforge_databricks/rendering/__init__.py +11 -0
  114. contractforge_databricks/rendering/bundle.py +93 -0
  115. contractforge_databricks/rendering/markdown.py +50 -0
  116. contractforge_databricks/rendering/names.py +56 -0
  117. contractforge_databricks/results.py +15 -0
  118. contractforge_databricks/runtime/__init__.py +101 -0
  119. contractforge_databricks/runtime/available_now.py +147 -0
  120. contractforge_databricks/runtime/bundles.py +211 -0
  121. contractforge_databricks/runtime/cache.py +20 -0
  122. contractforge_databricks/runtime/control_tables.py +19 -0
  123. contractforge_databricks/runtime/deploy.py +197 -0
  124. contractforge_databricks/runtime/detection.py +114 -0
  125. contractforge_databricks/runtime/dry_run.py +46 -0
  126. contractforge_databricks/runtime/errors.py +54 -0
  127. contractforge_databricks/runtime/file_selection.py +109 -0
  128. contractforge_databricks/runtime/finalization.py +168 -0
  129. contractforge_databricks/runtime/governance.py +37 -0
  130. contractforge_databricks/runtime/hooks.py +45 -0
  131. contractforge_databricks/runtime/http_file.py +37 -0
  132. contractforge_databricks/runtime/http_retry.py +15 -0
  133. contractforge_databricks/runtime/http_safety.py +9 -0
  134. contractforge_databricks/runtime/json_materialization.py +97 -0
  135. contractforge_databricks/runtime/lineage.py +164 -0
  136. contractforge_databricks/runtime/maintenance.py +43 -0
  137. contractforge_databricks/runtime/merge_validation.py +98 -0
  138. contractforge_databricks/runtime/metadata.py +21 -0
  139. contractforge_databricks/runtime/metrics.py +34 -0
  140. contractforge_databricks/runtime/models.py +32 -0
  141. contractforge_databricks/runtime/options.py +33 -0
  142. contractforge_databricks/runtime/orchestration_context.py +185 -0
  143. contractforge_databricks/runtime/orchestrator.py +147 -0
  144. contractforge_databricks/runtime/partitioning.py +93 -0
  145. contractforge_databricks/runtime/quality_quarantine.py +92 -0
  146. contractforge_databricks/runtime/rest_api.py +46 -0
  147. contractforge_databricks/runtime/rest_auth.py +21 -0
  148. contractforge_databricks/runtime/rest_pagination.py +21 -0
  149. contractforge_databricks/runtime/run_payload.py +177 -0
  150. contractforge_databricks/runtime/schema.py +106 -0
  151. contractforge_databricks/runtime/source_metadata.py +30 -0
  152. contractforge_databricks/runtime/source_registry.py +43 -0
  153. contractforge_databricks/runtime/source_schema.py +24 -0
  154. contractforge_databricks/runtime/sources.py +208 -0
  155. contractforge_databricks/runtime/spark.py +183 -0
  156. contractforge_databricks/runtime/spark_defaults.py +35 -0
  157. contractforge_databricks/runtime/storage_auth.py +132 -0
  158. contractforge_databricks/runtime/streaming.py +131 -0
  159. contractforge_databricks/runtime/success.py +104 -0
  160. contractforge_databricks/runtime/utils.py +52 -0
  161. contractforge_databricks/runtime/watermark.py +71 -0
  162. contractforge_databricks/runtime/windows.py +184 -0
  163. contractforge_databricks/runtime/write.py +66 -0
  164. contractforge_databricks/runtime/write_flow.py +146 -0
  165. contractforge_databricks/runtime/write_strategy.py +40 -0
  166. contractforge_databricks/schema/__init__.py +21 -0
  167. contractforge_databricks/schema/diff.py +11 -0
  168. contractforge_databricks/schema/policy.py +33 -0
  169. contractforge_databricks/schema/sync.py +23 -0
  170. contractforge_databricks/security/__init__.py +21 -0
  171. contractforge_databricks/security/errors.py +5 -0
  172. contractforge_databricks/security/redaction.py +5 -0
  173. contractforge_databricks/security/secrets.py +114 -0
  174. contractforge_databricks/security/source_policy.py +17 -0
  175. contractforge_databricks/shapes/__init__.py +3 -0
  176. contractforge_databricks/shapes/sql.py +123 -0
  177. contractforge_databricks/sources/__init__.py +67 -0
  178. contractforge_databricks/sources/artifacts.py +100 -0
  179. contractforge_databricks/sources/autoloader.py +48 -0
  180. contractforge_databricks/sources/bounded_streams.py +44 -0
  181. contractforge_databricks/sources/classification.py +115 -0
  182. contractforge_databricks/sources/delta_share.py +21 -0
  183. contractforge_databricks/sources/files.py +48 -0
  184. contractforge_databricks/sources/http_file.py +46 -0
  185. contractforge_databricks/sources/interpret.py +76 -0
  186. contractforge_databricks/sources/jdbc.py +32 -0
  187. contractforge_databricks/sources/metadata.py +18 -0
  188. contractforge_databricks/sources/native_passthrough.py +33 -0
  189. contractforge_databricks/sources/rds_iam.py +15 -0
  190. contractforge_databricks/sources/rds_iam_runtime.py +191 -0
  191. contractforge_databricks/sources/rest_api.py +33 -0
  192. contractforge_databricks/sources/support.py +50 -0
  193. contractforge_databricks/sources/table_refs.py +65 -0
  194. contractforge_databricks/sql/__init__.py +4 -0
  195. contractforge_databricks/sql/identifiers.py +17 -0
  196. contractforge_databricks/sql/literals.py +36 -0
  197. contractforge_databricks/state/__init__.py +39 -0
  198. contractforge_databricks/state/ddl.py +24 -0
  199. contractforge_databricks/state/migrations.py +146 -0
  200. contractforge_databricks/state/queries.py +149 -0
  201. contractforge_databricks/state/sql.py +116 -0
  202. contractforge_databricks/state/tables.py +9 -0
  203. contractforge_databricks/state/writer.py +83 -0
  204. contractforge_databricks/templates/__init__.py +15 -0
  205. contractforge_databricks/templates/catalog.py +205 -0
  206. contractforge_databricks/templates/catalog_parity.py +85 -0
  207. contractforge_databricks/templates/core.py +83 -0
  208. contractforge_databricks/templates/enrichment.py +175 -0
  209. contractforge_databricks/transforms/__init__.py +3 -0
  210. contractforge_databricks/transforms/sql.py +118 -0
  211. contractforge_databricks/watermark/__init__.py +6 -0
  212. contractforge_databricks/watermark/sql.py +91 -0
  213. contractforge_databricks/write_modes/__init__.py +20 -0
  214. contractforge_databricks/write_modes/registry.py +44 -0
  215. contractforge_databricks/write_modes/sql.py +33 -0
  216. contractforge_databricks/write_modes/strategy.py +192 -0
  217. contractforge_databricks-0.1.0.dist-info/METADATA +34 -0
  218. contractforge_databricks-0.1.0.dist-info/RECORD +220 -0
  219. contractforge_databricks-0.1.0.dist-info/WHEEL +4 -0
  220. contractforge_databricks-0.1.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,51 @@
1
+ """Databricks modifier presets ported from ContractForge."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from contractforge_databricks.presets.base import PRESET_META_KEY, Preset, meta
6
+
7
+ DELTA_PRESETS: dict[str, Preset] = {
8
+ "delta_cdf_enabled": {
9
+ PRESET_META_KEY: meta("delta_cdf_enabled", "delta", "modifier", "Enable Delta Change Data Feed."),
10
+ "extensions": {"databricks": {"delta_properties": {"delta.enableChangeDataFeed": "true"}}},
11
+ },
12
+ "delta_optimized_writes": {
13
+ PRESET_META_KEY: meta("delta_optimized_writes", "delta", "modifier", "Optimized Delta write properties."),
14
+ "extensions": {
15
+ "databricks": {
16
+ "delta_properties": {
17
+ "delta.autoOptimize.optimizeWrite": "true",
18
+ "delta.autoOptimize.autoCompact": "true",
19
+ }
20
+ }
21
+ },
22
+ },
23
+ "delta_liquid_clustering": {
24
+ PRESET_META_KEY: meta(
25
+ "delta_liquid_clustering",
26
+ "delta",
27
+ "modifier",
28
+ "Databricks Delta liquid clustering.",
29
+ ["extensions.databricks.cluster_columns"],
30
+ )
31
+ },
32
+ }
33
+
34
+ QUALITY_PRESETS: dict[str, Preset] = {
35
+ "quality_strict": {
36
+ PRESET_META_KEY: meta("quality_strict", "quality", "modifier", "Abortive quality policy."),
37
+ "on_quality_fail": "fail",
38
+ },
39
+ "quality_quarantine": {
40
+ PRESET_META_KEY: meta("quality_quarantine", "quality", "modifier", "Quality quarantine policy."),
41
+ "on_quality_fail": "quarantine",
42
+ },
43
+ }
44
+
45
+ GOVERNANCE_PRESETS: dict[str, Preset] = {
46
+ "governance_uc_basic": {
47
+ PRESET_META_KEY: meta("governance_uc_basic", "governance", "modifier", "Basic Unity Catalog governance."),
48
+ "annotations": {"policy": "warn"},
49
+ "access": {"access_policy": {"mode": "validate_only", "on_drift": "warn"}},
50
+ }
51
+ }
@@ -0,0 +1,22 @@
1
+ """Databricks runtime presets ported from ContractForge."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from contractforge_databricks.presets.base import PRESET_META_KEY, Preset, meta
6
+
7
+ RUNTIME_PRESETS: dict[str, Preset] = {
8
+ "runtime_databricks_serverless": {
9
+ PRESET_META_KEY: meta("runtime_databricks_serverless", "runtime", "runtime", "Databricks Serverless defaults."),
10
+ "extensions": {"databricks": {"cache_source": False, "optimize_after_write": False}},
11
+ },
12
+ "runtime_spark_delta_local": {
13
+ PRESET_META_KEY: meta("runtime_spark_delta_local", "runtime", "runtime", "Local PySpark + Delta defaults."),
14
+ "extensions": {
15
+ "databricks": {
16
+ "cache_source": False,
17
+ "optimize_after_write": False,
18
+ "lock_enabled": False,
19
+ }
20
+ },
21
+ },
22
+ }
@@ -0,0 +1,101 @@
1
+ """Silver Databricks presets ported from ContractForge."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from contractforge_databricks.presets.base import PRESET_META_KEY, Preset, meta
6
+
7
+ SILVER_PRESETS: dict[str, Preset] = {
8
+ "silver_scd1_upsert": {
9
+ PRESET_META_KEY: meta("silver_scd1_upsert", "silver", "ingestion", "Silver SCD1 Delta MERGE.", ["merge_keys"]),
10
+ "layer": "silver",
11
+ "mode": "scd1_upsert",
12
+ "extensions": {"databricks": {"merge_strategy": "delta"}},
13
+ "schema_policy": "additive_only",
14
+ "on_quality_fail": "fail",
15
+ },
16
+ "silver_scd1_partition_upsert": {
17
+ PRESET_META_KEY: meta(
18
+ "silver_scd1_partition_upsert",
19
+ "silver",
20
+ "ingestion",
21
+ "Silver SCD1 MERGE pruned by partition.",
22
+ ["merge_keys", "extensions.databricks.merge_partition_column"],
23
+ ),
24
+ "layer": "silver",
25
+ "mode": "scd1_upsert",
26
+ "extensions": {"databricks": {"merge_strategy": "delta_by_partition"}},
27
+ "schema_policy": "additive_only",
28
+ "on_quality_fail": "fail",
29
+ },
30
+ "silver_replace_partitions": {
31
+ PRESET_META_KEY: meta(
32
+ "silver_replace_partitions",
33
+ "silver",
34
+ "ingestion",
35
+ "Silver replacement of complete partitions.",
36
+ ["extensions.databricks.merge_partition_column"],
37
+ ),
38
+ "layer": "silver",
39
+ "mode": "scd1_upsert",
40
+ "extensions": {
41
+ "databricks": {
42
+ "merge_strategy": "replace_partitions",
43
+ "replace_partitions_source_complete": True,
44
+ }
45
+ },
46
+ "schema_policy": "additive_only",
47
+ "on_quality_fail": "fail",
48
+ },
49
+ "silver_incremental_watermark_upsert": {
50
+ PRESET_META_KEY: meta(
51
+ "silver_incremental_watermark_upsert",
52
+ "silver",
53
+ "ingestion",
54
+ "Silver SCD1 incremental watermark upsert.",
55
+ ["merge_keys", "watermark_columns"],
56
+ ),
57
+ "layer": "silver",
58
+ "mode": "scd1_upsert",
59
+ "extensions": {"databricks": {"merge_strategy": "delta"}},
60
+ "schema_policy": "additive_only",
61
+ "on_quality_fail": "fail",
62
+ },
63
+ "silver_hash_diff_append": {
64
+ PRESET_META_KEY: meta("silver_hash_diff_append", "silver", "ingestion", "Silver hash-diff append.", ["hash_keys"]),
65
+ "layer": "silver",
66
+ "mode": "scd1_hash_diff",
67
+ "schema_policy": "additive_only",
68
+ "on_quality_fail": "fail",
69
+ "hash_exclude_columns": ["ingestion_ts_utc", "__run_id"],
70
+ },
71
+ "silver_quarantine_ingestion": {
72
+ PRESET_META_KEY: meta(
73
+ "silver_quarantine_ingestion",
74
+ "silver",
75
+ "ingestion",
76
+ "Silver SCD1 with quarantine for row-level rules.",
77
+ ["merge_keys"],
78
+ ),
79
+ "layer": "silver",
80
+ "mode": "scd1_upsert",
81
+ "extensions": {"databricks": {"merge_strategy": "delta"}},
82
+ "schema_policy": "additive_only",
83
+ "on_quality_fail": "quarantine",
84
+ },
85
+ "silver_snapshot_soft_delete": {
86
+ PRESET_META_KEY: meta(
87
+ "silver_snapshot_soft_delete", "silver", "ingestion", "Silver snapshot soft delete.", ["merge_keys"]
88
+ ),
89
+ "layer": "silver",
90
+ "mode": "snapshot_soft_delete",
91
+ "schema_policy": "additive_only",
92
+ "on_quality_fail": "fail",
93
+ },
94
+ "silver_scd2_historical": {
95
+ PRESET_META_KEY: meta("silver_scd2_historical", "silver", "ingestion", "Silver SCD2 history.", ["merge_keys"]),
96
+ "layer": "silver",
97
+ "mode": "scd2_historical",
98
+ "schema_policy": "additive_only",
99
+ "on_quality_fail": "fail",
100
+ },
101
+ }
@@ -0,0 +1,57 @@
1
+ """Databricks write-engine preview presets ported from ContractForge."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from contractforge_databricks.presets.base import PRESET_META_KEY, Preset, meta
6
+
7
+ WRITE_ENGINE_PRESETS: dict[str, Preset] = {
8
+ "write_engine_native_auto_preview": {
9
+ PRESET_META_KEY: meta(
10
+ "write_engine_native_auto_preview",
11
+ "write_engine",
12
+ "modifier",
13
+ "Record Databricks native engine selection evidence without changing execution.",
14
+ ),
15
+ "extensions": {
16
+ "databricks": {
17
+ "write_engine": {"requested": "auto", "fallback_policy": "preview_only", "explain_selection": True}
18
+ }
19
+ },
20
+ },
21
+ "write_engine_databricks_sql_merge_preview": {
22
+ PRESET_META_KEY: meta(
23
+ "write_engine_databricks_sql_merge_preview",
24
+ "write_engine",
25
+ "modifier",
26
+ "Preview Databricks SQL MERGE eligibility while executing the Delta baseline.",
27
+ ["merge_keys"],
28
+ ),
29
+ "extensions": {
30
+ "databricks": {
31
+ "write_engine": {
32
+ "requested": "databricks_sql_merge",
33
+ "fallback_policy": "preview_only",
34
+ "explain_selection": True,
35
+ }
36
+ }
37
+ },
38
+ },
39
+ "write_engine_lakeflow_auto_cdc_preview": {
40
+ PRESET_META_KEY: meta(
41
+ "write_engine_lakeflow_auto_cdc_preview",
42
+ "write_engine",
43
+ "modifier",
44
+ "Preview Lakeflow AUTO CDC eligibility while executing the Delta baseline.",
45
+ ["merge_keys"],
46
+ ),
47
+ "extensions": {
48
+ "databricks": {
49
+ "write_engine": {
50
+ "requested": "lakeflow_auto_cdc",
51
+ "fallback_policy": "preview_only",
52
+ "explain_selection": True,
53
+ }
54
+ }
55
+ },
56
+ },
57
+ }
@@ -0,0 +1,41 @@
1
+ from contractforge_databricks.quality.persistence import (
2
+ render_quality_result_insert_sql,
3
+ render_quality_results_insert_sql,
4
+ render_quarantine_reference_insert_sql,
5
+ )
6
+ from contractforge_databricks.quality.evaluation import evaluate_quality
7
+ from contractforge_databricks.quality.registry import (
8
+ clear_quality_rule_registry,
9
+ evaluate_custom_quality_rules,
10
+ evaluate_custom_quality_runtime,
11
+ get_quality_rule,
12
+ is_abort_only_failure,
13
+ list_quality_rules,
14
+ register_quality_rule,
15
+ unregister_quality_rule,
16
+ )
17
+ from contractforge_core.quality import (
18
+ QualityRuleResult,
19
+ quality_status,
20
+ quarantinable_results,
21
+ )
22
+ from contractforge_databricks.quality.sql import render_quality_check_sql
23
+
24
+ __all__ = [
25
+ "QualityRuleResult",
26
+ "clear_quality_rule_registry",
27
+ "evaluate_quality",
28
+ "evaluate_custom_quality_rules",
29
+ "evaluate_custom_quality_runtime",
30
+ "get_quality_rule",
31
+ "is_abort_only_failure",
32
+ "list_quality_rules",
33
+ "quality_status",
34
+ "quarantinable_results",
35
+ "register_quality_rule",
36
+ "render_quality_check_sql",
37
+ "render_quality_result_insert_sql",
38
+ "render_quality_results_insert_sql",
39
+ "render_quarantine_reference_insert_sql",
40
+ "unregister_quality_rule",
41
+ ]
@@ -0,0 +1,178 @@
1
+ """Databricks runtime evaluation for portable quality intents."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from importlib import import_module
6
+ from typing import Any
7
+
8
+ from contractforge_core.config import MAX_INLINE_ACCEPTED_VALUES
9
+ from contractforge_core.quality import QualityRuleResult, quality_status
10
+ from contractforge_core.semantic import QualityIntent, SemanticContract
11
+ from contractforge_databricks.quality.registry import evaluate_custom_quality_runtime
12
+
13
+
14
+ def evaluate_quality(
15
+ df: Any,
16
+ contract_or_quality: SemanticContract | tuple[QualityIntent, ...],
17
+ ) -> tuple[str, tuple[QualityRuleResult, ...], Any, Any, int]:
18
+ quality = contract_or_quality.quality if isinstance(contract_or_quality, SemanticContract) else contract_or_quality
19
+ custom_rules = _custom_quality_rules(contract_or_quality)
20
+ if not quality and not custom_rules:
21
+ return "NOT_CONFIGURED", (), df, df.limit(0), 0
22
+ functions = _functions()
23
+ results: list[QualityRuleResult] = []
24
+ quarantine_condition = functions.lit(False)
25
+ has_quarantine_condition = False
26
+ row_count: int | None = None
27
+
28
+ for intent in quality:
29
+ if intent.rule != "required_columns":
30
+ _validate_columns(df, intent.columns, f"quality.{intent.rule}")
31
+ if intent.rule == "required_columns":
32
+ result = _required_columns(df, intent)
33
+ elif intent.rule == "not_null":
34
+ result = _not_null(df, intent, functions)
35
+ if result.failed_count:
36
+ quarantine_condition = quarantine_condition | functions.col(intent.columns[0]).isNull()
37
+ has_quarantine_condition = True
38
+ elif intent.rule == "accepted_values":
39
+ result = _accepted_values(df, intent, functions)
40
+ if result.failed_count:
41
+ column = functions.col(intent.columns[0])
42
+ values = _values(intent.value)
43
+ quarantine_condition = quarantine_condition | ((~column.isin(values)) & column.isNotNull())
44
+ has_quarantine_condition = True
45
+ elif intent.rule == "max_null_ratio":
46
+ row_count = _row_count(df) if row_count is None else row_count
47
+ result = _max_null_ratio(df, intent, functions, row_count)
48
+ if result.failed_count:
49
+ quarantine_condition = quarantine_condition | functions.col(intent.columns[0]).isNull()
50
+ has_quarantine_condition = True
51
+ elif intent.rule == "unique_key":
52
+ result = _unique_key(df, intent, functions)
53
+ elif intent.rule == "row_count_minimum":
54
+ row_count = _row_count(df) if row_count is None else row_count
55
+ result = _row_count_minimum(intent, row_count)
56
+ elif intent.rule == "expression":
57
+ result, condition = _expression(df, intent, functions)
58
+ if result.failed_count and result.severity == "quarantine":
59
+ quarantine_condition = quarantine_condition | condition
60
+ has_quarantine_condition = True
61
+ else:
62
+ result = QualityRuleResult(intent.name, "FAILED", 1, "abort", f"Unsupported quality rule: {intent.rule}")
63
+ results.append(result)
64
+
65
+ custom_results, custom_quarantine_condition = evaluate_custom_quality_runtime(df, custom_rules)
66
+ results.extend(custom_results)
67
+ if custom_quarantine_condition is not None:
68
+ quarantine_condition = (
69
+ quarantine_condition | custom_quarantine_condition
70
+ if has_quarantine_condition
71
+ else custom_quarantine_condition
72
+ )
73
+ has_quarantine_condition = True
74
+
75
+ failed = tuple(result for result in results if result.failed_count > 0)
76
+ quarantined_df = df.where(quarantine_condition) if failed and has_quarantine_condition else df.limit(0)
77
+ quarantined_count = int(quarantined_df.count()) if failed and has_quarantine_condition else 0
78
+ valid_df = df.where(~quarantine_condition) if quarantined_count > 0 else df
79
+ return quality_status(tuple(results)), tuple(results), valid_df, quarantined_df, quarantined_count
80
+
81
+
82
+ def _required_columns(df: Any, intent: QualityIntent) -> QualityRuleResult:
83
+ missing = [column for column in intent.columns if column not in (getattr(df, "columns", ()) or ())]
84
+ return QualityRuleResult(
85
+ intent.name,
86
+ "FAILED" if missing else "PASSED",
87
+ len(missing),
88
+ "abort",
89
+ "Required columns are missing." if missing else None,
90
+ {"missing": missing},
91
+ )
92
+
93
+
94
+ def _not_null(df: Any, intent: QualityIntent, functions: Any) -> QualityRuleResult:
95
+ column = intent.columns[0]
96
+ count = _agg_int(df, functions.sum(functions.col(column).isNull().cast("long")).alias("failed_rows"), "failed_rows")
97
+ return QualityRuleResult(intent.name, "FAILED" if count else "PASSED", count, _severity(intent), intent.message, {"column": column})
98
+
99
+
100
+ def _accepted_values(df: Any, intent: QualityIntent, functions: Any) -> QualityRuleResult:
101
+ column_name = intent.columns[0]
102
+ column = functions.col(column_name)
103
+ values = _values(intent.value)
104
+ if len(values) > MAX_INLINE_ACCEPTED_VALUES:
105
+ raise ValueError(
106
+ f"quality.accepted_values.{column_name} has {len(values)} values. "
107
+ "Use a reference table or custom quality evaluator for large value sets."
108
+ )
109
+ invalid = (~column.isin(values)) & column.isNotNull()
110
+ count = _agg_int(df, functions.sum(invalid.cast("long")).alias("failed_rows"), "failed_rows")
111
+ return QualityRuleResult(intent.name, "FAILED" if count else "PASSED", count, _severity(intent), intent.message, {"column": column_name, "values": values})
112
+
113
+
114
+ def _max_null_ratio(df: Any, intent: QualityIntent, functions: Any, row_count: int) -> QualityRuleResult:
115
+ column = intent.columns[0]
116
+ null_count = _agg_int(df, functions.sum(functions.col(column).isNull().cast("long")).alias("failed_rows"), "failed_rows")
117
+ ratio = 0.0 if row_count == 0 else null_count / row_count
118
+ failed = ratio > float(intent.value)
119
+ return QualityRuleResult(intent.name, "FAILED" if failed else "PASSED", null_count if failed else 0, _severity(intent), intent.message, {"column": column, "ratio": ratio, "max_ratio": intent.value})
120
+
121
+
122
+ def _unique_key(df: Any, intent: QualityIntent, functions: Any) -> QualityRuleResult:
123
+ duplicates = df.groupBy(*intent.columns).count().where(functions.col("count") > 1).count()
124
+ count = int(duplicates or 0)
125
+ return QualityRuleResult(intent.name, "FAILED" if count else "PASSED", count, "abort", intent.message, {"columns": list(intent.columns)})
126
+
127
+
128
+ def _row_count_minimum(intent: QualityIntent, row_count: int) -> QualityRuleResult:
129
+ minimum = int(intent.value)
130
+ failed = max(0, minimum - row_count)
131
+ return QualityRuleResult(intent.name, "FAILED" if failed else "PASSED", failed, "abort", intent.message, {"min_rows": minimum, "actual": row_count})
132
+
133
+
134
+ def _expression(df: Any, intent: QualityIntent, functions: Any) -> tuple[QualityRuleResult, Any]:
135
+ expression = functions.expr(str(intent.value))
136
+ invalid = expression.isNull() | (expression == functions.lit(False))
137
+ count = _agg_int(df, functions.sum(invalid.cast("long")).alias("failed_rows"), "failed_rows")
138
+ severity = _severity(intent)
139
+ status = "WARNED" if count and severity == "warn" else "FAILED" if count else "PASSED"
140
+ return QualityRuleResult(intent.name, status, count, severity, intent.message, {"expression": intent.value}), invalid
141
+
142
+
143
+ def _agg_int(df: Any, expression: Any, field: str) -> int:
144
+ row = df.agg(expression).collect()[0]
145
+ return int((row[field] if row is not None else 0) or 0)
146
+
147
+
148
+ def _row_count(df: Any) -> int:
149
+ return int(df.count() or 0)
150
+
151
+
152
+ def _validate_columns(df: Any, columns: tuple[str, ...], context: str) -> None:
153
+ missing = [column for column in columns if column not in (getattr(df, "columns", ()) or ())]
154
+ if missing:
155
+ raise ValueError(f"{context} not found: {missing}")
156
+
157
+
158
+ def _values(value: object) -> list[Any]:
159
+ if isinstance(value, (list, tuple, set)):
160
+ return list(value)
161
+ return [value]
162
+
163
+
164
+ def _severity(intent: QualityIntent) -> str:
165
+ return str(intent.severity or "quarantine") # type: ignore[return-value]
166
+
167
+
168
+ def _functions() -> Any:
169
+ return import_module("pyspark.sql").functions
170
+
171
+
172
+ def _custom_quality_rules(contract_or_quality: SemanticContract | tuple[QualityIntent, ...]) -> dict[str, dict[str, Any]]:
173
+ if not isinstance(contract_or_quality, SemanticContract):
174
+ return {}
175
+ extensions = contract_or_quality.extensions or {}
176
+ quality = extensions.get("quality") if isinstance(extensions, dict) else None
177
+ custom = quality.get("custom") if isinstance(quality, dict) else None
178
+ return dict(custom) if isinstance(custom, dict) else {}
@@ -0,0 +1,81 @@
1
+ """Databricks SQL rendering for quality and quarantine persistence."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from datetime import datetime
7
+
8
+ from contractforge_core.quality import QualityRuleResult
9
+ from contractforge_databricks.evidence import QuarantineEvidenceRecord
10
+ from contractforge_databricks.evidence.sql import render_quarantine_insert_sql
11
+ from contractforge_databricks.evidence.tables import evidence_table_names
12
+ from contractforge_databricks.sql import quote_table_name, sql_int, sql_string
13
+
14
+
15
+ def render_quality_result_insert_sql(
16
+ *,
17
+ run_id: str,
18
+ target_table: str,
19
+ result: QualityRuleResult,
20
+ checked_at_utc: datetime,
21
+ catalog: str = "main",
22
+ schema: str = "ops",
23
+ ) -> str:
24
+ table = evidence_table_names(catalog, schema)["quality"]
25
+ checked_at = checked_at_utc.strftime("%Y-%m-%d %H:%M:%S")
26
+ return (
27
+ f"INSERT INTO {quote_table_name(table)} "
28
+ "(run_id, target_table, rule_name, status, severity, failed_count, observed_value, "
29
+ "checked_at_utc, message, details_json) VALUES "
30
+ f"({sql_string(run_id)}, {sql_string(target_table)}, {sql_string(result.rule_name)}, "
31
+ f"{sql_string(result.status)}, {sql_string(result.severity)}, {sql_int(result.failed_count)}, "
32
+ f"{_json(result.as_dict())}, TIMESTAMP {sql_string(checked_at)}, "
33
+ f"{sql_string(result.message)}, {_json(result.details or {})})"
34
+ )
35
+
36
+
37
+ def render_quality_results_insert_sql(
38
+ *,
39
+ run_id: str,
40
+ target_table: str,
41
+ results: tuple[QualityRuleResult, ...],
42
+ checked_at_utc: datetime,
43
+ catalog: str = "main",
44
+ schema: str = "ops",
45
+ ) -> str:
46
+ statements = [
47
+ render_quality_result_insert_sql(
48
+ run_id=run_id,
49
+ target_table=target_table,
50
+ result=result,
51
+ checked_at_utc=checked_at_utc,
52
+ catalog=catalog,
53
+ schema=schema,
54
+ )
55
+ for result in results
56
+ ]
57
+ return ";\n".join(statements) + (";\n" if statements else "-- No quality results to persist.\n")
58
+
59
+
60
+ def render_quarantine_reference_insert_sql(
61
+ *,
62
+ run_id: str,
63
+ target_table: str,
64
+ record_ref: str,
65
+ reason: str,
66
+ quarantined_at_utc: datetime,
67
+ catalog: str = "main",
68
+ schema: str = "ops",
69
+ ) -> str:
70
+ record = QuarantineEvidenceRecord(
71
+ run_id=run_id,
72
+ target_table=target_table,
73
+ record_ref=record_ref,
74
+ reason=reason,
75
+ quarantined_at_utc=quarantined_at_utc,
76
+ )
77
+ return render_quarantine_insert_sql(record, catalog=catalog, schema=schema)
78
+
79
+
80
+ def _json(value: object) -> str:
81
+ return sql_string(json.dumps(value, sort_keys=True, separators=(",", ":")))
@@ -0,0 +1,134 @@
1
+ """Databricks runtime registry for custom quality evaluators."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Callable
6
+ from importlib import import_module
7
+ from typing import Any
8
+
9
+ from contractforge_core.config import VALID_QUALITY_RULE_SEVERITIES
10
+ from contractforge_core.quality import QualityRuleResult, is_abort_only_failure as is_abort_only_failure
11
+
12
+ QualityRuleEvaluator = Callable[[Any, str, dict[str, Any]], dict[str, Any]]
13
+ QUALITY_RULE_REGISTRY: dict[str, QualityRuleEvaluator] = {}
14
+
15
+
16
+ def register_quality_rule(rule_type: str, evaluator: QualityRuleEvaluator, *, overwrite: bool = False) -> None:
17
+ normalized = _normalize_rule_type(rule_type)
18
+ if not callable(evaluator):
19
+ raise ValueError("quality rule evaluator must be callable")
20
+ if normalized in QUALITY_RULE_REGISTRY and not overwrite:
21
+ raise ValueError(f"quality rule already registered: {normalized}")
22
+ QUALITY_RULE_REGISTRY[normalized] = evaluator
23
+
24
+
25
+ def unregister_quality_rule(rule_type: str) -> None:
26
+ QUALITY_RULE_REGISTRY.pop(_normalize_rule_type(rule_type), None)
27
+
28
+
29
+ def get_quality_rule(rule_type: str) -> QualityRuleEvaluator | None:
30
+ return QUALITY_RULE_REGISTRY.get(_normalize_rule_type(rule_type))
31
+
32
+
33
+ def list_quality_rules() -> tuple[str, ...]:
34
+ return tuple(sorted(QUALITY_RULE_REGISTRY))
35
+
36
+
37
+ def clear_quality_rule_registry() -> None:
38
+ QUALITY_RULE_REGISTRY.clear()
39
+
40
+
41
+ def evaluate_custom_quality_rules(df: Any, custom_rules: dict[str, dict[str, Any]] | None) -> tuple[QualityRuleResult, ...]:
42
+ if not custom_rules:
43
+ return ()
44
+ results: list[QualityRuleResult] = []
45
+ for rule_name, config in custom_rules.items():
46
+ rule_type = str(config.get("type") or "").strip()
47
+ evaluator = QUALITY_RULE_REGISTRY.get(rule_type)
48
+ if evaluator is None:
49
+ raise ValueError(f"quality_rules.custom.{rule_name} uses unregistered type: {rule_type}")
50
+ payload = evaluator(df, str(rule_name), dict(config))
51
+ failed_count = int(payload.get("failed_count", 0) or 0)
52
+ severity = str(payload.get("severity") or config.get("severity") or "abort").strip()
53
+ if severity not in VALID_QUALITY_RULE_SEVERITIES:
54
+ raise ValueError(
55
+ f"quality_rules.custom.{rule_name}.severity={severity!r} is not supported. "
56
+ f"Valid values: {sorted(VALID_QUALITY_RULE_SEVERITIES)}"
57
+ )
58
+ status = _status(failed_count, severity)
59
+ results.append(
60
+ QualityRuleResult(
61
+ rule_name=f"custom:{rule_name}",
62
+ status=status,
63
+ failed_count=failed_count,
64
+ severity=severity, # type: ignore[arg-type]
65
+ message=payload.get("message") or config.get("message"),
66
+ details={"name": rule_name, "type": rule_type, **dict(payload.get("details") or {})},
67
+ )
68
+ )
69
+ return tuple(results)
70
+
71
+
72
+ def evaluate_custom_quality_runtime(
73
+ df: Any,
74
+ custom_rules: dict[str, dict[str, Any]] | None,
75
+ ) -> tuple[tuple[QualityRuleResult, ...], Any | None]:
76
+ if not custom_rules:
77
+ return (), None
78
+ functions = import_module("pyspark.sql").functions
79
+ quarantine_condition = functions.lit(False)
80
+ has_quarantine_condition = False
81
+ results: list[QualityRuleResult] = []
82
+ for rule_name, config in custom_rules.items():
83
+ rule_type = str(config.get("type") or "").strip()
84
+ evaluator = QUALITY_RULE_REGISTRY.get(rule_type)
85
+ if evaluator is None:
86
+ raise ValueError(f"quality_rules.custom.{rule_name} uses unregistered type: {rule_type}")
87
+ payload = evaluator(df, str(rule_name), dict(config))
88
+ result = _custom_result(str(rule_name), rule_type, config, payload)
89
+ results.append(result)
90
+ if result.failed_count and result.severity == "quarantine":
91
+ condition = payload.get("condition")
92
+ if condition is None:
93
+ raise ValueError(f"quality_rules.custom.{rule_name} with severity=quarantine must return condition")
94
+ quarantine_condition = quarantine_condition | condition
95
+ has_quarantine_condition = True
96
+ return tuple(results), quarantine_condition if has_quarantine_condition else None
97
+
98
+
99
+ def _custom_result(
100
+ rule_name: str,
101
+ rule_type: str,
102
+ config: dict[str, Any],
103
+ payload: dict[str, Any],
104
+ ) -> QualityRuleResult:
105
+ failed_count = int(payload.get("failed_count", 0) or 0)
106
+ severity = str(payload.get("severity") or config.get("severity") or "abort").strip()
107
+ if severity not in VALID_QUALITY_RULE_SEVERITIES:
108
+ raise ValueError(
109
+ f"quality_rules.custom.{rule_name}.severity={severity!r} is not supported. "
110
+ f"Valid values: {sorted(VALID_QUALITY_RULE_SEVERITIES)}"
111
+ )
112
+ return QualityRuleResult(
113
+ rule_name=f"custom:{rule_name}",
114
+ status=_status(failed_count, severity),
115
+ failed_count=failed_count,
116
+ severity=severity, # type: ignore[arg-type]
117
+ message=payload.get("message") or config.get("message"),
118
+ details={"name": rule_name, "type": rule_type, **dict(payload.get("details") or {})},
119
+ )
120
+
121
+
122
+ def _status(failed_count: int, severity: str) -> str:
123
+ if failed_count <= 0:
124
+ return "PASSED"
125
+ if severity == "warn":
126
+ return "WARNED"
127
+ return "FAILED"
128
+
129
+
130
+ def _normalize_rule_type(rule_type: str) -> str:
131
+ normalized = str(rule_type or "").strip()
132
+ if not normalized:
133
+ raise ValueError("quality rule type cannot be empty")
134
+ return normalized