contractforge-databricks 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contractforge_databricks/__init__.py +172 -0
- contractforge_databricks/adapter.py +69 -0
- contractforge_databricks/annotations/__init__.py +10 -0
- contractforge_databricks/annotations/application.py +52 -0
- contractforge_databricks/annotations/audit.py +49 -0
- contractforge_databricks/annotations/sql.py +142 -0
- contractforge_databricks/api.py +65 -0
- contractforge_databricks/bundles/__init__.py +9 -0
- contractforge_databricks/bundles/assets.py +47 -0
- contractforge_databricks/bundles/project.py +213 -0
- contractforge_databricks/bundles/project_config.py +133 -0
- contractforge_databricks/capabilities/__init__.py +17 -0
- contractforge_databricks/capabilities/builders.py +43 -0
- contractforge_databricks/capabilities/evaluate.py +162 -0
- contractforge_databricks/capabilities/mapping.py +36 -0
- contractforge_databricks/capabilities/models.py +44 -0
- contractforge_databricks/capabilities/runtime.py +111 -0
- contractforge_databricks/capabilities/uc.py +47 -0
- contractforge_databricks/cli.py +196 -0
- contractforge_databricks/cli_deploy.py +98 -0
- contractforge_databricks/cli_governance.py +142 -0
- contractforge_databricks/cli_io.py +91 -0
- contractforge_databricks/cli_maintenance.py +69 -0
- contractforge_databricks/coercion.py +31 -0
- contractforge_databricks/contract_extensions.py +70 -0
- contractforge_databricks/cost/__init__.py +11 -0
- contractforge_databricks/cost/model.py +22 -0
- contractforge_databricks/cost/report.py +65 -0
- contractforge_databricks/cost/sql.py +136 -0
- contractforge_databricks/dashboards/__init__.py +15 -0
- contractforge_databricks/dashboards/control_tables.py +150 -0
- contractforge_databricks/diagnostics/__init__.py +7 -0
- contractforge_databricks/diagnostics/explain.py +40 -0
- contractforge_databricks/environment.py +53 -0
- contractforge_databricks/evidence/__init__.py +98 -0
- contractforge_databricks/evidence/ddl.py +35 -0
- contractforge_databricks/evidence/governance_log.py +175 -0
- contractforge_databricks/evidence/helpers.py +29 -0
- contractforge_databricks/evidence/ops_log.py +210 -0
- contractforge_databricks/evidence/records.py +27 -0
- contractforge_databricks/evidence/run_log.py +74 -0
- contractforge_databricks/evidence/schemas.py +7 -0
- contractforge_databricks/evidence/sql.py +144 -0
- contractforge_databricks/evidence/tables.py +20 -0
- contractforge_databricks/evidence/writer.py +118 -0
- contractforge_databricks/execution/__init__.py +70 -0
- contractforge_databricks/execution/delta_basic.py +57 -0
- contractforge_databricks/execution/hash_diff.py +126 -0
- contractforge_databricks/execution/hash_diff_latest.py +142 -0
- contractforge_databricks/execution/replace_partitions.py +40 -0
- contractforge_databricks/execution/results.py +5 -0
- contractforge_databricks/execution/retry.py +36 -0
- contractforge_databricks/execution/scd2.py +213 -0
- contractforge_databricks/execution/scd2_deletes.py +65 -0
- contractforge_databricks/execution/scd2_late.py +30 -0
- contractforge_databricks/execution/snapshot.py +77 -0
- contractforge_databricks/execution/sql_merge.py +85 -0
- contractforge_databricks/execution/tables.py +98 -0
- contractforge_databricks/execution/windows.py +58 -0
- contractforge_databricks/governance/__init__.py +30 -0
- contractforge_databricks/governance/access.py +185 -0
- contractforge_databricks/governance/application.py +93 -0
- contractforge_databricks/governance/drift.py +49 -0
- contractforge_databricks/governance/runtime.py +60 -0
- contractforge_databricks/governance/sql.py +31 -0
- contractforge_databricks/governance/validation.py +135 -0
- contractforge_databricks/lakeflow/__init__.py +21 -0
- contractforge_databricks/lakeflow/compatibility.py +194 -0
- contractforge_databricks/lakeflow/rendering.py +175 -0
- contractforge_databricks/lineage/__init__.py +7 -0
- contractforge_databricks/lineage/openlineage.py +182 -0
- contractforge_databricks/maintenance/__init__.py +27 -0
- contractforge_databricks/maintenance/retention.py +90 -0
- contractforge_databricks/maintenance/sql.py +68 -0
- contractforge_databricks/metrics/__init__.py +19 -0
- contractforge_databricks/metrics/history.py +21 -0
- contractforge_databricks/metrics/write.py +63 -0
- contractforge_databricks/operations/__init__.py +4 -0
- contractforge_databricks/operations/application.py +38 -0
- contractforge_databricks/operations/sql.py +95 -0
- contractforge_databricks/parity/__init__.py +18 -0
- contractforge_databricks/parity/catalog.py +59 -0
- contractforge_databricks/parity/models.py +7 -0
- contractforge_databricks/parity/scenarios.py +111 -0
- contractforge_databricks/partitioning/__init__.py +3 -0
- contractforge_databricks/partitioning/predicates.py +28 -0
- contractforge_databricks/preparation/__init__.py +47 -0
- contractforge_databricks/preparation/deduplicate.py +87 -0
- contractforge_databricks/preparation/encoding.py +37 -0
- contractforge_databricks/preparation/hashing.py +18 -0
- contractforge_databricks/preparation/pyspark.py +178 -0
- contractforge_databricks/preparation/pyspark_staging.py +70 -0
- contractforge_databricks/preparation/shape.py +209 -0
- contractforge_databricks/preparation/shape_validation.py +94 -0
- contractforge_databricks/preparation/staging.py +17 -0
- contractforge_databricks/preparation/zip_arrays.py +51 -0
- contractforge_databricks/presets/__init__.py +3 -0
- contractforge_databricks/presets/base.py +24 -0
- contractforge_databricks/presets/bronze.py +57 -0
- contractforge_databricks/presets/catalog.py +22 -0
- contractforge_databricks/presets/core.py +134 -0
- contractforge_databricks/presets/gold.py +62 -0
- contractforge_databricks/presets/modifiers.py +51 -0
- contractforge_databricks/presets/runtime.py +22 -0
- contractforge_databricks/presets/silver.py +101 -0
- contractforge_databricks/presets/write_engine.py +57 -0
- contractforge_databricks/quality/__init__.py +41 -0
- contractforge_databricks/quality/evaluation.py +178 -0
- contractforge_databricks/quality/persistence.py +81 -0
- contractforge_databricks/quality/registry.py +134 -0
- contractforge_databricks/quality/results.py +17 -0
- contractforge_databricks/quality/sql.py +113 -0
- contractforge_databricks/rendering/__init__.py +11 -0
- contractforge_databricks/rendering/bundle.py +93 -0
- contractforge_databricks/rendering/markdown.py +50 -0
- contractforge_databricks/rendering/names.py +56 -0
- contractforge_databricks/results.py +15 -0
- contractforge_databricks/runtime/__init__.py +101 -0
- contractforge_databricks/runtime/available_now.py +147 -0
- contractforge_databricks/runtime/bundles.py +211 -0
- contractforge_databricks/runtime/cache.py +20 -0
- contractforge_databricks/runtime/control_tables.py +19 -0
- contractforge_databricks/runtime/deploy.py +197 -0
- contractforge_databricks/runtime/detection.py +114 -0
- contractforge_databricks/runtime/dry_run.py +46 -0
- contractforge_databricks/runtime/errors.py +54 -0
- contractforge_databricks/runtime/file_selection.py +109 -0
- contractforge_databricks/runtime/finalization.py +168 -0
- contractforge_databricks/runtime/governance.py +37 -0
- contractforge_databricks/runtime/hooks.py +45 -0
- contractforge_databricks/runtime/http_file.py +37 -0
- contractforge_databricks/runtime/http_retry.py +15 -0
- contractforge_databricks/runtime/http_safety.py +9 -0
- contractforge_databricks/runtime/json_materialization.py +97 -0
- contractforge_databricks/runtime/lineage.py +164 -0
- contractforge_databricks/runtime/maintenance.py +43 -0
- contractforge_databricks/runtime/merge_validation.py +98 -0
- contractforge_databricks/runtime/metadata.py +21 -0
- contractforge_databricks/runtime/metrics.py +34 -0
- contractforge_databricks/runtime/models.py +32 -0
- contractforge_databricks/runtime/options.py +33 -0
- contractforge_databricks/runtime/orchestration_context.py +185 -0
- contractforge_databricks/runtime/orchestrator.py +147 -0
- contractforge_databricks/runtime/partitioning.py +93 -0
- contractforge_databricks/runtime/quality_quarantine.py +92 -0
- contractforge_databricks/runtime/rest_api.py +46 -0
- contractforge_databricks/runtime/rest_auth.py +21 -0
- contractforge_databricks/runtime/rest_pagination.py +21 -0
- contractforge_databricks/runtime/run_payload.py +177 -0
- contractforge_databricks/runtime/schema.py +106 -0
- contractforge_databricks/runtime/source_metadata.py +30 -0
- contractforge_databricks/runtime/source_registry.py +43 -0
- contractforge_databricks/runtime/source_schema.py +24 -0
- contractforge_databricks/runtime/sources.py +208 -0
- contractforge_databricks/runtime/spark.py +183 -0
- contractforge_databricks/runtime/spark_defaults.py +35 -0
- contractforge_databricks/runtime/storage_auth.py +132 -0
- contractforge_databricks/runtime/streaming.py +131 -0
- contractforge_databricks/runtime/success.py +104 -0
- contractforge_databricks/runtime/utils.py +52 -0
- contractforge_databricks/runtime/watermark.py +71 -0
- contractforge_databricks/runtime/windows.py +184 -0
- contractforge_databricks/runtime/write.py +66 -0
- contractforge_databricks/runtime/write_flow.py +146 -0
- contractforge_databricks/runtime/write_strategy.py +40 -0
- contractforge_databricks/schema/__init__.py +21 -0
- contractforge_databricks/schema/diff.py +11 -0
- contractforge_databricks/schema/policy.py +33 -0
- contractforge_databricks/schema/sync.py +23 -0
- contractforge_databricks/security/__init__.py +21 -0
- contractforge_databricks/security/errors.py +5 -0
- contractforge_databricks/security/redaction.py +5 -0
- contractforge_databricks/security/secrets.py +114 -0
- contractforge_databricks/security/source_policy.py +17 -0
- contractforge_databricks/shapes/__init__.py +3 -0
- contractforge_databricks/shapes/sql.py +123 -0
- contractforge_databricks/sources/__init__.py +67 -0
- contractforge_databricks/sources/artifacts.py +100 -0
- contractforge_databricks/sources/autoloader.py +48 -0
- contractforge_databricks/sources/bounded_streams.py +44 -0
- contractforge_databricks/sources/classification.py +115 -0
- contractforge_databricks/sources/delta_share.py +21 -0
- contractforge_databricks/sources/files.py +48 -0
- contractforge_databricks/sources/http_file.py +46 -0
- contractforge_databricks/sources/interpret.py +76 -0
- contractforge_databricks/sources/jdbc.py +32 -0
- contractforge_databricks/sources/metadata.py +18 -0
- contractforge_databricks/sources/native_passthrough.py +33 -0
- contractforge_databricks/sources/rds_iam.py +15 -0
- contractforge_databricks/sources/rds_iam_runtime.py +191 -0
- contractforge_databricks/sources/rest_api.py +33 -0
- contractforge_databricks/sources/support.py +50 -0
- contractforge_databricks/sources/table_refs.py +65 -0
- contractforge_databricks/sql/__init__.py +4 -0
- contractforge_databricks/sql/identifiers.py +17 -0
- contractforge_databricks/sql/literals.py +36 -0
- contractforge_databricks/state/__init__.py +39 -0
- contractforge_databricks/state/ddl.py +24 -0
- contractforge_databricks/state/migrations.py +146 -0
- contractforge_databricks/state/queries.py +149 -0
- contractforge_databricks/state/sql.py +116 -0
- contractforge_databricks/state/tables.py +9 -0
- contractforge_databricks/state/writer.py +83 -0
- contractforge_databricks/templates/__init__.py +15 -0
- contractforge_databricks/templates/catalog.py +205 -0
- contractforge_databricks/templates/catalog_parity.py +85 -0
- contractforge_databricks/templates/core.py +83 -0
- contractforge_databricks/templates/enrichment.py +175 -0
- contractforge_databricks/transforms/__init__.py +3 -0
- contractforge_databricks/transforms/sql.py +118 -0
- contractforge_databricks/watermark/__init__.py +6 -0
- contractforge_databricks/watermark/sql.py +91 -0
- contractforge_databricks/write_modes/__init__.py +20 -0
- contractforge_databricks/write_modes/registry.py +44 -0
- contractforge_databricks/write_modes/sql.py +33 -0
- contractforge_databricks/write_modes/strategy.py +192 -0
- contractforge_databricks-0.1.0.dist-info/METADATA +34 -0
- contractforge_databricks-0.1.0.dist-info/RECORD +220 -0
- contractforge_databricks-0.1.0.dist-info/WHEEL +4 -0
- contractforge_databricks-0.1.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Databricks modifier presets ported from ContractForge."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from contractforge_databricks.presets.base import PRESET_META_KEY, Preset, meta
|
|
6
|
+
|
|
7
|
+
DELTA_PRESETS: dict[str, Preset] = {
|
|
8
|
+
"delta_cdf_enabled": {
|
|
9
|
+
PRESET_META_KEY: meta("delta_cdf_enabled", "delta", "modifier", "Enable Delta Change Data Feed."),
|
|
10
|
+
"extensions": {"databricks": {"delta_properties": {"delta.enableChangeDataFeed": "true"}}},
|
|
11
|
+
},
|
|
12
|
+
"delta_optimized_writes": {
|
|
13
|
+
PRESET_META_KEY: meta("delta_optimized_writes", "delta", "modifier", "Optimized Delta write properties."),
|
|
14
|
+
"extensions": {
|
|
15
|
+
"databricks": {
|
|
16
|
+
"delta_properties": {
|
|
17
|
+
"delta.autoOptimize.optimizeWrite": "true",
|
|
18
|
+
"delta.autoOptimize.autoCompact": "true",
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
},
|
|
22
|
+
},
|
|
23
|
+
"delta_liquid_clustering": {
|
|
24
|
+
PRESET_META_KEY: meta(
|
|
25
|
+
"delta_liquid_clustering",
|
|
26
|
+
"delta",
|
|
27
|
+
"modifier",
|
|
28
|
+
"Databricks Delta liquid clustering.",
|
|
29
|
+
["extensions.databricks.cluster_columns"],
|
|
30
|
+
)
|
|
31
|
+
},
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
QUALITY_PRESETS: dict[str, Preset] = {
|
|
35
|
+
"quality_strict": {
|
|
36
|
+
PRESET_META_KEY: meta("quality_strict", "quality", "modifier", "Abortive quality policy."),
|
|
37
|
+
"on_quality_fail": "fail",
|
|
38
|
+
},
|
|
39
|
+
"quality_quarantine": {
|
|
40
|
+
PRESET_META_KEY: meta("quality_quarantine", "quality", "modifier", "Quality quarantine policy."),
|
|
41
|
+
"on_quality_fail": "quarantine",
|
|
42
|
+
},
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
GOVERNANCE_PRESETS: dict[str, Preset] = {
|
|
46
|
+
"governance_uc_basic": {
|
|
47
|
+
PRESET_META_KEY: meta("governance_uc_basic", "governance", "modifier", "Basic Unity Catalog governance."),
|
|
48
|
+
"annotations": {"policy": "warn"},
|
|
49
|
+
"access": {"access_policy": {"mode": "validate_only", "on_drift": "warn"}},
|
|
50
|
+
}
|
|
51
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Databricks runtime presets ported from ContractForge."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from contractforge_databricks.presets.base import PRESET_META_KEY, Preset, meta
|
|
6
|
+
|
|
7
|
+
RUNTIME_PRESETS: dict[str, Preset] = {
|
|
8
|
+
"runtime_databricks_serverless": {
|
|
9
|
+
PRESET_META_KEY: meta("runtime_databricks_serverless", "runtime", "runtime", "Databricks Serverless defaults."),
|
|
10
|
+
"extensions": {"databricks": {"cache_source": False, "optimize_after_write": False}},
|
|
11
|
+
},
|
|
12
|
+
"runtime_spark_delta_local": {
|
|
13
|
+
PRESET_META_KEY: meta("runtime_spark_delta_local", "runtime", "runtime", "Local PySpark + Delta defaults."),
|
|
14
|
+
"extensions": {
|
|
15
|
+
"databricks": {
|
|
16
|
+
"cache_source": False,
|
|
17
|
+
"optimize_after_write": False,
|
|
18
|
+
"lock_enabled": False,
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
},
|
|
22
|
+
}
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""Silver Databricks presets ported from ContractForge."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from contractforge_databricks.presets.base import PRESET_META_KEY, Preset, meta
|
|
6
|
+
|
|
7
|
+
SILVER_PRESETS: dict[str, Preset] = {
|
|
8
|
+
"silver_scd1_upsert": {
|
|
9
|
+
PRESET_META_KEY: meta("silver_scd1_upsert", "silver", "ingestion", "Silver SCD1 Delta MERGE.", ["merge_keys"]),
|
|
10
|
+
"layer": "silver",
|
|
11
|
+
"mode": "scd1_upsert",
|
|
12
|
+
"extensions": {"databricks": {"merge_strategy": "delta"}},
|
|
13
|
+
"schema_policy": "additive_only",
|
|
14
|
+
"on_quality_fail": "fail",
|
|
15
|
+
},
|
|
16
|
+
"silver_scd1_partition_upsert": {
|
|
17
|
+
PRESET_META_KEY: meta(
|
|
18
|
+
"silver_scd1_partition_upsert",
|
|
19
|
+
"silver",
|
|
20
|
+
"ingestion",
|
|
21
|
+
"Silver SCD1 MERGE pruned by partition.",
|
|
22
|
+
["merge_keys", "extensions.databricks.merge_partition_column"],
|
|
23
|
+
),
|
|
24
|
+
"layer": "silver",
|
|
25
|
+
"mode": "scd1_upsert",
|
|
26
|
+
"extensions": {"databricks": {"merge_strategy": "delta_by_partition"}},
|
|
27
|
+
"schema_policy": "additive_only",
|
|
28
|
+
"on_quality_fail": "fail",
|
|
29
|
+
},
|
|
30
|
+
"silver_replace_partitions": {
|
|
31
|
+
PRESET_META_KEY: meta(
|
|
32
|
+
"silver_replace_partitions",
|
|
33
|
+
"silver",
|
|
34
|
+
"ingestion",
|
|
35
|
+
"Silver replacement of complete partitions.",
|
|
36
|
+
["extensions.databricks.merge_partition_column"],
|
|
37
|
+
),
|
|
38
|
+
"layer": "silver",
|
|
39
|
+
"mode": "scd1_upsert",
|
|
40
|
+
"extensions": {
|
|
41
|
+
"databricks": {
|
|
42
|
+
"merge_strategy": "replace_partitions",
|
|
43
|
+
"replace_partitions_source_complete": True,
|
|
44
|
+
}
|
|
45
|
+
},
|
|
46
|
+
"schema_policy": "additive_only",
|
|
47
|
+
"on_quality_fail": "fail",
|
|
48
|
+
},
|
|
49
|
+
"silver_incremental_watermark_upsert": {
|
|
50
|
+
PRESET_META_KEY: meta(
|
|
51
|
+
"silver_incremental_watermark_upsert",
|
|
52
|
+
"silver",
|
|
53
|
+
"ingestion",
|
|
54
|
+
"Silver SCD1 incremental watermark upsert.",
|
|
55
|
+
["merge_keys", "watermark_columns"],
|
|
56
|
+
),
|
|
57
|
+
"layer": "silver",
|
|
58
|
+
"mode": "scd1_upsert",
|
|
59
|
+
"extensions": {"databricks": {"merge_strategy": "delta"}},
|
|
60
|
+
"schema_policy": "additive_only",
|
|
61
|
+
"on_quality_fail": "fail",
|
|
62
|
+
},
|
|
63
|
+
"silver_hash_diff_append": {
|
|
64
|
+
PRESET_META_KEY: meta("silver_hash_diff_append", "silver", "ingestion", "Silver hash-diff append.", ["hash_keys"]),
|
|
65
|
+
"layer": "silver",
|
|
66
|
+
"mode": "scd1_hash_diff",
|
|
67
|
+
"schema_policy": "additive_only",
|
|
68
|
+
"on_quality_fail": "fail",
|
|
69
|
+
"hash_exclude_columns": ["ingestion_ts_utc", "__run_id"],
|
|
70
|
+
},
|
|
71
|
+
"silver_quarantine_ingestion": {
|
|
72
|
+
PRESET_META_KEY: meta(
|
|
73
|
+
"silver_quarantine_ingestion",
|
|
74
|
+
"silver",
|
|
75
|
+
"ingestion",
|
|
76
|
+
"Silver SCD1 with quarantine for row-level rules.",
|
|
77
|
+
["merge_keys"],
|
|
78
|
+
),
|
|
79
|
+
"layer": "silver",
|
|
80
|
+
"mode": "scd1_upsert",
|
|
81
|
+
"extensions": {"databricks": {"merge_strategy": "delta"}},
|
|
82
|
+
"schema_policy": "additive_only",
|
|
83
|
+
"on_quality_fail": "quarantine",
|
|
84
|
+
},
|
|
85
|
+
"silver_snapshot_soft_delete": {
|
|
86
|
+
PRESET_META_KEY: meta(
|
|
87
|
+
"silver_snapshot_soft_delete", "silver", "ingestion", "Silver snapshot soft delete.", ["merge_keys"]
|
|
88
|
+
),
|
|
89
|
+
"layer": "silver",
|
|
90
|
+
"mode": "snapshot_soft_delete",
|
|
91
|
+
"schema_policy": "additive_only",
|
|
92
|
+
"on_quality_fail": "fail",
|
|
93
|
+
},
|
|
94
|
+
"silver_scd2_historical": {
|
|
95
|
+
PRESET_META_KEY: meta("silver_scd2_historical", "silver", "ingestion", "Silver SCD2 history.", ["merge_keys"]),
|
|
96
|
+
"layer": "silver",
|
|
97
|
+
"mode": "scd2_historical",
|
|
98
|
+
"schema_policy": "additive_only",
|
|
99
|
+
"on_quality_fail": "fail",
|
|
100
|
+
},
|
|
101
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Databricks write-engine preview presets ported from ContractForge."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from contractforge_databricks.presets.base import PRESET_META_KEY, Preset, meta
|
|
6
|
+
|
|
7
|
+
WRITE_ENGINE_PRESETS: dict[str, Preset] = {
|
|
8
|
+
"write_engine_native_auto_preview": {
|
|
9
|
+
PRESET_META_KEY: meta(
|
|
10
|
+
"write_engine_native_auto_preview",
|
|
11
|
+
"write_engine",
|
|
12
|
+
"modifier",
|
|
13
|
+
"Record Databricks native engine selection evidence without changing execution.",
|
|
14
|
+
),
|
|
15
|
+
"extensions": {
|
|
16
|
+
"databricks": {
|
|
17
|
+
"write_engine": {"requested": "auto", "fallback_policy": "preview_only", "explain_selection": True}
|
|
18
|
+
}
|
|
19
|
+
},
|
|
20
|
+
},
|
|
21
|
+
"write_engine_databricks_sql_merge_preview": {
|
|
22
|
+
PRESET_META_KEY: meta(
|
|
23
|
+
"write_engine_databricks_sql_merge_preview",
|
|
24
|
+
"write_engine",
|
|
25
|
+
"modifier",
|
|
26
|
+
"Preview Databricks SQL MERGE eligibility while executing the Delta baseline.",
|
|
27
|
+
["merge_keys"],
|
|
28
|
+
),
|
|
29
|
+
"extensions": {
|
|
30
|
+
"databricks": {
|
|
31
|
+
"write_engine": {
|
|
32
|
+
"requested": "databricks_sql_merge",
|
|
33
|
+
"fallback_policy": "preview_only",
|
|
34
|
+
"explain_selection": True,
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
},
|
|
38
|
+
},
|
|
39
|
+
"write_engine_lakeflow_auto_cdc_preview": {
|
|
40
|
+
PRESET_META_KEY: meta(
|
|
41
|
+
"write_engine_lakeflow_auto_cdc_preview",
|
|
42
|
+
"write_engine",
|
|
43
|
+
"modifier",
|
|
44
|
+
"Preview Lakeflow AUTO CDC eligibility while executing the Delta baseline.",
|
|
45
|
+
["merge_keys"],
|
|
46
|
+
),
|
|
47
|
+
"extensions": {
|
|
48
|
+
"databricks": {
|
|
49
|
+
"write_engine": {
|
|
50
|
+
"requested": "lakeflow_auto_cdc",
|
|
51
|
+
"fallback_policy": "preview_only",
|
|
52
|
+
"explain_selection": True,
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
},
|
|
56
|
+
},
|
|
57
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from contractforge_databricks.quality.persistence import (
|
|
2
|
+
render_quality_result_insert_sql,
|
|
3
|
+
render_quality_results_insert_sql,
|
|
4
|
+
render_quarantine_reference_insert_sql,
|
|
5
|
+
)
|
|
6
|
+
from contractforge_databricks.quality.evaluation import evaluate_quality
|
|
7
|
+
from contractforge_databricks.quality.registry import (
|
|
8
|
+
clear_quality_rule_registry,
|
|
9
|
+
evaluate_custom_quality_rules,
|
|
10
|
+
evaluate_custom_quality_runtime,
|
|
11
|
+
get_quality_rule,
|
|
12
|
+
is_abort_only_failure,
|
|
13
|
+
list_quality_rules,
|
|
14
|
+
register_quality_rule,
|
|
15
|
+
unregister_quality_rule,
|
|
16
|
+
)
|
|
17
|
+
from contractforge_core.quality import (
|
|
18
|
+
QualityRuleResult,
|
|
19
|
+
quality_status,
|
|
20
|
+
quarantinable_results,
|
|
21
|
+
)
|
|
22
|
+
from contractforge_databricks.quality.sql import render_quality_check_sql
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"QualityRuleResult",
|
|
26
|
+
"clear_quality_rule_registry",
|
|
27
|
+
"evaluate_quality",
|
|
28
|
+
"evaluate_custom_quality_rules",
|
|
29
|
+
"evaluate_custom_quality_runtime",
|
|
30
|
+
"get_quality_rule",
|
|
31
|
+
"is_abort_only_failure",
|
|
32
|
+
"list_quality_rules",
|
|
33
|
+
"quality_status",
|
|
34
|
+
"quarantinable_results",
|
|
35
|
+
"register_quality_rule",
|
|
36
|
+
"render_quality_check_sql",
|
|
37
|
+
"render_quality_result_insert_sql",
|
|
38
|
+
"render_quality_results_insert_sql",
|
|
39
|
+
"render_quarantine_reference_insert_sql",
|
|
40
|
+
"unregister_quality_rule",
|
|
41
|
+
]
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""Databricks runtime evaluation for portable quality intents."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from importlib import import_module
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from contractforge_core.config import MAX_INLINE_ACCEPTED_VALUES
|
|
9
|
+
from contractforge_core.quality import QualityRuleResult, quality_status
|
|
10
|
+
from contractforge_core.semantic import QualityIntent, SemanticContract
|
|
11
|
+
from contractforge_databricks.quality.registry import evaluate_custom_quality_runtime
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def evaluate_quality(
|
|
15
|
+
df: Any,
|
|
16
|
+
contract_or_quality: SemanticContract | tuple[QualityIntent, ...],
|
|
17
|
+
) -> tuple[str, tuple[QualityRuleResult, ...], Any, Any, int]:
|
|
18
|
+
quality = contract_or_quality.quality if isinstance(contract_or_quality, SemanticContract) else contract_or_quality
|
|
19
|
+
custom_rules = _custom_quality_rules(contract_or_quality)
|
|
20
|
+
if not quality and not custom_rules:
|
|
21
|
+
return "NOT_CONFIGURED", (), df, df.limit(0), 0
|
|
22
|
+
functions = _functions()
|
|
23
|
+
results: list[QualityRuleResult] = []
|
|
24
|
+
quarantine_condition = functions.lit(False)
|
|
25
|
+
has_quarantine_condition = False
|
|
26
|
+
row_count: int | None = None
|
|
27
|
+
|
|
28
|
+
for intent in quality:
|
|
29
|
+
if intent.rule != "required_columns":
|
|
30
|
+
_validate_columns(df, intent.columns, f"quality.{intent.rule}")
|
|
31
|
+
if intent.rule == "required_columns":
|
|
32
|
+
result = _required_columns(df, intent)
|
|
33
|
+
elif intent.rule == "not_null":
|
|
34
|
+
result = _not_null(df, intent, functions)
|
|
35
|
+
if result.failed_count:
|
|
36
|
+
quarantine_condition = quarantine_condition | functions.col(intent.columns[0]).isNull()
|
|
37
|
+
has_quarantine_condition = True
|
|
38
|
+
elif intent.rule == "accepted_values":
|
|
39
|
+
result = _accepted_values(df, intent, functions)
|
|
40
|
+
if result.failed_count:
|
|
41
|
+
column = functions.col(intent.columns[0])
|
|
42
|
+
values = _values(intent.value)
|
|
43
|
+
quarantine_condition = quarantine_condition | ((~column.isin(values)) & column.isNotNull())
|
|
44
|
+
has_quarantine_condition = True
|
|
45
|
+
elif intent.rule == "max_null_ratio":
|
|
46
|
+
row_count = _row_count(df) if row_count is None else row_count
|
|
47
|
+
result = _max_null_ratio(df, intent, functions, row_count)
|
|
48
|
+
if result.failed_count:
|
|
49
|
+
quarantine_condition = quarantine_condition | functions.col(intent.columns[0]).isNull()
|
|
50
|
+
has_quarantine_condition = True
|
|
51
|
+
elif intent.rule == "unique_key":
|
|
52
|
+
result = _unique_key(df, intent, functions)
|
|
53
|
+
elif intent.rule == "row_count_minimum":
|
|
54
|
+
row_count = _row_count(df) if row_count is None else row_count
|
|
55
|
+
result = _row_count_minimum(intent, row_count)
|
|
56
|
+
elif intent.rule == "expression":
|
|
57
|
+
result, condition = _expression(df, intent, functions)
|
|
58
|
+
if result.failed_count and result.severity == "quarantine":
|
|
59
|
+
quarantine_condition = quarantine_condition | condition
|
|
60
|
+
has_quarantine_condition = True
|
|
61
|
+
else:
|
|
62
|
+
result = QualityRuleResult(intent.name, "FAILED", 1, "abort", f"Unsupported quality rule: {intent.rule}")
|
|
63
|
+
results.append(result)
|
|
64
|
+
|
|
65
|
+
custom_results, custom_quarantine_condition = evaluate_custom_quality_runtime(df, custom_rules)
|
|
66
|
+
results.extend(custom_results)
|
|
67
|
+
if custom_quarantine_condition is not None:
|
|
68
|
+
quarantine_condition = (
|
|
69
|
+
quarantine_condition | custom_quarantine_condition
|
|
70
|
+
if has_quarantine_condition
|
|
71
|
+
else custom_quarantine_condition
|
|
72
|
+
)
|
|
73
|
+
has_quarantine_condition = True
|
|
74
|
+
|
|
75
|
+
failed = tuple(result for result in results if result.failed_count > 0)
|
|
76
|
+
quarantined_df = df.where(quarantine_condition) if failed and has_quarantine_condition else df.limit(0)
|
|
77
|
+
quarantined_count = int(quarantined_df.count()) if failed and has_quarantine_condition else 0
|
|
78
|
+
valid_df = df.where(~quarantine_condition) if quarantined_count > 0 else df
|
|
79
|
+
return quality_status(tuple(results)), tuple(results), valid_df, quarantined_df, quarantined_count
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _required_columns(df: Any, intent: QualityIntent) -> QualityRuleResult:
|
|
83
|
+
missing = [column for column in intent.columns if column not in (getattr(df, "columns", ()) or ())]
|
|
84
|
+
return QualityRuleResult(
|
|
85
|
+
intent.name,
|
|
86
|
+
"FAILED" if missing else "PASSED",
|
|
87
|
+
len(missing),
|
|
88
|
+
"abort",
|
|
89
|
+
"Required columns are missing." if missing else None,
|
|
90
|
+
{"missing": missing},
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _not_null(df: Any, intent: QualityIntent, functions: Any) -> QualityRuleResult:
|
|
95
|
+
column = intent.columns[0]
|
|
96
|
+
count = _agg_int(df, functions.sum(functions.col(column).isNull().cast("long")).alias("failed_rows"), "failed_rows")
|
|
97
|
+
return QualityRuleResult(intent.name, "FAILED" if count else "PASSED", count, _severity(intent), intent.message, {"column": column})
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _accepted_values(df: Any, intent: QualityIntent, functions: Any) -> QualityRuleResult:
|
|
101
|
+
column_name = intent.columns[0]
|
|
102
|
+
column = functions.col(column_name)
|
|
103
|
+
values = _values(intent.value)
|
|
104
|
+
if len(values) > MAX_INLINE_ACCEPTED_VALUES:
|
|
105
|
+
raise ValueError(
|
|
106
|
+
f"quality.accepted_values.{column_name} has {len(values)} values. "
|
|
107
|
+
"Use a reference table or custom quality evaluator for large value sets."
|
|
108
|
+
)
|
|
109
|
+
invalid = (~column.isin(values)) & column.isNotNull()
|
|
110
|
+
count = _agg_int(df, functions.sum(invalid.cast("long")).alias("failed_rows"), "failed_rows")
|
|
111
|
+
return QualityRuleResult(intent.name, "FAILED" if count else "PASSED", count, _severity(intent), intent.message, {"column": column_name, "values": values})
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _max_null_ratio(df: Any, intent: QualityIntent, functions: Any, row_count: int) -> QualityRuleResult:
|
|
115
|
+
column = intent.columns[0]
|
|
116
|
+
null_count = _agg_int(df, functions.sum(functions.col(column).isNull().cast("long")).alias("failed_rows"), "failed_rows")
|
|
117
|
+
ratio = 0.0 if row_count == 0 else null_count / row_count
|
|
118
|
+
failed = ratio > float(intent.value)
|
|
119
|
+
return QualityRuleResult(intent.name, "FAILED" if failed else "PASSED", null_count if failed else 0, _severity(intent), intent.message, {"column": column, "ratio": ratio, "max_ratio": intent.value})
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _unique_key(df: Any, intent: QualityIntent, functions: Any) -> QualityRuleResult:
|
|
123
|
+
duplicates = df.groupBy(*intent.columns).count().where(functions.col("count") > 1).count()
|
|
124
|
+
count = int(duplicates or 0)
|
|
125
|
+
return QualityRuleResult(intent.name, "FAILED" if count else "PASSED", count, "abort", intent.message, {"columns": list(intent.columns)})
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _row_count_minimum(intent: QualityIntent, row_count: int) -> QualityRuleResult:
|
|
129
|
+
minimum = int(intent.value)
|
|
130
|
+
failed = max(0, minimum - row_count)
|
|
131
|
+
return QualityRuleResult(intent.name, "FAILED" if failed else "PASSED", failed, "abort", intent.message, {"min_rows": minimum, "actual": row_count})
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _expression(df: Any, intent: QualityIntent, functions: Any) -> tuple[QualityRuleResult, Any]:
|
|
135
|
+
expression = functions.expr(str(intent.value))
|
|
136
|
+
invalid = expression.isNull() | (expression == functions.lit(False))
|
|
137
|
+
count = _agg_int(df, functions.sum(invalid.cast("long")).alias("failed_rows"), "failed_rows")
|
|
138
|
+
severity = _severity(intent)
|
|
139
|
+
status = "WARNED" if count and severity == "warn" else "FAILED" if count else "PASSED"
|
|
140
|
+
return QualityRuleResult(intent.name, status, count, severity, intent.message, {"expression": intent.value}), invalid
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _agg_int(df: Any, expression: Any, field: str) -> int:
|
|
144
|
+
row = df.agg(expression).collect()[0]
|
|
145
|
+
return int((row[field] if row is not None else 0) or 0)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _row_count(df: Any) -> int:
|
|
149
|
+
return int(df.count() or 0)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _validate_columns(df: Any, columns: tuple[str, ...], context: str) -> None:
|
|
153
|
+
missing = [column for column in columns if column not in (getattr(df, "columns", ()) or ())]
|
|
154
|
+
if missing:
|
|
155
|
+
raise ValueError(f"{context} not found: {missing}")
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _values(value: object) -> list[Any]:
|
|
159
|
+
if isinstance(value, (list, tuple, set)):
|
|
160
|
+
return list(value)
|
|
161
|
+
return [value]
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _severity(intent: QualityIntent) -> str:
|
|
165
|
+
return str(intent.severity or "quarantine") # type: ignore[return-value]
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _functions() -> Any:
|
|
169
|
+
return import_module("pyspark.sql").functions
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _custom_quality_rules(contract_or_quality: SemanticContract | tuple[QualityIntent, ...]) -> dict[str, dict[str, Any]]:
|
|
173
|
+
if not isinstance(contract_or_quality, SemanticContract):
|
|
174
|
+
return {}
|
|
175
|
+
extensions = contract_or_quality.extensions or {}
|
|
176
|
+
quality = extensions.get("quality") if isinstance(extensions, dict) else None
|
|
177
|
+
custom = quality.get("custom") if isinstance(quality, dict) else None
|
|
178
|
+
return dict(custom) if isinstance(custom, dict) else {}
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""Databricks SQL rendering for quality and quarantine persistence."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
|
|
8
|
+
from contractforge_core.quality import QualityRuleResult
|
|
9
|
+
from contractforge_databricks.evidence import QuarantineEvidenceRecord
|
|
10
|
+
from contractforge_databricks.evidence.sql import render_quarantine_insert_sql
|
|
11
|
+
from contractforge_databricks.evidence.tables import evidence_table_names
|
|
12
|
+
from contractforge_databricks.sql import quote_table_name, sql_int, sql_string
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def render_quality_result_insert_sql(
|
|
16
|
+
*,
|
|
17
|
+
run_id: str,
|
|
18
|
+
target_table: str,
|
|
19
|
+
result: QualityRuleResult,
|
|
20
|
+
checked_at_utc: datetime,
|
|
21
|
+
catalog: str = "main",
|
|
22
|
+
schema: str = "ops",
|
|
23
|
+
) -> str:
|
|
24
|
+
table = evidence_table_names(catalog, schema)["quality"]
|
|
25
|
+
checked_at = checked_at_utc.strftime("%Y-%m-%d %H:%M:%S")
|
|
26
|
+
return (
|
|
27
|
+
f"INSERT INTO {quote_table_name(table)} "
|
|
28
|
+
"(run_id, target_table, rule_name, status, severity, failed_count, observed_value, "
|
|
29
|
+
"checked_at_utc, message, details_json) VALUES "
|
|
30
|
+
f"({sql_string(run_id)}, {sql_string(target_table)}, {sql_string(result.rule_name)}, "
|
|
31
|
+
f"{sql_string(result.status)}, {sql_string(result.severity)}, {sql_int(result.failed_count)}, "
|
|
32
|
+
f"{_json(result.as_dict())}, TIMESTAMP {sql_string(checked_at)}, "
|
|
33
|
+
f"{sql_string(result.message)}, {_json(result.details or {})})"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def render_quality_results_insert_sql(
|
|
38
|
+
*,
|
|
39
|
+
run_id: str,
|
|
40
|
+
target_table: str,
|
|
41
|
+
results: tuple[QualityRuleResult, ...],
|
|
42
|
+
checked_at_utc: datetime,
|
|
43
|
+
catalog: str = "main",
|
|
44
|
+
schema: str = "ops",
|
|
45
|
+
) -> str:
|
|
46
|
+
statements = [
|
|
47
|
+
render_quality_result_insert_sql(
|
|
48
|
+
run_id=run_id,
|
|
49
|
+
target_table=target_table,
|
|
50
|
+
result=result,
|
|
51
|
+
checked_at_utc=checked_at_utc,
|
|
52
|
+
catalog=catalog,
|
|
53
|
+
schema=schema,
|
|
54
|
+
)
|
|
55
|
+
for result in results
|
|
56
|
+
]
|
|
57
|
+
return ";\n".join(statements) + (";\n" if statements else "-- No quality results to persist.\n")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def render_quarantine_reference_insert_sql(
|
|
61
|
+
*,
|
|
62
|
+
run_id: str,
|
|
63
|
+
target_table: str,
|
|
64
|
+
record_ref: str,
|
|
65
|
+
reason: str,
|
|
66
|
+
quarantined_at_utc: datetime,
|
|
67
|
+
catalog: str = "main",
|
|
68
|
+
schema: str = "ops",
|
|
69
|
+
) -> str:
|
|
70
|
+
record = QuarantineEvidenceRecord(
|
|
71
|
+
run_id=run_id,
|
|
72
|
+
target_table=target_table,
|
|
73
|
+
record_ref=record_ref,
|
|
74
|
+
reason=reason,
|
|
75
|
+
quarantined_at_utc=quarantined_at_utc,
|
|
76
|
+
)
|
|
77
|
+
return render_quarantine_insert_sql(record, catalog=catalog, schema=schema)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _json(value: object) -> str:
|
|
81
|
+
return sql_string(json.dumps(value, sort_keys=True, separators=(",", ":")))
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""Databricks runtime registry for custom quality evaluators."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Callable
|
|
6
|
+
from importlib import import_module
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from contractforge_core.config import VALID_QUALITY_RULE_SEVERITIES
|
|
10
|
+
from contractforge_core.quality import QualityRuleResult, is_abort_only_failure as is_abort_only_failure
|
|
11
|
+
|
|
12
|
+
QualityRuleEvaluator = Callable[[Any, str, dict[str, Any]], dict[str, Any]]
|
|
13
|
+
QUALITY_RULE_REGISTRY: dict[str, QualityRuleEvaluator] = {}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def register_quality_rule(rule_type: str, evaluator: QualityRuleEvaluator, *, overwrite: bool = False) -> None:
|
|
17
|
+
normalized = _normalize_rule_type(rule_type)
|
|
18
|
+
if not callable(evaluator):
|
|
19
|
+
raise ValueError("quality rule evaluator must be callable")
|
|
20
|
+
if normalized in QUALITY_RULE_REGISTRY and not overwrite:
|
|
21
|
+
raise ValueError(f"quality rule already registered: {normalized}")
|
|
22
|
+
QUALITY_RULE_REGISTRY[normalized] = evaluator
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def unregister_quality_rule(rule_type: str) -> None:
|
|
26
|
+
QUALITY_RULE_REGISTRY.pop(_normalize_rule_type(rule_type), None)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_quality_rule(rule_type: str) -> QualityRuleEvaluator | None:
|
|
30
|
+
return QUALITY_RULE_REGISTRY.get(_normalize_rule_type(rule_type))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def list_quality_rules() -> tuple[str, ...]:
|
|
34
|
+
return tuple(sorted(QUALITY_RULE_REGISTRY))
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def clear_quality_rule_registry() -> None:
|
|
38
|
+
QUALITY_RULE_REGISTRY.clear()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def evaluate_custom_quality_rules(df: Any, custom_rules: dict[str, dict[str, Any]] | None) -> tuple[QualityRuleResult, ...]:
|
|
42
|
+
if not custom_rules:
|
|
43
|
+
return ()
|
|
44
|
+
results: list[QualityRuleResult] = []
|
|
45
|
+
for rule_name, config in custom_rules.items():
|
|
46
|
+
rule_type = str(config.get("type") or "").strip()
|
|
47
|
+
evaluator = QUALITY_RULE_REGISTRY.get(rule_type)
|
|
48
|
+
if evaluator is None:
|
|
49
|
+
raise ValueError(f"quality_rules.custom.{rule_name} uses unregistered type: {rule_type}")
|
|
50
|
+
payload = evaluator(df, str(rule_name), dict(config))
|
|
51
|
+
failed_count = int(payload.get("failed_count", 0) or 0)
|
|
52
|
+
severity = str(payload.get("severity") or config.get("severity") or "abort").strip()
|
|
53
|
+
if severity not in VALID_QUALITY_RULE_SEVERITIES:
|
|
54
|
+
raise ValueError(
|
|
55
|
+
f"quality_rules.custom.{rule_name}.severity={severity!r} is not supported. "
|
|
56
|
+
f"Valid values: {sorted(VALID_QUALITY_RULE_SEVERITIES)}"
|
|
57
|
+
)
|
|
58
|
+
status = _status(failed_count, severity)
|
|
59
|
+
results.append(
|
|
60
|
+
QualityRuleResult(
|
|
61
|
+
rule_name=f"custom:{rule_name}",
|
|
62
|
+
status=status,
|
|
63
|
+
failed_count=failed_count,
|
|
64
|
+
severity=severity, # type: ignore[arg-type]
|
|
65
|
+
message=payload.get("message") or config.get("message"),
|
|
66
|
+
details={"name": rule_name, "type": rule_type, **dict(payload.get("details") or {})},
|
|
67
|
+
)
|
|
68
|
+
)
|
|
69
|
+
return tuple(results)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def evaluate_custom_quality_runtime(
|
|
73
|
+
df: Any,
|
|
74
|
+
custom_rules: dict[str, dict[str, Any]] | None,
|
|
75
|
+
) -> tuple[tuple[QualityRuleResult, ...], Any | None]:
|
|
76
|
+
if not custom_rules:
|
|
77
|
+
return (), None
|
|
78
|
+
functions = import_module("pyspark.sql").functions
|
|
79
|
+
quarantine_condition = functions.lit(False)
|
|
80
|
+
has_quarantine_condition = False
|
|
81
|
+
results: list[QualityRuleResult] = []
|
|
82
|
+
for rule_name, config in custom_rules.items():
|
|
83
|
+
rule_type = str(config.get("type") or "").strip()
|
|
84
|
+
evaluator = QUALITY_RULE_REGISTRY.get(rule_type)
|
|
85
|
+
if evaluator is None:
|
|
86
|
+
raise ValueError(f"quality_rules.custom.{rule_name} uses unregistered type: {rule_type}")
|
|
87
|
+
payload = evaluator(df, str(rule_name), dict(config))
|
|
88
|
+
result = _custom_result(str(rule_name), rule_type, config, payload)
|
|
89
|
+
results.append(result)
|
|
90
|
+
if result.failed_count and result.severity == "quarantine":
|
|
91
|
+
condition = payload.get("condition")
|
|
92
|
+
if condition is None:
|
|
93
|
+
raise ValueError(f"quality_rules.custom.{rule_name} with severity=quarantine must return condition")
|
|
94
|
+
quarantine_condition = quarantine_condition | condition
|
|
95
|
+
has_quarantine_condition = True
|
|
96
|
+
return tuple(results), quarantine_condition if has_quarantine_condition else None
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _custom_result(
|
|
100
|
+
rule_name: str,
|
|
101
|
+
rule_type: str,
|
|
102
|
+
config: dict[str, Any],
|
|
103
|
+
payload: dict[str, Any],
|
|
104
|
+
) -> QualityRuleResult:
|
|
105
|
+
failed_count = int(payload.get("failed_count", 0) or 0)
|
|
106
|
+
severity = str(payload.get("severity") or config.get("severity") or "abort").strip()
|
|
107
|
+
if severity not in VALID_QUALITY_RULE_SEVERITIES:
|
|
108
|
+
raise ValueError(
|
|
109
|
+
f"quality_rules.custom.{rule_name}.severity={severity!r} is not supported. "
|
|
110
|
+
f"Valid values: {sorted(VALID_QUALITY_RULE_SEVERITIES)}"
|
|
111
|
+
)
|
|
112
|
+
return QualityRuleResult(
|
|
113
|
+
rule_name=f"custom:{rule_name}",
|
|
114
|
+
status=_status(failed_count, severity),
|
|
115
|
+
failed_count=failed_count,
|
|
116
|
+
severity=severity, # type: ignore[arg-type]
|
|
117
|
+
message=payload.get("message") or config.get("message"),
|
|
118
|
+
details={"name": rule_name, "type": rule_type, **dict(payload.get("details") or {})},
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _status(failed_count: int, severity: str) -> str:
|
|
123
|
+
if failed_count <= 0:
|
|
124
|
+
return "PASSED"
|
|
125
|
+
if severity == "warn":
|
|
126
|
+
return "WARNED"
|
|
127
|
+
return "FAILED"
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _normalize_rule_type(rule_type: str) -> str:
|
|
131
|
+
normalized = str(rule_type or "").strip()
|
|
132
|
+
if not normalized:
|
|
133
|
+
raise ValueError("quality rule type cannot be empty")
|
|
134
|
+
return normalized
|