contractforge-databricks 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contractforge_databricks/__init__.py +172 -0
- contractforge_databricks/adapter.py +69 -0
- contractforge_databricks/annotations/__init__.py +10 -0
- contractforge_databricks/annotations/application.py +52 -0
- contractforge_databricks/annotations/audit.py +49 -0
- contractforge_databricks/annotations/sql.py +142 -0
- contractforge_databricks/api.py +65 -0
- contractforge_databricks/bundles/__init__.py +9 -0
- contractforge_databricks/bundles/assets.py +47 -0
- contractforge_databricks/bundles/project.py +213 -0
- contractforge_databricks/bundles/project_config.py +133 -0
- contractforge_databricks/capabilities/__init__.py +17 -0
- contractforge_databricks/capabilities/builders.py +43 -0
- contractforge_databricks/capabilities/evaluate.py +162 -0
- contractforge_databricks/capabilities/mapping.py +36 -0
- contractforge_databricks/capabilities/models.py +44 -0
- contractforge_databricks/capabilities/runtime.py +111 -0
- contractforge_databricks/capabilities/uc.py +47 -0
- contractforge_databricks/cli.py +196 -0
- contractforge_databricks/cli_deploy.py +98 -0
- contractforge_databricks/cli_governance.py +142 -0
- contractforge_databricks/cli_io.py +91 -0
- contractforge_databricks/cli_maintenance.py +69 -0
- contractforge_databricks/coercion.py +31 -0
- contractforge_databricks/contract_extensions.py +70 -0
- contractforge_databricks/cost/__init__.py +11 -0
- contractforge_databricks/cost/model.py +22 -0
- contractforge_databricks/cost/report.py +65 -0
- contractforge_databricks/cost/sql.py +136 -0
- contractforge_databricks/dashboards/__init__.py +15 -0
- contractforge_databricks/dashboards/control_tables.py +150 -0
- contractforge_databricks/diagnostics/__init__.py +7 -0
- contractforge_databricks/diagnostics/explain.py +40 -0
- contractforge_databricks/environment.py +53 -0
- contractforge_databricks/evidence/__init__.py +98 -0
- contractforge_databricks/evidence/ddl.py +35 -0
- contractforge_databricks/evidence/governance_log.py +175 -0
- contractforge_databricks/evidence/helpers.py +29 -0
- contractforge_databricks/evidence/ops_log.py +210 -0
- contractforge_databricks/evidence/records.py +27 -0
- contractforge_databricks/evidence/run_log.py +74 -0
- contractforge_databricks/evidence/schemas.py +7 -0
- contractforge_databricks/evidence/sql.py +144 -0
- contractforge_databricks/evidence/tables.py +20 -0
- contractforge_databricks/evidence/writer.py +118 -0
- contractforge_databricks/execution/__init__.py +70 -0
- contractforge_databricks/execution/delta_basic.py +57 -0
- contractforge_databricks/execution/hash_diff.py +126 -0
- contractforge_databricks/execution/hash_diff_latest.py +142 -0
- contractforge_databricks/execution/replace_partitions.py +40 -0
- contractforge_databricks/execution/results.py +5 -0
- contractforge_databricks/execution/retry.py +36 -0
- contractforge_databricks/execution/scd2.py +213 -0
- contractforge_databricks/execution/scd2_deletes.py +65 -0
- contractforge_databricks/execution/scd2_late.py +30 -0
- contractforge_databricks/execution/snapshot.py +77 -0
- contractforge_databricks/execution/sql_merge.py +85 -0
- contractforge_databricks/execution/tables.py +98 -0
- contractforge_databricks/execution/windows.py +58 -0
- contractforge_databricks/governance/__init__.py +30 -0
- contractforge_databricks/governance/access.py +185 -0
- contractforge_databricks/governance/application.py +93 -0
- contractforge_databricks/governance/drift.py +49 -0
- contractforge_databricks/governance/runtime.py +60 -0
- contractforge_databricks/governance/sql.py +31 -0
- contractforge_databricks/governance/validation.py +135 -0
- contractforge_databricks/lakeflow/__init__.py +21 -0
- contractforge_databricks/lakeflow/compatibility.py +194 -0
- contractforge_databricks/lakeflow/rendering.py +175 -0
- contractforge_databricks/lineage/__init__.py +7 -0
- contractforge_databricks/lineage/openlineage.py +182 -0
- contractforge_databricks/maintenance/__init__.py +27 -0
- contractforge_databricks/maintenance/retention.py +90 -0
- contractforge_databricks/maintenance/sql.py +68 -0
- contractforge_databricks/metrics/__init__.py +19 -0
- contractforge_databricks/metrics/history.py +21 -0
- contractforge_databricks/metrics/write.py +63 -0
- contractforge_databricks/operations/__init__.py +4 -0
- contractforge_databricks/operations/application.py +38 -0
- contractforge_databricks/operations/sql.py +95 -0
- contractforge_databricks/parity/__init__.py +18 -0
- contractforge_databricks/parity/catalog.py +59 -0
- contractforge_databricks/parity/models.py +7 -0
- contractforge_databricks/parity/scenarios.py +111 -0
- contractforge_databricks/partitioning/__init__.py +3 -0
- contractforge_databricks/partitioning/predicates.py +28 -0
- contractforge_databricks/preparation/__init__.py +47 -0
- contractforge_databricks/preparation/deduplicate.py +87 -0
- contractforge_databricks/preparation/encoding.py +37 -0
- contractforge_databricks/preparation/hashing.py +18 -0
- contractforge_databricks/preparation/pyspark.py +178 -0
- contractforge_databricks/preparation/pyspark_staging.py +70 -0
- contractforge_databricks/preparation/shape.py +209 -0
- contractforge_databricks/preparation/shape_validation.py +94 -0
- contractforge_databricks/preparation/staging.py +17 -0
- contractforge_databricks/preparation/zip_arrays.py +51 -0
- contractforge_databricks/presets/__init__.py +3 -0
- contractforge_databricks/presets/base.py +24 -0
- contractforge_databricks/presets/bronze.py +57 -0
- contractforge_databricks/presets/catalog.py +22 -0
- contractforge_databricks/presets/core.py +134 -0
- contractforge_databricks/presets/gold.py +62 -0
- contractforge_databricks/presets/modifiers.py +51 -0
- contractforge_databricks/presets/runtime.py +22 -0
- contractforge_databricks/presets/silver.py +101 -0
- contractforge_databricks/presets/write_engine.py +57 -0
- contractforge_databricks/quality/__init__.py +41 -0
- contractforge_databricks/quality/evaluation.py +178 -0
- contractforge_databricks/quality/persistence.py +81 -0
- contractforge_databricks/quality/registry.py +134 -0
- contractforge_databricks/quality/results.py +17 -0
- contractforge_databricks/quality/sql.py +113 -0
- contractforge_databricks/rendering/__init__.py +11 -0
- contractforge_databricks/rendering/bundle.py +93 -0
- contractforge_databricks/rendering/markdown.py +50 -0
- contractforge_databricks/rendering/names.py +56 -0
- contractforge_databricks/results.py +15 -0
- contractforge_databricks/runtime/__init__.py +101 -0
- contractforge_databricks/runtime/available_now.py +147 -0
- contractforge_databricks/runtime/bundles.py +211 -0
- contractforge_databricks/runtime/cache.py +20 -0
- contractforge_databricks/runtime/control_tables.py +19 -0
- contractforge_databricks/runtime/deploy.py +197 -0
- contractforge_databricks/runtime/detection.py +114 -0
- contractforge_databricks/runtime/dry_run.py +46 -0
- contractforge_databricks/runtime/errors.py +54 -0
- contractforge_databricks/runtime/file_selection.py +109 -0
- contractforge_databricks/runtime/finalization.py +168 -0
- contractforge_databricks/runtime/governance.py +37 -0
- contractforge_databricks/runtime/hooks.py +45 -0
- contractforge_databricks/runtime/http_file.py +37 -0
- contractforge_databricks/runtime/http_retry.py +15 -0
- contractforge_databricks/runtime/http_safety.py +9 -0
- contractforge_databricks/runtime/json_materialization.py +97 -0
- contractforge_databricks/runtime/lineage.py +164 -0
- contractforge_databricks/runtime/maintenance.py +43 -0
- contractforge_databricks/runtime/merge_validation.py +98 -0
- contractforge_databricks/runtime/metadata.py +21 -0
- contractforge_databricks/runtime/metrics.py +34 -0
- contractforge_databricks/runtime/models.py +32 -0
- contractforge_databricks/runtime/options.py +33 -0
- contractforge_databricks/runtime/orchestration_context.py +185 -0
- contractforge_databricks/runtime/orchestrator.py +147 -0
- contractforge_databricks/runtime/partitioning.py +93 -0
- contractforge_databricks/runtime/quality_quarantine.py +92 -0
- contractforge_databricks/runtime/rest_api.py +46 -0
- contractforge_databricks/runtime/rest_auth.py +21 -0
- contractforge_databricks/runtime/rest_pagination.py +21 -0
- contractforge_databricks/runtime/run_payload.py +177 -0
- contractforge_databricks/runtime/schema.py +106 -0
- contractforge_databricks/runtime/source_metadata.py +30 -0
- contractforge_databricks/runtime/source_registry.py +43 -0
- contractforge_databricks/runtime/source_schema.py +24 -0
- contractforge_databricks/runtime/sources.py +208 -0
- contractforge_databricks/runtime/spark.py +183 -0
- contractforge_databricks/runtime/spark_defaults.py +35 -0
- contractforge_databricks/runtime/storage_auth.py +132 -0
- contractforge_databricks/runtime/streaming.py +131 -0
- contractforge_databricks/runtime/success.py +104 -0
- contractforge_databricks/runtime/utils.py +52 -0
- contractforge_databricks/runtime/watermark.py +71 -0
- contractforge_databricks/runtime/windows.py +184 -0
- contractforge_databricks/runtime/write.py +66 -0
- contractforge_databricks/runtime/write_flow.py +146 -0
- contractforge_databricks/runtime/write_strategy.py +40 -0
- contractforge_databricks/schema/__init__.py +21 -0
- contractforge_databricks/schema/diff.py +11 -0
- contractforge_databricks/schema/policy.py +33 -0
- contractforge_databricks/schema/sync.py +23 -0
- contractforge_databricks/security/__init__.py +21 -0
- contractforge_databricks/security/errors.py +5 -0
- contractforge_databricks/security/redaction.py +5 -0
- contractforge_databricks/security/secrets.py +114 -0
- contractforge_databricks/security/source_policy.py +17 -0
- contractforge_databricks/shapes/__init__.py +3 -0
- contractforge_databricks/shapes/sql.py +123 -0
- contractforge_databricks/sources/__init__.py +67 -0
- contractforge_databricks/sources/artifacts.py +100 -0
- contractforge_databricks/sources/autoloader.py +48 -0
- contractforge_databricks/sources/bounded_streams.py +44 -0
- contractforge_databricks/sources/classification.py +115 -0
- contractforge_databricks/sources/delta_share.py +21 -0
- contractforge_databricks/sources/files.py +48 -0
- contractforge_databricks/sources/http_file.py +46 -0
- contractforge_databricks/sources/interpret.py +76 -0
- contractforge_databricks/sources/jdbc.py +32 -0
- contractforge_databricks/sources/metadata.py +18 -0
- contractforge_databricks/sources/native_passthrough.py +33 -0
- contractforge_databricks/sources/rds_iam.py +15 -0
- contractforge_databricks/sources/rds_iam_runtime.py +191 -0
- contractforge_databricks/sources/rest_api.py +33 -0
- contractforge_databricks/sources/support.py +50 -0
- contractforge_databricks/sources/table_refs.py +65 -0
- contractforge_databricks/sql/__init__.py +4 -0
- contractforge_databricks/sql/identifiers.py +17 -0
- contractforge_databricks/sql/literals.py +36 -0
- contractforge_databricks/state/__init__.py +39 -0
- contractforge_databricks/state/ddl.py +24 -0
- contractforge_databricks/state/migrations.py +146 -0
- contractforge_databricks/state/queries.py +149 -0
- contractforge_databricks/state/sql.py +116 -0
- contractforge_databricks/state/tables.py +9 -0
- contractforge_databricks/state/writer.py +83 -0
- contractforge_databricks/templates/__init__.py +15 -0
- contractforge_databricks/templates/catalog.py +205 -0
- contractforge_databricks/templates/catalog_parity.py +85 -0
- contractforge_databricks/templates/core.py +83 -0
- contractforge_databricks/templates/enrichment.py +175 -0
- contractforge_databricks/transforms/__init__.py +3 -0
- contractforge_databricks/transforms/sql.py +118 -0
- contractforge_databricks/watermark/__init__.py +6 -0
- contractforge_databricks/watermark/sql.py +91 -0
- contractforge_databricks/write_modes/__init__.py +20 -0
- contractforge_databricks/write_modes/registry.py +44 -0
- contractforge_databricks/write_modes/sql.py +33 -0
- contractforge_databricks/write_modes/strategy.py +192 -0
- contractforge_databricks-0.1.0.dist-info/METADATA +34 -0
- contractforge_databricks-0.1.0.dist-info/RECORD +220 -0
- contractforge_databricks-0.1.0.dist-info/WHEEL +4 -0
- contractforge_databricks-0.1.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Template helpers for Databricks adapter contract examples."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from copy import deepcopy
|
|
6
|
+
import json
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from contractforge_databricks.templates.catalog import BUILTIN_CONTRACT_TEMPLATES, TEMPLATE_META_KEY, ContractTemplate
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def list_contract_templates() -> list[str]:
|
|
13
|
+
return sorted(BUILTIN_CONTRACT_TEMPLATES)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_contract_template(name: str) -> ContractTemplate:
|
|
17
|
+
if name not in BUILTIN_CONTRACT_TEMPLATES:
|
|
18
|
+
raise ValueError(f"Template not found: {name}. Valid templates: {list_contract_templates()}")
|
|
19
|
+
return deepcopy(BUILTIN_CONTRACT_TEMPLATES[name])
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def contract_template_files(name: str) -> dict[str, dict[str, Any]]:
|
|
23
|
+
template = get_contract_template(name)
|
|
24
|
+
return {
|
|
25
|
+
key: deepcopy(template[key])
|
|
26
|
+
for key in ("ingestion", "annotations", "operations", "access")
|
|
27
|
+
if key in template
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def contract_template_details(name: str) -> dict[str, Any]:
|
|
32
|
+
template = get_contract_template(name)
|
|
33
|
+
meta = dict(template.get(TEMPLATE_META_KEY) or {})
|
|
34
|
+
ingestion = template.get("ingestion") or {}
|
|
35
|
+
return {
|
|
36
|
+
"name": name,
|
|
37
|
+
"description": meta.get("description", ""),
|
|
38
|
+
"category": meta.get("category", "custom"),
|
|
39
|
+
"files": [key for key in ("ingestion", "annotations", "operations", "access") if key in template],
|
|
40
|
+
"target": ingestion.get("target"),
|
|
41
|
+
"presets": ingestion.get("preset"),
|
|
42
|
+
"source": _source_kind(template),
|
|
43
|
+
"mode": ingestion.get("mode"),
|
|
44
|
+
"recommendation_priority": meta.get("recommendation_priority", 100),
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def recommend_contract_templates(
|
|
49
|
+
*,
|
|
50
|
+
layer: str | None = None,
|
|
51
|
+
source: str | None = None,
|
|
52
|
+
mode: str | None = None,
|
|
53
|
+
pattern: str | None = None,
|
|
54
|
+
limit: int | None = None,
|
|
55
|
+
) -> list[dict[str, Any]]:
|
|
56
|
+
criteria = {"layer": _norm(layer), "source": _norm(source), "mode": _norm(mode), "pattern": _norm(pattern)}
|
|
57
|
+
has_criteria = any(criteria.values())
|
|
58
|
+
matches = []
|
|
59
|
+
for name in list_contract_templates():
|
|
60
|
+
details = contract_template_details(name)
|
|
61
|
+
haystack = _norm(json.dumps({"name": name, "details": details, "template": get_contract_template(name)}))
|
|
62
|
+
score = 0
|
|
63
|
+
matched: list[str] = []
|
|
64
|
+
for key, weight in (("layer", 4), ("source", 3), ("mode", 3), ("pattern", 2)):
|
|
65
|
+
if criteria[key] and criteria[key] in haystack:
|
|
66
|
+
score += weight
|
|
67
|
+
matched.append(key)
|
|
68
|
+
if has_criteria and score == 0:
|
|
69
|
+
continue
|
|
70
|
+
matches.append({**details, "score": score, "matched": matched})
|
|
71
|
+
matches.sort(key=lambda item: (-int(item["score"]), int(item["recommendation_priority"]), str(item["name"])))
|
|
72
|
+
return matches[: max(0, int(limit))] if limit is not None else matches
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _source_kind(template: ContractTemplate) -> str:
|
|
76
|
+
source = (template.get("ingestion") or {}).get("source")
|
|
77
|
+
if isinstance(source, dict):
|
|
78
|
+
return str(source.get("type") or source.get("connector") or "connector")
|
|
79
|
+
return str(source or "unknown")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _norm(value: Any) -> str:
|
|
83
|
+
return str(value or "").strip().lower().replace("-", "_")
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
"""Parity enrichments for Databricks contract templates."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
ContractTemplate = dict[str, Any]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def enrich_contractforge_parity(templates: dict[str, ContractTemplate]) -> None:
|
|
11
|
+
"""Restore mature original-template parameters using core canonical names."""
|
|
12
|
+
_bronze_rest_api_incremental(templates)
|
|
13
|
+
_bronze_http_file_csv_snapshot(templates)
|
|
14
|
+
_bronze_autoloader_governed_delta(templates)
|
|
15
|
+
_silver_jdbc_scd1_upsert(templates)
|
|
16
|
+
_silver_jdbc_rds_iam_hash_diff(templates)
|
|
17
|
+
_silver_scd2_history(templates)
|
|
18
|
+
_silver_snapshot_soft_delete(templates)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _bronze_rest_api_incremental(templates: dict[str, ContractTemplate]) -> None:
|
|
22
|
+
ingestion = templates["bronze_rest_api_incremental"]["ingestion"]
|
|
23
|
+
ingestion.update(
|
|
24
|
+
{
|
|
25
|
+
"source": {
|
|
26
|
+
"type": "rest_api",
|
|
27
|
+
"name": "orders_api",
|
|
28
|
+
"request": {"url": "https://api.example.com/orders", "params": {"status": "open"}},
|
|
29
|
+
"auth": {"type": "bearer_token", "token": "{{ secret:orders_api/token }}"},
|
|
30
|
+
"pagination": {"type": "cursor", "cursor_param": "cursor", "next_cursor_path": "$.next"},
|
|
31
|
+
"response": {"records_path": "$.data"},
|
|
32
|
+
"incremental": {
|
|
33
|
+
"watermark_param": "updated_after",
|
|
34
|
+
"watermark_header": "X-Watermark",
|
|
35
|
+
"initial_value": "1970-01-01T00:00:00Z",
|
|
36
|
+
},
|
|
37
|
+
"limits": {"max_pages": 100, "timeout_seconds": 60, "retry_attempts": 3},
|
|
38
|
+
},
|
|
39
|
+
"mode": "scd0_append",
|
|
40
|
+
"watermark_columns": ["updated_at"],
|
|
41
|
+
"schema_policy": "additive_only",
|
|
42
|
+
"quality_rules": {
|
|
43
|
+
"not_null": ["id"],
|
|
44
|
+
"expressions": [
|
|
45
|
+
{
|
|
46
|
+
"name": "valid_updated_at",
|
|
47
|
+
"expression": "updated_at IS NOT NULL",
|
|
48
|
+
"severity": "warn",
|
|
49
|
+
"message": "updated_at is missing from the API payload.",
|
|
50
|
+
}
|
|
51
|
+
],
|
|
52
|
+
},
|
|
53
|
+
}
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _bronze_http_file_csv_snapshot(templates: dict[str, ContractTemplate]) -> None:
|
|
58
|
+
ingestion = templates["bronze_http_file_csv_snapshot"]["ingestion"]
|
|
59
|
+
ingestion.update(
|
|
60
|
+
{
|
|
61
|
+
"source": {
|
|
62
|
+
"type": "http_csv",
|
|
63
|
+
"url": "https://example.com/public/orders.csv",
|
|
64
|
+
"format": "csv",
|
|
65
|
+
"options": {"header": True, "multiLine": False},
|
|
66
|
+
"read": {
|
|
67
|
+
"source_complete": True,
|
|
68
|
+
"schema": "order_id STRING, order_date DATE, customer_id STRING, amount DOUBLE, updated_at TIMESTAMP",
|
|
69
|
+
"timeout_seconds": 120,
|
|
70
|
+
},
|
|
71
|
+
},
|
|
72
|
+
"schema_policy": "additive_only",
|
|
73
|
+
"quality_rules": {
|
|
74
|
+
"not_null": ["order_id"],
|
|
75
|
+
"expressions": [
|
|
76
|
+
{
|
|
77
|
+
"name": "valid_amount",
|
|
78
|
+
"expression": "amount IS NULL OR amount >= 0",
|
|
79
|
+
"severity": "warn",
|
|
80
|
+
"message": "Negative amount in HTTP CSV.",
|
|
81
|
+
}
|
|
82
|
+
],
|
|
83
|
+
},
|
|
84
|
+
}
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _bronze_autoloader_governed_delta(templates: dict[str, ContractTemplate]) -> None:
|
|
89
|
+
ingestion = templates["bronze_autoloader_governed_delta"]["ingestion"]
|
|
90
|
+
ingestion.update(
|
|
91
|
+
{
|
|
92
|
+
"preset": [
|
|
93
|
+
"bronze_autoloader_append",
|
|
94
|
+
"runtime_databricks_serverless",
|
|
95
|
+
"delta_cdf_enabled",
|
|
96
|
+
"delta_liquid_clustering",
|
|
97
|
+
"write_engine_native_auto_preview",
|
|
98
|
+
"governance_uc_basic",
|
|
99
|
+
],
|
|
100
|
+
"source": {
|
|
101
|
+
"type": "incremental_files",
|
|
102
|
+
"format": "json",
|
|
103
|
+
"path": "/Volumes/main/landing/orders_json",
|
|
104
|
+
"progress_location": "/Volumes/main/ops/checkpoints/orders_json",
|
|
105
|
+
"schema_tracking_location": "/Volumes/main/ops/autoloader_schemas/orders_json",
|
|
106
|
+
"read": {"max_files_per_trigger": 50, "include_existing_files": True},
|
|
107
|
+
"schema_hints": "order_id STRING, event_time TIMESTAMP, customer_id STRING",
|
|
108
|
+
"trigger": "available_now",
|
|
109
|
+
},
|
|
110
|
+
"extensions": {"databricks": {"cluster_columns": ["order_id"]}},
|
|
111
|
+
"idempotency_key": "b_orders_events_available_now",
|
|
112
|
+
}
|
|
113
|
+
)
|
|
114
|
+
templates["bronze_autoloader_governed_delta"]["access"]["grants"] = [
|
|
115
|
+
{"principal": "data-engineering", "privileges": ["SELECT", "MODIFY"]},
|
|
116
|
+
{"principal": "sales-analytics", "privileges": ["SELECT"]},
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _silver_jdbc_scd1_upsert(templates: dict[str, ContractTemplate]) -> None:
|
|
121
|
+
ingestion = templates["silver_jdbc_scd1_upsert"]["ingestion"]
|
|
122
|
+
ingestion.update(
|
|
123
|
+
{
|
|
124
|
+
"source": {
|
|
125
|
+
"type": "postgres",
|
|
126
|
+
"url": "{{ secret:erp/postgres_url }}",
|
|
127
|
+
"table": "public.orders",
|
|
128
|
+
"auth": {"type": "basic", "username": "{{ secret:erp/user }}", "password": "{{ secret:erp/password }}"},
|
|
129
|
+
"incremental": {"watermark_column": "updated_at", "initial_value": "1970-01-01 00:00:00"},
|
|
130
|
+
"read": {"fetchsize": 10000, "partition_column": "id", "lower_bound": 1, "upper_bound": 10000000, "num_partitions": 16},
|
|
131
|
+
},
|
|
132
|
+
"transform": {"deduplicate": {"keys": ["order_id"], "order_by": "updated_at DESC NULLS LAST"}},
|
|
133
|
+
"column_mapping": {"id": "order_id"},
|
|
134
|
+
"quality_rules": {
|
|
135
|
+
"not_null": ["order_id", "updated_at"],
|
|
136
|
+
"unique_key": ["order_id"],
|
|
137
|
+
"expressions": [{"name": "positive_amount", "expression": "amount >= 0", "severity": "quarantine", "message": "Negative amount."}],
|
|
138
|
+
},
|
|
139
|
+
}
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _silver_jdbc_rds_iam_hash_diff(templates: dict[str, ContractTemplate]) -> None:
|
|
144
|
+
ingestion = templates["silver_jdbc_rds_iam_hash_diff"]["ingestion"]
|
|
145
|
+
source = dict(ingestion.get("source") or {})
|
|
146
|
+
source.update(
|
|
147
|
+
{
|
|
148
|
+
"type": "postgres",
|
|
149
|
+
"url": "jdbc:postgresql://orders.cluster-abcdefghijkl.us-east-1.rds.amazonaws.com:5432/erp",
|
|
150
|
+
"table": "public.orders",
|
|
151
|
+
"auth": {"type": "rds_iam", "username": "contractforge_app", "region": "us-east-1"},
|
|
152
|
+
}
|
|
153
|
+
)
|
|
154
|
+
ingestion["source"] = source
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _silver_scd2_history(templates: dict[str, ContractTemplate]) -> None:
|
|
158
|
+
ingestion = templates["silver_scd2_history"]["ingestion"]
|
|
159
|
+
ingestion.update(
|
|
160
|
+
{
|
|
161
|
+
"transform": {"deduplicate": {"keys": ["customer_id"], "order_by": "updated_at DESC NULLS LAST"}},
|
|
162
|
+
"hash_exclude_columns": ["updated_at", "ingestion_ts_utc", "__run_id"],
|
|
163
|
+
"quality_rules": {
|
|
164
|
+
"not_null": ["customer_id"],
|
|
165
|
+
"expressions": [{"name": "valid_period", "expression": "updated_at IS NOT NULL", "severity": "abort", "message": "updated_at is required for SCD2 history."}],
|
|
166
|
+
},
|
|
167
|
+
}
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _silver_snapshot_soft_delete(templates: dict[str, ContractTemplate]) -> None:
|
|
172
|
+
ingestion = templates["silver_snapshot_soft_delete"]["ingestion"]
|
|
173
|
+
source = dict(ingestion.get("source") or {})
|
|
174
|
+
source["read"] = {"source_complete": True}
|
|
175
|
+
ingestion.update({"source": source, "quality_rules": {"not_null": ["device_id"], "unique_key": ["device_id"]}})
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""Databricks SQL review rendering for portable transform intent."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from contractforge_core.semantic import SemanticContract
|
|
8
|
+
from contractforge_databricks.sql import quote_identifier, quote_table_name
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def render_transform_sql(
|
|
12
|
+
contract: SemanticContract,
|
|
13
|
+
*,
|
|
14
|
+
source_view: str = "${prepared_source_view}",
|
|
15
|
+
output_view: str = "${transformed_view}",
|
|
16
|
+
) -> str:
|
|
17
|
+
if not contract.transform:
|
|
18
|
+
return "-- No transform declared.\n"
|
|
19
|
+
transform = contract.transform.raw
|
|
20
|
+
select_items = _select_items(transform)
|
|
21
|
+
deduplicate = transform.get("deduplicate")
|
|
22
|
+
if isinstance(deduplicate, dict):
|
|
23
|
+
return _render_deduplicate(
|
|
24
|
+
select_items=select_items,
|
|
25
|
+
source_view=source_view,
|
|
26
|
+
output_view=output_view,
|
|
27
|
+
deduplicate=deduplicate,
|
|
28
|
+
)
|
|
29
|
+
sql = [
|
|
30
|
+
"-- Transform SQL review artifact.",
|
|
31
|
+
"-- Databricks runtime may execute equivalent PySpark preparation for complex cases.",
|
|
32
|
+
f"CREATE OR REPLACE TEMP VIEW {quote_table_name(output_view)} AS",
|
|
33
|
+
"SELECT",
|
|
34
|
+
",\n".join(f" {item}" for item in select_items),
|
|
35
|
+
f"FROM {quote_table_name(source_view)}",
|
|
36
|
+
]
|
|
37
|
+
return "\n".join(sql) + ";\n"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _select_items(transform: dict[str, Any]) -> list[str]:
|
|
41
|
+
items: list[str] = ["*"]
|
|
42
|
+
for column, data_type in transform.get("cast", {}).items():
|
|
43
|
+
items.append(f"CAST({quote_identifier(column)} AS {data_type}) AS {quote_identifier(column)}")
|
|
44
|
+
for column, config in transform.get("standardize", {}).items():
|
|
45
|
+
items.append(f"{_standardize_expr(column, config)} AS {quote_identifier(column)}")
|
|
46
|
+
for column, expression in transform.get("derive", {}).items():
|
|
47
|
+
items.append(f"{expression} AS {quote_identifier(column)}")
|
|
48
|
+
for column, source_columns in transform.get("composite_keys", {}).items():
|
|
49
|
+
columns = [source_columns] if isinstance(source_columns, str) else list(source_columns or ())
|
|
50
|
+
parts = ", ".join(f"coalesce(CAST({quote_identifier(str(item))} AS STRING), '')" for item in columns)
|
|
51
|
+
items.append(f"concat_ws('|', {parts}) AS {quote_identifier(column)}")
|
|
52
|
+
return items
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _standardize_expr(column: str, config: dict[str, Any]) -> str:
|
|
56
|
+
expr = quote_identifier(column)
|
|
57
|
+
if config.get("normalize_whitespace"):
|
|
58
|
+
expr = f"regexp_replace({expr}, '\\\\s+', ' ')"
|
|
59
|
+
if config.get("trim"):
|
|
60
|
+
expr = f"trim({expr})"
|
|
61
|
+
if config.get("lower"):
|
|
62
|
+
expr = f"lower({expr})"
|
|
63
|
+
if config.get("upper"):
|
|
64
|
+
expr = f"upper({expr})"
|
|
65
|
+
if config.get("empty_as_null"):
|
|
66
|
+
expr = f"nullif({expr}, '')"
|
|
67
|
+
return expr
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _render_deduplicate(
|
|
71
|
+
*,
|
|
72
|
+
select_items: list[str],
|
|
73
|
+
source_view: str,
|
|
74
|
+
output_view: str,
|
|
75
|
+
deduplicate: dict[str, Any],
|
|
76
|
+
) -> str:
|
|
77
|
+
keys = deduplicate.get("keys")
|
|
78
|
+
if isinstance(keys, str):
|
|
79
|
+
key_columns = [keys]
|
|
80
|
+
else:
|
|
81
|
+
key_columns = [str(key) for key in keys or ()]
|
|
82
|
+
order_by = _order_by(deduplicate.get("order_by"))
|
|
83
|
+
partition = ", ".join(quote_identifier(column) for column in key_columns)
|
|
84
|
+
lines = [
|
|
85
|
+
"-- Transform SQL review artifact.",
|
|
86
|
+
"-- Databricks runtime may execute equivalent PySpark preparation for complex cases.",
|
|
87
|
+
f"CREATE OR REPLACE TEMP VIEW {quote_table_name(output_view)} AS",
|
|
88
|
+
"WITH transformed AS (",
|
|
89
|
+
" SELECT",
|
|
90
|
+
",\n".join(f" {item}" for item in select_items),
|
|
91
|
+
f" FROM {quote_table_name(source_view)}",
|
|
92
|
+
"), ranked AS (",
|
|
93
|
+
" SELECT *,",
|
|
94
|
+
f" row_number() OVER (PARTITION BY {partition} ORDER BY {order_by}) AS __cf_row_number",
|
|
95
|
+
" FROM transformed",
|
|
96
|
+
")",
|
|
97
|
+
"SELECT * EXCEPT (__cf_row_number)",
|
|
98
|
+
"FROM ranked",
|
|
99
|
+
"WHERE __cf_row_number = 1",
|
|
100
|
+
]
|
|
101
|
+
return "\n".join(lines) + ";\n"
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _order_by(value: object) -> str:
|
|
105
|
+
if isinstance(value, str):
|
|
106
|
+
return value
|
|
107
|
+
if isinstance(value, list):
|
|
108
|
+
parts = []
|
|
109
|
+
for item in value:
|
|
110
|
+
if not isinstance(item, dict):
|
|
111
|
+
continue
|
|
112
|
+
clause = quote_identifier(str(item["column"]))
|
|
113
|
+
clause += f" {str(item.get('direction', 'desc')).upper()}"
|
|
114
|
+
if item.get("nulls"):
|
|
115
|
+
clause += f" NULLS {str(item['nulls']).upper()}"
|
|
116
|
+
parts.append(clause)
|
|
117
|
+
return ", ".join(parts)
|
|
118
|
+
raise ValueError("transform.deduplicate.order_by is required")
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""Databricks SQL helpers for ContractForge typed watermarks."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from contractforge_core.watermark import decode_watermark_value
|
|
6
|
+
from contractforge_databricks.sql import quote_identifier, quote_table_name, sql_string
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def render_watermark_filter_predicate(*, columns: tuple[str, ...], watermark_value: str | None) -> str:
|
|
10
|
+
"""Render a lexicographic SQL predicate for rows after a typed watermark."""
|
|
11
|
+
if not columns or not watermark_value:
|
|
12
|
+
return "true"
|
|
13
|
+
decoded = decode_watermark_value(watermark_value, columns)
|
|
14
|
+
if not decoded:
|
|
15
|
+
return "true"
|
|
16
|
+
if len(columns) == 1:
|
|
17
|
+
column = columns[0]
|
|
18
|
+
return f"{quote_identifier(column)} > {_typed_literal(decoded[column].value, decoded[column].type)}"
|
|
19
|
+
|
|
20
|
+
branches: list[str] = []
|
|
21
|
+
for index, column in enumerate(columns):
|
|
22
|
+
comparisons = [
|
|
23
|
+
f"{quote_identifier(previous)} = {_typed_literal(decoded[previous].value, decoded[previous].type)}"
|
|
24
|
+
for previous in columns[:index]
|
|
25
|
+
]
|
|
26
|
+
comparisons.append(
|
|
27
|
+
f"{quote_identifier(column)} > {_typed_literal(decoded[column].value, decoded[column].type)}"
|
|
28
|
+
)
|
|
29
|
+
branches.append("(" + " AND ".join(comparisons) + ")")
|
|
30
|
+
return "(" + " OR ".join(branches) + ")"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def render_select_watermark_candidate_sql(
|
|
34
|
+
*,
|
|
35
|
+
table_name: str,
|
|
36
|
+
columns: tuple[str, ...],
|
|
37
|
+
types: dict[str, str] | None = None,
|
|
38
|
+
) -> str:
|
|
39
|
+
"""Render SQL that computes the next typed watermark candidate from a table."""
|
|
40
|
+
if not columns:
|
|
41
|
+
raise ValueError("watermark columns must not be empty")
|
|
42
|
+
type_map = types or {}
|
|
43
|
+
if len(columns) == 1:
|
|
44
|
+
column = columns[0]
|
|
45
|
+
return "\n".join(
|
|
46
|
+
[
|
|
47
|
+
"SELECT",
|
|
48
|
+
f" to_json(named_struct({sql_string(column)}, named_struct(",
|
|
49
|
+
f" 'type', {sql_string(type_map.get(column, 'string'))},",
|
|
50
|
+
f" 'value', CAST(MAX({quote_identifier(column)}) AS STRING)",
|
|
51
|
+
" ))) AS watermark_value",
|
|
52
|
+
f"FROM {quote_table_name(table_name)}",
|
|
53
|
+
]
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
struct_fields = ", ".join(
|
|
57
|
+
f"{sql_string(column)}, {quote_identifier(column)}" for column in columns
|
|
58
|
+
)
|
|
59
|
+
json_fields = _candidate_json_fields(columns, type_map)
|
|
60
|
+
return "\n".join(
|
|
61
|
+
[
|
|
62
|
+
"WITH candidate AS (",
|
|
63
|
+
f" SELECT MAX(named_struct({struct_fields})) AS wm",
|
|
64
|
+
f" FROM {quote_table_name(table_name)}",
|
|
65
|
+
")",
|
|
66
|
+
"SELECT",
|
|
67
|
+
f" to_json(named_struct({json_fields})) AS watermark_value",
|
|
68
|
+
"FROM candidate",
|
|
69
|
+
]
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _candidate_json_fields(columns: tuple[str, ...], types: dict[str, str]) -> str:
|
|
74
|
+
fields = []
|
|
75
|
+
for column in columns:
|
|
76
|
+
fields.append(
|
|
77
|
+
", ".join(
|
|
78
|
+
[
|
|
79
|
+
sql_string(column),
|
|
80
|
+
"named_struct("
|
|
81
|
+
f"'type', {sql_string(types.get(column, 'string'))}, "
|
|
82
|
+
f"'value', CAST(wm.{quote_identifier(column)} AS STRING)"
|
|
83
|
+
")",
|
|
84
|
+
]
|
|
85
|
+
)
|
|
86
|
+
)
|
|
87
|
+
return ", ".join(fields)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _typed_literal(value: str | None, data_type: str) -> str:
|
|
91
|
+
return f"CAST({sql_string(value)} AS {data_type})"
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from contractforge_databricks.write_modes.registry import (
|
|
2
|
+
clear_write_mode_registry,
|
|
3
|
+
get_write_mode,
|
|
4
|
+
list_write_modes,
|
|
5
|
+
register_write_mode,
|
|
6
|
+
unregister_write_mode,
|
|
7
|
+
)
|
|
8
|
+
from contractforge_databricks.write_modes.sql import render_write_mode_sql_notes
|
|
9
|
+
from contractforge_databricks.write_modes.strategy import WriteStrategy, choose_write_strategy
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"WriteStrategy",
|
|
13
|
+
"choose_write_strategy",
|
|
14
|
+
"clear_write_mode_registry",
|
|
15
|
+
"get_write_mode",
|
|
16
|
+
"list_write_modes",
|
|
17
|
+
"register_write_mode",
|
|
18
|
+
"render_write_mode_sql_notes",
|
|
19
|
+
"unregister_write_mode",
|
|
20
|
+
]
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Databricks runtime registry for adapter-owned custom write modes."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Callable
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from contractforge_core.execution import ExecutionOutcome, canonical_custom_write_mode
|
|
9
|
+
|
|
10
|
+
DatabricksWriteHandler = Callable[..., ExecutionOutcome]
|
|
11
|
+
WRITE_MODE_REGISTRY: dict[str, DatabricksWriteHandler] = {}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def register_write_mode(mode: str, handler: DatabricksWriteHandler, *, overwrite: bool = False) -> str:
|
|
15
|
+
canonical = canonical_custom_write_mode(mode)
|
|
16
|
+
if not callable(handler):
|
|
17
|
+
raise ValueError("write mode handler must be callable")
|
|
18
|
+
if canonical in WRITE_MODE_REGISTRY and not overwrite:
|
|
19
|
+
raise ValueError(f"write mode already registered: {canonical}")
|
|
20
|
+
WRITE_MODE_REGISTRY[canonical] = handler
|
|
21
|
+
return canonical
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def unregister_write_mode(mode: str) -> None:
|
|
25
|
+
WRITE_MODE_REGISTRY.pop(canonical_custom_write_mode(mode), None)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def list_write_modes() -> tuple[str, ...]:
|
|
29
|
+
return tuple(sorted(WRITE_MODE_REGISTRY))
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_write_mode(mode: str) -> DatabricksWriteHandler | None:
|
|
33
|
+
return WRITE_MODE_REGISTRY.get(canonical_custom_write_mode(mode))
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def execute_registered_write_mode(mode: str, **kwargs: Any) -> ExecutionOutcome:
|
|
37
|
+
handler = get_write_mode(mode)
|
|
38
|
+
if handler is None:
|
|
39
|
+
raise ValueError(f"Unsupported Databricks write mode: {mode}")
|
|
40
|
+
return handler(**kwargs)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def clear_write_mode_registry() -> None:
|
|
44
|
+
WRITE_MODE_REGISTRY.clear()
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Render non-executing SQL notes for Databricks write modes."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from contractforge_core.semantic import SemanticContract
|
|
6
|
+
from contractforge_databricks.rendering.names import target_full_name
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def render_write_mode_sql_notes(contract: SemanticContract) -> str:
|
|
10
|
+
target = target_full_name(contract)
|
|
11
|
+
mode = contract.write.mode
|
|
12
|
+
lines = [
|
|
13
|
+
"-- Databricks write mode review notes.",
|
|
14
|
+
"-- This artifact is not an executable job script.",
|
|
15
|
+
f"-- Target: {target}",
|
|
16
|
+
f"-- Mode: {mode}",
|
|
17
|
+
"",
|
|
18
|
+
]
|
|
19
|
+
if mode == "scd0_append":
|
|
20
|
+
lines.append("-- Expected implementation: Delta append with schema policy applied by adapter.")
|
|
21
|
+
elif mode == "scd0_overwrite":
|
|
22
|
+
lines.append("-- Expected implementation: Delta overwrite or scoped replaceWhere when declared.")
|
|
23
|
+
elif mode == "scd1_upsert":
|
|
24
|
+
lines.append("-- Expected implementation: SCD1 Delta MERGE current-state upsert by merge keys.")
|
|
25
|
+
elif mode == "scd1_hash_diff":
|
|
26
|
+
lines.append("-- Expected implementation: SCD1 hash current source rows and append changed versions.")
|
|
27
|
+
elif mode == "scd2_historical":
|
|
28
|
+
lines.append("-- Expected implementation: Delta MERGE with valid_from, valid_to, is_current, row_hash.")
|
|
29
|
+
elif mode == "snapshot_soft_delete":
|
|
30
|
+
lines.append("-- Expected implementation: Delta MERGE with NOT MATCHED BY SOURCE soft-delete update.")
|
|
31
|
+
else:
|
|
32
|
+
lines.append("-- Unsupported write mode.")
|
|
33
|
+
return "\n".join(lines) + "\n"
|