contractforge-databricks 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contractforge_databricks/__init__.py +172 -0
- contractforge_databricks/adapter.py +69 -0
- contractforge_databricks/annotations/__init__.py +10 -0
- contractforge_databricks/annotations/application.py +52 -0
- contractforge_databricks/annotations/audit.py +49 -0
- contractforge_databricks/annotations/sql.py +142 -0
- contractforge_databricks/api.py +65 -0
- contractforge_databricks/bundles/__init__.py +9 -0
- contractforge_databricks/bundles/assets.py +47 -0
- contractforge_databricks/bundles/project.py +213 -0
- contractforge_databricks/bundles/project_config.py +133 -0
- contractforge_databricks/capabilities/__init__.py +17 -0
- contractforge_databricks/capabilities/builders.py +43 -0
- contractforge_databricks/capabilities/evaluate.py +162 -0
- contractforge_databricks/capabilities/mapping.py +36 -0
- contractforge_databricks/capabilities/models.py +44 -0
- contractforge_databricks/capabilities/runtime.py +111 -0
- contractforge_databricks/capabilities/uc.py +47 -0
- contractforge_databricks/cli.py +196 -0
- contractforge_databricks/cli_deploy.py +98 -0
- contractforge_databricks/cli_governance.py +142 -0
- contractforge_databricks/cli_io.py +91 -0
- contractforge_databricks/cli_maintenance.py +69 -0
- contractforge_databricks/coercion.py +31 -0
- contractforge_databricks/contract_extensions.py +70 -0
- contractforge_databricks/cost/__init__.py +11 -0
- contractforge_databricks/cost/model.py +22 -0
- contractforge_databricks/cost/report.py +65 -0
- contractforge_databricks/cost/sql.py +136 -0
- contractforge_databricks/dashboards/__init__.py +15 -0
- contractforge_databricks/dashboards/control_tables.py +150 -0
- contractforge_databricks/diagnostics/__init__.py +7 -0
- contractforge_databricks/diagnostics/explain.py +40 -0
- contractforge_databricks/environment.py +53 -0
- contractforge_databricks/evidence/__init__.py +98 -0
- contractforge_databricks/evidence/ddl.py +35 -0
- contractforge_databricks/evidence/governance_log.py +175 -0
- contractforge_databricks/evidence/helpers.py +29 -0
- contractforge_databricks/evidence/ops_log.py +210 -0
- contractforge_databricks/evidence/records.py +27 -0
- contractforge_databricks/evidence/run_log.py +74 -0
- contractforge_databricks/evidence/schemas.py +7 -0
- contractforge_databricks/evidence/sql.py +144 -0
- contractforge_databricks/evidence/tables.py +20 -0
- contractforge_databricks/evidence/writer.py +118 -0
- contractforge_databricks/execution/__init__.py +70 -0
- contractforge_databricks/execution/delta_basic.py +57 -0
- contractforge_databricks/execution/hash_diff.py +126 -0
- contractforge_databricks/execution/hash_diff_latest.py +142 -0
- contractforge_databricks/execution/replace_partitions.py +40 -0
- contractforge_databricks/execution/results.py +5 -0
- contractforge_databricks/execution/retry.py +36 -0
- contractforge_databricks/execution/scd2.py +213 -0
- contractforge_databricks/execution/scd2_deletes.py +65 -0
- contractforge_databricks/execution/scd2_late.py +30 -0
- contractforge_databricks/execution/snapshot.py +77 -0
- contractforge_databricks/execution/sql_merge.py +85 -0
- contractforge_databricks/execution/tables.py +98 -0
- contractforge_databricks/execution/windows.py +58 -0
- contractforge_databricks/governance/__init__.py +30 -0
- contractforge_databricks/governance/access.py +185 -0
- contractforge_databricks/governance/application.py +93 -0
- contractforge_databricks/governance/drift.py +49 -0
- contractforge_databricks/governance/runtime.py +60 -0
- contractforge_databricks/governance/sql.py +31 -0
- contractforge_databricks/governance/validation.py +135 -0
- contractforge_databricks/lakeflow/__init__.py +21 -0
- contractforge_databricks/lakeflow/compatibility.py +194 -0
- contractforge_databricks/lakeflow/rendering.py +175 -0
- contractforge_databricks/lineage/__init__.py +7 -0
- contractforge_databricks/lineage/openlineage.py +182 -0
- contractforge_databricks/maintenance/__init__.py +27 -0
- contractforge_databricks/maintenance/retention.py +90 -0
- contractforge_databricks/maintenance/sql.py +68 -0
- contractforge_databricks/metrics/__init__.py +19 -0
- contractforge_databricks/metrics/history.py +21 -0
- contractforge_databricks/metrics/write.py +63 -0
- contractforge_databricks/operations/__init__.py +4 -0
- contractforge_databricks/operations/application.py +38 -0
- contractforge_databricks/operations/sql.py +95 -0
- contractforge_databricks/parity/__init__.py +18 -0
- contractforge_databricks/parity/catalog.py +59 -0
- contractforge_databricks/parity/models.py +7 -0
- contractforge_databricks/parity/scenarios.py +111 -0
- contractforge_databricks/partitioning/__init__.py +3 -0
- contractforge_databricks/partitioning/predicates.py +28 -0
- contractforge_databricks/preparation/__init__.py +47 -0
- contractforge_databricks/preparation/deduplicate.py +87 -0
- contractforge_databricks/preparation/encoding.py +37 -0
- contractforge_databricks/preparation/hashing.py +18 -0
- contractforge_databricks/preparation/pyspark.py +178 -0
- contractforge_databricks/preparation/pyspark_staging.py +70 -0
- contractforge_databricks/preparation/shape.py +209 -0
- contractforge_databricks/preparation/shape_validation.py +94 -0
- contractforge_databricks/preparation/staging.py +17 -0
- contractforge_databricks/preparation/zip_arrays.py +51 -0
- contractforge_databricks/presets/__init__.py +3 -0
- contractforge_databricks/presets/base.py +24 -0
- contractforge_databricks/presets/bronze.py +57 -0
- contractforge_databricks/presets/catalog.py +22 -0
- contractforge_databricks/presets/core.py +134 -0
- contractforge_databricks/presets/gold.py +62 -0
- contractforge_databricks/presets/modifiers.py +51 -0
- contractforge_databricks/presets/runtime.py +22 -0
- contractforge_databricks/presets/silver.py +101 -0
- contractforge_databricks/presets/write_engine.py +57 -0
- contractforge_databricks/quality/__init__.py +41 -0
- contractforge_databricks/quality/evaluation.py +178 -0
- contractforge_databricks/quality/persistence.py +81 -0
- contractforge_databricks/quality/registry.py +134 -0
- contractforge_databricks/quality/results.py +17 -0
- contractforge_databricks/quality/sql.py +113 -0
- contractforge_databricks/rendering/__init__.py +11 -0
- contractforge_databricks/rendering/bundle.py +93 -0
- contractforge_databricks/rendering/markdown.py +50 -0
- contractforge_databricks/rendering/names.py +56 -0
- contractforge_databricks/results.py +15 -0
- contractforge_databricks/runtime/__init__.py +101 -0
- contractforge_databricks/runtime/available_now.py +147 -0
- contractforge_databricks/runtime/bundles.py +211 -0
- contractforge_databricks/runtime/cache.py +20 -0
- contractforge_databricks/runtime/control_tables.py +19 -0
- contractforge_databricks/runtime/deploy.py +197 -0
- contractforge_databricks/runtime/detection.py +114 -0
- contractforge_databricks/runtime/dry_run.py +46 -0
- contractforge_databricks/runtime/errors.py +54 -0
- contractforge_databricks/runtime/file_selection.py +109 -0
- contractforge_databricks/runtime/finalization.py +168 -0
- contractforge_databricks/runtime/governance.py +37 -0
- contractforge_databricks/runtime/hooks.py +45 -0
- contractforge_databricks/runtime/http_file.py +37 -0
- contractforge_databricks/runtime/http_retry.py +15 -0
- contractforge_databricks/runtime/http_safety.py +9 -0
- contractforge_databricks/runtime/json_materialization.py +97 -0
- contractforge_databricks/runtime/lineage.py +164 -0
- contractforge_databricks/runtime/maintenance.py +43 -0
- contractforge_databricks/runtime/merge_validation.py +98 -0
- contractforge_databricks/runtime/metadata.py +21 -0
- contractforge_databricks/runtime/metrics.py +34 -0
- contractforge_databricks/runtime/models.py +32 -0
- contractforge_databricks/runtime/options.py +33 -0
- contractforge_databricks/runtime/orchestration_context.py +185 -0
- contractforge_databricks/runtime/orchestrator.py +147 -0
- contractforge_databricks/runtime/partitioning.py +93 -0
- contractforge_databricks/runtime/quality_quarantine.py +92 -0
- contractforge_databricks/runtime/rest_api.py +46 -0
- contractforge_databricks/runtime/rest_auth.py +21 -0
- contractforge_databricks/runtime/rest_pagination.py +21 -0
- contractforge_databricks/runtime/run_payload.py +177 -0
- contractforge_databricks/runtime/schema.py +106 -0
- contractforge_databricks/runtime/source_metadata.py +30 -0
- contractforge_databricks/runtime/source_registry.py +43 -0
- contractforge_databricks/runtime/source_schema.py +24 -0
- contractforge_databricks/runtime/sources.py +208 -0
- contractforge_databricks/runtime/spark.py +183 -0
- contractforge_databricks/runtime/spark_defaults.py +35 -0
- contractforge_databricks/runtime/storage_auth.py +132 -0
- contractforge_databricks/runtime/streaming.py +131 -0
- contractforge_databricks/runtime/success.py +104 -0
- contractforge_databricks/runtime/utils.py +52 -0
- contractforge_databricks/runtime/watermark.py +71 -0
- contractforge_databricks/runtime/windows.py +184 -0
- contractforge_databricks/runtime/write.py +66 -0
- contractforge_databricks/runtime/write_flow.py +146 -0
- contractforge_databricks/runtime/write_strategy.py +40 -0
- contractforge_databricks/schema/__init__.py +21 -0
- contractforge_databricks/schema/diff.py +11 -0
- contractforge_databricks/schema/policy.py +33 -0
- contractforge_databricks/schema/sync.py +23 -0
- contractforge_databricks/security/__init__.py +21 -0
- contractforge_databricks/security/errors.py +5 -0
- contractforge_databricks/security/redaction.py +5 -0
- contractforge_databricks/security/secrets.py +114 -0
- contractforge_databricks/security/source_policy.py +17 -0
- contractforge_databricks/shapes/__init__.py +3 -0
- contractforge_databricks/shapes/sql.py +123 -0
- contractforge_databricks/sources/__init__.py +67 -0
- contractforge_databricks/sources/artifacts.py +100 -0
- contractforge_databricks/sources/autoloader.py +48 -0
- contractforge_databricks/sources/bounded_streams.py +44 -0
- contractforge_databricks/sources/classification.py +115 -0
- contractforge_databricks/sources/delta_share.py +21 -0
- contractforge_databricks/sources/files.py +48 -0
- contractforge_databricks/sources/http_file.py +46 -0
- contractforge_databricks/sources/interpret.py +76 -0
- contractforge_databricks/sources/jdbc.py +32 -0
- contractforge_databricks/sources/metadata.py +18 -0
- contractforge_databricks/sources/native_passthrough.py +33 -0
- contractforge_databricks/sources/rds_iam.py +15 -0
- contractforge_databricks/sources/rds_iam_runtime.py +191 -0
- contractforge_databricks/sources/rest_api.py +33 -0
- contractforge_databricks/sources/support.py +50 -0
- contractforge_databricks/sources/table_refs.py +65 -0
- contractforge_databricks/sql/__init__.py +4 -0
- contractforge_databricks/sql/identifiers.py +17 -0
- contractforge_databricks/sql/literals.py +36 -0
- contractforge_databricks/state/__init__.py +39 -0
- contractforge_databricks/state/ddl.py +24 -0
- contractforge_databricks/state/migrations.py +146 -0
- contractforge_databricks/state/queries.py +149 -0
- contractforge_databricks/state/sql.py +116 -0
- contractforge_databricks/state/tables.py +9 -0
- contractforge_databricks/state/writer.py +83 -0
- contractforge_databricks/templates/__init__.py +15 -0
- contractforge_databricks/templates/catalog.py +205 -0
- contractforge_databricks/templates/catalog_parity.py +85 -0
- contractforge_databricks/templates/core.py +83 -0
- contractforge_databricks/templates/enrichment.py +175 -0
- contractforge_databricks/transforms/__init__.py +3 -0
- contractforge_databricks/transforms/sql.py +118 -0
- contractforge_databricks/watermark/__init__.py +6 -0
- contractforge_databricks/watermark/sql.py +91 -0
- contractforge_databricks/write_modes/__init__.py +20 -0
- contractforge_databricks/write_modes/registry.py +44 -0
- contractforge_databricks/write_modes/sql.py +33 -0
- contractforge_databricks/write_modes/strategy.py +192 -0
- contractforge_databricks-0.1.0.dist-info/METADATA +34 -0
- contractforge_databricks-0.1.0.dist-info/RECORD +220 -0
- contractforge_databricks-0.1.0.dist-info/WHEEL +4 -0
- contractforge_databricks-0.1.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""Databricks SQL lookup queries for control state."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from contractforge_databricks.sql import quote_table_name, sql_int, sql_string
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def render_lock_status_sql(*, target_table: str, locks_table: str = "main.ops.ctrl_ingestion_locks") -> str:
|
|
9
|
+
return "\n".join(
|
|
10
|
+
[
|
|
11
|
+
"SELECT run_id, owner, status, acquired_at_utc, expires_at_utc, ttl_minutes",
|
|
12
|
+
f"FROM {quote_table_name(locks_table)}",
|
|
13
|
+
f"WHERE target_table = {sql_string(target_table)}",
|
|
14
|
+
"LIMIT 1",
|
|
15
|
+
]
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def render_find_idempotent_run_sql(
|
|
20
|
+
*,
|
|
21
|
+
target_table: str,
|
|
22
|
+
idempotency_key: str,
|
|
23
|
+
status: str | None = None,
|
|
24
|
+
runs_table: str = "main.ops.ctrl_ingestion_runs",
|
|
25
|
+
) -> str:
|
|
26
|
+
filters = [
|
|
27
|
+
f"target_table = {sql_string(target_table)}",
|
|
28
|
+
f"idempotency_key = {sql_string(idempotency_key)}",
|
|
29
|
+
]
|
|
30
|
+
if status:
|
|
31
|
+
filters.append(f"status = {sql_string(status)}")
|
|
32
|
+
return "\n".join(
|
|
33
|
+
[
|
|
34
|
+
"SELECT run_id, status",
|
|
35
|
+
f"FROM {quote_table_name(runs_table)}",
|
|
36
|
+
f"WHERE {' AND '.join(filters)}",
|
|
37
|
+
"ORDER BY run_ts_utc DESC NULLS LAST",
|
|
38
|
+
"LIMIT 1",
|
|
39
|
+
]
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def render_find_idempotent_stream_sql(
|
|
44
|
+
*,
|
|
45
|
+
target_table: str,
|
|
46
|
+
idempotency_key: str,
|
|
47
|
+
status: str | None = None,
|
|
48
|
+
streams_table: str = "main.ops.ctrl_ingestion_streams",
|
|
49
|
+
) -> str:
|
|
50
|
+
filters = [
|
|
51
|
+
f"target_table = {sql_string(target_table)}",
|
|
52
|
+
f"idempotency_key = {sql_string(idempotency_key)}",
|
|
53
|
+
]
|
|
54
|
+
if status:
|
|
55
|
+
filters.append(f"status = {sql_string(status)}")
|
|
56
|
+
return "\n".join(
|
|
57
|
+
[
|
|
58
|
+
"SELECT stream_run_id, status",
|
|
59
|
+
f"FROM {quote_table_name(streams_table)}",
|
|
60
|
+
f"WHERE {' AND '.join(filters)}",
|
|
61
|
+
"ORDER BY started_at_utc DESC NULLS LAST",
|
|
62
|
+
"LIMIT 1",
|
|
63
|
+
]
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def render_has_successful_run_sql(
|
|
68
|
+
*,
|
|
69
|
+
target_table: str,
|
|
70
|
+
idempotency_key: str,
|
|
71
|
+
runs_table: str = "main.ops.ctrl_ingestion_runs",
|
|
72
|
+
) -> str:
|
|
73
|
+
return "\n".join(
|
|
74
|
+
[
|
|
75
|
+
"SELECT count(1) > 0 AS has_successful_run",
|
|
76
|
+
f"FROM {quote_table_name(runs_table)}",
|
|
77
|
+
"WHERE "
|
|
78
|
+
f"target_table = {sql_string(target_table)} "
|
|
79
|
+
f"AND idempotency_key = {sql_string(idempotency_key)} "
|
|
80
|
+
"AND status = 'SUCCESS'",
|
|
81
|
+
]
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def render_select_previous_watermark_sql(
|
|
86
|
+
*,
|
|
87
|
+
target_table: str,
|
|
88
|
+
state_table: str = "main.ops.ctrl_ingestion_state",
|
|
89
|
+
) -> str:
|
|
90
|
+
return "\n".join(
|
|
91
|
+
[
|
|
92
|
+
"SELECT watermark_value",
|
|
93
|
+
f"FROM {quote_table_name(state_table)}",
|
|
94
|
+
f"WHERE target_table = {sql_string(target_table)}",
|
|
95
|
+
"LIMIT 1",
|
|
96
|
+
]
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def render_control_metadata_current_sql(
|
|
101
|
+
*,
|
|
102
|
+
framework_version: str,
|
|
103
|
+
ctrl_schema_version: int,
|
|
104
|
+
metadata_table: str = "main.ops.ctrl_ingestion_metadata",
|
|
105
|
+
) -> str:
|
|
106
|
+
return "\n".join(
|
|
107
|
+
[
|
|
108
|
+
"SELECT 1",
|
|
109
|
+
f"FROM {quote_table_name(metadata_table)}",
|
|
110
|
+
"WHERE component = 'contractforge'",
|
|
111
|
+
f" AND framework_version = {sql_string(framework_version)}",
|
|
112
|
+
f" AND ctrl_schema_version = {sql_int(ctrl_schema_version)}",
|
|
113
|
+
"LIMIT 1",
|
|
114
|
+
]
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def render_record_control_metadata_sql(
|
|
119
|
+
*,
|
|
120
|
+
framework_version: str,
|
|
121
|
+
ctrl_schema_version: int,
|
|
122
|
+
metadata_table: str = "main.ops.ctrl_ingestion_metadata",
|
|
123
|
+
) -> str:
|
|
124
|
+
return f"""
|
|
125
|
+
MERGE INTO {quote_table_name(metadata_table)} t
|
|
126
|
+
USING (
|
|
127
|
+
SELECT
|
|
128
|
+
'contractforge' AS component,
|
|
129
|
+
{sql_string(framework_version)} AS framework_version,
|
|
130
|
+
{sql_int(ctrl_schema_version)} AS ctrl_schema_version,
|
|
131
|
+
current_timestamp() AS updated_at_utc
|
|
132
|
+
) s
|
|
133
|
+
ON t.component = s.component
|
|
134
|
+
WHEN MATCHED THEN UPDATE SET
|
|
135
|
+
t.framework_version = s.framework_version,
|
|
136
|
+
t.ctrl_schema_version = s.ctrl_schema_version,
|
|
137
|
+
t.updated_at_utc = s.updated_at_utc
|
|
138
|
+
WHEN NOT MATCHED THEN INSERT (
|
|
139
|
+
component,
|
|
140
|
+
framework_version,
|
|
141
|
+
ctrl_schema_version,
|
|
142
|
+
updated_at_utc
|
|
143
|
+
) VALUES (
|
|
144
|
+
s.component,
|
|
145
|
+
s.framework_version,
|
|
146
|
+
s.ctrl_schema_version,
|
|
147
|
+
s.updated_at_utc
|
|
148
|
+
)
|
|
149
|
+
""".strip()
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""Databricks SQL for locks, state and idempotency lookups."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from contractforge_databricks.sql import quote_table_name, sql_int, sql_string
|
|
6
|
+
from contractforge_databricks.state.tables import state_table_names
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def render_acquire_lock_sql(
|
|
10
|
+
*,
|
|
11
|
+
target_table: str,
|
|
12
|
+
run_id: str,
|
|
13
|
+
owner: str | None = None,
|
|
14
|
+
ttl_minutes: int = 60,
|
|
15
|
+
catalog: str = "main",
|
|
16
|
+
schema: str = "ops",
|
|
17
|
+
) -> str:
|
|
18
|
+
table = state_table_names(catalog, schema)["locks"]
|
|
19
|
+
return f"""
|
|
20
|
+
MERGE INTO {quote_table_name(table)} t
|
|
21
|
+
USING (
|
|
22
|
+
SELECT
|
|
23
|
+
{sql_string(target_table)} AS target_table,
|
|
24
|
+
{sql_string(run_id)} AS run_id,
|
|
25
|
+
{sql_string(owner)} AS owner,
|
|
26
|
+
current_timestamp() AS acquired_at_utc,
|
|
27
|
+
current_timestamp() + INTERVAL {int(ttl_minutes)} MINUTES AS expires_at_utc,
|
|
28
|
+
{sql_int(ttl_minutes)} AS ttl_minutes,
|
|
29
|
+
CAST(NULL AS TIMESTAMP) AS released_at_utc,
|
|
30
|
+
'ACTIVE' AS status
|
|
31
|
+
) s
|
|
32
|
+
ON t.target_table = s.target_table
|
|
33
|
+
WHEN MATCHED AND (t.status <> 'ACTIVE' OR t.expires_at_utc < current_timestamp()) THEN UPDATE SET *
|
|
34
|
+
WHEN NOT MATCHED THEN INSERT *
|
|
35
|
+
""".strip()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def render_release_lock_sql(
|
|
39
|
+
*, target_table: str, run_id: str, catalog: str = "main", schema: str = "ops"
|
|
40
|
+
) -> str:
|
|
41
|
+
table = state_table_names(catalog, schema)["locks"]
|
|
42
|
+
return f"""
|
|
43
|
+
UPDATE {quote_table_name(table)}
|
|
44
|
+
SET status = 'RELEASED',
|
|
45
|
+
released_at_utc = current_timestamp()
|
|
46
|
+
WHERE target_table = {sql_string(target_table)} AND run_id = {sql_string(run_id)}
|
|
47
|
+
""".strip()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def render_upsert_state_sql(
|
|
51
|
+
*,
|
|
52
|
+
target_table: str,
|
|
53
|
+
run_id: str,
|
|
54
|
+
status: str,
|
|
55
|
+
rows_written: int,
|
|
56
|
+
watermark_column: str | None = None,
|
|
57
|
+
watermark_value: str | None = None,
|
|
58
|
+
success_at_utc: str | None = None,
|
|
59
|
+
error_message: str | None = None,
|
|
60
|
+
table_version: str | None = None,
|
|
61
|
+
write_completed_at_utc: str | None = None,
|
|
62
|
+
watermark_candidate: str | None = None,
|
|
63
|
+
parent_run_id: str | None = None,
|
|
64
|
+
run_group_id: str | None = None,
|
|
65
|
+
master_job_id: str | None = None,
|
|
66
|
+
master_run_id: str | None = None,
|
|
67
|
+
catalog: str = "main",
|
|
68
|
+
schema: str = "ops",
|
|
69
|
+
) -> str:
|
|
70
|
+
table = state_table_names(catalog, schema)["state"]
|
|
71
|
+
return f"""
|
|
72
|
+
MERGE INTO {quote_table_name(table)} t
|
|
73
|
+
USING (
|
|
74
|
+
SELECT
|
|
75
|
+
{sql_string(target_table)} AS target_table,
|
|
76
|
+
{sql_string(watermark_column)} AS watermark_column,
|
|
77
|
+
{sql_string(watermark_value)} AS watermark_value,
|
|
78
|
+
CAST({sql_string(success_at_utc)} AS TIMESTAMP) AS last_success_at_utc,
|
|
79
|
+
{sql_string(run_id)} AS last_run_id,
|
|
80
|
+
{sql_string(status)} AS last_status,
|
|
81
|
+
{sql_int(rows_written)} AS last_rows_written,
|
|
82
|
+
{sql_string(_truncate(error_message))} AS last_error_message,
|
|
83
|
+
{sql_string(parent_run_id)} AS parent_run_id,
|
|
84
|
+
{sql_string(run_group_id)} AS run_group_id,
|
|
85
|
+
{sql_string(master_job_id)} AS master_job_id,
|
|
86
|
+
{sql_string(master_run_id)} AS master_run_id,
|
|
87
|
+
{sql_string(table_version)} AS last_table_version,
|
|
88
|
+
CAST({sql_string(write_completed_at_utc)} AS TIMESTAMP) AS last_write_completed_at_utc,
|
|
89
|
+
{sql_string(watermark_candidate)} AS last_watermark_candidate,
|
|
90
|
+
current_timestamp() AS last_updated_at_utc
|
|
91
|
+
) s
|
|
92
|
+
ON t.target_table = s.target_table
|
|
93
|
+
WHEN MATCHED THEN UPDATE SET
|
|
94
|
+
t.watermark_column = s.watermark_column,
|
|
95
|
+
t.watermark_value = s.watermark_value,
|
|
96
|
+
t.last_success_at_utc = s.last_success_at_utc,
|
|
97
|
+
t.last_run_id = s.last_run_id,
|
|
98
|
+
t.last_status = s.last_status,
|
|
99
|
+
t.last_rows_written = s.last_rows_written,
|
|
100
|
+
t.last_error_message = s.last_error_message,
|
|
101
|
+
t.parent_run_id = s.parent_run_id,
|
|
102
|
+
t.run_group_id = s.run_group_id,
|
|
103
|
+
t.master_job_id = s.master_job_id,
|
|
104
|
+
t.master_run_id = s.master_run_id,
|
|
105
|
+
t.last_table_version = s.last_table_version,
|
|
106
|
+
t.last_write_completed_at_utc = s.last_write_completed_at_utc,
|
|
107
|
+
t.last_watermark_candidate = s.last_watermark_candidate,
|
|
108
|
+
t.last_updated_at_utc = s.last_updated_at_utc
|
|
109
|
+
WHEN NOT MATCHED THEN INSERT *
|
|
110
|
+
""".strip()
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _truncate(value: str | None, limit: int = 4000) -> str | None:
|
|
114
|
+
if value is None or len(value) <= limit:
|
|
115
|
+
return value
|
|
116
|
+
return value[:limit]
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""Databricks operational state table names."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from contractforge_core.evidence import STATE_TABLES
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def state_table_names(catalog: str, schema: str) -> dict[str, str]:
|
|
9
|
+
return {name: f"{catalog}.{schema}.{table}" for name, table in STATE_TABLES.items()}
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""State writer using an injected SQL runner."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from collections.abc import Callable
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from contractforge_databricks.execution.sql_merge import SqlRunner
|
|
10
|
+
from contractforge_databricks.state.queries import render_lock_status_sql, render_record_control_metadata_sql
|
|
11
|
+
from contractforge_databricks.state.sql import (
|
|
12
|
+
render_acquire_lock_sql,
|
|
13
|
+
render_release_lock_sql,
|
|
14
|
+
render_upsert_state_sql,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger("contractforge_databricks")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class StateWriter:
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
runner: SqlRunner,
|
|
25
|
+
*,
|
|
26
|
+
catalog: str = "main",
|
|
27
|
+
schema: str = "ops",
|
|
28
|
+
query_one: Callable[[str], dict[str, Any] | None] | None = None,
|
|
29
|
+
) -> None:
|
|
30
|
+
self.runner = runner
|
|
31
|
+
self.catalog = catalog
|
|
32
|
+
self.schema = schema
|
|
33
|
+
self.query_one = query_one
|
|
34
|
+
|
|
35
|
+
def acquire_lock(self, *, target_table: str, run_id: str, owner: str | None = None, ttl_minutes: int = 60) -> None:
|
|
36
|
+
self.runner.sql(
|
|
37
|
+
render_acquire_lock_sql(
|
|
38
|
+
target_table=target_table,
|
|
39
|
+
run_id=run_id,
|
|
40
|
+
owner=owner,
|
|
41
|
+
ttl_minutes=ttl_minutes,
|
|
42
|
+
catalog=self.catalog,
|
|
43
|
+
schema=self.schema,
|
|
44
|
+
)
|
|
45
|
+
)
|
|
46
|
+
if self.query_one is None:
|
|
47
|
+
return
|
|
48
|
+
row = self.query_one(
|
|
49
|
+
render_lock_status_sql(
|
|
50
|
+
target_table=target_table,
|
|
51
|
+
locks_table=f"{self.catalog}.{self.schema}.ctrl_ingestion_locks",
|
|
52
|
+
)
|
|
53
|
+
)
|
|
54
|
+
if not row or row.get("run_id") != run_id or row.get("status") != "ACTIVE":
|
|
55
|
+
raise RuntimeError(
|
|
56
|
+
f"Lock is busy for {target_table}. This run_id={run_id} did not acquire the lock. "
|
|
57
|
+
f"Current lock: {row}"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
def release_lock(self, *, target_table: str, run_id: str) -> None:
|
|
61
|
+
try:
|
|
62
|
+
self.runner.sql(
|
|
63
|
+
render_release_lock_sql(
|
|
64
|
+
target_table=target_table,
|
|
65
|
+
run_id=run_id,
|
|
66
|
+
catalog=self.catalog,
|
|
67
|
+
schema=self.schema,
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
except Exception as exc:
|
|
71
|
+
logger.warning("Failed to release lock for %s: %s", target_table, exc)
|
|
72
|
+
|
|
73
|
+
def upsert_state(self, **kwargs: object) -> None:
|
|
74
|
+
self.runner.sql(render_upsert_state_sql(catalog=self.catalog, schema=self.schema, **kwargs))
|
|
75
|
+
|
|
76
|
+
def record_control_metadata(self, *, framework_version: str, ctrl_schema_version: int) -> None:
|
|
77
|
+
self.runner.sql(
|
|
78
|
+
render_record_control_metadata_sql(
|
|
79
|
+
framework_version=framework_version,
|
|
80
|
+
ctrl_schema_version=ctrl_schema_version,
|
|
81
|
+
metadata_table=f"{self.catalog}.{self.schema}.ctrl_ingestion_metadata",
|
|
82
|
+
)
|
|
83
|
+
)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from contractforge_databricks.templates.core import (
|
|
2
|
+
contract_template_details,
|
|
3
|
+
contract_template_files,
|
|
4
|
+
get_contract_template,
|
|
5
|
+
list_contract_templates,
|
|
6
|
+
recommend_contract_templates,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"contract_template_details",
|
|
11
|
+
"contract_template_files",
|
|
12
|
+
"get_contract_template",
|
|
13
|
+
"list_contract_templates",
|
|
14
|
+
"recommend_contract_templates",
|
|
15
|
+
]
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""Databricks template catalog for split ContractForge contracts."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
ContractTemplate = dict[str, Any]
|
|
8
|
+
TEMPLATE_META_KEY = "_template"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _template(
|
|
12
|
+
name: str,
|
|
13
|
+
category: str,
|
|
14
|
+
description: str,
|
|
15
|
+
ingestion: dict[str, Any],
|
|
16
|
+
*,
|
|
17
|
+
annotations: dict[str, Any] | None = None,
|
|
18
|
+
operations: dict[str, Any] | None = None,
|
|
19
|
+
access: dict[str, Any] | None = None,
|
|
20
|
+
priority: int = 50,
|
|
21
|
+
) -> ContractTemplate:
|
|
22
|
+
result: ContractTemplate = {
|
|
23
|
+
TEMPLATE_META_KEY: {
|
|
24
|
+
"name": name,
|
|
25
|
+
"category": category,
|
|
26
|
+
"description": description,
|
|
27
|
+
"recommendation_priority": priority,
|
|
28
|
+
},
|
|
29
|
+
"ingestion": ingestion,
|
|
30
|
+
}
|
|
31
|
+
if annotations:
|
|
32
|
+
result["annotations"] = annotations
|
|
33
|
+
if operations:
|
|
34
|
+
result["operations"] = operations
|
|
35
|
+
if access:
|
|
36
|
+
result["access"] = access
|
|
37
|
+
return result
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _target(schema: str, table: str) -> dict[str, str]:
|
|
41
|
+
return {"catalog": "main", "schema": schema, "table": table}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _ops(domain: str) -> dict[str, Any]:
|
|
45
|
+
return {
|
|
46
|
+
"owner": "data-platform",
|
|
47
|
+
"domain": domain,
|
|
48
|
+
"criticality": "medium",
|
|
49
|
+
"expected_frequency": "daily",
|
|
50
|
+
"runbook_url": f"https://wiki.example.com/runbooks/{domain}",
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _ann(description: str) -> dict[str, Any]:
|
|
55
|
+
return {"policy": "warn", "table": {"comment": description, "tags": {"contractforge": "databricks"}}}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _access(group: str) -> dict[str, Any]:
|
|
59
|
+
return {"access_policy": {"mode": "validate_only", "on_drift": "warn"}, "grants": [{"principal": group, "privileges": ["SELECT"]}]}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
from contractforge_databricks.templates.catalog_parity import PARITY_CONTRACT_TEMPLATES # noqa: E402
|
|
63
|
+
from contractforge_databricks.templates.enrichment import enrich_contractforge_parity # noqa: E402
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
BUILTIN_CONTRACT_TEMPLATES: dict[str, ContractTemplate] = {
|
|
67
|
+
**PARITY_CONTRACT_TEMPLATES,
|
|
68
|
+
"bronze_rest_api_incremental": _template(
|
|
69
|
+
"bronze_rest_api_incremental",
|
|
70
|
+
"bronze",
|
|
71
|
+
"REST API landing through reviewed native passthrough or bounded file fetch.",
|
|
72
|
+
{
|
|
73
|
+
"preset": "bronze_file_append",
|
|
74
|
+
"source": {"type": "native_passthrough", "system": "rest_api", "object": "orders"},
|
|
75
|
+
"target": _target("raw", "b_orders_api"),
|
|
76
|
+
},
|
|
77
|
+
annotations=_ann("Raw REST API order events."),
|
|
78
|
+
operations=_ops("b_orders_api"),
|
|
79
|
+
),
|
|
80
|
+
"bronze_http_file_csv_snapshot": _template(
|
|
81
|
+
"bronze_http_file_csv_snapshot",
|
|
82
|
+
"bronze",
|
|
83
|
+
"HTTP CSV snapshot landing.",
|
|
84
|
+
{"preset": "bronze_full_overwrite", "source": {"type": "http_csv", "url": "https://example.com/orders.csv"}, "target": _target("raw", "b_orders_http")},
|
|
85
|
+
annotations=_ann("Raw HTTP CSV orders."),
|
|
86
|
+
operations=_ops("b_orders_http"),
|
|
87
|
+
),
|
|
88
|
+
"bronze_autoloader_json": _template(
|
|
89
|
+
"bronze_autoloader_json",
|
|
90
|
+
"bronze",
|
|
91
|
+
"Portable incremental files rendered as Databricks Auto Loader.",
|
|
92
|
+
{"preset": "bronze_autoloader_append", "source": {"type": "incremental_files", "format": "json", "path": "s3://bucket/landing/orders/"}, "target": _target("raw", "b_orders_json")},
|
|
93
|
+
annotations=_ann("Raw incremental JSON orders."),
|
|
94
|
+
operations=_ops("b_orders_json"),
|
|
95
|
+
),
|
|
96
|
+
"bronze_autoloader_available_now_json": _template(
|
|
97
|
+
"bronze_autoloader_available_now_json",
|
|
98
|
+
"bronze",
|
|
99
|
+
"Available-now incremental JSON ingestion.",
|
|
100
|
+
{"preset": "bronze_autoloader_append", "source": {"type": "incremental_files", "format": "json", "trigger": "available_now", "path": "s3://bucket/landing/orders/"}, "target": _target("raw", "b_orders_available_now")},
|
|
101
|
+
annotations=_ann("Available-now incremental JSON orders."),
|
|
102
|
+
operations=_ops("b_orders_available_now"),
|
|
103
|
+
),
|
|
104
|
+
"bronze_autoloader_governed_delta": _template(
|
|
105
|
+
"bronze_autoloader_governed_delta",
|
|
106
|
+
"bronze",
|
|
107
|
+
"Governed Auto Loader landing with Delta optimization preview.",
|
|
108
|
+
{"preset": ["bronze_autoloader_append", "delta_optimized_writes", "governance_uc_basic"], "source": {"type": "incremental_files", "format": "json", "path": "s3://bucket/landing/governed/"}, "target": _target("raw", "b_governed_delta")},
|
|
109
|
+
annotations=_ann("Governed raw landing table."),
|
|
110
|
+
operations=_ops("b_governed_delta"),
|
|
111
|
+
access=_access("data-engineers"),
|
|
112
|
+
),
|
|
113
|
+
"bronze_object_storage_nested_json_shape": _template(
|
|
114
|
+
"bronze_object_storage_nested_json_shape",
|
|
115
|
+
"bronze",
|
|
116
|
+
"Object-storage nested JSON with shape intent.",
|
|
117
|
+
{"preset": "bronze_file_append", "source": {"type": "json", "path": "s3://bucket/events/"}, "shape": {"parse_json": [{"column": "payload", "schema": "STRUCT<id: STRING>", "alias": "payload_obj"}]}, "target": _target("raw", "b_nested_events")},
|
|
118
|
+
annotations=_ann("Nested JSON event landing."),
|
|
119
|
+
operations=_ops("b_nested_events"),
|
|
120
|
+
),
|
|
121
|
+
"bronze_object_storage_small_files": _template(
|
|
122
|
+
"bronze_object_storage_small_files",
|
|
123
|
+
"bronze",
|
|
124
|
+
"Object-storage small files batch append.",
|
|
125
|
+
{"preset": "bronze_file_append", "source": {"type": "parquet", "path": "s3://bucket/small-files/"}, "target": _target("raw", "b_small_files")},
|
|
126
|
+
annotations=_ann("Small-file batch landing."),
|
|
127
|
+
operations=_ops("b_small_files"),
|
|
128
|
+
),
|
|
129
|
+
"silver_jdbc_scd1_upsert": _template(
|
|
130
|
+
"silver_jdbc_scd1_upsert",
|
|
131
|
+
"silver",
|
|
132
|
+
"JDBC SCD1 current-state upsert.",
|
|
133
|
+
{"preset": ["silver_incremental_watermark_upsert", "quality_quarantine", "delta_optimized_writes"], "source": {"type": "jdbc", "table": "public.orders"}, "target": _target("curated", "s_orders"), "merge_keys": ["order_id"], "watermark_columns": ["updated_at"]},
|
|
134
|
+
annotations=_ann("Current-state orders from JDBC."),
|
|
135
|
+
operations=_ops("s_orders"),
|
|
136
|
+
access=_access("sales-analytics"),
|
|
137
|
+
),
|
|
138
|
+
"silver_jdbc_rds_iam_hash_diff": _template(
|
|
139
|
+
"silver_jdbc_rds_iam_hash_diff",
|
|
140
|
+
"silver",
|
|
141
|
+
"JDBC RDS IAM hash-diff append.",
|
|
142
|
+
{"preset": ["silver_hash_diff_append", "quality_quarantine"], "source": {"type": "postgres", "table": "public.orders", "auth": {"type": "rds_iam"}}, "target": _target("curated", "s_orders_hash_diff"), "hash_keys": ["order_id"]},
|
|
143
|
+
annotations=_ann("Hash-diff order changes from JDBC."),
|
|
144
|
+
operations=_ops("s_orders_hash_diff"),
|
|
145
|
+
),
|
|
146
|
+
"silver_lakeflow_auto_cdc_scd1_preview": _template(
|
|
147
|
+
"silver_lakeflow_auto_cdc_scd1_preview",
|
|
148
|
+
"silver",
|
|
149
|
+
"Lakeflow AUTO CDC SCD1 review artifact with Delta fallback semantics.",
|
|
150
|
+
{"preset": ["silver_scd1_upsert", "delta_liquid_clustering"], "source": {"type": "table", "table": "main.raw.customer_cdc"}, "target": _target("curated", "s_customers_current"), "merge_keys": ["customer_id"], "extensions": {"databricks": {"cluster_columns": ["customer_id"], "write_engine": {"requested": "lakeflow_auto_cdc", "fallback_policy": "preview_only"}}}},
|
|
151
|
+
annotations=_ann("Current customers with Lakeflow review evidence."),
|
|
152
|
+
operations=_ops("s_customers_current"),
|
|
153
|
+
),
|
|
154
|
+
"silver_lakeflow_auto_cdc_scd2_preview": _template(
|
|
155
|
+
"silver_lakeflow_auto_cdc_scd2_preview",
|
|
156
|
+
"silver",
|
|
157
|
+
"Lakeflow AUTO CDC SCD2 review artifact with Delta baseline.",
|
|
158
|
+
{"preset": ["silver_scd2_historical", "delta_liquid_clustering"], "source": {"type": "table", "table": "main.raw.product_cdc"}, "target": _target("curated", "s_products_history"), "merge_keys": ["product_id"], "extensions": {"databricks": {"cluster_columns": ["product_id"], "write_engine": {"requested": "lakeflow_auto_cdc", "fallback_policy": "preview_only"}}}},
|
|
159
|
+
annotations=_ann("Product SCD2 history with Lakeflow review evidence."),
|
|
160
|
+
operations=_ops("s_products_history"),
|
|
161
|
+
),
|
|
162
|
+
"silver_raw_json_payload_shape": _template(
|
|
163
|
+
"silver_raw_json_payload_shape",
|
|
164
|
+
"silver",
|
|
165
|
+
"Raw JSON payload parsing into a curated table.",
|
|
166
|
+
{"preset": "silver_scd1_upsert", "source": {"type": "table", "table": "main.raw.b_events"}, "shape": {"parse_json": [{"column": "payload", "schema": "STRUCT<event_id: STRING>", "alias": "payload_obj"}], "columns": {"payload_obj.event_id": {"alias": "event_id", "cast": "STRING"}}}, "target": _target("curated", "s_events"), "merge_keys": ["event_id"]},
|
|
167
|
+
annotations=_ann("Curated event payloads."),
|
|
168
|
+
operations=_ops("s_events"),
|
|
169
|
+
),
|
|
170
|
+
"silver_parallel_arrays_shape": _template(
|
|
171
|
+
"silver_parallel_arrays_shape",
|
|
172
|
+
"silver",
|
|
173
|
+
"Parallel array normalization review template.",
|
|
174
|
+
{"preset": "silver_scd1_upsert", "source": {"type": "table", "table": "main.raw.b_forecast"}, "shape": {"zip_arrays": [{"alias": "hour", "columns": {"times": "time", "values": "value"}}], "arrays": [{"path": "hour", "mode": "explode_outer", "alias": "hour"}]}, "target": _target("curated", "s_hourly_forecast"), "merge_keys": ["forecast_id"]},
|
|
175
|
+
annotations=_ann("Forecast rows derived from parallel arrays."),
|
|
176
|
+
operations=_ops("s_hourly_forecast"),
|
|
177
|
+
),
|
|
178
|
+
"silver_snapshot_soft_delete": _template(
|
|
179
|
+
"silver_snapshot_soft_delete",
|
|
180
|
+
"silver",
|
|
181
|
+
"Current-state snapshot with soft delete for missing rows.",
|
|
182
|
+
{"preset": "silver_snapshot_soft_delete", "source": {"type": "table", "table": "main.raw.b_devices_snapshot"}, "target": _target("curated", "s_devices"), "merge_keys": ["device_id"]},
|
|
183
|
+
annotations=_ann("Device snapshot with soft delete semantics."),
|
|
184
|
+
operations=_ops("s_devices"),
|
|
185
|
+
),
|
|
186
|
+
"silver_scd2_history": _template(
|
|
187
|
+
"silver_scd2_history",
|
|
188
|
+
"silver",
|
|
189
|
+
"SCD2 historical table.",
|
|
190
|
+
{"preset": "silver_scd2_historical", "source": {"type": "table", "table": "main.raw.b_customers"}, "target": _target("curated", "s_customers_history"), "merge_keys": ["customer_id"]},
|
|
191
|
+
annotations=_ann("Customer SCD2 history."),
|
|
192
|
+
operations=_ops("s_customers_history"),
|
|
193
|
+
),
|
|
194
|
+
"gold_full_refresh_kpi": _template(
|
|
195
|
+
"gold_full_refresh_kpi",
|
|
196
|
+
"gold",
|
|
197
|
+
"Gold KPI table recalculated by full refresh.",
|
|
198
|
+
{"preset": "gold_full_refresh", "source": {"type": "sql", "query": "SELECT order_date, count(*) AS orders FROM main.curated.s_orders GROUP BY order_date"}, "target": _target("analytics", "g_daily_orders")},
|
|
199
|
+
annotations=_ann("Daily order KPI table."),
|
|
200
|
+
operations=_ops("g_daily_orders"),
|
|
201
|
+
access=_access("executive-dashboards"),
|
|
202
|
+
),
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
enrich_contractforge_parity(BUILTIN_CONTRACT_TEMPLATES)
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""Additional templates ported from the original ContractForge catalog."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from contractforge_databricks.templates.catalog import _access, _ann, _ops, _target, _template
|
|
8
|
+
|
|
9
|
+
ContractTemplate = dict[str, Any]
|
|
10
|
+
|
|
11
|
+
PARITY_CONTRACT_TEMPLATES: dict[str, ContractTemplate] = {
|
|
12
|
+
"bronze_blob_partitioned_files": _template(
|
|
13
|
+
"bronze_blob_partitioned_files",
|
|
14
|
+
"bronze",
|
|
15
|
+
"Bronze batch ingestion for partitioned files in object storage.",
|
|
16
|
+
{
|
|
17
|
+
"preset": "bronze_file_append",
|
|
18
|
+
"source": {
|
|
19
|
+
"type": "s3",
|
|
20
|
+
"format": "parquet",
|
|
21
|
+
"path": "s3://company-landing/orders/",
|
|
22
|
+
"options": {"recursiveFileLookup": True, "pathGlobFilter": "*.parquet"},
|
|
23
|
+
"read": {
|
|
24
|
+
"source_complete": True,
|
|
25
|
+
"schema": "order_id STRING, order_date DATE, customer_id STRING, amount DOUBLE",
|
|
26
|
+
"file_regex": r"^year=2026/month=05/.*/orders_\d+\.parquet$",
|
|
27
|
+
"file_regex_scope": "relative_path",
|
|
28
|
+
"file_regex_max_listed": 50000,
|
|
29
|
+
},
|
|
30
|
+
},
|
|
31
|
+
"target": _target("raw", "b_orders_files"),
|
|
32
|
+
"layer": "bronze",
|
|
33
|
+
"mode": "scd0_append",
|
|
34
|
+
"schema_policy": "additive_only",
|
|
35
|
+
"quality_rules": {
|
|
36
|
+
"not_null": ["order_id"],
|
|
37
|
+
"expressions": [
|
|
38
|
+
{
|
|
39
|
+
"name": "valid_amount",
|
|
40
|
+
"expression": "amount IS NULL OR amount >= 0",
|
|
41
|
+
"severity": "warn",
|
|
42
|
+
"message": "Negative amount in raw file.",
|
|
43
|
+
}
|
|
44
|
+
],
|
|
45
|
+
},
|
|
46
|
+
},
|
|
47
|
+
annotations=_ann("Partitioned order files in object storage."),
|
|
48
|
+
operations=_ops("b_orders_files"),
|
|
49
|
+
),
|
|
50
|
+
"silver_scd1_hash_diff": _template(
|
|
51
|
+
"silver_scd1_hash_diff",
|
|
52
|
+
"silver",
|
|
53
|
+
"Silver append-only hash diff retaining changed versions.",
|
|
54
|
+
{
|
|
55
|
+
"preset": "silver_hash_diff_append",
|
|
56
|
+
"source": {"type": "table", "table": "main.raw.b_products"},
|
|
57
|
+
"target": _target("curated", "s_products_hash_diff"),
|
|
58
|
+
"layer": "silver",
|
|
59
|
+
"mode": "scd1_hash_diff",
|
|
60
|
+
"hash_keys": ["product_id"],
|
|
61
|
+
"hash_exclude_columns": ["updated_at"],
|
|
62
|
+
"transform": {
|
|
63
|
+
"deduplicate": {
|
|
64
|
+
"keys": ["product_id"],
|
|
65
|
+
"order_by": "updated_at DESC NULLS LAST",
|
|
66
|
+
}
|
|
67
|
+
},
|
|
68
|
+
"quality_rules": {
|
|
69
|
+
"not_null": ["product_id"],
|
|
70
|
+
"expressions": [
|
|
71
|
+
{
|
|
72
|
+
"name": "valid_product_status",
|
|
73
|
+
"expression": "status IS NULL OR status IN ('active', 'inactive', 'discontinued')",
|
|
74
|
+
"severity": "quarantine",
|
|
75
|
+
"message": "Invalid product status.",
|
|
76
|
+
}
|
|
77
|
+
],
|
|
78
|
+
},
|
|
79
|
+
},
|
|
80
|
+
annotations=_ann("Changed product versions detected by hash diff."),
|
|
81
|
+
operations=_ops("s_products_hash_diff"),
|
|
82
|
+
access=_access("catalog-analytics"),
|
|
83
|
+
priority=10,
|
|
84
|
+
),
|
|
85
|
+
}
|