cfa-kernel 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cfa/__init__.py +39 -0
- cfa/_lazy.py +39 -0
- cfa/adapters/__init__.py +104 -0
- cfa/adapters/autogen.py +19 -0
- cfa/adapters/crewai.py +19 -0
- cfa/adapters/dspy.py +19 -0
- cfa/adapters/langgraph.py +19 -0
- cfa/adapters/openai_agents.py +19 -0
- cfa/audit/__init__.py +15 -0
- cfa/audit/context.py +205 -0
- cfa/audit/hashing.py +41 -0
- cfa/audit/trail.py +194 -0
- cfa/backends/__init__.py +132 -0
- cfa/backends/dbt.py +338 -0
- cfa/backends/pyspark.py +240 -0
- cfa/backends/sql.py +270 -0
- cfa/behavior/__init__.py +49 -0
- cfa/behavior/llm.py +244 -0
- cfa/behavior/spec.py +235 -0
- cfa/behavior/systematizer.py +222 -0
- cfa/cli/__init__.py +296 -0
- cfa/cli/__main__.py +6 -0
- cfa/cli/_helpers.py +109 -0
- cfa/cli/core/__init__.py +0 -0
- cfa/cli/core/evaluate.py +72 -0
- cfa/cli/core/validate.py +29 -0
- cfa/cli/formatters.py +280 -0
- cfa/cli/governance/__init__.py +0 -0
- cfa/cli/governance/audit.py +65 -0
- cfa/cli/governance/catalog.py +28 -0
- cfa/cli/governance/policy.py +119 -0
- cfa/cli/governance/rules.py +42 -0
- cfa/cli/governance/signature.py +31 -0
- cfa/cli/infrastructure/__init__.py +0 -0
- cfa/cli/infrastructure/backend_list.py +24 -0
- cfa/cli/infrastructure/storage.py +87 -0
- cfa/cli/project/__init__.py +0 -0
- cfa/cli/project/init.py +73 -0
- cfa/cli/project/lifecycle.py +92 -0
- cfa/cli/project/status.py +75 -0
- cfa/cli/project/taxonomy.py +38 -0
- cfa/cli/reporting/__init__.py +0 -0
- cfa/cli/reporting/report.py +109 -0
- cfa/cli/reporting/serve.py +43 -0
- cfa/config.py +103 -0
- cfa/core/__init__.py +19 -0
- cfa/core/codegen.py +65 -0
- cfa/core/conditions.py +129 -0
- cfa/core/kernel.py +224 -0
- cfa/core/phases/__init__.py +0 -0
- cfa/core/phases/runner.py +477 -0
- cfa/core/planner.py +290 -0
- cfa/execution/__init__.py +12 -0
- cfa/execution/partial.py +339 -0
- cfa/execution/state_projection.py +216 -0
- cfa/governance/__init__.py +76 -0
- cfa/lifecycle/__init__.py +51 -0
- cfa/mcp/__init__.py +347 -0
- cfa/mcp/__main__.py +4 -0
- cfa/normalizer/__init__.py +15 -0
- cfa/normalizer/base.py +441 -0
- cfa/normalizer/llm.py +426 -0
- cfa/observability/__init__.py +14 -0
- cfa/observability/indices.py +177 -0
- cfa/observability/metrics.py +91 -0
- cfa/observability/notify.py +79 -0
- cfa/observability/otel.py +81 -0
- cfa/observability/promotion.py +367 -0
- cfa/policy/__init__.py +12 -0
- cfa/policy/bundle.py +317 -0
- cfa/policy/catalog.py +117 -0
- cfa/policy/engine.py +306 -0
- cfa/reporting/__init__.py +42 -0
- cfa/reporting/charts.py +223 -0
- cfa/reporting/engine.py +456 -0
- cfa/resolution/__init__.py +62 -0
- cfa/runtime/__init__.py +13 -0
- cfa/runtime/gate.py +287 -0
- cfa/sandbox/__init__.py +189 -0
- cfa/sandbox/executor.py +92 -0
- cfa/sandbox/mock.py +89 -0
- cfa/sandbox/panic.py +52 -0
- cfa/storage/__init__.py +591 -0
- cfa/testing/__init__.py +60 -0
- cfa/testing/asserts.py +77 -0
- cfa/testing/evaluate.py +168 -0
- cfa/testing/fixtures.py +89 -0
- cfa/testing/markers.py +36 -0
- cfa/types.py +489 -0
- cfa/validation/__init__.py +14 -0
- cfa/validation/runtime.py +285 -0
- cfa/validation/signature.py +146 -0
- cfa/validation/static.py +252 -0
- cfa_kernel-0.1.0.dist-info/METADATA +32 -0
- cfa_kernel-0.1.0.dist-info/RECORD +98 -0
- cfa_kernel-0.1.0.dist-info/WHEEL +4 -0
- cfa_kernel-0.1.0.dist-info/entry_points.txt +3 -0
- cfa_kernel-0.1.0.dist-info/licenses/LICENSE +21 -0
cfa/audit/trail.py
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CFA Audit Trail
|
|
3
|
+
================
|
|
4
|
+
Append-only, causally ordered record of all decision events.
|
|
5
|
+
|
|
6
|
+
Phase 1: in-memory list.
|
|
7
|
+
Phase 4: persistent backend (JSON Lines file, extensible to S3/Kafka/OpenLineage).
|
|
8
|
+
|
|
9
|
+
Properties (Invariant I5):
|
|
10
|
+
- Immutable after write
|
|
11
|
+
- Causal ordering per intent_id
|
|
12
|
+
- Complete per intent (start and end recorded)
|
|
13
|
+
- Cryptographic hash chain for tamper detection
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import hashlib
|
|
19
|
+
import json
|
|
20
|
+
from abc import ABC, abstractmethod
|
|
21
|
+
from dataclasses import asdict, dataclass, field
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Any
|
|
24
|
+
|
|
25
|
+
from cfa.types import _utcnow
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class AuditEvent:
|
|
30
|
+
"""Single typed event in the audit trail."""
|
|
31
|
+
|
|
32
|
+
intent_id: str
|
|
33
|
+
stage: str
|
|
34
|
+
event_type: str
|
|
35
|
+
outcome: str
|
|
36
|
+
policy_bundle_version: str = ""
|
|
37
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
38
|
+
timestamp: str = field(default_factory=lambda: _utcnow().isoformat())
|
|
39
|
+
event_hash: str = ""
|
|
40
|
+
previous_hash: str = ""
|
|
41
|
+
|
|
42
|
+
def to_dict(self) -> dict[str, Any]:
|
|
43
|
+
return asdict(self)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# ── Audit Storage Backend ───────────────────────────────────────────────────
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class AuditStorageBackend(ABC):
|
|
50
|
+
"""Extension point: pluggable persistence for audit events."""
|
|
51
|
+
|
|
52
|
+
@abstractmethod
|
|
53
|
+
def append(self, event: AuditEvent) -> None: ...
|
|
54
|
+
|
|
55
|
+
@abstractmethod
|
|
56
|
+
def load_all(self) -> list[AuditEvent]: ...
|
|
57
|
+
|
|
58
|
+
@abstractmethod
|
|
59
|
+
def load_by_intent(self, intent_id: str) -> list[AuditEvent]: ...
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class InMemoryAuditStorage(AuditStorageBackend):
|
|
63
|
+
"""In-memory storage for testing."""
|
|
64
|
+
|
|
65
|
+
def __init__(self) -> None:
|
|
66
|
+
self._events: list[AuditEvent] = []
|
|
67
|
+
|
|
68
|
+
def append(self, event: AuditEvent) -> None:
|
|
69
|
+
self._events.append(event)
|
|
70
|
+
|
|
71
|
+
def load_all(self) -> list[AuditEvent]:
|
|
72
|
+
return list(self._events)
|
|
73
|
+
|
|
74
|
+
def load_by_intent(self, intent_id: str) -> list[AuditEvent]:
|
|
75
|
+
return [e for e in self._events if e.intent_id == intent_id]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class JsonLinesAuditStorage(AuditStorageBackend):
|
|
79
|
+
"""
|
|
80
|
+
JSON Lines file-based persistent audit storage.
|
|
81
|
+
Each line is a JSON-serialized AuditEvent.
|
|
82
|
+
Append-only — never modifies existing lines (Invariant I5).
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
def __init__(self, file_path: str | Path) -> None:
|
|
86
|
+
self.file_path = Path(file_path)
|
|
87
|
+
self.file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
88
|
+
|
|
89
|
+
def append(self, event: AuditEvent) -> None:
|
|
90
|
+
with open(self.file_path, "a", encoding="utf-8") as f:
|
|
91
|
+
f.write(json.dumps(event.to_dict(), default=str) + "\n")
|
|
92
|
+
|
|
93
|
+
def load_all(self) -> list[AuditEvent]:
|
|
94
|
+
if not self.file_path.exists():
|
|
95
|
+
return []
|
|
96
|
+
events = []
|
|
97
|
+
for line in self.file_path.read_text(encoding="utf-8").strip().splitlines():
|
|
98
|
+
if line:
|
|
99
|
+
data = json.loads(line)
|
|
100
|
+
events.append(AuditEvent(**data))
|
|
101
|
+
return events
|
|
102
|
+
|
|
103
|
+
def load_by_intent(self, intent_id: str) -> list[AuditEvent]:
|
|
104
|
+
return [e for e in self.load_all() if e.intent_id == intent_id]
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
# ── Audit Trail ─────────────────────────────────────────────────────────────
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class AuditTrail:
|
|
111
|
+
"""
|
|
112
|
+
Append-only audit trail with hash chain.
|
|
113
|
+
|
|
114
|
+
Two consumption modes (per whitepaper):
|
|
115
|
+
- Operational (engineering): JSON, debugging, tuning
|
|
116
|
+
- Regulatory (audit): normalized + cryptographic hash chain
|
|
117
|
+
|
|
118
|
+
The hash chain ensures tamper detection: each event's hash
|
|
119
|
+
includes the previous event's hash, creating a causal chain.
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
def __init__(self, storage: AuditStorageBackend | None = None) -> None:
|
|
123
|
+
self._storage = storage or InMemoryAuditStorage()
|
|
124
|
+
self._last_hash = ""
|
|
125
|
+
# Restore hash chain from existing events
|
|
126
|
+
existing = self._storage.load_all()
|
|
127
|
+
if existing:
|
|
128
|
+
self._last_hash = existing[-1].event_hash
|
|
129
|
+
|
|
130
|
+
def record(
|
|
131
|
+
self,
|
|
132
|
+
intent_id: str,
|
|
133
|
+
stage: str,
|
|
134
|
+
event_type: str,
|
|
135
|
+
outcome: str,
|
|
136
|
+
policy_bundle_version: str = "",
|
|
137
|
+
**details: Any,
|
|
138
|
+
) -> AuditEvent:
|
|
139
|
+
event = AuditEvent(
|
|
140
|
+
intent_id=intent_id,
|
|
141
|
+
stage=stage,
|
|
142
|
+
event_type=event_type,
|
|
143
|
+
outcome=outcome,
|
|
144
|
+
policy_bundle_version=policy_bundle_version,
|
|
145
|
+
details=details,
|
|
146
|
+
previous_hash=self._last_hash,
|
|
147
|
+
)
|
|
148
|
+
# Compute hash chain
|
|
149
|
+
event.event_hash = self._compute_hash(event)
|
|
150
|
+
self._last_hash = event.event_hash
|
|
151
|
+
self._storage.append(event)
|
|
152
|
+
return event
|
|
153
|
+
|
|
154
|
+
def get_events_for_intent(self, intent_id: str) -> list[AuditEvent]:
|
|
155
|
+
return self._storage.load_by_intent(intent_id)
|
|
156
|
+
|
|
157
|
+
def get_all_events(self) -> list[AuditEvent]:
|
|
158
|
+
return self._storage.load_all()
|
|
159
|
+
|
|
160
|
+
@property
|
|
161
|
+
def event_count(self) -> int:
|
|
162
|
+
return len(self._storage.load_all())
|
|
163
|
+
|
|
164
|
+
def verify_chain(self) -> bool:
|
|
165
|
+
"""Verify the integrity of the hash chain. Returns True if valid."""
|
|
166
|
+
events = self._storage.load_all()
|
|
167
|
+
prev_hash = ""
|
|
168
|
+
for event in events:
|
|
169
|
+
if event.previous_hash != prev_hash:
|
|
170
|
+
return False
|
|
171
|
+
expected = self._compute_hash(event)
|
|
172
|
+
legacy_expected = self._compute_hash_legacy(event)
|
|
173
|
+
if event.event_hash not in (expected, legacy_expected):
|
|
174
|
+
return False
|
|
175
|
+
prev_hash = event.event_hash
|
|
176
|
+
return True
|
|
177
|
+
|
|
178
|
+
@staticmethod
|
|
179
|
+
def _compute_hash(event: AuditEvent) -> str:
|
|
180
|
+
payload = json.dumps({
|
|
181
|
+
"intent_id": event.intent_id,
|
|
182
|
+
"stage": event.stage,
|
|
183
|
+
"event_type": event.event_type,
|
|
184
|
+
"outcome": event.outcome,
|
|
185
|
+
"timestamp": event.timestamp,
|
|
186
|
+
"previous_hash": event.previous_hash,
|
|
187
|
+
"details": event.details,
|
|
188
|
+
}, sort_keys=True, default=str)
|
|
189
|
+
return hashlib.sha256(payload.encode()).hexdigest()
|
|
190
|
+
|
|
191
|
+
@staticmethod
|
|
192
|
+
def _compute_hash_legacy(event: AuditEvent) -> str:
|
|
193
|
+
"""Return the pre-3.1 truncated hash for backward verification only."""
|
|
194
|
+
return AuditTrail._compute_hash(event)[:16]
|
cfa/backends/__init__.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CFA Backend Registry
|
|
3
|
+
====================
|
|
4
|
+
Pluggable backends for code generation and execution.
|
|
5
|
+
|
|
6
|
+
A BackendAdapter extends CodeGenBackend with capability introspection,
|
|
7
|
+
enabling the Policy Engine to validate whether a backend can satisfy
|
|
8
|
+
a given intent's constraints before attempting code generation.
|
|
9
|
+
|
|
10
|
+
Usage:
|
|
11
|
+
from cfa.backends import BackendRegistry, BackendAdapter, BackendCapabilities
|
|
12
|
+
|
|
13
|
+
registry = BackendRegistry()
|
|
14
|
+
registry.register("pyspark", PySparkBackend)
|
|
15
|
+
registry.register("duckdb", DuckDBBackend)
|
|
16
|
+
|
|
17
|
+
backend = registry.get("pyspark")()
|
|
18
|
+
caps = backend.get_capabilities()
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from abc import abstractmethod
|
|
24
|
+
from collections.abc import Callable
|
|
25
|
+
from dataclasses import dataclass, field
|
|
26
|
+
from typing import Any
|
|
27
|
+
|
|
28
|
+
from cfa.core.codegen import CodeGenBackend
|
|
29
|
+
from cfa.validation.static import ForbiddenToken
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class BackendCapabilities:
|
|
34
|
+
"""Capabilities exposed by a backend for policy validation.
|
|
35
|
+
|
|
36
|
+
The PolicyEngine queries these before approving an intent —
|
|
37
|
+
if a backend cannot satisfy a required constraint (e.g., merge_key
|
|
38
|
+
on a Silver write), the intent is blocked before code generation.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
backend_name: str = ""
|
|
42
|
+
backend_version: str = ""
|
|
43
|
+
|
|
44
|
+
supports_merge: bool = False
|
|
45
|
+
supports_partition_overwrite: bool = False
|
|
46
|
+
supports_anonymization: bool = False
|
|
47
|
+
supports_schema_enforcement: bool = False
|
|
48
|
+
|
|
49
|
+
pii_anonymization_methods: list[str] = field(default_factory=lambda: ["sha256", "drop"])
|
|
50
|
+
cost_model_available: bool = False
|
|
51
|
+
max_recommended_rows: int = 10_000_000
|
|
52
|
+
supported_languages: list[str] = field(default_factory=lambda: ["python"])
|
|
53
|
+
|
|
54
|
+
forbidden_tokens: list[ForbiddenToken] = field(default_factory=list)
|
|
55
|
+
|
|
56
|
+
custom: dict[str, Any] = field(default_factory=dict)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class BackendAdapter(CodeGenBackend):
|
|
60
|
+
"""Extension of CodeGenBackend with capability introspection.
|
|
61
|
+
|
|
62
|
+
Implement this to create a pluggable backend for any target:
|
|
63
|
+
PySpark, DuckDB, BigQuery, REST API, LLM chain, etc.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
@abstractmethod
|
|
67
|
+
def get_capabilities(self) -> BackendCapabilities: ...
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
BackendFactory = Callable[[], BackendAdapter]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class BackendRegistry:
|
|
74
|
+
"""Global registry of available backend factories.
|
|
75
|
+
|
|
76
|
+
Supports:
|
|
77
|
+
- register(name, factory) — add a backend
|
|
78
|
+
- get(name) -> factory — retrieve by name
|
|
79
|
+
- list() -> list of registered names
|
|
80
|
+
- remove(name) — unregister a backend
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
_instance: BackendRegistry | None = None
|
|
84
|
+
_lock: Any = None
|
|
85
|
+
|
|
86
|
+
def __init__(self) -> None:
|
|
87
|
+
self._backends: dict[str, BackendFactory] = {}
|
|
88
|
+
|
|
89
|
+
@classmethod
|
|
90
|
+
def singleton(cls) -> BackendRegistry:
|
|
91
|
+
if cls._lock is None:
|
|
92
|
+
import threading
|
|
93
|
+
cls._lock = threading.Lock()
|
|
94
|
+
with cls._lock:
|
|
95
|
+
if cls._instance is None:
|
|
96
|
+
cls._instance = cls()
|
|
97
|
+
cls._instance._bootstrap()
|
|
98
|
+
return cls._instance
|
|
99
|
+
|
|
100
|
+
def _bootstrap(self) -> None:
|
|
101
|
+
if self._backends:
|
|
102
|
+
return
|
|
103
|
+
from .dbt import DbtBackend # noqa: F811
|
|
104
|
+
from .pyspark import PySparkBackend # noqa: F811
|
|
105
|
+
from .sql import SqlBackend # noqa: F811
|
|
106
|
+
|
|
107
|
+
self.register("dbt", lambda: DbtBackend())
|
|
108
|
+
self.register("pyspark", lambda: PySparkBackend())
|
|
109
|
+
self.register("sql", lambda: SqlBackend())
|
|
110
|
+
|
|
111
|
+
def register(self, name: str, factory: BackendFactory) -> None:
|
|
112
|
+
self._backends[name] = factory
|
|
113
|
+
|
|
114
|
+
def get(self, name: str) -> BackendFactory:
|
|
115
|
+
if name not in self._backends:
|
|
116
|
+
available = ", ".join(sorted(self._backends))
|
|
117
|
+
raise KeyError(
|
|
118
|
+
f"Unknown backend '{name}'. Registered backends: {available or '(none)'}"
|
|
119
|
+
)
|
|
120
|
+
return self._backends[name]
|
|
121
|
+
|
|
122
|
+
def list(self) -> list[str]:
|
|
123
|
+
return sorted(self._backends)
|
|
124
|
+
|
|
125
|
+
def remove(self, name: str) -> None:
|
|
126
|
+
self._backends.pop(name, None)
|
|
127
|
+
|
|
128
|
+
def clear(self) -> None:
|
|
129
|
+
self._backends.clear()
|
|
130
|
+
|
|
131
|
+
def __contains__(self, name: str) -> bool:
|
|
132
|
+
return name in self._backends
|
cfa/backends/dbt.py
ADDED
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
"""
|
|
2
|
+
dbt Backend
|
|
3
|
+
============
|
|
4
|
+
Code generation backend targeting dbt (data build tool).
|
|
5
|
+
|
|
6
|
+
Generates governed dbt models from an ExecutionPlan. Output includes:
|
|
7
|
+
- ``model.sql`` with dbt Jinja templating and governed SQL
|
|
8
|
+
- ``schema.yml`` with data quality tests derived from governance rules
|
|
9
|
+
- Partition configs, merge keys as uniqueness tests, PII annotations
|
|
10
|
+
|
|
11
|
+
Template-based — no LLM involved.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from cfa.core.codegen import GeneratedCode
|
|
17
|
+
from cfa.core.planner import ExecutionPlan, ExecutionStep, StepType, WriteMode
|
|
18
|
+
from cfa.types import FaultSeverity
|
|
19
|
+
from cfa.validation.static import ForbiddenToken
|
|
20
|
+
|
|
21
|
+
from . import BackendAdapter, BackendCapabilities
|
|
22
|
+
|
|
23
|
+
_DBT_FORBIDDEN_TOKENS: list[ForbiddenToken] = [
|
|
24
|
+
ForbiddenToken(pattern=r"\bDROP\s+TABLE\b", fault_code="STATIC_DBT_DROP_TABLE",
|
|
25
|
+
severity=FaultSeverity.CRITICAL,
|
|
26
|
+
message="DROP TABLE in governed dbt model forbidden.", is_regex=True),
|
|
27
|
+
ForbiddenToken(pattern=r"\bDROP\s+DATABASE\b", fault_code="STATIC_DBT_DROP_DATABASE",
|
|
28
|
+
severity=FaultSeverity.CRITICAL,
|
|
29
|
+
message="DROP DATABASE in governed dbt model forbidden.", is_regex=True),
|
|
30
|
+
ForbiddenToken(pattern=r"\bTRUNCATE\b", fault_code="STATIC_DBT_TRUNCATE",
|
|
31
|
+
severity=FaultSeverity.CRITICAL,
|
|
32
|
+
message="TRUNCATE forbidden in dbt governed models.", is_regex=True),
|
|
33
|
+
ForbiddenToken(pattern=r"\bDELETE\s+FROM\b", fault_code="STATIC_DBT_DELETE",
|
|
34
|
+
severity=FaultSeverity.CRITICAL,
|
|
35
|
+
message="DELETE FROM forbidden in dbt governed models.", is_regex=True),
|
|
36
|
+
ForbiddenToken(pattern=r"\bALTER\s+TABLE\b", fault_code="STATIC_DBT_ALTER_TABLE",
|
|
37
|
+
severity=FaultSeverity.HIGH,
|
|
38
|
+
message="ALTER TABLE forbidden — use dbt model rebuild.", is_regex=True),
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class DbtBackend(BackendAdapter):
|
|
43
|
+
"""Generates governed dbt model + schema.yml from an ExecutionPlan."""
|
|
44
|
+
|
|
45
|
+
def get_capabilities(self) -> BackendCapabilities:
|
|
46
|
+
return BackendCapabilities(
|
|
47
|
+
backend_name="dbt",
|
|
48
|
+
backend_version="dbt-core-1.x",
|
|
49
|
+
supports_merge=True,
|
|
50
|
+
supports_partition_overwrite=True,
|
|
51
|
+
supports_anonymization=True,
|
|
52
|
+
supports_schema_enforcement=True,
|
|
53
|
+
pii_anonymization_methods=["sha256", "drop", "md5", "tokenize"],
|
|
54
|
+
cost_model_available=False,
|
|
55
|
+
max_recommended_rows=500_000_000,
|
|
56
|
+
supported_languages=["sql", "yaml"],
|
|
57
|
+
forbidden_tokens=_DBT_FORBIDDEN_TOKENS,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
def generate(self, plan: ExecutionPlan) -> GeneratedCode:
|
|
61
|
+
sql_lines: list[str] = []
|
|
62
|
+
schema_columns: list[dict[str, object]] = []
|
|
63
|
+
model_name = self._derive_model_name(plan)
|
|
64
|
+
pii_columns_anonymized: set[str] = set()
|
|
65
|
+
merge_keys: list[str] = []
|
|
66
|
+
partition_cols: list[str] = []
|
|
67
|
+
step_code: dict[str, str] = {}
|
|
68
|
+
ordered = plan.execution_order()
|
|
69
|
+
|
|
70
|
+
for step in ordered:
|
|
71
|
+
code = self._generate_step(step, plan)
|
|
72
|
+
step_code[step.id] = code
|
|
73
|
+
sql_lines.append(f"-- Step: {step.id} ({step.step_type.value})")
|
|
74
|
+
sql_lines.append(code)
|
|
75
|
+
sql_lines.append("")
|
|
76
|
+
|
|
77
|
+
merge_keys = step.config.get("merge_keys", merge_keys)
|
|
78
|
+
partition_cols = step.config.get("partition_by", partition_cols)
|
|
79
|
+
if step.step_type == StepType.ANONYMIZE:
|
|
80
|
+
for col in step.config.get("pii_columns", []):
|
|
81
|
+
pii_columns_anonymized.add(str(col))
|
|
82
|
+
|
|
83
|
+
schema = self._build_schema_yml(
|
|
84
|
+
model_name=model_name,
|
|
85
|
+
plan=plan,
|
|
86
|
+
merge_keys=merge_keys,
|
|
87
|
+
partition_cols=partition_cols,
|
|
88
|
+
pii_columns_anonymized=pii_columns_anonymized,
|
|
89
|
+
schema_columns=schema_columns,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
config_block = self._build_config(
|
|
93
|
+
plan=plan,
|
|
94
|
+
partition_cols=partition_cols,
|
|
95
|
+
merge_keys=merge_keys,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
full_body = "\n".join(sql_lines)
|
|
99
|
+
model_sql = f"{config_block}\n\n{full_body}"
|
|
100
|
+
|
|
101
|
+
combined = (
|
|
102
|
+
f"-- dbt model: {model_name}\n"
|
|
103
|
+
f"-- ============={'=' * len(model_name)}\n\n"
|
|
104
|
+
f"{model_sql}\n\n"
|
|
105
|
+
f"-- schema.yml (generated by CFA governance)\n"
|
|
106
|
+
f"-- ==========================================\n\n"
|
|
107
|
+
f"{schema}"
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
return GeneratedCode(
|
|
111
|
+
plan_signature_hash=plan.signature_hash,
|
|
112
|
+
intent_id=plan.intent_id,
|
|
113
|
+
language="dbt",
|
|
114
|
+
code=combined,
|
|
115
|
+
step_code_map=step_code,
|
|
116
|
+
metadata={
|
|
117
|
+
"model_name": model_name,
|
|
118
|
+
"write_mode": plan.write_mode.value,
|
|
119
|
+
"consistency_unit": plan.consistency_unit.value,
|
|
120
|
+
"step_count": plan.step_count,
|
|
121
|
+
},
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# ── Step generators ───────────────────────────────────────────────────
|
|
125
|
+
|
|
126
|
+
def _generate_step(self, step: ExecutionStep, plan: ExecutionPlan) -> str:
|
|
127
|
+
match step.step_type:
|
|
128
|
+
case StepType.EXTRACT:
|
|
129
|
+
return self._gen_extract(step)
|
|
130
|
+
case StepType.ANONYMIZE:
|
|
131
|
+
return self._gen_anonymize(step)
|
|
132
|
+
case StepType.JOIN:
|
|
133
|
+
return self._gen_join(step, plan)
|
|
134
|
+
case StepType.AGGREGATE:
|
|
135
|
+
return self._gen_aggregate(step)
|
|
136
|
+
case StepType.LOAD:
|
|
137
|
+
return self._gen_load(step, plan)
|
|
138
|
+
case StepType.FILTER:
|
|
139
|
+
return self._gen_filter(step)
|
|
140
|
+
case StepType.TRANSFORM:
|
|
141
|
+
return self._gen_transform(step)
|
|
142
|
+
case _:
|
|
143
|
+
return f"-- TODO: unsupported step type {step.step_type.value}"
|
|
144
|
+
|
|
145
|
+
def _gen_extract(self, step: ExecutionStep) -> str:
|
|
146
|
+
source = step.source or "unknown_source"
|
|
147
|
+
model_ref = _ref_name(source)
|
|
148
|
+
lines = [f"-- EXTRACT: {source}"]
|
|
149
|
+
lines.append(f"SELECT * FROM {{{{ ref('{model_ref}') }}}}")
|
|
150
|
+
|
|
151
|
+
filt = step.config.get("filter")
|
|
152
|
+
if filt:
|
|
153
|
+
col = _quote_ident(filt["column"])
|
|
154
|
+
pred = filt["predicate"]
|
|
155
|
+
lines.append(f"WHERE {col} {pred} '{{{{ var(\"date_param\") }}}}'")
|
|
156
|
+
|
|
157
|
+
return "\n".join(lines)
|
|
158
|
+
|
|
159
|
+
def _gen_anonymize(self, step: ExecutionStep) -> str:
|
|
160
|
+
source = step.source or "source"
|
|
161
|
+
pii_cols = step.config.get("pii_columns", [])
|
|
162
|
+
strategy = step.config.get("strategy", "sha256")
|
|
163
|
+
|
|
164
|
+
if not pii_cols:
|
|
165
|
+
return f"-- No PII columns to anonymize for {source}"
|
|
166
|
+
|
|
167
|
+
col_list = ", ".join(_quote_ident(c) for c in pii_cols)
|
|
168
|
+
lines: list[str] = [
|
|
169
|
+
f"-- ANONYMIZE: {source} (strategy={strategy}, columns={col_list})",
|
|
170
|
+
]
|
|
171
|
+
for col in pii_cols:
|
|
172
|
+
safe = _quote_ident(col)
|
|
173
|
+
if strategy == "sha256":
|
|
174
|
+
lines.append(f"-- {safe} replaced with SHA256({safe})")
|
|
175
|
+
elif strategy == "drop":
|
|
176
|
+
lines.append(f"-- {safe} dropped")
|
|
177
|
+
else:
|
|
178
|
+
lines.append(f"-- {safe} anonymized ({strategy})")
|
|
179
|
+
return "\n".join(lines)
|
|
180
|
+
|
|
181
|
+
def _gen_join(self, step: ExecutionStep, plan: ExecutionPlan) -> str:
|
|
182
|
+
datasets = step.config.get("datasets", [])
|
|
183
|
+
merge_keys = step.config.get("merge_keys", ["id"])
|
|
184
|
+
|
|
185
|
+
if len(datasets) < 2:
|
|
186
|
+
return "-- Join requires at least 2 datasets"
|
|
187
|
+
|
|
188
|
+
left_alias = _cte_name(datasets[0])
|
|
189
|
+
right_alias = _cte_name(datasets[1])
|
|
190
|
+
left_ref = _ref_name(datasets[0])
|
|
191
|
+
right_ref = _ref_name(datasets[1])
|
|
192
|
+
on_clause = " AND ".join(
|
|
193
|
+
f"{left_alias}.{_quote_ident(k)} = {right_alias}.{_quote_ident(k)}"
|
|
194
|
+
for k in merge_keys
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
lines: list[str] = [f"-- JOIN: {datasets[0]} + {datasets[1]}"]
|
|
198
|
+
lines.append(f"SELECT {left_alias}.*, {right_alias}.*")
|
|
199
|
+
lines.append(f"FROM {{{{ ref('{left_ref}') }}}} {left_alias}")
|
|
200
|
+
lines.append(f"INNER JOIN {{{{ ref('{right_ref}') }}}} {right_alias}")
|
|
201
|
+
lines.append(f" ON {on_clause}")
|
|
202
|
+
|
|
203
|
+
return "\n".join(lines)
|
|
204
|
+
|
|
205
|
+
def _gen_aggregate(self, step: ExecutionStep) -> str:
|
|
206
|
+
group_by = step.config.get("group_by", [])
|
|
207
|
+
lines: list[str] = ["-- AGGREGATE"]
|
|
208
|
+
if not group_by:
|
|
209
|
+
lines.append("SELECT COUNT(*) AS cnt FROM joined_cte")
|
|
210
|
+
else:
|
|
211
|
+
cols = ", ".join(_quote_ident(c) for c in group_by)
|
|
212
|
+
lines.append(f"SELECT {cols}, COUNT(*) AS cnt")
|
|
213
|
+
lines.append("FROM joined_cte")
|
|
214
|
+
lines.append(f"GROUP BY {cols}")
|
|
215
|
+
return "\n".join(lines)
|
|
216
|
+
|
|
217
|
+
def _gen_load(self, step: ExecutionStep, plan: ExecutionPlan) -> str:
|
|
218
|
+
target = step.target or "target_model"
|
|
219
|
+
merge_keys = step.config.get("merge_keys", ["id"])
|
|
220
|
+
partition_by = step.config.get("partition_by", [])
|
|
221
|
+
|
|
222
|
+
lines: list[str] = [f"-- LOAD: {target}"]
|
|
223
|
+
lines.append(f"-- Model: {{{{ ref('{_ref_name(target)}') }}}}")
|
|
224
|
+
if merge_keys:
|
|
225
|
+
lines.append(f"-- Merge keys: {', '.join(merge_keys)}")
|
|
226
|
+
if partition_by:
|
|
227
|
+
lines.append(f"-- Partition by: {', '.join(partition_by)}")
|
|
228
|
+
lines.append("SELECT * FROM joined_cte")
|
|
229
|
+
return "\n".join(lines)
|
|
230
|
+
|
|
231
|
+
def _gen_filter(self, step: ExecutionStep) -> str:
|
|
232
|
+
condition = step.config.get("condition", "1=1")
|
|
233
|
+
return f"-- FILTER: WHERE {condition}"
|
|
234
|
+
|
|
235
|
+
def _gen_transform(self, step: ExecutionStep) -> str:
|
|
236
|
+
return f"-- TRANSFORM: {step.config}"
|
|
237
|
+
|
|
238
|
+
# ── Config & schema generation ────────────────────────────────────────
|
|
239
|
+
|
|
240
|
+
def _build_config(
|
|
241
|
+
self,
|
|
242
|
+
plan: ExecutionPlan,
|
|
243
|
+
partition_cols: list[str],
|
|
244
|
+
merge_keys: list[str],
|
|
245
|
+
) -> str:
|
|
246
|
+
materialized = "table" if plan.write_mode == WriteMode.MERGE else "incremental"
|
|
247
|
+
lines = [
|
|
248
|
+
"{{ config(",
|
|
249
|
+
f" materialized='{materialized}',",
|
|
250
|
+
]
|
|
251
|
+
if partition_cols:
|
|
252
|
+
field_list = ", ".join(f"'{p}'" for p in partition_cols)
|
|
253
|
+
lines.append(f" partition_by={{'field': [{field_list}], 'data_type': 'date'}},")
|
|
254
|
+
if merge_keys and plan.write_mode == WriteMode.MERGE:
|
|
255
|
+
key_list = ", ".join(f"'{k}'" for k in merge_keys)
|
|
256
|
+
lines.append(f" unique_key=['{key_list}'],")
|
|
257
|
+
lines.append(") }}")
|
|
258
|
+
return "\n".join(lines)
|
|
259
|
+
|
|
260
|
+
def _build_schema_yml(
|
|
261
|
+
self,
|
|
262
|
+
model_name: str,
|
|
263
|
+
plan: ExecutionPlan,
|
|
264
|
+
merge_keys: list[str],
|
|
265
|
+
partition_cols: list[str],
|
|
266
|
+
pii_columns_anonymized: set[str],
|
|
267
|
+
schema_columns: list[dict[str, object]],
|
|
268
|
+
) -> str:
|
|
269
|
+
lines = [
|
|
270
|
+
"version: 2",
|
|
271
|
+
"",
|
|
272
|
+
"models:",
|
|
273
|
+
f" - name: {_ref_name(model_name)}",
|
|
274
|
+
f' description: "Governed model generated by CFA. Intent hash: {plan.signature_hash}"',
|
|
275
|
+
]
|
|
276
|
+
|
|
277
|
+
columns: list[str] = list({
|
|
278
|
+
str(k) for k in merge_keys
|
|
279
|
+
} | {
|
|
280
|
+
str(p) for p in partition_cols
|
|
281
|
+
})
|
|
282
|
+
|
|
283
|
+
if columns:
|
|
284
|
+
lines.append(" columns:")
|
|
285
|
+
for col in sorted(columns):
|
|
286
|
+
lines.append(f" - name: {_quote_ident(col)}")
|
|
287
|
+
tests: list[str] = []
|
|
288
|
+
if col in merge_keys:
|
|
289
|
+
tests.append("not_null")
|
|
290
|
+
if len(merge_keys) == 1:
|
|
291
|
+
tests.append("unique")
|
|
292
|
+
if col in pii_columns_anonymized:
|
|
293
|
+
lines.append(' description: "PII column anonimizada via SHA256"')
|
|
294
|
+
for test in tests:
|
|
295
|
+
lines.append(" tests:")
|
|
296
|
+
lines.append(f" - {test}")
|
|
297
|
+
|
|
298
|
+
if len(merge_keys) > 1:
|
|
299
|
+
lines.append(" tests:")
|
|
300
|
+
lines.append(" - dbt_utils.unique_combination_of_columns:")
|
|
301
|
+
lines.append(" combination_of_columns:")
|
|
302
|
+
for k in merge_keys:
|
|
303
|
+
lines.append(f" - {_quote_ident(k)}")
|
|
304
|
+
|
|
305
|
+
return "\n".join(lines)
|
|
306
|
+
|
|
307
|
+
@staticmethod
|
|
308
|
+
def _derive_model_name(plan: ExecutionPlan) -> str:
|
|
309
|
+
target = plan.write_mode.value
|
|
310
|
+
domain = plan.metadata.get("domain", "governed")
|
|
311
|
+
layer = plan.metadata.get("target_layer", "silver")
|
|
312
|
+
return f"{layer}_{domain}_{target}"
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
# ── dbt-specific helpers ────────────────────────────────────────────────────
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def _ref_name(source: str) -> str:
|
|
319
|
+
clean = source.replace("-", "_").replace(".", "_").replace(" ", "_").lower()
|
|
320
|
+
return clean
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def _cte_name(source: str) -> str:
|
|
324
|
+
clean = _ref_name(source)
|
|
325
|
+
if len(clean) <= 8:
|
|
326
|
+
return clean
|
|
327
|
+
parts = clean.split("_")
|
|
328
|
+
if len(parts) >= 2:
|
|
329
|
+
return parts[0][:4] + "_" + parts[-1][:4]
|
|
330
|
+
return clean[:8]
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def _quote_ident(name: str) -> str:
|
|
334
|
+
return str(name)
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def _var_name(source: str) -> str:
|
|
338
|
+
return f"cte_{_ref_name(source)}"
|