cfa-kernel 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. cfa/__init__.py +39 -0
  2. cfa/_lazy.py +39 -0
  3. cfa/adapters/__init__.py +104 -0
  4. cfa/adapters/autogen.py +19 -0
  5. cfa/adapters/crewai.py +19 -0
  6. cfa/adapters/dspy.py +19 -0
  7. cfa/adapters/langgraph.py +19 -0
  8. cfa/adapters/openai_agents.py +19 -0
  9. cfa/audit/__init__.py +15 -0
  10. cfa/audit/context.py +205 -0
  11. cfa/audit/hashing.py +41 -0
  12. cfa/audit/trail.py +194 -0
  13. cfa/backends/__init__.py +132 -0
  14. cfa/backends/dbt.py +338 -0
  15. cfa/backends/pyspark.py +240 -0
  16. cfa/backends/sql.py +270 -0
  17. cfa/behavior/__init__.py +49 -0
  18. cfa/behavior/llm.py +244 -0
  19. cfa/behavior/spec.py +235 -0
  20. cfa/behavior/systematizer.py +222 -0
  21. cfa/cli/__init__.py +296 -0
  22. cfa/cli/__main__.py +6 -0
  23. cfa/cli/_helpers.py +109 -0
  24. cfa/cli/core/__init__.py +0 -0
  25. cfa/cli/core/evaluate.py +72 -0
  26. cfa/cli/core/validate.py +29 -0
  27. cfa/cli/formatters.py +280 -0
  28. cfa/cli/governance/__init__.py +0 -0
  29. cfa/cli/governance/audit.py +65 -0
  30. cfa/cli/governance/catalog.py +28 -0
  31. cfa/cli/governance/policy.py +119 -0
  32. cfa/cli/governance/rules.py +42 -0
  33. cfa/cli/governance/signature.py +31 -0
  34. cfa/cli/infrastructure/__init__.py +0 -0
  35. cfa/cli/infrastructure/backend_list.py +24 -0
  36. cfa/cli/infrastructure/storage.py +87 -0
  37. cfa/cli/project/__init__.py +0 -0
  38. cfa/cli/project/init.py +73 -0
  39. cfa/cli/project/lifecycle.py +92 -0
  40. cfa/cli/project/status.py +75 -0
  41. cfa/cli/project/taxonomy.py +38 -0
  42. cfa/cli/reporting/__init__.py +0 -0
  43. cfa/cli/reporting/report.py +109 -0
  44. cfa/cli/reporting/serve.py +43 -0
  45. cfa/config.py +103 -0
  46. cfa/core/__init__.py +19 -0
  47. cfa/core/codegen.py +65 -0
  48. cfa/core/conditions.py +129 -0
  49. cfa/core/kernel.py +224 -0
  50. cfa/core/phases/__init__.py +0 -0
  51. cfa/core/phases/runner.py +477 -0
  52. cfa/core/planner.py +290 -0
  53. cfa/execution/__init__.py +12 -0
  54. cfa/execution/partial.py +339 -0
  55. cfa/execution/state_projection.py +216 -0
  56. cfa/governance/__init__.py +76 -0
  57. cfa/lifecycle/__init__.py +51 -0
  58. cfa/mcp/__init__.py +347 -0
  59. cfa/mcp/__main__.py +4 -0
  60. cfa/normalizer/__init__.py +15 -0
  61. cfa/normalizer/base.py +441 -0
  62. cfa/normalizer/llm.py +426 -0
  63. cfa/observability/__init__.py +14 -0
  64. cfa/observability/indices.py +177 -0
  65. cfa/observability/metrics.py +91 -0
  66. cfa/observability/notify.py +79 -0
  67. cfa/observability/otel.py +81 -0
  68. cfa/observability/promotion.py +367 -0
  69. cfa/policy/__init__.py +12 -0
  70. cfa/policy/bundle.py +317 -0
  71. cfa/policy/catalog.py +117 -0
  72. cfa/policy/engine.py +306 -0
  73. cfa/reporting/__init__.py +42 -0
  74. cfa/reporting/charts.py +223 -0
  75. cfa/reporting/engine.py +456 -0
  76. cfa/resolution/__init__.py +62 -0
  77. cfa/runtime/__init__.py +13 -0
  78. cfa/runtime/gate.py +287 -0
  79. cfa/sandbox/__init__.py +189 -0
  80. cfa/sandbox/executor.py +92 -0
  81. cfa/sandbox/mock.py +89 -0
  82. cfa/sandbox/panic.py +52 -0
  83. cfa/storage/__init__.py +591 -0
  84. cfa/testing/__init__.py +60 -0
  85. cfa/testing/asserts.py +77 -0
  86. cfa/testing/evaluate.py +168 -0
  87. cfa/testing/fixtures.py +89 -0
  88. cfa/testing/markers.py +36 -0
  89. cfa/types.py +489 -0
  90. cfa/validation/__init__.py +14 -0
  91. cfa/validation/runtime.py +285 -0
  92. cfa/validation/signature.py +146 -0
  93. cfa/validation/static.py +252 -0
  94. cfa_kernel-0.1.0.dist-info/METADATA +32 -0
  95. cfa_kernel-0.1.0.dist-info/RECORD +98 -0
  96. cfa_kernel-0.1.0.dist-info/WHEEL +4 -0
  97. cfa_kernel-0.1.0.dist-info/entry_points.txt +3 -0
  98. cfa_kernel-0.1.0.dist-info/licenses/LICENSE +21 -0
cfa/audit/trail.py ADDED
@@ -0,0 +1,194 @@
1
+ """
2
+ CFA Audit Trail
3
+ ================
4
+ Append-only, causally ordered record of all decision events.
5
+
6
+ Phase 1: in-memory list.
7
+ Phase 4: persistent backend (JSON Lines file, extensible to S3/Kafka/OpenLineage).
8
+
9
+ Properties (Invariant I5):
10
+ - Immutable after write
11
+ - Causal ordering per intent_id
12
+ - Complete per intent (start and end recorded)
13
+ - Cryptographic hash chain for tamper detection
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import hashlib
19
+ import json
20
+ from abc import ABC, abstractmethod
21
+ from dataclasses import asdict, dataclass, field
22
+ from pathlib import Path
23
+ from typing import Any
24
+
25
+ from cfa.types import _utcnow
26
+
27
+
28
+ @dataclass
29
+ class AuditEvent:
30
+ """Single typed event in the audit trail."""
31
+
32
+ intent_id: str
33
+ stage: str
34
+ event_type: str
35
+ outcome: str
36
+ policy_bundle_version: str = ""
37
+ details: dict[str, Any] = field(default_factory=dict)
38
+ timestamp: str = field(default_factory=lambda: _utcnow().isoformat())
39
+ event_hash: str = ""
40
+ previous_hash: str = ""
41
+
42
+ def to_dict(self) -> dict[str, Any]:
43
+ return asdict(self)
44
+
45
+
46
+ # ── Audit Storage Backend ───────────────────────────────────────────────────
47
+
48
+
49
+ class AuditStorageBackend(ABC):
50
+ """Extension point: pluggable persistence for audit events."""
51
+
52
+ @abstractmethod
53
+ def append(self, event: AuditEvent) -> None: ...
54
+
55
+ @abstractmethod
56
+ def load_all(self) -> list[AuditEvent]: ...
57
+
58
+ @abstractmethod
59
+ def load_by_intent(self, intent_id: str) -> list[AuditEvent]: ...
60
+
61
+
62
+ class InMemoryAuditStorage(AuditStorageBackend):
63
+ """In-memory storage for testing."""
64
+
65
+ def __init__(self) -> None:
66
+ self._events: list[AuditEvent] = []
67
+
68
+ def append(self, event: AuditEvent) -> None:
69
+ self._events.append(event)
70
+
71
+ def load_all(self) -> list[AuditEvent]:
72
+ return list(self._events)
73
+
74
+ def load_by_intent(self, intent_id: str) -> list[AuditEvent]:
75
+ return [e for e in self._events if e.intent_id == intent_id]
76
+
77
+
78
+ class JsonLinesAuditStorage(AuditStorageBackend):
79
+ """
80
+ JSON Lines file-based persistent audit storage.
81
+ Each line is a JSON-serialized AuditEvent.
82
+ Append-only — never modifies existing lines (Invariant I5).
83
+ """
84
+
85
+ def __init__(self, file_path: str | Path) -> None:
86
+ self.file_path = Path(file_path)
87
+ self.file_path.parent.mkdir(parents=True, exist_ok=True)
88
+
89
+ def append(self, event: AuditEvent) -> None:
90
+ with open(self.file_path, "a", encoding="utf-8") as f:
91
+ f.write(json.dumps(event.to_dict(), default=str) + "\n")
92
+
93
+ def load_all(self) -> list[AuditEvent]:
94
+ if not self.file_path.exists():
95
+ return []
96
+ events = []
97
+ for line in self.file_path.read_text(encoding="utf-8").strip().splitlines():
98
+ if line:
99
+ data = json.loads(line)
100
+ events.append(AuditEvent(**data))
101
+ return events
102
+
103
+ def load_by_intent(self, intent_id: str) -> list[AuditEvent]:
104
+ return [e for e in self.load_all() if e.intent_id == intent_id]
105
+
106
+
107
+ # ── Audit Trail ─────────────────────────────────────────────────────────────
108
+
109
+
110
+ class AuditTrail:
111
+ """
112
+ Append-only audit trail with hash chain.
113
+
114
+ Two consumption modes (per whitepaper):
115
+ - Operational (engineering): JSON, debugging, tuning
116
+ - Regulatory (audit): normalized + cryptographic hash chain
117
+
118
+ The hash chain ensures tamper detection: each event's hash
119
+ includes the previous event's hash, creating a causal chain.
120
+ """
121
+
122
+ def __init__(self, storage: AuditStorageBackend | None = None) -> None:
123
+ self._storage = storage or InMemoryAuditStorage()
124
+ self._last_hash = ""
125
+ # Restore hash chain from existing events
126
+ existing = self._storage.load_all()
127
+ if existing:
128
+ self._last_hash = existing[-1].event_hash
129
+
130
+ def record(
131
+ self,
132
+ intent_id: str,
133
+ stage: str,
134
+ event_type: str,
135
+ outcome: str,
136
+ policy_bundle_version: str = "",
137
+ **details: Any,
138
+ ) -> AuditEvent:
139
+ event = AuditEvent(
140
+ intent_id=intent_id,
141
+ stage=stage,
142
+ event_type=event_type,
143
+ outcome=outcome,
144
+ policy_bundle_version=policy_bundle_version,
145
+ details=details,
146
+ previous_hash=self._last_hash,
147
+ )
148
+ # Compute hash chain
149
+ event.event_hash = self._compute_hash(event)
150
+ self._last_hash = event.event_hash
151
+ self._storage.append(event)
152
+ return event
153
+
154
+ def get_events_for_intent(self, intent_id: str) -> list[AuditEvent]:
155
+ return self._storage.load_by_intent(intent_id)
156
+
157
+ def get_all_events(self) -> list[AuditEvent]:
158
+ return self._storage.load_all()
159
+
160
+ @property
161
+ def event_count(self) -> int:
162
+ return len(self._storage.load_all())
163
+
164
+ def verify_chain(self) -> bool:
165
+ """Verify the integrity of the hash chain. Returns True if valid."""
166
+ events = self._storage.load_all()
167
+ prev_hash = ""
168
+ for event in events:
169
+ if event.previous_hash != prev_hash:
170
+ return False
171
+ expected = self._compute_hash(event)
172
+ legacy_expected = self._compute_hash_legacy(event)
173
+ if event.event_hash not in (expected, legacy_expected):
174
+ return False
175
+ prev_hash = event.event_hash
176
+ return True
177
+
178
+ @staticmethod
179
+ def _compute_hash(event: AuditEvent) -> str:
180
+ payload = json.dumps({
181
+ "intent_id": event.intent_id,
182
+ "stage": event.stage,
183
+ "event_type": event.event_type,
184
+ "outcome": event.outcome,
185
+ "timestamp": event.timestamp,
186
+ "previous_hash": event.previous_hash,
187
+ "details": event.details,
188
+ }, sort_keys=True, default=str)
189
+ return hashlib.sha256(payload.encode()).hexdigest()
190
+
191
+ @staticmethod
192
+ def _compute_hash_legacy(event: AuditEvent) -> str:
193
+ """Return the pre-3.1 truncated hash for backward verification only."""
194
+ return AuditTrail._compute_hash(event)[:16]
@@ -0,0 +1,132 @@
1
+ """
2
+ CFA Backend Registry
3
+ ====================
4
+ Pluggable backends for code generation and execution.
5
+
6
+ A BackendAdapter extends CodeGenBackend with capability introspection,
7
+ enabling the Policy Engine to validate whether a backend can satisfy
8
+ a given intent's constraints before attempting code generation.
9
+
10
+ Usage:
11
+ from cfa.backends import BackendRegistry, BackendAdapter, BackendCapabilities
12
+
13
+ registry = BackendRegistry()
14
+ registry.register("pyspark", PySparkBackend)
15
+ registry.register("duckdb", DuckDBBackend)
16
+
17
+ backend = registry.get("pyspark")()
18
+ caps = backend.get_capabilities()
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ from abc import abstractmethod
24
+ from collections.abc import Callable
25
+ from dataclasses import dataclass, field
26
+ from typing import Any
27
+
28
+ from cfa.core.codegen import CodeGenBackend
29
+ from cfa.validation.static import ForbiddenToken
30
+
31
+
32
+ @dataclass
33
+ class BackendCapabilities:
34
+ """Capabilities exposed by a backend for policy validation.
35
+
36
+ The PolicyEngine queries these before approving an intent —
37
+ if a backend cannot satisfy a required constraint (e.g., merge_key
38
+ on a Silver write), the intent is blocked before code generation.
39
+ """
40
+
41
+ backend_name: str = ""
42
+ backend_version: str = ""
43
+
44
+ supports_merge: bool = False
45
+ supports_partition_overwrite: bool = False
46
+ supports_anonymization: bool = False
47
+ supports_schema_enforcement: bool = False
48
+
49
+ pii_anonymization_methods: list[str] = field(default_factory=lambda: ["sha256", "drop"])
50
+ cost_model_available: bool = False
51
+ max_recommended_rows: int = 10_000_000
52
+ supported_languages: list[str] = field(default_factory=lambda: ["python"])
53
+
54
+ forbidden_tokens: list[ForbiddenToken] = field(default_factory=list)
55
+
56
+ custom: dict[str, Any] = field(default_factory=dict)
57
+
58
+
59
+ class BackendAdapter(CodeGenBackend):
60
+ """Extension of CodeGenBackend with capability introspection.
61
+
62
+ Implement this to create a pluggable backend for any target:
63
+ PySpark, DuckDB, BigQuery, REST API, LLM chain, etc.
64
+ """
65
+
66
+ @abstractmethod
67
+ def get_capabilities(self) -> BackendCapabilities: ...
68
+
69
+
70
+ BackendFactory = Callable[[], BackendAdapter]
71
+
72
+
73
+ class BackendRegistry:
74
+ """Global registry of available backend factories.
75
+
76
+ Supports:
77
+ - register(name, factory) — add a backend
78
+ - get(name) -> factory — retrieve by name
79
+ - list() -> list of registered names
80
+ - remove(name) — unregister a backend
81
+ """
82
+
83
+ _instance: BackendRegistry | None = None
84
+ _lock: Any = None
85
+
86
+ def __init__(self) -> None:
87
+ self._backends: dict[str, BackendFactory] = {}
88
+
89
+ @classmethod
90
+ def singleton(cls) -> BackendRegistry:
91
+ if cls._lock is None:
92
+ import threading
93
+ cls._lock = threading.Lock()
94
+ with cls._lock:
95
+ if cls._instance is None:
96
+ cls._instance = cls()
97
+ cls._instance._bootstrap()
98
+ return cls._instance
99
+
100
+ def _bootstrap(self) -> None:
101
+ if self._backends:
102
+ return
103
+ from .dbt import DbtBackend # noqa: F811
104
+ from .pyspark import PySparkBackend # noqa: F811
105
+ from .sql import SqlBackend # noqa: F811
106
+
107
+ self.register("dbt", lambda: DbtBackend())
108
+ self.register("pyspark", lambda: PySparkBackend())
109
+ self.register("sql", lambda: SqlBackend())
110
+
111
+ def register(self, name: str, factory: BackendFactory) -> None:
112
+ self._backends[name] = factory
113
+
114
+ def get(self, name: str) -> BackendFactory:
115
+ if name not in self._backends:
116
+ available = ", ".join(sorted(self._backends))
117
+ raise KeyError(
118
+ f"Unknown backend '{name}'. Registered backends: {available or '(none)'}"
119
+ )
120
+ return self._backends[name]
121
+
122
+ def list(self) -> list[str]:
123
+ return sorted(self._backends)
124
+
125
+ def remove(self, name: str) -> None:
126
+ self._backends.pop(name, None)
127
+
128
+ def clear(self) -> None:
129
+ self._backends.clear()
130
+
131
+ def __contains__(self, name: str) -> bool:
132
+ return name in self._backends
cfa/backends/dbt.py ADDED
@@ -0,0 +1,338 @@
1
+ """
2
+ dbt Backend
3
+ ============
4
+ Code generation backend targeting dbt (data build tool).
5
+
6
+ Generates governed dbt models from an ExecutionPlan. Output includes:
7
+ - ``model.sql`` with dbt Jinja templating and governed SQL
8
+ - ``schema.yml`` with data quality tests derived from governance rules
9
+ - Partition configs, merge keys as uniqueness tests, PII annotations
10
+
11
+ Template-based — no LLM involved.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from cfa.core.codegen import GeneratedCode
17
+ from cfa.core.planner import ExecutionPlan, ExecutionStep, StepType, WriteMode
18
+ from cfa.types import FaultSeverity
19
+ from cfa.validation.static import ForbiddenToken
20
+
21
+ from . import BackendAdapter, BackendCapabilities
22
+
23
+ _DBT_FORBIDDEN_TOKENS: list[ForbiddenToken] = [
24
+ ForbiddenToken(pattern=r"\bDROP\s+TABLE\b", fault_code="STATIC_DBT_DROP_TABLE",
25
+ severity=FaultSeverity.CRITICAL,
26
+ message="DROP TABLE in governed dbt model forbidden.", is_regex=True),
27
+ ForbiddenToken(pattern=r"\bDROP\s+DATABASE\b", fault_code="STATIC_DBT_DROP_DATABASE",
28
+ severity=FaultSeverity.CRITICAL,
29
+ message="DROP DATABASE in governed dbt model forbidden.", is_regex=True),
30
+ ForbiddenToken(pattern=r"\bTRUNCATE\b", fault_code="STATIC_DBT_TRUNCATE",
31
+ severity=FaultSeverity.CRITICAL,
32
+ message="TRUNCATE forbidden in dbt governed models.", is_regex=True),
33
+ ForbiddenToken(pattern=r"\bDELETE\s+FROM\b", fault_code="STATIC_DBT_DELETE",
34
+ severity=FaultSeverity.CRITICAL,
35
+ message="DELETE FROM forbidden in dbt governed models.", is_regex=True),
36
+ ForbiddenToken(pattern=r"\bALTER\s+TABLE\b", fault_code="STATIC_DBT_ALTER_TABLE",
37
+ severity=FaultSeverity.HIGH,
38
+ message="ALTER TABLE forbidden — use dbt model rebuild.", is_regex=True),
39
+ ]
40
+
41
+
42
+ class DbtBackend(BackendAdapter):
43
+ """Generates governed dbt model + schema.yml from an ExecutionPlan."""
44
+
45
+ def get_capabilities(self) -> BackendCapabilities:
46
+ return BackendCapabilities(
47
+ backend_name="dbt",
48
+ backend_version="dbt-core-1.x",
49
+ supports_merge=True,
50
+ supports_partition_overwrite=True,
51
+ supports_anonymization=True,
52
+ supports_schema_enforcement=True,
53
+ pii_anonymization_methods=["sha256", "drop", "md5", "tokenize"],
54
+ cost_model_available=False,
55
+ max_recommended_rows=500_000_000,
56
+ supported_languages=["sql", "yaml"],
57
+ forbidden_tokens=_DBT_FORBIDDEN_TOKENS,
58
+ )
59
+
60
+ def generate(self, plan: ExecutionPlan) -> GeneratedCode:
61
+ sql_lines: list[str] = []
62
+ schema_columns: list[dict[str, object]] = []
63
+ model_name = self._derive_model_name(plan)
64
+ pii_columns_anonymized: set[str] = set()
65
+ merge_keys: list[str] = []
66
+ partition_cols: list[str] = []
67
+ step_code: dict[str, str] = {}
68
+ ordered = plan.execution_order()
69
+
70
+ for step in ordered:
71
+ code = self._generate_step(step, plan)
72
+ step_code[step.id] = code
73
+ sql_lines.append(f"-- Step: {step.id} ({step.step_type.value})")
74
+ sql_lines.append(code)
75
+ sql_lines.append("")
76
+
77
+ merge_keys = step.config.get("merge_keys", merge_keys)
78
+ partition_cols = step.config.get("partition_by", partition_cols)
79
+ if step.step_type == StepType.ANONYMIZE:
80
+ for col in step.config.get("pii_columns", []):
81
+ pii_columns_anonymized.add(str(col))
82
+
83
+ schema = self._build_schema_yml(
84
+ model_name=model_name,
85
+ plan=plan,
86
+ merge_keys=merge_keys,
87
+ partition_cols=partition_cols,
88
+ pii_columns_anonymized=pii_columns_anonymized,
89
+ schema_columns=schema_columns,
90
+ )
91
+
92
+ config_block = self._build_config(
93
+ plan=plan,
94
+ partition_cols=partition_cols,
95
+ merge_keys=merge_keys,
96
+ )
97
+
98
+ full_body = "\n".join(sql_lines)
99
+ model_sql = f"{config_block}\n\n{full_body}"
100
+
101
+ combined = (
102
+ f"-- dbt model: {model_name}\n"
103
+ f"-- ============={'=' * len(model_name)}\n\n"
104
+ f"{model_sql}\n\n"
105
+ f"-- schema.yml (generated by CFA governance)\n"
106
+ f"-- ==========================================\n\n"
107
+ f"{schema}"
108
+ )
109
+
110
+ return GeneratedCode(
111
+ plan_signature_hash=plan.signature_hash,
112
+ intent_id=plan.intent_id,
113
+ language="dbt",
114
+ code=combined,
115
+ step_code_map=step_code,
116
+ metadata={
117
+ "model_name": model_name,
118
+ "write_mode": plan.write_mode.value,
119
+ "consistency_unit": plan.consistency_unit.value,
120
+ "step_count": plan.step_count,
121
+ },
122
+ )
123
+
124
+ # ── Step generators ───────────────────────────────────────────────────
125
+
126
+ def _generate_step(self, step: ExecutionStep, plan: ExecutionPlan) -> str:
127
+ match step.step_type:
128
+ case StepType.EXTRACT:
129
+ return self._gen_extract(step)
130
+ case StepType.ANONYMIZE:
131
+ return self._gen_anonymize(step)
132
+ case StepType.JOIN:
133
+ return self._gen_join(step, plan)
134
+ case StepType.AGGREGATE:
135
+ return self._gen_aggregate(step)
136
+ case StepType.LOAD:
137
+ return self._gen_load(step, plan)
138
+ case StepType.FILTER:
139
+ return self._gen_filter(step)
140
+ case StepType.TRANSFORM:
141
+ return self._gen_transform(step)
142
+ case _:
143
+ return f"-- TODO: unsupported step type {step.step_type.value}"
144
+
145
+ def _gen_extract(self, step: ExecutionStep) -> str:
146
+ source = step.source or "unknown_source"
147
+ model_ref = _ref_name(source)
148
+ lines = [f"-- EXTRACT: {source}"]
149
+ lines.append(f"SELECT * FROM {{{{ ref('{model_ref}') }}}}")
150
+
151
+ filt = step.config.get("filter")
152
+ if filt:
153
+ col = _quote_ident(filt["column"])
154
+ pred = filt["predicate"]
155
+ lines.append(f"WHERE {col} {pred} '{{{{ var(\"date_param\") }}}}'")
156
+
157
+ return "\n".join(lines)
158
+
159
+ def _gen_anonymize(self, step: ExecutionStep) -> str:
160
+ source = step.source or "source"
161
+ pii_cols = step.config.get("pii_columns", [])
162
+ strategy = step.config.get("strategy", "sha256")
163
+
164
+ if not pii_cols:
165
+ return f"-- No PII columns to anonymize for {source}"
166
+
167
+ col_list = ", ".join(_quote_ident(c) for c in pii_cols)
168
+ lines: list[str] = [
169
+ f"-- ANONYMIZE: {source} (strategy={strategy}, columns={col_list})",
170
+ ]
171
+ for col in pii_cols:
172
+ safe = _quote_ident(col)
173
+ if strategy == "sha256":
174
+ lines.append(f"-- {safe} replaced with SHA256({safe})")
175
+ elif strategy == "drop":
176
+ lines.append(f"-- {safe} dropped")
177
+ else:
178
+ lines.append(f"-- {safe} anonymized ({strategy})")
179
+ return "\n".join(lines)
180
+
181
+ def _gen_join(self, step: ExecutionStep, plan: ExecutionPlan) -> str:
182
+ datasets = step.config.get("datasets", [])
183
+ merge_keys = step.config.get("merge_keys", ["id"])
184
+
185
+ if len(datasets) < 2:
186
+ return "-- Join requires at least 2 datasets"
187
+
188
+ left_alias = _cte_name(datasets[0])
189
+ right_alias = _cte_name(datasets[1])
190
+ left_ref = _ref_name(datasets[0])
191
+ right_ref = _ref_name(datasets[1])
192
+ on_clause = " AND ".join(
193
+ f"{left_alias}.{_quote_ident(k)} = {right_alias}.{_quote_ident(k)}"
194
+ for k in merge_keys
195
+ )
196
+
197
+ lines: list[str] = [f"-- JOIN: {datasets[0]} + {datasets[1]}"]
198
+ lines.append(f"SELECT {left_alias}.*, {right_alias}.*")
199
+ lines.append(f"FROM {{{{ ref('{left_ref}') }}}} {left_alias}")
200
+ lines.append(f"INNER JOIN {{{{ ref('{right_ref}') }}}} {right_alias}")
201
+ lines.append(f" ON {on_clause}")
202
+
203
+ return "\n".join(lines)
204
+
205
+ def _gen_aggregate(self, step: ExecutionStep) -> str:
206
+ group_by = step.config.get("group_by", [])
207
+ lines: list[str] = ["-- AGGREGATE"]
208
+ if not group_by:
209
+ lines.append("SELECT COUNT(*) AS cnt FROM joined_cte")
210
+ else:
211
+ cols = ", ".join(_quote_ident(c) for c in group_by)
212
+ lines.append(f"SELECT {cols}, COUNT(*) AS cnt")
213
+ lines.append("FROM joined_cte")
214
+ lines.append(f"GROUP BY {cols}")
215
+ return "\n".join(lines)
216
+
217
+ def _gen_load(self, step: ExecutionStep, plan: ExecutionPlan) -> str:
218
+ target = step.target or "target_model"
219
+ merge_keys = step.config.get("merge_keys", ["id"])
220
+ partition_by = step.config.get("partition_by", [])
221
+
222
+ lines: list[str] = [f"-- LOAD: {target}"]
223
+ lines.append(f"-- Model: {{{{ ref('{_ref_name(target)}') }}}}")
224
+ if merge_keys:
225
+ lines.append(f"-- Merge keys: {', '.join(merge_keys)}")
226
+ if partition_by:
227
+ lines.append(f"-- Partition by: {', '.join(partition_by)}")
228
+ lines.append("SELECT * FROM joined_cte")
229
+ return "\n".join(lines)
230
+
231
+ def _gen_filter(self, step: ExecutionStep) -> str:
232
+ condition = step.config.get("condition", "1=1")
233
+ return f"-- FILTER: WHERE {condition}"
234
+
235
+ def _gen_transform(self, step: ExecutionStep) -> str:
236
+ return f"-- TRANSFORM: {step.config}"
237
+
238
+ # ── Config & schema generation ────────────────────────────────────────
239
+
240
+ def _build_config(
241
+ self,
242
+ plan: ExecutionPlan,
243
+ partition_cols: list[str],
244
+ merge_keys: list[str],
245
+ ) -> str:
246
+ materialized = "table" if plan.write_mode == WriteMode.MERGE else "incremental"
247
+ lines = [
248
+ "{{ config(",
249
+ f" materialized='{materialized}',",
250
+ ]
251
+ if partition_cols:
252
+ field_list = ", ".join(f"'{p}'" for p in partition_cols)
253
+ lines.append(f" partition_by={{'field': [{field_list}], 'data_type': 'date'}},")
254
+ if merge_keys and plan.write_mode == WriteMode.MERGE:
255
+ key_list = ", ".join(f"'{k}'" for k in merge_keys)
256
+ lines.append(f" unique_key=['{key_list}'],")
257
+ lines.append(") }}")
258
+ return "\n".join(lines)
259
+
260
+ def _build_schema_yml(
261
+ self,
262
+ model_name: str,
263
+ plan: ExecutionPlan,
264
+ merge_keys: list[str],
265
+ partition_cols: list[str],
266
+ pii_columns_anonymized: set[str],
267
+ schema_columns: list[dict[str, object]],
268
+ ) -> str:
269
+ lines = [
270
+ "version: 2",
271
+ "",
272
+ "models:",
273
+ f" - name: {_ref_name(model_name)}",
274
+ f' description: "Governed model generated by CFA. Intent hash: {plan.signature_hash}"',
275
+ ]
276
+
277
+ columns: list[str] = list({
278
+ str(k) for k in merge_keys
279
+ } | {
280
+ str(p) for p in partition_cols
281
+ })
282
+
283
+ if columns:
284
+ lines.append(" columns:")
285
+ for col in sorted(columns):
286
+ lines.append(f" - name: {_quote_ident(col)}")
287
+ tests: list[str] = []
288
+ if col in merge_keys:
289
+ tests.append("not_null")
290
+ if len(merge_keys) == 1:
291
+ tests.append("unique")
292
+ if col in pii_columns_anonymized:
293
+ lines.append(' description: "PII column anonimizada via SHA256"')
294
+ for test in tests:
295
+ lines.append(" tests:")
296
+ lines.append(f" - {test}")
297
+
298
+ if len(merge_keys) > 1:
299
+ lines.append(" tests:")
300
+ lines.append(" - dbt_utils.unique_combination_of_columns:")
301
+ lines.append(" combination_of_columns:")
302
+ for k in merge_keys:
303
+ lines.append(f" - {_quote_ident(k)}")
304
+
305
+ return "\n".join(lines)
306
+
307
+ @staticmethod
308
+ def _derive_model_name(plan: ExecutionPlan) -> str:
309
+ target = plan.write_mode.value
310
+ domain = plan.metadata.get("domain", "governed")
311
+ layer = plan.metadata.get("target_layer", "silver")
312
+ return f"{layer}_{domain}_{target}"
313
+
314
+
315
+ # ── dbt-specific helpers ────────────────────────────────────────────────────
316
+
317
+
318
+ def _ref_name(source: str) -> str:
319
+ clean = source.replace("-", "_").replace(".", "_").replace(" ", "_").lower()
320
+ return clean
321
+
322
+
323
+ def _cte_name(source: str) -> str:
324
+ clean = _ref_name(source)
325
+ if len(clean) <= 8:
326
+ return clean
327
+ parts = clean.split("_")
328
+ if len(parts) >= 2:
329
+ return parts[0][:4] + "_" + parts[-1][:4]
330
+ return clean[:8]
331
+
332
+
333
+ def _quote_ident(name: str) -> str:
334
+ return str(name)
335
+
336
+
337
+ def _var_name(source: str) -> str:
338
+ return f"cte_{_ref_name(source)}"