cfa-kernel 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. cfa/__init__.py +39 -0
  2. cfa/_lazy.py +39 -0
  3. cfa/adapters/__init__.py +104 -0
  4. cfa/adapters/autogen.py +19 -0
  5. cfa/adapters/crewai.py +19 -0
  6. cfa/adapters/dspy.py +19 -0
  7. cfa/adapters/langgraph.py +19 -0
  8. cfa/adapters/openai_agents.py +19 -0
  9. cfa/audit/__init__.py +15 -0
  10. cfa/audit/context.py +205 -0
  11. cfa/audit/hashing.py +41 -0
  12. cfa/audit/trail.py +194 -0
  13. cfa/backends/__init__.py +132 -0
  14. cfa/backends/dbt.py +338 -0
  15. cfa/backends/pyspark.py +240 -0
  16. cfa/backends/sql.py +270 -0
  17. cfa/behavior/__init__.py +49 -0
  18. cfa/behavior/llm.py +244 -0
  19. cfa/behavior/spec.py +235 -0
  20. cfa/behavior/systematizer.py +222 -0
  21. cfa/cli/__init__.py +296 -0
  22. cfa/cli/__main__.py +6 -0
  23. cfa/cli/_helpers.py +109 -0
  24. cfa/cli/core/__init__.py +0 -0
  25. cfa/cli/core/evaluate.py +72 -0
  26. cfa/cli/core/validate.py +29 -0
  27. cfa/cli/formatters.py +280 -0
  28. cfa/cli/governance/__init__.py +0 -0
  29. cfa/cli/governance/audit.py +65 -0
  30. cfa/cli/governance/catalog.py +28 -0
  31. cfa/cli/governance/policy.py +119 -0
  32. cfa/cli/governance/rules.py +42 -0
  33. cfa/cli/governance/signature.py +31 -0
  34. cfa/cli/infrastructure/__init__.py +0 -0
  35. cfa/cli/infrastructure/backend_list.py +24 -0
  36. cfa/cli/infrastructure/storage.py +87 -0
  37. cfa/cli/project/__init__.py +0 -0
  38. cfa/cli/project/init.py +73 -0
  39. cfa/cli/project/lifecycle.py +92 -0
  40. cfa/cli/project/status.py +75 -0
  41. cfa/cli/project/taxonomy.py +38 -0
  42. cfa/cli/reporting/__init__.py +0 -0
  43. cfa/cli/reporting/report.py +109 -0
  44. cfa/cli/reporting/serve.py +43 -0
  45. cfa/config.py +103 -0
  46. cfa/core/__init__.py +19 -0
  47. cfa/core/codegen.py +65 -0
  48. cfa/core/conditions.py +129 -0
  49. cfa/core/kernel.py +224 -0
  50. cfa/core/phases/__init__.py +0 -0
  51. cfa/core/phases/runner.py +477 -0
  52. cfa/core/planner.py +290 -0
  53. cfa/execution/__init__.py +12 -0
  54. cfa/execution/partial.py +339 -0
  55. cfa/execution/state_projection.py +216 -0
  56. cfa/governance/__init__.py +76 -0
  57. cfa/lifecycle/__init__.py +51 -0
  58. cfa/mcp/__init__.py +347 -0
  59. cfa/mcp/__main__.py +4 -0
  60. cfa/normalizer/__init__.py +15 -0
  61. cfa/normalizer/base.py +441 -0
  62. cfa/normalizer/llm.py +426 -0
  63. cfa/observability/__init__.py +14 -0
  64. cfa/observability/indices.py +177 -0
  65. cfa/observability/metrics.py +91 -0
  66. cfa/observability/notify.py +79 -0
  67. cfa/observability/otel.py +81 -0
  68. cfa/observability/promotion.py +367 -0
  69. cfa/policy/__init__.py +12 -0
  70. cfa/policy/bundle.py +317 -0
  71. cfa/policy/catalog.py +117 -0
  72. cfa/policy/engine.py +306 -0
  73. cfa/reporting/__init__.py +42 -0
  74. cfa/reporting/charts.py +223 -0
  75. cfa/reporting/engine.py +456 -0
  76. cfa/resolution/__init__.py +62 -0
  77. cfa/runtime/__init__.py +13 -0
  78. cfa/runtime/gate.py +287 -0
  79. cfa/sandbox/__init__.py +189 -0
  80. cfa/sandbox/executor.py +92 -0
  81. cfa/sandbox/mock.py +89 -0
  82. cfa/sandbox/panic.py +52 -0
  83. cfa/storage/__init__.py +591 -0
  84. cfa/testing/__init__.py +60 -0
  85. cfa/testing/asserts.py +77 -0
  86. cfa/testing/evaluate.py +168 -0
  87. cfa/testing/fixtures.py +89 -0
  88. cfa/testing/markers.py +36 -0
  89. cfa/types.py +489 -0
  90. cfa/validation/__init__.py +14 -0
  91. cfa/validation/runtime.py +285 -0
  92. cfa/validation/signature.py +146 -0
  93. cfa/validation/static.py +252 -0
  94. cfa_kernel-0.1.0.dist-info/METADATA +32 -0
  95. cfa_kernel-0.1.0.dist-info/RECORD +98 -0
  96. cfa_kernel-0.1.0.dist-info/WHEEL +4 -0
  97. cfa_kernel-0.1.0.dist-info/entry_points.txt +3 -0
  98. cfa_kernel-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,240 @@
1
+ """
2
+ PySpark Backend
3
+ ===============
4
+ Code generation backend targeting Apache Spark (PySpark) with Delta Lake.
5
+
6
+ Generates deterministic PySpark code from an ExecutionPlan.
7
+ Template-based — no LLM involved.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from cfa.core.codegen import GeneratedCode
13
+ from cfa.core.planner import ExecutionPlan, ExecutionStep, StepType, WriteMode
14
+ from cfa.types import FaultSeverity
15
+ from cfa.validation.static import ForbiddenToken
16
+
17
+ from . import BackendAdapter, BackendCapabilities
18
+
19
+ _PYSPARK_FORBIDDEN_TOKENS: list[ForbiddenToken] = [
20
+ ForbiddenToken(pattern=".collect()", fault_code="STATIC_FORBIDDEN_COLLECT",
21
+ severity=FaultSeverity.CRITICAL,
22
+ message="collect() brings all data to driver."),
23
+ ForbiddenToken(pattern=".toPandas()", fault_code="STATIC_FORBIDDEN_TOPANDAS",
24
+ severity=FaultSeverity.CRITICAL,
25
+ message="toPandas() brings all data to driver."),
26
+ ForbiddenToken(pattern="crossJoin(", fault_code="STATIC_FORBIDDEN_CROSSJOIN",
27
+ severity=FaultSeverity.CRITICAL,
28
+ message="crossJoin() produces cartesian product."),
29
+ ForbiddenToken(pattern="import os", fault_code="STATIC_FORBIDDEN_IMPORT_OS",
30
+ severity=FaultSeverity.CRITICAL,
31
+ message="os module import forbidden in sandboxed execution."),
32
+ ForbiddenToken(pattern="import subprocess", fault_code="STATIC_FORBIDDEN_IMPORT_SUBPROCESS",
33
+ severity=FaultSeverity.CRITICAL,
34
+ message="subprocess module import forbidden in sandboxed execution."),
35
+ ForbiddenToken(pattern=r'\.mode\(\"append\"\).*(?:silver|gold)',
36
+ fault_code="STATIC_APPEND_TO_PROTECTED",
37
+ severity=FaultSeverity.HIGH,
38
+ message="Append mode to Silver/Gold detected.", is_regex=True),
39
+ ]
40
+
41
+
42
+ class PySparkBackend(BackendAdapter):
43
+ """Generates PySpark code from an ExecutionPlan."""
44
+
45
+ def get_capabilities(self) -> BackendCapabilities:
46
+ return BackendCapabilities(
47
+ backend_name="pyspark",
48
+ backend_version="delta-3.x",
49
+ supports_merge=True,
50
+ supports_partition_overwrite=True,
51
+ supports_anonymization=True,
52
+ supports_schema_enforcement=True,
53
+ pii_anonymization_methods=["sha256", "drop", "tokenize", "mask"],
54
+ cost_model_available=True,
55
+ max_recommended_rows=100_000_000,
56
+ supported_languages=["python", "pyspark"],
57
+ forbidden_tokens=_PYSPARK_FORBIDDEN_TOKENS,
58
+ )
59
+
60
+ def generate(self, plan: ExecutionPlan) -> GeneratedCode:
61
+ lines: list[str] = []
62
+ step_code: dict[str, str] = {}
63
+
64
+ lines.append("from pyspark.sql import SparkSession, functions as F")
65
+ lines.append("")
66
+ lines.append("spark = SparkSession.builder.getOrCreate()")
67
+ lines.append("")
68
+
69
+ ordered = plan.execution_order()
70
+ for step in ordered:
71
+ code = self._generate_step(step, plan)
72
+ step_code[step.id] = code
73
+ lines.append(f"# ── Step: {step.id} ({step.step_type.value}) ──")
74
+ lines.append(code)
75
+ lines.append("")
76
+
77
+ full_code = "\n".join(lines)
78
+
79
+ return GeneratedCode(
80
+ plan_signature_hash=plan.signature_hash,
81
+ intent_id=plan.intent_id,
82
+ language="pyspark",
83
+ code=full_code,
84
+ step_code_map=step_code,
85
+ metadata={
86
+ "write_mode": plan.write_mode.value,
87
+ "consistency_unit": plan.consistency_unit.value,
88
+ "step_count": plan.step_count,
89
+ },
90
+ )
91
+
92
+ def _generate_step(self, step: ExecutionStep, plan: ExecutionPlan) -> str:
93
+ match step.step_type:
94
+ case StepType.EXTRACT:
95
+ return self._gen_extract(step)
96
+ case StepType.ANONYMIZE:
97
+ return self._gen_anonymize(step)
98
+ case StepType.JOIN:
99
+ return self._gen_join(step, plan)
100
+ case StepType.AGGREGATE:
101
+ return self._gen_aggregate(step)
102
+ case StepType.LOAD:
103
+ return self._gen_load(step, plan)
104
+ case StepType.FILTER:
105
+ return self._gen_filter(step)
106
+ case StepType.TRANSFORM:
107
+ return self._gen_transform(step)
108
+ case _:
109
+ return f"# TODO: unsupported step type {step.step_type.value}"
110
+
111
+ def _gen_extract(self, step: ExecutionStep) -> str:
112
+ var = _var_name(step.source or "data")
113
+ lines = [f'{var} = spark.read.format("delta").load("{step.source}")']
114
+
115
+ filt = step.config.get("filter")
116
+ if filt:
117
+ col = filt["column"]
118
+ pred = filt["predicate"]
119
+ lines.append(
120
+ f'{var} = {var}.filter(F.col("{col}") {pred} F.lit("{{date_param}}"))'
121
+ )
122
+
123
+ return "\n".join(lines)
124
+
125
+ def _gen_anonymize(self, step: ExecutionStep) -> str:
126
+ var = _var_name(step.source or "data")
127
+ pii_cols = step.config.get("pii_columns", [])
128
+ strategy = step.config.get("strategy", "sha256")
129
+
130
+ lines: list[str] = []
131
+ for col in pii_cols:
132
+ if strategy == "sha256":
133
+ lines.append(
134
+ f'{var} = {var}.withColumn("{col}_hash", F.sha2(F.col("{col}").cast("string"), 256))'
135
+ )
136
+ lines.append(f'{var} = {var}.drop("{col}")')
137
+ else:
138
+ lines.append(f'{var} = {var}.drop("{col}")')
139
+
140
+ return "\n".join(lines) if lines else f"# No PII columns to anonymize in {step.source}"
141
+
142
+ def _gen_join(self, step: ExecutionStep, plan: ExecutionPlan) -> str:
143
+ datasets = step.config.get("datasets", [])
144
+ join_type = step.config.get("type", "sort_merge")
145
+ merge_keys = step.config.get("merge_keys", ["id"])
146
+ on_clause = ", ".join(f'"{k}"' for k in merge_keys)
147
+
148
+ if len(datasets) < 2:
149
+ return "# Join requires at least 2 datasets"
150
+
151
+ left_var = _var_name(datasets[0])
152
+ right_var = _var_name(datasets[1])
153
+ result_var = "df_joined"
154
+
155
+ lines: list[str] = []
156
+ if join_type == "broadcast":
157
+ lines.append("from pyspark.sql.functions import broadcast")
158
+ lines.append(
159
+ f'{result_var} = {left_var}.join(broadcast({right_var}), on=[{on_clause}], how="inner")'
160
+ )
161
+ else:
162
+ lines.append(
163
+ f'{result_var} = {left_var}.join({right_var}, on=[{on_clause}], how="inner")'
164
+ )
165
+
166
+ return "\n".join(lines)
167
+
168
+ def _gen_aggregate(self, step: ExecutionStep) -> str:
169
+ group_by = step.config.get("group_by", [])
170
+ if not group_by:
171
+ return "df_agg = df_joined.groupBy().count() # WARNING: no group_by specified"
172
+
173
+ cols = ", ".join(f'"{c}"' for c in group_by)
174
+ return f"df_agg = df_joined.groupBy({cols}).agg(F.count(F.lit(1)).alias(\"count\"))"
175
+
176
+ def _gen_load(self, step: ExecutionStep, plan: ExecutionPlan) -> str:
177
+ target = step.target or "target"
178
+ source_var = self._resolve_source_var(step, plan)
179
+ write_mode = step.config.get("write_mode", plan.write_mode.value)
180
+ partition_by = step.config.get("partition_by", [])
181
+ merge_keys = step.config.get("merge_keys", ["id"])
182
+ merge_on = " AND ".join(f't.{k} = s.{k}' for k in merge_keys)
183
+
184
+ lines: list[str] = []
185
+
186
+ if write_mode == WriteMode.MERGE.value:
187
+ lines.append("from delta.tables import DeltaTable")
188
+ lines.append("")
189
+ lines.append(f'if DeltaTable.isDeltaTable(spark, "{target}"):')
190
+ lines.append(f' target_table = DeltaTable.forPath(spark, "{target}")')
191
+ lines.append(' target_table.alias("t").merge(')
192
+ lines.append(f' {source_var}.alias("s"),')
193
+ lines.append(f' "{merge_on}"')
194
+ lines.append(" ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()")
195
+ lines.append("else:")
196
+
197
+ writer = f' {source_var}.write.format("delta")'
198
+ if partition_by:
199
+ cols = ", ".join(f'"{c}"' for c in partition_by)
200
+ writer += f".partitionBy({cols})"
201
+ writer += f'.mode("overwrite").save("{target}")'
202
+ lines.append(writer)
203
+
204
+ elif write_mode == WriteMode.OVERWRITE_PARTITION.value:
205
+ writer = f'{source_var}.write.format("delta")'
206
+ if partition_by:
207
+ cols = ", ".join(f'"{c}"' for c in partition_by)
208
+ writer += f".partitionBy({cols})"
209
+ writer += f'.mode("overwrite").option("replaceWhere", "{{partition_predicate}}").save("{target}")'
210
+ lines.append(writer)
211
+
212
+ else:
213
+ writer = f'{source_var}.write.format("delta").mode("append").save("{target}")'
214
+ lines.append(writer)
215
+
216
+ return "\n".join(lines)
217
+
218
+ def _gen_filter(self, step: ExecutionStep) -> str:
219
+ var = _var_name(step.source or "data")
220
+ condition = step.config.get("condition", "1=1")
221
+ return f'{var} = {var}.filter("{condition}")'
222
+
223
+ def _gen_transform(self, step: ExecutionStep) -> str:
224
+ return f"# Transform step: {step.config}"
225
+
226
+ def _resolve_source_var(self, step: ExecutionStep, plan: ExecutionPlan) -> str:
227
+ if step.depends_on:
228
+ dep = step.depends_on[0]
229
+ if "join" in dep:
230
+ return "df_joined"
231
+ if "aggregate" in dep or "agg" in dep:
232
+ return "df_agg"
233
+ dep_step = plan.get_step(dep)
234
+ if dep_step and dep_step.source:
235
+ return _var_name(dep_step.source)
236
+ return "df"
237
+
238
+
239
+ def _var_name(name: str) -> str:
240
+ return f"df_{name.replace('-', '_').replace('.', '_')}"
cfa/backends/sql.py ADDED
@@ -0,0 +1,270 @@
1
+ """
2
+ SQL Backend
3
+ ============
4
+ Code generation backend targeting standard SQL.
5
+
6
+ Generates governed SQL from an ExecutionPlan. The output is dialect-agnostic
7
+ SQL that runs on Snowflake, BigQuery, Postgres, DuckDB, and similar engines.
8
+
9
+ Template-based — no LLM involved. Every SQL statement is traceable back to
10
+ the execution step that produced it.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from cfa.core.codegen import GeneratedCode
16
+ from cfa.core.planner import ExecutionPlan, ExecutionStep, StepType, WriteMode
17
+ from cfa.types import FaultSeverity
18
+ from cfa.validation.static import ForbiddenToken
19
+
20
+ from . import BackendAdapter, BackendCapabilities
21
+
22
+ _SQL_FORBIDDEN_TOKENS: list[ForbiddenToken] = [
23
+ ForbiddenToken(pattern=r"\bDROP\s+TABLE\b", fault_code="STATIC_SQL_DROP_TABLE",
24
+ severity=FaultSeverity.CRITICAL,
25
+ message="DROP TABLE in governed SQL forbidden.", is_regex=True),
26
+ ForbiddenToken(pattern=r"\bDROP\s+DATABASE\b", fault_code="STATIC_SQL_DROP_DATABASE",
27
+ severity=FaultSeverity.CRITICAL,
28
+ message="DROP DATABASE in governed SQL forbidden.", is_regex=True),
29
+ ForbiddenToken(pattern=r"\bTRUNCATE\b", fault_code="STATIC_SQL_TRUNCATE",
30
+ severity=FaultSeverity.CRITICAL,
31
+ message="TRUNCATE forbidden — use MERGE or INSERT OVERWRITE.", is_regex=True),
32
+ ForbiddenToken(pattern=r"\bDELETE\s+FROM\b(?!.*WHERE)", fault_code="STATIC_SQL_DELETE_WITHOUT_WHERE",
33
+ severity=FaultSeverity.CRITICAL,
34
+ message="DELETE FROM without WHERE forbidden.", is_regex=True),
35
+ ForbiddenToken(pattern=r"\bALTER\s+TABLE\b", fault_code="STATIC_SQL_ALTER_TABLE",
36
+ severity=FaultSeverity.HIGH,
37
+ message="ALTER TABLE requires explicit approval.", is_regex=True),
38
+ ]
39
+
40
+
41
+ class SqlBackend(BackendAdapter):
42
+ """Generates governed SQL from an ExecutionPlan."""
43
+
44
+ def get_capabilities(self) -> BackendCapabilities:
45
+ return BackendCapabilities(
46
+ backend_name="sql",
47
+ backend_version="ansi-sql-2023",
48
+ supports_merge=True,
49
+ supports_partition_overwrite=True,
50
+ supports_anonymization=True,
51
+ supports_schema_enforcement=True,
52
+ pii_anonymization_methods=["sha256", "drop", "md5", "tokenize"],
53
+ cost_model_available=False,
54
+ max_recommended_rows=1_000_000_000,
55
+ supported_languages=["sql"],
56
+ forbidden_tokens=_SQL_FORBIDDEN_TOKENS,
57
+ )
58
+
59
+ def generate(self, plan: ExecutionPlan) -> GeneratedCode:
60
+ lines: list[str] = []
61
+ step_code: dict[str, str] = {}
62
+ ordered = plan.execution_order()
63
+
64
+ for step in ordered:
65
+ code = self._generate_step(step, plan)
66
+ step_code[step.id] = code
67
+ lines.append(f"-- Step: {step.id} ({step.step_type.value})")
68
+ lines.append(code)
69
+ lines.append("")
70
+
71
+ full_code = "\n".join(lines)
72
+
73
+ return GeneratedCode(
74
+ plan_signature_hash=plan.signature_hash,
75
+ intent_id=plan.intent_id,
76
+ language="sql",
77
+ code=full_code,
78
+ step_code_map=step_code,
79
+ metadata={
80
+ "write_mode": plan.write_mode.value,
81
+ "consistency_unit": plan.consistency_unit.value,
82
+ "step_count": plan.step_count,
83
+ },
84
+ )
85
+
86
+ def _generate_step(self, step: ExecutionStep, plan: ExecutionPlan) -> str:
87
+ match step.step_type:
88
+ case StepType.EXTRACT:
89
+ return self._gen_extract(step)
90
+ case StepType.ANONYMIZE:
91
+ return self._gen_anonymize(step)
92
+ case StepType.JOIN:
93
+ return self._gen_join(step, plan)
94
+ case StepType.AGGREGATE:
95
+ return self._gen_aggregate(step)
96
+ case StepType.LOAD:
97
+ return self._gen_load(step, plan)
98
+ case StepType.FILTER:
99
+ return self._gen_filter(step)
100
+ case StepType.TRANSFORM:
101
+ return self._gen_transform(step)
102
+ case _:
103
+ return f"-- TODO: unsupported step type {step.step_type.value}"
104
+
105
+ # ── Step generators ───────────────────────────────────────────────────
106
+
107
+ def _gen_extract(self, step: ExecutionStep) -> str:
108
+ source = step.source or "unknown_source"
109
+ columns = self._resolve_extract_columns(step)
110
+ lines = [f"-- EXTRACT: {source}"]
111
+ lines.append(f"SELECT {columns} FROM {_quote_ident(source)}")
112
+
113
+ filt = step.config.get("filter")
114
+ if filt:
115
+ col = _quote_ident(filt["column"])
116
+ pred = filt["predicate"]
117
+ lines.append(f"WHERE {col} {pred} '{{date_param}}'")
118
+
119
+ return "\n".join(lines)
120
+
121
+ def _gen_anonymize(self, step: ExecutionStep) -> str:
122
+ source = f"cte_{step.source}" if step.source else "source_cte"
123
+ pii_cols = step.config.get("pii_columns", [])
124
+ strategy = step.config.get("strategy", "sha256")
125
+
126
+ if not pii_cols:
127
+ return f"-- No PII columns to anonymize for {source}"
128
+
129
+ lines: list[str] = [f"-- ANONYMIZE: {source} (strategy={strategy})"]
130
+ for col in pii_cols:
131
+ safe = _quote_ident(col)
132
+ if strategy == "sha256":
133
+ lines.append(f"-- {safe} → SHA256({safe})")
134
+ elif strategy == "drop":
135
+ lines.append(f"-- {safe} → DROPPED")
136
+ elif strategy == "md5":
137
+ lines.append(f"-- {safe} → MD5({safe})")
138
+ else:
139
+ lines.append(f"-- {safe} → anonymized ({strategy})")
140
+ return "\n".join(lines)
141
+
142
+ def _gen_join(self, step: ExecutionStep, plan: ExecutionPlan) -> str:
143
+ datasets = step.config.get("datasets", [])
144
+ merge_keys = step.config.get("merge_keys", ["id"])
145
+ join_type = step.config.get("type", "sort_merge")
146
+
147
+ if len(datasets) < 2:
148
+ return "-- Join requires at least 2 datasets"
149
+
150
+ left_alias = _cte_name(datasets[0])
151
+ right_alias = _cte_name(datasets[1])
152
+ on_clause = " AND ".join(
153
+ f"{left_alias}.{_quote_ident(k)} = {right_alias}.{_quote_ident(k)}"
154
+ for k in merge_keys
155
+ )
156
+
157
+ lines: list[str] = [f"-- JOIN: {datasets[0]} + {datasets[1]}"]
158
+ hint = "/*+ BROADCAST */ " if join_type == "broadcast" else ""
159
+ lines.append(
160
+ f"SELECT {left_alias}.*, {right_alias}.*"
161
+ )
162
+ lines.append(f"FROM {_quote_ident(datasets[0])} {left_alias}")
163
+ lines.append(f"{hint}INNER JOIN {_quote_ident(datasets[1])} {right_alias}")
164
+ lines.append(f" ON {on_clause}")
165
+
166
+ return "\n".join(lines)
167
+
168
+ def _gen_aggregate(self, step: ExecutionStep) -> str:
169
+ group_by = step.config.get("group_by", [])
170
+
171
+ lines: list[str] = ["-- AGGREGATE"]
172
+ if not group_by:
173
+ lines.append("SELECT COUNT(*) AS cnt FROM joined_cte")
174
+ else:
175
+ cols = ", ".join(_quote_ident(c) for c in group_by)
176
+ lines.append(f"SELECT {cols}, COUNT(*) AS cnt")
177
+ lines.append("FROM joined_cte")
178
+ lines.append(f"GROUP BY {cols}")
179
+
180
+ return "\n".join(lines)
181
+
182
+ def _gen_load(self, step: ExecutionStep, plan: ExecutionPlan) -> str:
183
+ target = step.target or "target_table"
184
+ write_mode = step.config.get("write_mode", plan.write_mode.value)
185
+ partition_by = step.config.get("partition_by", [])
186
+ merge_keys = step.config.get("merge_keys", ["id"])
187
+ source_cte = self._resolve_source_cte(step, plan)
188
+
189
+ lines: list[str] = [f"-- LOAD: {target} (mode={write_mode})"]
190
+
191
+ if write_mode == WriteMode.MERGE.value:
192
+ merge_on = " AND ".join(
193
+ f"target.{_quote_ident(k)} = source.{_quote_ident(k)}"
194
+ for k in merge_keys
195
+ )
196
+ set_clause = ", ".join(
197
+ f"{_quote_ident(k)} = source.{_quote_ident(k)}"
198
+ for k in merge_keys
199
+ )
200
+ lines.append(f"MERGE INTO {_quote_ident(target)} AS target")
201
+ lines.append(f"USING ({source_cte}) AS source")
202
+ lines.append(f" ON {merge_on}")
203
+ lines.append("WHEN MATCHED THEN")
204
+ lines.append(f" UPDATE SET {set_clause}")
205
+ lines.append("WHEN NOT MATCHED THEN")
206
+ lines.append(" INSERT (*)")
207
+
208
+ elif write_mode == WriteMode.OVERWRITE_PARTITION.value:
209
+ partition_clause = ""
210
+ if partition_by:
211
+ parts = ", ".join(_quote_ident(p) for p in partition_by)
212
+ partition_clause = f" PARTITION ({parts})"
213
+ lines.append(f"INSERT OVERWRITE {_quote_ident(target)}{partition_clause}")
214
+ lines.append(source_cte)
215
+
216
+ elif write_mode == WriteMode.APPEND.value:
217
+ lines.append(f"INSERT INTO {_quote_ident(target)}")
218
+ lines.append(source_cte)
219
+
220
+ else:
221
+ lines.append(f"-- Unsupported write mode: {write_mode}")
222
+
223
+ return "\n".join(lines)
224
+
225
+ def _gen_filter(self, step: ExecutionStep) -> str:
226
+ condition = step.config.get("condition", "1=1")
227
+ return f"-- FILTER: WHERE {condition}"
228
+
229
+ def _gen_transform(self, step: ExecutionStep) -> str:
230
+ return f"-- TRANSFORM: {step.config}"
231
+
232
+ # ── Helpers ────────────────────────────────────────────────────────────
233
+
234
+ def _resolve_extract_columns(self, step: ExecutionStep) -> str:
235
+ target_columns = step.config.get("target_columns")
236
+ if target_columns and isinstance(target_columns, list):
237
+ return ", ".join(_quote_ident(c) for c in target_columns)
238
+ return "*"
239
+
240
+ def _resolve_source_cte(self, step: ExecutionStep, plan: ExecutionPlan) -> str:
241
+ if step.depends_on:
242
+ dep = step.depends_on[0]
243
+ if "join" in dep:
244
+ return "joined_cte"
245
+ if "aggregate" in dep or "agg" in dep:
246
+ return "aggregated_cte"
247
+ dep_step = plan.get_step(dep)
248
+ if dep_step and dep_step.source:
249
+ return f"SELECT * FROM {_quote_ident(dep_step.source)}"
250
+ return "source_cte"
251
+
252
+
253
+ # ── SQL helpers ──────────────────────────────────────────────────────────────
254
+
255
+
256
+ def _quote_ident(name: str) -> str:
257
+ """Quote a SQL identifier if it contains special characters or is a reserved word."""
258
+ sanitized = str(name).replace('"', '""')
259
+ return f'"{sanitized}"'
260
+
261
+
262
+ def _cte_name(source: str) -> str:
263
+ """Generate a short CTE alias from a dataset name."""
264
+ clean = source.replace("-", "_").replace(".", "_").lower()
265
+ if len(clean) <= 8:
266
+ return clean
267
+ parts = clean.split("_")
268
+ if len(parts) >= 2:
269
+ return parts[0][:4] + "_" + parts[-1][:4]
270
+ return clean[:8]
@@ -0,0 +1,49 @@
1
+ """
2
+ CFA Behavior — specification-driven governance
3
+ ===============================================
4
+ Bridge between human-written governance intent and executable policy rules.
5
+
6
+ Usage:
7
+ from cfa.behavior import BehaviorSpec, Systematizer
8
+
9
+ spec = BehaviorSpec.from_yaml("fiscal_governance.yaml")
10
+ taxonomy, rules = Systematizer().systematize(spec)
11
+
12
+ from cfa import KernelOrchestrator
13
+ kernel = KernelOrchestrator(policy_rules=rules)
14
+ result = kernel.process("agregar vendas com PII")
15
+
16
+ # Generate test intents
17
+ intents = taxonomy.generate_test_intents(5)
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ from .spec import (
23
+ BehaviorCategory,
24
+ BehaviorSpec,
25
+ BehaviorTaxonomy,
26
+ ConditionType,
27
+ )
28
+ from .systematizer import Systematizer
29
+
30
+ # Optional LLM backend
31
+ try:
32
+ from .llm import LLMSystematizer, LLMSystematizerBackend, OpenAISystematizerBackend
33
+ _HAS_LLM = True
34
+ except ImportError:
35
+ _HAS_LLM = False
36
+ LLMSystematizerBackend = None # type: ignore
37
+ OpenAISystematizerBackend = None # type: ignore
38
+ LLMSystematizer = None # type: ignore
39
+
40
+ __all__ = [
41
+ "BehaviorSpec",
42
+ "BehaviorCategory",
43
+ "BehaviorTaxonomy",
44
+ "ConditionType",
45
+ "Systematizer",
46
+ "LLMSystematizerBackend",
47
+ "OpenAISystematizerBackend",
48
+ "LLMSystematizer",
49
+ ]