cfa-kernel 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. cfa/__init__.py +39 -0
  2. cfa/_lazy.py +39 -0
  3. cfa/adapters/__init__.py +104 -0
  4. cfa/adapters/autogen.py +19 -0
  5. cfa/adapters/crewai.py +19 -0
  6. cfa/adapters/dspy.py +19 -0
  7. cfa/adapters/langgraph.py +19 -0
  8. cfa/adapters/openai_agents.py +19 -0
  9. cfa/audit/__init__.py +15 -0
  10. cfa/audit/context.py +205 -0
  11. cfa/audit/hashing.py +41 -0
  12. cfa/audit/trail.py +194 -0
  13. cfa/backends/__init__.py +132 -0
  14. cfa/backends/dbt.py +338 -0
  15. cfa/backends/pyspark.py +240 -0
  16. cfa/backends/sql.py +270 -0
  17. cfa/behavior/__init__.py +49 -0
  18. cfa/behavior/llm.py +244 -0
  19. cfa/behavior/spec.py +235 -0
  20. cfa/behavior/systematizer.py +222 -0
  21. cfa/cli/__init__.py +296 -0
  22. cfa/cli/__main__.py +6 -0
  23. cfa/cli/_helpers.py +109 -0
  24. cfa/cli/core/__init__.py +0 -0
  25. cfa/cli/core/evaluate.py +72 -0
  26. cfa/cli/core/validate.py +29 -0
  27. cfa/cli/formatters.py +280 -0
  28. cfa/cli/governance/__init__.py +0 -0
  29. cfa/cli/governance/audit.py +65 -0
  30. cfa/cli/governance/catalog.py +28 -0
  31. cfa/cli/governance/policy.py +119 -0
  32. cfa/cli/governance/rules.py +42 -0
  33. cfa/cli/governance/signature.py +31 -0
  34. cfa/cli/infrastructure/__init__.py +0 -0
  35. cfa/cli/infrastructure/backend_list.py +24 -0
  36. cfa/cli/infrastructure/storage.py +87 -0
  37. cfa/cli/project/__init__.py +0 -0
  38. cfa/cli/project/init.py +73 -0
  39. cfa/cli/project/lifecycle.py +92 -0
  40. cfa/cli/project/status.py +75 -0
  41. cfa/cli/project/taxonomy.py +38 -0
  42. cfa/cli/reporting/__init__.py +0 -0
  43. cfa/cli/reporting/report.py +109 -0
  44. cfa/cli/reporting/serve.py +43 -0
  45. cfa/config.py +103 -0
  46. cfa/core/__init__.py +19 -0
  47. cfa/core/codegen.py +65 -0
  48. cfa/core/conditions.py +129 -0
  49. cfa/core/kernel.py +224 -0
  50. cfa/core/phases/__init__.py +0 -0
  51. cfa/core/phases/runner.py +477 -0
  52. cfa/core/planner.py +290 -0
  53. cfa/execution/__init__.py +12 -0
  54. cfa/execution/partial.py +339 -0
  55. cfa/execution/state_projection.py +216 -0
  56. cfa/governance/__init__.py +76 -0
  57. cfa/lifecycle/__init__.py +51 -0
  58. cfa/mcp/__init__.py +347 -0
  59. cfa/mcp/__main__.py +4 -0
  60. cfa/normalizer/__init__.py +15 -0
  61. cfa/normalizer/base.py +441 -0
  62. cfa/normalizer/llm.py +426 -0
  63. cfa/observability/__init__.py +14 -0
  64. cfa/observability/indices.py +177 -0
  65. cfa/observability/metrics.py +91 -0
  66. cfa/observability/notify.py +79 -0
  67. cfa/observability/otel.py +81 -0
  68. cfa/observability/promotion.py +367 -0
  69. cfa/policy/__init__.py +12 -0
  70. cfa/policy/bundle.py +317 -0
  71. cfa/policy/catalog.py +117 -0
  72. cfa/policy/engine.py +306 -0
  73. cfa/reporting/__init__.py +42 -0
  74. cfa/reporting/charts.py +223 -0
  75. cfa/reporting/engine.py +456 -0
  76. cfa/resolution/__init__.py +62 -0
  77. cfa/runtime/__init__.py +13 -0
  78. cfa/runtime/gate.py +287 -0
  79. cfa/sandbox/__init__.py +189 -0
  80. cfa/sandbox/executor.py +92 -0
  81. cfa/sandbox/mock.py +89 -0
  82. cfa/sandbox/panic.py +52 -0
  83. cfa/storage/__init__.py +591 -0
  84. cfa/testing/__init__.py +60 -0
  85. cfa/testing/asserts.py +77 -0
  86. cfa/testing/evaluate.py +168 -0
  87. cfa/testing/fixtures.py +89 -0
  88. cfa/testing/markers.py +36 -0
  89. cfa/types.py +489 -0
  90. cfa/validation/__init__.py +14 -0
  91. cfa/validation/runtime.py +285 -0
  92. cfa/validation/signature.py +146 -0
  93. cfa/validation/static.py +252 -0
  94. cfa_kernel-0.1.0.dist-info/METADATA +32 -0
  95. cfa_kernel-0.1.0.dist-info/RECORD +98 -0
  96. cfa_kernel-0.1.0.dist-info/WHEEL +4 -0
  97. cfa_kernel-0.1.0.dist-info/entry_points.txt +3 -0
  98. cfa_kernel-0.1.0.dist-info/licenses/LICENSE +21 -0
cfa/behavior/llm.py ADDED
@@ -0,0 +1,244 @@
1
+ """
2
+ CFA LLM Systematizer
3
+ ====================
4
+ Optional LLM-backed plugin for behavior specification.
5
+
6
+ Transforms natural-language governance descriptions into BehaviorSpecs
7
+ that feed the deterministic Systematizer. The LLM is used only for the
8
+ "understanding" step — all rules are still generated deterministically.
9
+
10
+ Usage:
11
+ from cfa.behavior.llm import OpenAISystematizerBackend
12
+ from cfa.behavior import Systematizer
13
+
14
+ backend = OpenAISystematizerBackend(model="gpt-4o-mini")
15
+ taxonomy, rules = Systematizer().systematize_from_nl(
16
+ "Pipeline must protect PII, enforce merge keys, and stay within budget.",
17
+ backend=backend,
18
+ )
19
+
20
+ Architecture:
21
+ NL description → LLM → BehaviorSpec (JSON) → Systematizer → (Taxonomy, Rules)
22
+ ↑ optional ↑ deterministic
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import json
28
+ from abc import ABC, abstractmethod
29
+ from typing import Any
30
+
31
+ from .spec import BehaviorSpec
32
+
33
+ _SYSTEMATIZER_SYSTEM_PROMPT = """\
34
+ You are a data governance specification expert. Given a description of a data \
35
+ pipeline's requirements, constraints, and context, produce a structured \
36
+ behavior specification in JSON format.
37
+
38
+ For each potential failure mode, classify it using ONE of these condition types:
39
+ {pii_in_protected_layer, missing_merge_key, missing_partition, schema_mismatch,
40
+ cost_budget_exceeded, sensitive_without_partition, enforce_types_disabled,
41
+ pii_without_policy, unauthorized_gold_write, custom}
42
+
43
+ Condition type meanings:
44
+ - pii_in_protected_layer: PII exposed in Silver/Gold without anonymization
45
+ - missing_merge_key: Write to Silver/Gold without merge_key enforcement
46
+ - missing_partition: High-volume or sensitive dataset processed without partition filter
47
+ - schema_mismatch: Output schema differs from contract
48
+ - cost_budget_exceeded: Estimated cost exceeds configured ceiling
49
+ - sensitive_without_partition: Sensitive dataset without partition declaration
50
+ - enforce_types_disabled: Type enforcement disabled on protected layer write
51
+ - pii_without_policy: PII present without no_pii_raw constraint
52
+ - unauthorized_gold_write: Unauthorized write to Gold layer
53
+ - custom: Any other custom governance condition
54
+
55
+ Output ONLY valid JSON. No markdown fences, no explanation outside the JSON.
56
+
57
+ JSON schema:
58
+ {
59
+ "behavior": {
60
+ "name": "<snake_case_name>",
61
+ "description": "<markdown_description_of_governance_rules>",
62
+ "failure_modes": [
63
+ {
64
+ "code": "<unique_snake_case_code>",
65
+ "label": "<Short human-readable label>",
66
+ "description": "<When this failure occurs and why it matters>",
67
+ "condition": "<condition_type>",
68
+ "severity": "<critical|high|medium|warning|info>",
69
+ "action": "<replan|block>",
70
+ "target_layer": "<bronze|silver|gold>",
71
+ "remediation": ["<actionable step 1>", "<actionable step 2>"]
72
+ }
73
+ ]
74
+ }
75
+ }
76
+
77
+ Rules:
78
+ - Generate at least 2 failure modes covering the most important constraints.
79
+ - Use "action": "replan" for automatically fixable issues, "action": "block" for
80
+ issues that require human review (e.g., PII in Gold without anonymization).
81
+ - Severity: "critical" for PII/security, "high" for data quality, "medium" for
82
+ cost/performance, "warning" for informational.
83
+ - Remediation steps must be actionable and specific.
84
+ """
85
+
86
+ _SYSTEMATIZER_USER_TEMPLATE = """\
87
+ Pipeline description:
88
+ {description}
89
+
90
+ Context:
91
+ {context}
92
+ """
93
+
94
+
95
+ class LLMSystematizerBackend(ABC):
96
+ """Backend for LLM-assisted behavior specification.
97
+
98
+ Implement this to use any LLM provider (OpenAI, Anthropic, Azure, local).
99
+ """
100
+
101
+ @abstractmethod
102
+ def complete(self, system_prompt: str, user_message: str) -> str:
103
+ """Send prompts to the LLM and return the completion text."""
104
+ ...
105
+
106
+
107
+ class OpenAISystematizerBackend(LLMSystematizerBackend):
108
+ """OpenAI-compatible backend for NL → BehaviorSpec.
109
+
110
+ Requires: pip install openai
111
+
112
+ Args:
113
+ model: Model name (default: gpt-4o-mini).
114
+ temperature: Sampling temperature (default: 0.0 for deterministic output).
115
+ api_key: OpenAI API key. If None, reads from OPENAI_API_KEY env var.
116
+ base_url: Custom API base URL (for Azure, local models, etc.).
117
+ max_tokens: Maximum completion tokens.
118
+ """
119
+
120
+ def __init__(
121
+ self,
122
+ model: str = "gpt-4o-mini",
123
+ temperature: float = 0.0,
124
+ api_key: str | None = None,
125
+ base_url: str | None = None,
126
+ max_tokens: int = 2048,
127
+ ) -> None:
128
+ self.model = model
129
+ self.temperature = temperature
130
+ self.api_key = api_key
131
+ self.base_url = base_url
132
+ self.max_tokens = max_tokens
133
+
134
+ def complete(self, system_prompt: str, user_message: str) -> str:
135
+ try:
136
+ from openai import OpenAI
137
+ except ImportError:
138
+ raise ImportError(
139
+ "openai package is required for OpenAISystematizerBackend. "
140
+ "Install it with: pip install openai"
141
+ )
142
+
143
+ client_kwargs: dict[str, Any] = {}
144
+ if self.api_key:
145
+ client_kwargs["api_key"] = self.api_key
146
+ if self.base_url:
147
+ client_kwargs["base_url"] = self.base_url
148
+
149
+ client = OpenAI(**client_kwargs)
150
+
151
+ response = client.chat.completions.create(
152
+ model=self.model,
153
+ temperature=self.temperature,
154
+ max_tokens=self.max_tokens,
155
+ messages=[
156
+ {"role": "system", "content": system_prompt},
157
+ {"role": "user", "content": user_message},
158
+ ],
159
+ )
160
+ return response.choices[0].message.content or ""
161
+
162
+
163
+ class LLMSystematizer:
164
+ """Transforms NL descriptions into BehaviorSpecs via an LLM backend.
165
+
166
+ This is the "Phase 6" plugin — adds NL understanding on top of the
167
+ deterministic Systematizer. Without a backend, falls back gracefully.
168
+
169
+ Usage:
170
+ backend = OpenAISystematizerBackend()
171
+ spec = LLMSystematizer().systematize_nl(
172
+ "Pipeline must protect PII and enforce merge keys.",
173
+ backend=backend,
174
+ )
175
+ # spec is a BehaviorSpec ready for Systematizer
176
+ """
177
+
178
+ def systematize_nl(
179
+ self,
180
+ description: str,
181
+ *,
182
+ backend: LLMSystematizerBackend,
183
+ context: str = "",
184
+ ) -> BehaviorSpec:
185
+ """Transform a natural language description into a BehaviorSpec.
186
+
187
+ Args:
188
+ description: Natural language description of governance requirements.
189
+ backend: LLM backend implementation.
190
+ context: Optional context about the target system.
191
+
192
+ Returns:
193
+ A BehaviorSpec ready for Systematizer.systematize().
194
+
195
+ Raises:
196
+ ValueError: If the LLM response cannot be parsed.
197
+ """
198
+ user_message = _SYSTEMATIZER_USER_TEMPLATE.format(
199
+ description=description, context=context or "No additional context provided."
200
+ )
201
+
202
+ raw = backend.complete(_SYSTEMATIZER_SYSTEM_PROMPT, user_message)
203
+
204
+ if not raw.strip():
205
+ raise ValueError("LLM returned empty response.")
206
+
207
+ data = self._parse_llm_response(raw)
208
+ return BehaviorSpec.from_dict(data)
209
+
210
+ def _parse_llm_response(self, raw: str) -> dict[str, Any]:
211
+ raw = raw.strip()
212
+ # Remove markdown code fences if present
213
+ if raw.startswith("```"):
214
+ lines = raw.split("\n")
215
+ if lines[0].startswith("```"):
216
+ lines = lines[1:]
217
+ if lines and lines[-1].strip() == "```":
218
+ lines = lines[:-1]
219
+ raw = "\n".join(lines)
220
+
221
+ try:
222
+ data = json.loads(raw)
223
+ except json.JSONDecodeError:
224
+ # Try to find JSON object in the text
225
+ start = raw.find("{")
226
+ end = raw.rfind("}")
227
+ if start >= 0 and end > start:
228
+ try:
229
+ data = json.loads(raw[start : end + 1])
230
+ except json.JSONDecodeError:
231
+ raise ValueError(
232
+ f"LLM response is not valid JSON. Raw response:\n{raw[:500]}"
233
+ )
234
+ else:
235
+ raise ValueError(
236
+ f"LLM response does not contain JSON. Raw response:\n{raw[:500]}"
237
+ )
238
+
239
+ if "behavior" not in data:
240
+ raise ValueError(
241
+ f"LLM response missing 'behavior' key. Got keys: {list(data.keys())}"
242
+ )
243
+
244
+ return data
cfa/behavior/spec.py ADDED
@@ -0,0 +1,235 @@
1
+ """
2
+ CFA Behavior Spec
3
+ =================
4
+ Structured specification of allowed and prohibited behaviors.
5
+
6
+ A BehaviorSpec bridges the gap between human-written governance policies
7
+ (in natural language or YAML) and executable CFA policy rules.
8
+
9
+ Inspired by ASSERT's systematization: BehaviorSpec → BehaviorTaxonomy → PolicyRules.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from dataclasses import dataclass, field
15
+ from enum import StrEnum
16
+ from pathlib import Path
17
+ from typing import Any
18
+
19
+
20
+ class ConditionType(StrEnum):
21
+ """Condition types that map to CFA constraint checks."""
22
+
23
+ PII_IN_PROTECTED_LAYER = "pii_in_protected_layer"
24
+ MISSING_MERGE_KEY = "missing_merge_key"
25
+ SCHEMA_MISMATCH = "schema_mismatch"
26
+ SHUFFLE_BUDGET_EXCEEDED = "shuffle_budget_exceeded"
27
+ MISSING_PARTITION = "missing_partition"
28
+ COST_BUDGET_EXCEEDED = "cost_budget_exceeded"
29
+ UNAUTHORIZED_GOLD_WRITE = "unauthorized_gold_write"
30
+ ENFORCE_TYPES_DISABLED = "enforce_types_disabled"
31
+ PII_WITHOUT_POLICY = "pii_without_policy"
32
+ SENSITIVE_WITHOUT_PARTITION = "sensitive_without_partition"
33
+ CUSTOM = "custom"
34
+
35
+
36
+ @dataclass
37
+ class BehaviorCategory:
38
+ """A single behavior category in the taxonomy.
39
+
40
+ Attributes:
41
+ code: Unique identifier, e.g. "raw_pii_exposure".
42
+ label: Human-readable label, e.g. "Raw PII in Silver/Gold".
43
+ description: Detailed description of the behavior.
44
+ allowed: True if this behavior is permitted, False if prohibited.
45
+ condition_type: The CFA ConditionType that detects this behavior.
46
+ severity: Fault severity when this behavior is detected.
47
+ remediation: Ordered list of remediation actions.
48
+ metadata: Custom key-value pairs for condition refinements.
49
+ """
50
+
51
+ code: str
52
+ label: str
53
+ description: str
54
+ allowed: bool = True
55
+ condition_type: ConditionType = ConditionType.CUSTOM
56
+ severity: str = "high"
57
+ remediation: list[str] = field(default_factory=list)
58
+ metadata: dict[str, Any] = field(default_factory=dict)
59
+
60
+
61
+ @dataclass
62
+ class BehaviorTaxonomy:
63
+ """Complete taxonomy of behaviors for a governance domain.
64
+
65
+ Separates behaviors into allowed (permissible) and not_allowed (prohibited)
66
+ categories, with metadata for traceability.
67
+ """
68
+
69
+ name: str
70
+ description: str = ""
71
+ context: str = ""
72
+
73
+ allowed: list[BehaviorCategory] = field(default_factory=list)
74
+ not_allowed: list[BehaviorCategory] = field(default_factory=list)
75
+
76
+ spec_version: str = "v1.0"
77
+ source_yaml: str = ""
78
+
79
+ @property
80
+ def categories(self) -> list[BehaviorCategory]:
81
+ return self.allowed + self.not_allowed
82
+
83
+ @property
84
+ def category_count(self) -> int:
85
+ return len(self.categories)
86
+
87
+ def generate_test_intents(self, count: int = 3) -> list[str]:
88
+ """Generate test intent strings for each behavior category.
89
+
90
+ Used for automated test case generation in CI.
91
+ """
92
+ intents: list[str] = []
93
+ template_map = {
94
+ ConditionType.PII_IN_PROTECTED_LAYER: (
95
+ "Join {datasets} with PII columns and persist to {layer}"
96
+ ),
97
+ ConditionType.MISSING_MERGE_KEY: (
98
+ "Write {datasets} directly to {layer} without merge key"
99
+ ),
100
+ ConditionType.MISSING_PARTITION: (
101
+ "Scan full {datasets} without partition filter"
102
+ ),
103
+ ConditionType.SCHEMA_MISMATCH: (
104
+ "Write {datasets} to {layer} with modified schema"
105
+ ),
106
+ ConditionType.SHUFFLE_BUDGET_EXCEEDED: (
107
+ "Join massive {datasets} with cross join"
108
+ ),
109
+ ConditionType.COST_BUDGET_EXCEEDED: (
110
+ "Process full {datasets} without budget limit"
111
+ ),
112
+ }
113
+ for cat in self.not_allowed:
114
+ template = template_map.get(
115
+ cat.condition_type,
116
+ "Process {datasets} in {layer} layer",
117
+ )
118
+ for i in range(min(count, 3)):
119
+ intents.append(
120
+ template.format(
121
+ datasets=cat.code.replace("_", " "),
122
+ layer=cat.metadata.get("target_layer", "Silver"),
123
+ )
124
+ + f" #{cat.code}#{i}"
125
+ )
126
+ return intents
127
+
128
+ def to_dict(self) -> dict[str, Any]:
129
+ return {
130
+ "name": self.name,
131
+ "description": self.description,
132
+ "context": self.context,
133
+ "allowed": [
134
+ {
135
+ "code": c.code,
136
+ "label": c.label,
137
+ "description": c.description,
138
+ "condition_type": c.condition_type.value,
139
+ }
140
+ for c in self.allowed
141
+ ],
142
+ "not_allowed": [
143
+ {
144
+ "code": c.code,
145
+ "label": c.label,
146
+ "description": c.description,
147
+ "condition_type": c.condition_type.value,
148
+ "severity": c.severity,
149
+ "remediation": c.remediation,
150
+ }
151
+ for c in self.not_allowed
152
+ ],
153
+ "spec_version": self.spec_version,
154
+ }
155
+
156
+
157
+ @dataclass
158
+ class BehaviorSpec:
159
+ """Top-level behavior specification, typically loaded from YAML.
160
+
161
+ Schema:
162
+ behavior:
163
+ name: fiscal_reconciliation
164
+ description: |
165
+ # Fiscal Data Reconciliation Governance
166
+ ...
167
+ failure_modes:
168
+ - code: raw_pii_exposure
169
+ ...
170
+ context: |
171
+ Target is a PySpark ETL pipeline...
172
+ generate:
173
+ taxonomy: true
174
+ test_cases: true
175
+ """
176
+
177
+ name: str
178
+ description: str = ""
179
+ context: str = ""
180
+ failure_modes: list[dict[str, Any]] = field(default_factory=list)
181
+ target_layer: str = "silver"
182
+ backend: str = "pyspark"
183
+ auto_generate_rules: bool = True
184
+ generate_test_cases: bool = True
185
+
186
+ @classmethod
187
+ def from_yaml(cls, path: str | Path) -> BehaviorSpec:
188
+ """Parse a BehaviorSpec from a YAML file.
189
+
190
+ Requires PyYAML. Falls back gracefully with a clear message if not installed.
191
+ """
192
+ p = Path(path)
193
+ if not p.exists():
194
+ raise FileNotFoundError(f"Behavior spec file not found: {path}")
195
+
196
+ try:
197
+ import yaml
198
+ except ImportError:
199
+ raise ImportError(
200
+ "PyYAML is required to load BehaviorSpec from YAML. "
201
+ "Install it with: pip install pyyaml"
202
+ )
203
+
204
+ raw = yaml.safe_load(Path(path).read_text(encoding="utf-8"))
205
+ behavior = raw.get("behavior", raw)
206
+ pipeline = raw.get("pipeline", raw.get("generate", {}))
207
+
208
+ return cls(
209
+ name=behavior.get("name", "unnamed"),
210
+ description=behavior.get("description", ""),
211
+ context=raw.get("context", ""),
212
+ failure_modes=behavior.get("failure_modes", []),
213
+ target_layer=behavior.get("target_layer", raw.get("default_model", {}).get("target_layer", "silver")),
214
+ backend=behavior.get("backend", raw.get("default_model", {}).get("backend", "pyspark")),
215
+ auto_generate_rules=pipeline.get("policy", {}).get("auto_generate_rules", True),
216
+ generate_test_cases=pipeline.get("generate", {}).get("test_cases", True),
217
+ )
218
+
219
+ @classmethod
220
+ def from_dict(cls, data: dict[str, Any]) -> BehaviorSpec:
221
+ """Build from a dictionary (e.g. loaded from JSON or programmatic)."""
222
+ behavior = data.get("behavior", data)
223
+ pipeline = data.get("pipeline", {})
224
+ generate = pipeline.get("generate", data.get("generate", {}))
225
+
226
+ return cls(
227
+ name=behavior.get("name", "unnamed"),
228
+ description=behavior.get("description", ""),
229
+ context=data.get("context", ""),
230
+ failure_modes=behavior.get("failure_modes", []),
231
+ target_layer=behavior.get("target_layer", data.get("default_model", {}).get("target_layer", "silver")),
232
+ backend=behavior.get("backend", data.get("default_model", {}).get("backend", "pyspark")),
233
+ auto_generate_rules=pipeline.get("policy", {}).get("auto_generate_rules", True),
234
+ generate_test_cases=generate.get("test_cases", True),
235
+ )
@@ -0,0 +1,222 @@
1
+ """
2
+ CFA Systematizer
3
+ ================
4
+ Transforms a BehaviorSpec into a BehaviorTaxonomy and optionally
5
+ auto-generates PolicyRules for the CFA Policy Engine.
6
+
7
+ This is the systematization step: bridge between human-written
8
+ governance intent and executable rules.
9
+
10
+ Supports two modes:
11
+ - Template-based (MVP, no LLM): maps failure_modes → PolicyRules via conditions
12
+ - LLM-assisted (Phase 6): NL description → behavior spec (future)
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from typing import Any
18
+
19
+ from cfa.core.conditions import build_condition
20
+ from cfa.policy.engine import PolicyRule
21
+ from cfa.types import (
22
+ FaultFamily,
23
+ FaultSeverity,
24
+ PolicyAction,
25
+ )
26
+
27
+ from .spec import (
28
+ BehaviorCategory,
29
+ BehaviorSpec,
30
+ BehaviorTaxonomy,
31
+ ConditionType,
32
+ )
33
+
34
+ # Re-export for convenience
35
+ try:
36
+ from .llm import LLMSystematizer, LLMSystematizerBackend # noqa: F401
37
+ _HAS_LLM = True
38
+ except ImportError:
39
+ _HAS_LLM = False
40
+
41
+ # Map ConditionType enum values to ConditionRegistry names
42
+ _CONDITION_TYPE_MAP: dict[ConditionType, str] = {
43
+ ConditionType.PII_IN_PROTECTED_LAYER: "pii_in_protected_layer",
44
+ ConditionType.MISSING_MERGE_KEY: "missing_merge_key",
45
+ ConditionType.MISSING_PARTITION: "missing_partition",
46
+ ConditionType.ENFORCE_TYPES_DISABLED: "enforce_types_disabled",
47
+ ConditionType.PII_WITHOUT_POLICY: "pii_without_policy",
48
+ ConditionType.SENSITIVE_WITHOUT_PARTITION: "sensitive_without_partition",
49
+ ConditionType.COST_BUDGET_EXCEEDED: "cost_budget_exceeded",
50
+ ConditionType.SCHEMA_MISMATCH: "schema_mismatch",
51
+ ConditionType.SHUFFLE_BUDGET_EXCEEDED: "shuffle_budget_exceeded",
52
+ ConditionType.UNAUTHORIZED_GOLD_WRITE: "unauthorized_gold_write",
53
+ }
54
+
55
+ _SEVERITY_MAP: dict[str, FaultSeverity] = {
56
+ "info": FaultSeverity.INFO,
57
+ "warning": FaultSeverity.WARNING,
58
+ "high": FaultSeverity.HIGH,
59
+ "critical": FaultSeverity.CRITICAL,
60
+ }
61
+
62
+ _ACTION_MAP: dict[str, PolicyAction] = {
63
+ "block": PolicyAction.BLOCK,
64
+ "replan": PolicyAction.REPLAN,
65
+ "approve": PolicyAction.APPROVE,
66
+ }
67
+
68
+
69
+ class Systematizer:
70
+ """Transforms a BehaviorSpec into a BehaviorTaxonomy and PolicyRules."""
71
+
72
+ def systematize(
73
+ self, spec: BehaviorSpec
74
+ ) -> tuple[BehaviorTaxonomy, list[PolicyRule]]:
75
+ """Main entry point: spec → (taxonomy, rules).
76
+
77
+ Args:
78
+ spec: Parsed BehaviorSpec from YAML or programmatic construction.
79
+
80
+ Returns:
81
+ Tuple of (BehaviorTaxonomy, list of PolicyRules).
82
+ """
83
+ taxonomy = self._build_taxonomy(spec)
84
+ rules: list[PolicyRule] = []
85
+
86
+ if spec.auto_generate_rules:
87
+ rules = self._generate_rules(spec, taxonomy)
88
+
89
+ return taxonomy, rules
90
+
91
+ def systematize_from_nl(
92
+ self,
93
+ description: str,
94
+ *,
95
+ backend: Any = None,
96
+ context: str = "",
97
+ target_layer: str = "silver",
98
+ ) -> tuple[BehaviorTaxonomy, list[PolicyRule]]:
99
+ """Natural language → BehaviorTaxonomy + PolicyRules via LLM.
100
+
101
+ Requires an LLM backend implementing LLMSystematizerBackend.
102
+
103
+ Args:
104
+ description: NL description of governance requirements.
105
+ backend: LLM backend instance (e.g. OpenAISystematizerBackend).
106
+ context: Optional context about the target system.
107
+ target_layer: Default target layer for generated rules.
108
+
109
+ Returns:
110
+ Tuple of (BehaviorTaxonomy, list of PolicyRules).
111
+ """
112
+ from .llm import LLMSystematizer
113
+
114
+ llm = LLMSystematizer()
115
+ spec = llm.systematize_nl(description, backend=backend, context=context)
116
+ spec.target_layer = target_layer
117
+ return self.systematize(spec)
118
+
119
+ def _build_taxonomy(self, spec: BehaviorSpec) -> BehaviorTaxonomy:
120
+ allowed: list[BehaviorCategory] = []
121
+ not_allowed: list[BehaviorCategory] = []
122
+
123
+ for mode in spec.failure_modes:
124
+ code = mode.get("code", "unnamed")
125
+ label = mode.get("label", code.replace("_", " ").title())
126
+ description = mode.get("description", "")
127
+ severity = mode.get("severity", "high")
128
+
129
+ condition_str = mode.get("condition", "custom")
130
+ try:
131
+ condition_type = ConditionType(condition_str)
132
+ except ValueError:
133
+ condition_type = ConditionType.CUSTOM
134
+
135
+ category = BehaviorCategory(
136
+ code=code,
137
+ label=label,
138
+ description=description,
139
+ allowed=False,
140
+ condition_type=condition_type,
141
+ severity=severity,
142
+ remediation=mode.get("remediation", []),
143
+ metadata={
144
+ "target_layer": mode.get("target_layer", spec.target_layer),
145
+ "max_dbu": mode.get("max_dbu"),
146
+ "min_size_gb": mode.get("min_size_gb", 1.0),
147
+ **mode.get("metadata", {}),
148
+ },
149
+ )
150
+ not_allowed.append(category)
151
+
152
+ # Implicit allowed behaviors (the inverse of what we test for)
153
+ # This would be enriched by an LLM in Phase 6
154
+ allowed.append(
155
+ BehaviorCategory(
156
+ code="valid_governed_processing",
157
+ label="Valid Governed Processing",
158
+ description=(
159
+ "All pipeline operations that respect PII, schema, budget, "
160
+ "and partition constraints."
161
+ ),
162
+ allowed=True,
163
+ condition_type=ConditionType.CUSTOM,
164
+ )
165
+ )
166
+
167
+ return BehaviorTaxonomy(
168
+ name=spec.name,
169
+ description=spec.description,
170
+ context=spec.context,
171
+ allowed=allowed,
172
+ not_allowed=not_allowed,
173
+ )
174
+
175
+ def _generate_rules(
176
+ self, spec: BehaviorSpec, taxonomy: BehaviorTaxonomy
177
+ ) -> list[PolicyRule]:
178
+ """Auto-generate PolicyRules from the taxonomy's not_allowed categories."""
179
+
180
+ rules: list[PolicyRule] = []
181
+
182
+ for category in taxonomy.not_allowed:
183
+ condition_name = _CONDITION_TYPE_MAP.get(category.condition_type)
184
+ if condition_name is None:
185
+ continue
186
+
187
+ try:
188
+ condition_fn = build_condition(condition_name, category.metadata)
189
+ except KeyError:
190
+ continue
191
+
192
+ severity_enum = _SEVERITY_MAP.get(category.severity, FaultSeverity.HIGH)
193
+ action_enum = _ACTION_MAP.get(
194
+ category.metadata.get("action", "replan"), PolicyAction.REPLAN
195
+ )
196
+
197
+ rules.append(
198
+ PolicyRule(
199
+ name=f"behavior_spec_{category.code}",
200
+ condition=condition_fn,
201
+ action=action_enum,
202
+ fault_code=f"BEHAVIOR_{category.code.upper()}",
203
+ fault_family=FaultFamily.SEMANTIC,
204
+ severity=severity_enum,
205
+ message=f"{category.label}: {category.description}",
206
+ remediation=tuple(category.remediation),
207
+ )
208
+ )
209
+
210
+ return rules
211
+
212
+ def generate_test_intents(
213
+ self, spec: BehaviorSpec, count: int = 3
214
+ ) -> list[str]:
215
+ """Generate test intent strings that exercise each failure mode.
216
+
217
+ Useful for automated governance testing in CI.
218
+ """
219
+ taxonomy, _ = self.systematize(spec)
220
+ if spec.generate_test_cases:
221
+ return taxonomy.generate_test_intents(count)
222
+ return []