cfa-kernel 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. cfa/__init__.py +39 -0
  2. cfa/_lazy.py +39 -0
  3. cfa/adapters/__init__.py +104 -0
  4. cfa/adapters/autogen.py +19 -0
  5. cfa/adapters/crewai.py +19 -0
  6. cfa/adapters/dspy.py +19 -0
  7. cfa/adapters/langgraph.py +19 -0
  8. cfa/adapters/openai_agents.py +19 -0
  9. cfa/audit/__init__.py +15 -0
  10. cfa/audit/context.py +205 -0
  11. cfa/audit/hashing.py +41 -0
  12. cfa/audit/trail.py +194 -0
  13. cfa/backends/__init__.py +132 -0
  14. cfa/backends/dbt.py +338 -0
  15. cfa/backends/pyspark.py +240 -0
  16. cfa/backends/sql.py +270 -0
  17. cfa/behavior/__init__.py +49 -0
  18. cfa/behavior/llm.py +244 -0
  19. cfa/behavior/spec.py +235 -0
  20. cfa/behavior/systematizer.py +222 -0
  21. cfa/cli/__init__.py +296 -0
  22. cfa/cli/__main__.py +6 -0
  23. cfa/cli/_helpers.py +109 -0
  24. cfa/cli/core/__init__.py +0 -0
  25. cfa/cli/core/evaluate.py +72 -0
  26. cfa/cli/core/validate.py +29 -0
  27. cfa/cli/formatters.py +280 -0
  28. cfa/cli/governance/__init__.py +0 -0
  29. cfa/cli/governance/audit.py +65 -0
  30. cfa/cli/governance/catalog.py +28 -0
  31. cfa/cli/governance/policy.py +119 -0
  32. cfa/cli/governance/rules.py +42 -0
  33. cfa/cli/governance/signature.py +31 -0
  34. cfa/cli/infrastructure/__init__.py +0 -0
  35. cfa/cli/infrastructure/backend_list.py +24 -0
  36. cfa/cli/infrastructure/storage.py +87 -0
  37. cfa/cli/project/__init__.py +0 -0
  38. cfa/cli/project/init.py +73 -0
  39. cfa/cli/project/lifecycle.py +92 -0
  40. cfa/cli/project/status.py +75 -0
  41. cfa/cli/project/taxonomy.py +38 -0
  42. cfa/cli/reporting/__init__.py +0 -0
  43. cfa/cli/reporting/report.py +109 -0
  44. cfa/cli/reporting/serve.py +43 -0
  45. cfa/config.py +103 -0
  46. cfa/core/__init__.py +19 -0
  47. cfa/core/codegen.py +65 -0
  48. cfa/core/conditions.py +129 -0
  49. cfa/core/kernel.py +224 -0
  50. cfa/core/phases/__init__.py +0 -0
  51. cfa/core/phases/runner.py +477 -0
  52. cfa/core/planner.py +290 -0
  53. cfa/execution/__init__.py +12 -0
  54. cfa/execution/partial.py +339 -0
  55. cfa/execution/state_projection.py +216 -0
  56. cfa/governance/__init__.py +76 -0
  57. cfa/lifecycle/__init__.py +51 -0
  58. cfa/mcp/__init__.py +347 -0
  59. cfa/mcp/__main__.py +4 -0
  60. cfa/normalizer/__init__.py +15 -0
  61. cfa/normalizer/base.py +441 -0
  62. cfa/normalizer/llm.py +426 -0
  63. cfa/observability/__init__.py +14 -0
  64. cfa/observability/indices.py +177 -0
  65. cfa/observability/metrics.py +91 -0
  66. cfa/observability/notify.py +79 -0
  67. cfa/observability/otel.py +81 -0
  68. cfa/observability/promotion.py +367 -0
  69. cfa/policy/__init__.py +12 -0
  70. cfa/policy/bundle.py +317 -0
  71. cfa/policy/catalog.py +117 -0
  72. cfa/policy/engine.py +306 -0
  73. cfa/reporting/__init__.py +42 -0
  74. cfa/reporting/charts.py +223 -0
  75. cfa/reporting/engine.py +456 -0
  76. cfa/resolution/__init__.py +62 -0
  77. cfa/runtime/__init__.py +13 -0
  78. cfa/runtime/gate.py +287 -0
  79. cfa/sandbox/__init__.py +189 -0
  80. cfa/sandbox/executor.py +92 -0
  81. cfa/sandbox/mock.py +89 -0
  82. cfa/sandbox/panic.py +52 -0
  83. cfa/storage/__init__.py +591 -0
  84. cfa/testing/__init__.py +60 -0
  85. cfa/testing/asserts.py +77 -0
  86. cfa/testing/evaluate.py +168 -0
  87. cfa/testing/fixtures.py +89 -0
  88. cfa/testing/markers.py +36 -0
  89. cfa/types.py +489 -0
  90. cfa/validation/__init__.py +14 -0
  91. cfa/validation/runtime.py +285 -0
  92. cfa/validation/signature.py +146 -0
  93. cfa/validation/static.py +252 -0
  94. cfa_kernel-0.1.0.dist-info/METADATA +32 -0
  95. cfa_kernel-0.1.0.dist-info/RECORD +98 -0
  96. cfa_kernel-0.1.0.dist-info/WHEEL +4 -0
  97. cfa_kernel-0.1.0.dist-info/entry_points.txt +3 -0
  98. cfa_kernel-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,15 @@
1
+ """CFA Normalizer — intent normalization."""
2
+ from cfa._lazy import LazyLoader
3
+
4
+ __getattr__ = LazyLoader({
5
+ "IntentNormalizer": ("cfa.normalizer.base", "IntentNormalizer"),
6
+ "NormalizerBackend": ("cfa.normalizer.base", "NormalizerBackend"),
7
+ "MockNormalizerBackend": ("cfa.normalizer.base", "MockNormalizerBackend"),
8
+ "RuleBasedNormalizerBackend": ("cfa.normalizer.base", "RuleBasedNormalizerBackend"),
9
+ "ConfirmationOrchestrator": ("cfa.normalizer.base", "ConfirmationOrchestrator"),
10
+ "AutoApproveHandler": ("cfa.normalizer.base", "AutoApproveHandler"),
11
+ "AutoRejectHandler": ("cfa.normalizer.base", "AutoRejectHandler"),
12
+ "LLMNormalizerBackend": ("cfa.normalizer.llm", "LLMNormalizerBackend"),
13
+ "LLMProvider": ("cfa.normalizer.llm", "LLMProvider"),
14
+ "OpenAILMProvider": ("cfa.normalizer.llm", "OpenAILMProvider"),
15
+ })
cfa/normalizer/base.py ADDED
@@ -0,0 +1,441 @@
1
+ """
2
+ CFA Intent Normalizer + Confirmation Orchestrator
3
+ ==================================================
4
+ Transforms natural language into a typed State Signature.
5
+
6
+ The Normalizer is the most critical pipeline component:
7
+ an error here contaminates the entire system with deterministic perfection.
8
+
9
+ Architecture:
10
+ - NormalizerBackend ABC — LLM-agnostic
11
+ - IntentNormalizer — orchestrates resolution, context and signature
12
+ - ConfirmationOrchestrator — risk-based escalation
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from abc import ABC, abstractmethod
18
+ from dataclasses import dataclass, field
19
+ from typing import Any, Protocol
20
+
21
+ from cfa.types import (
22
+ AmbiguityLevel,
23
+ ConfirmationMode,
24
+ DatasetClassification,
25
+ DatasetRef,
26
+ ExecutionContext,
27
+ Fault,
28
+ FaultFamily,
29
+ FaultSeverity,
30
+ PolicyAction,
31
+ SemanticResolution,
32
+ SignatureConstraints,
33
+ StateSignature,
34
+ TargetLayer,
35
+ )
36
+
37
+ # ── Backend contract ─────────────────────────────────────────────────────────
38
+
39
+
40
+ @dataclass
41
+ class NormalizerInput:
42
+ raw_intent: str
43
+ environment_state: dict[str, Any]
44
+ catalog: dict[str, Any]
45
+ policy_bundle_version: str
46
+ catalog_snapshot_version: str
47
+ context_registry_version_id: str
48
+
49
+
50
+ @dataclass
51
+ class NormalizerOutput:
52
+ domain: str
53
+ intent: str
54
+ target_layer: str
55
+ datasets: list[dict[str, Any]]
56
+ constraints: dict[str, Any]
57
+ confidence_score: float
58
+ ambiguity_level: str
59
+ competing_interpretations: list[str] = field(default_factory=list)
60
+ environment_constraints_injected: list[str] = field(default_factory=list)
61
+ reasoning: str = ""
62
+
63
+
64
+ class NormalizerBackend(ABC):
65
+ """
66
+ Interface for any semantic resolution backend.
67
+ Extension point: LLM, rule-based, hybrid, mock.
68
+ """
69
+
70
+ @abstractmethod
71
+ def resolve(self, inp: NormalizerInput) -> NormalizerOutput: ...
72
+
73
+
74
+ # ── Shared keyword maps ───────────────────────────────────────────────────────
75
+
76
+ _LAYER_KEYWORDS: dict[TargetLayer, list[str]] = {
77
+ TargetLayer.GOLD: ["gold", "ouro", "master", "curated", "final"],
78
+ TargetLayer.SILVER: ["silver", "prata", "refined", "trusted", "join", "reconcil"],
79
+ TargetLayer.BRONZE: ["bronze", "raw", "ingest", "landing"],
80
+ }
81
+
82
+ _DOMAIN_KEYWORDS: dict[str, list[str]] = {
83
+ "fiscal_data_processing": ["nfe", "nota fiscal", "fiscal", "tribut"],
84
+ "customer_data": ["client", "customer", "cpf", "cadastro"],
85
+ "financial_data": ["payment", "transac", "financ", "pagamento"],
86
+ }
87
+
88
+ _INTENT_KEYWORDS: dict[str, list[str]] = {
89
+ "reconciliation_and_persist": ["join", "reconcil", "merg"],
90
+ "ingest": ["ingest", "load", "import", "carregar"],
91
+ "aggregate_and_persist": ["aggregat", "summ", "group"],
92
+ "transform_and_persist": [],
93
+ }
94
+
95
+
96
+ # ── Rule-based production backend ────────────────────────────────────────────
97
+
98
+
99
+ class RuleBasedNormalizerBackend(NormalizerBackend):
100
+ """Deterministic production baseline normalizer.
101
+
102
+ This backend is intentionally simple and catalog-grounded. It is not a
103
+ semantic oracle: if ``strict`` is enabled and the intent cannot be mapped to
104
+ catalog datasets with enough confidence, the kernel blocks before policy
105
+ evaluation instead of silently approving an underspecified operation.
106
+ """
107
+
108
+ def __init__(self, *, strict: bool = False, min_confidence: float = 0.65) -> None:
109
+ self.strict = strict
110
+ self.min_confidence = min_confidence
111
+
112
+ def resolve(self, inp: NormalizerInput) -> NormalizerOutput:
113
+ raw = inp.raw_intent.lower()
114
+
115
+ target_layer = self._detect_layer(raw)
116
+ datasets = self._detect_datasets(raw, inp.catalog)
117
+ domain = self._detect_domain(raw)
118
+ intent = self._detect_intent(raw)
119
+ has_pii = any(d.get("pii_columns") for d in datasets)
120
+
121
+ confidence = self._compute_confidence(datasets, has_pii, target_layer, inp.catalog)
122
+ env_constraints = self._detect_env_constraints(inp.environment_state)
123
+
124
+ ambiguity = self._derive_ambiguity(confidence, self.strict)
125
+
126
+ output = NormalizerOutput(
127
+ domain=domain,
128
+ intent=intent,
129
+ target_layer=target_layer,
130
+ datasets=datasets,
131
+ constraints={
132
+ "no_pii_raw": True,
133
+ "merge_key_required": target_layer in ("silver", "gold"),
134
+ "enforce_types": True,
135
+ "partition_by": ["processing_date"] if datasets else [],
136
+ },
137
+ confidence_score=round(confidence, 2),
138
+ ambiguity_level=ambiguity,
139
+ environment_constraints_injected=env_constraints,
140
+ reasoning=(
141
+ f"Rule-based: layer={target_layer}, "
142
+ f"datasets={[d['name'] for d in datasets]}, "
143
+ f"pii={has_pii}"
144
+ ),
145
+ )
146
+
147
+ catalog_names = set(inp.catalog.get("datasets", {}).keys())
148
+ if not catalog_names:
149
+ output.confidence_score = min(output.confidence_score, 0.20)
150
+ output.ambiguity_level = "high"
151
+ output.competing_interpretations.append("No catalog provided; datasets cannot be grounded.")
152
+ output.reasoning = "Rule-based: no catalog available to ground the requested transition."
153
+ return output
154
+
155
+ if not output.datasets:
156
+ output.confidence_score = min(output.confidence_score, 0.30)
157
+ output.ambiguity_level = "high"
158
+ output.competing_interpretations.append("No dataset from the catalog matched the intent.")
159
+ output.reasoning = "Rule-based: intent did not reference any known catalog dataset."
160
+ return output
161
+
162
+ if self.strict and output.confidence_score < self.min_confidence:
163
+ output.ambiguity_level = "high"
164
+ output.competing_interpretations.append(
165
+ f"Confidence {output.confidence_score:.2f} is below strict threshold {self.min_confidence:.2f}."
166
+ )
167
+
168
+ return output
169
+
170
+ # ── Private helpers ───────────────────────────────────────────────────
171
+
172
+ def _compute_confidence(
173
+ self,
174
+ datasets: list[dict[str, Any]],
175
+ has_pii: bool,
176
+ target_layer: str,
177
+ catalog: dict[str, Any],
178
+ ) -> float:
179
+ confidence = 0.85 if datasets else 0.45
180
+ if has_pii and target_layer in ("silver", "gold"):
181
+ confidence -= 0.1
182
+ if not catalog.get("datasets"):
183
+ confidence -= 0.3
184
+ return max(0.05, min(0.99, confidence))
185
+
186
+ def _derive_ambiguity(self, confidence: float, strict: bool) -> str:
187
+ if strict and confidence < 0.65:
188
+ return "high"
189
+ if confidence > 0.80:
190
+ return "low"
191
+ if confidence > 0.60:
192
+ return "medium"
193
+ return "high"
194
+
195
+ def _detect_layer(self, raw: str) -> str:
196
+ for layer, keywords in _LAYER_KEYWORDS.items():
197
+ if any(kw in raw for kw in keywords):
198
+ return layer.value
199
+ return "silver"
200
+
201
+ def _detect_datasets(self, raw: str, catalog: dict[str, Any]) -> list[dict[str, Any]]:
202
+ found: list[dict[str, Any]] = []
203
+ for name, meta in catalog.get("datasets", {}).items():
204
+ if name.lower() in raw:
205
+ found.append({
206
+ "name": name,
207
+ "classification": meta.get("classification", "internal"),
208
+ "pii_columns": meta.get("pii_columns", []),
209
+ "size_gb": meta.get("size_gb", 0.0),
210
+ "partition_column": meta.get("partition_column"),
211
+ "merge_keys": meta.get("merge_keys", []),
212
+ })
213
+ return found
214
+
215
+ def _detect_domain(self, raw: str) -> str:
216
+ for domain, keywords in _DOMAIN_KEYWORDS.items():
217
+ if any(kw in raw for kw in keywords):
218
+ return domain
219
+ return "general"
220
+
221
+ def _detect_intent(self, raw: str) -> str:
222
+ for intent_name, keywords in _INTENT_KEYWORDS.items():
223
+ if keywords and any(w in raw for w in keywords):
224
+ return intent_name
225
+ return "transform_and_persist"
226
+
227
+ def _detect_env_constraints(self, env_state: dict[str, Any]) -> list[str]:
228
+ constraints: list[str] = []
229
+ for name, state in env_state.get("datasets", {}).items():
230
+ if state.get("state") == "partially_committed":
231
+ constraints.append(
232
+ f"{name}.state=partially_committed -> publish_allowed=false"
233
+ )
234
+ return constraints
235
+
236
+
237
+ # ── Mock backend (test-only) ──────────────────────────────────────────────────
238
+
239
+
240
+ class MockNormalizerBackend(NormalizerBackend):
241
+ """Deterministic backend for tests. Uses keyword matching.
242
+
243
+ This class delegates to the production RuleBasedNormalizerBackend internally
244
+ so test behaviour stays consistent with production. The name is preserved for
245
+ backward-compatible test imports.
246
+ """
247
+
248
+ def resolve(self, inp: NormalizerInput) -> NormalizerOutput:
249
+ backend = RuleBasedNormalizerBackend()
250
+ return backend.resolve(inp)
251
+
252
+
253
+ # ── Intent Normalizer ────────────────────────────────────────────────────────
254
+
255
+
256
+ class IntentNormalizer:
257
+ """
258
+ Transforms natural language into a typed State Signature.
259
+
260
+ Mandatory inputs (per whitepaper):
261
+ 1. user_intent (natural language)
262
+ 2. context_registry.environment_state
263
+ 3. data_catalog
264
+ """
265
+
266
+ def __init__(
267
+ self,
268
+ backend: NormalizerBackend,
269
+ policy_bundle_version: str = "v1.0",
270
+ catalog_snapshot_version: str = "catalog_default",
271
+ ) -> None:
272
+ self.backend = backend
273
+ self.policy_bundle_version = policy_bundle_version
274
+ self.catalog_snapshot_version = catalog_snapshot_version
275
+
276
+ def normalize(
277
+ self,
278
+ raw_intent: str,
279
+ environment_state: dict[str, Any],
280
+ catalog: dict[str, Any],
281
+ context_registry_version_id: str = "v_initial",
282
+ ) -> SemanticResolution:
283
+ inp = NormalizerInput(
284
+ raw_intent=raw_intent,
285
+ environment_state=environment_state,
286
+ catalog=catalog,
287
+ policy_bundle_version=self.policy_bundle_version,
288
+ catalog_snapshot_version=self.catalog_snapshot_version,
289
+ context_registry_version_id=context_registry_version_id,
290
+ )
291
+ output = self.backend.resolve(inp)
292
+ signature = self._build_signature(output, raw_intent, context_registry_version_id)
293
+
294
+ ambiguity_map = {
295
+ "low": AmbiguityLevel.LOW,
296
+ "medium": AmbiguityLevel.MEDIUM,
297
+ "high": AmbiguityLevel.HIGH,
298
+ }
299
+
300
+ return SemanticResolution(
301
+ signature=signature,
302
+ confidence_score=output.confidence_score,
303
+ ambiguity_level=ambiguity_map.get(output.ambiguity_level, AmbiguityLevel.MEDIUM),
304
+ competing_interpretations=output.competing_interpretations,
305
+ environment_constraints_injected=output.environment_constraints_injected,
306
+ reasoning=output.reasoning,
307
+ )
308
+
309
+ def _build_signature(
310
+ self,
311
+ output: NormalizerOutput,
312
+ raw_intent: str,
313
+ context_registry_version_id: str,
314
+ ) -> StateSignature:
315
+ layer_map = {"bronze": TargetLayer.BRONZE, "silver": TargetLayer.SILVER, "gold": TargetLayer.GOLD}
316
+ target_layer = layer_map.get(output.target_layer, TargetLayer.SILVER)
317
+
318
+ cls_map = {
319
+ "public": DatasetClassification.PUBLIC,
320
+ "internal": DatasetClassification.INTERNAL,
321
+ "sensitive": DatasetClassification.SENSITIVE,
322
+ "high_volume": DatasetClassification.HIGH_VOLUME,
323
+ }
324
+
325
+ datasets = tuple(
326
+ DatasetRef(
327
+ name=d["name"],
328
+ classification=cls_map.get(d.get("classification", "internal"), DatasetClassification.INTERNAL),
329
+ size_gb=d.get("size_gb", 0.0),
330
+ pii_columns=tuple(d.get("pii_columns", [])),
331
+ partition_column=d.get("partition_column"),
332
+ merge_keys=tuple(d.get("merge_keys", [])),
333
+ )
334
+ for d in output.datasets
335
+ )
336
+
337
+ c = output.constraints
338
+ constraints = SignatureConstraints(
339
+ no_pii_raw=c.get("no_pii_raw", True),
340
+ merge_key_required=c.get("merge_key_required", True),
341
+ enforce_types=c.get("enforce_types", True),
342
+ partition_by=tuple(c.get("partition_by", [])),
343
+ max_cost_dbu=c.get("max_cost_dbu"),
344
+ )
345
+
346
+ execution_context = ExecutionContext(
347
+ policy_bundle_version=self.policy_bundle_version,
348
+ catalog_snapshot_version=self.catalog_snapshot_version,
349
+ context_registry_version_id=context_registry_version_id,
350
+ )
351
+
352
+ return StateSignature(
353
+ domain=output.domain,
354
+ intent=output.intent,
355
+ target_layer=target_layer,
356
+ datasets=datasets,
357
+ constraints=constraints,
358
+ execution_context=execution_context,
359
+ source_intent_raw=raw_intent,
360
+ )
361
+
362
+
363
+ # ── Confirmation Orchestrator ────────────────────────────────────────────────
364
+
365
+
366
+ class ConfirmationHandler(Protocol):
367
+ """Interface for confirmation handlers (Slack bot, web UI, mock, etc.)."""
368
+
369
+ def confirm(self, resolution: SemanticResolution, reason: str) -> bool: ...
370
+
371
+
372
+ class AutoApproveHandler:
373
+ def confirm(self, resolution: SemanticResolution, reason: str) -> bool:
374
+ return True
375
+
376
+
377
+ class AutoRejectHandler:
378
+ def confirm(self, resolution: SemanticResolution, reason: str) -> bool:
379
+ return False
380
+
381
+
382
+ class ConfirmationOrchestrator:
383
+ """
384
+ Interposes escalation between Semantic Resolution and Policy Engine.
385
+ Selectively activated by risk — no friction in 90% of cases.
386
+
387
+ Modes:
388
+ - auto: pass through
389
+ - soft: log and pass
390
+ - hard: require explicit confirmation
391
+ - human_escalation: send for human review with timeout
392
+ """
393
+
394
+ def __init__(
395
+ self,
396
+ handler: ConfirmationHandler | None = None,
397
+ timeout_seconds: int = 300,
398
+ ) -> None:
399
+ self.handler = handler or AutoApproveHandler()
400
+ self.timeout_seconds = timeout_seconds
401
+
402
+ def process(self, resolution: SemanticResolution) -> tuple[bool, str, Fault | None]:
403
+ """Returns (approved, reason, fault_or_none)."""
404
+ mode = resolution.confirmation_mode
405
+
406
+ if mode == ConfirmationMode.AUTO:
407
+ return True, "Auto-confirmed: low risk.", None
408
+
409
+ if mode == ConfirmationMode.SOFT:
410
+ return True, f"Soft-confirmed: confidence={resolution.confidence_score:.2f}", None
411
+
412
+ reason = self._build_reason(resolution)
413
+ approved = self.handler.confirm(resolution, reason)
414
+
415
+ if approved:
416
+ label = "Hard" if mode == ConfirmationMode.HARD else "Human escalation"
417
+ return True, f"{label} approved.", None
418
+
419
+ fault = Fault(
420
+ code=f"CONFIRMATION_{mode.value.upper()}_REJECTED",
421
+ family=FaultFamily.SEMANTIC,
422
+ severity=FaultSeverity.HIGH if mode == ConfirmationMode.HARD else FaultSeverity.CRITICAL,
423
+ stage="confirmation_orchestrator",
424
+ message=f"Confirmation rejected (mode={mode.value}).",
425
+ mandatory_action=PolicyAction.BLOCK,
426
+ remediation=("Review the intent and resubmit.",),
427
+ )
428
+ return False, f"Confirmation rejected (mode={mode.value}).", fault
429
+
430
+ def _build_reason(self, resolution: SemanticResolution) -> str:
431
+ sig = resolution.signature
432
+ reasons: list[str] = []
433
+ if sig.target_layer == TargetLayer.GOLD:
434
+ reasons.append("Gold layer write")
435
+ if sig.writes_to_protected_layer and sig.contains_pii:
436
+ reasons.append("protected layer write with PII")
437
+ if resolution.confidence_score < 0.65:
438
+ reasons.append(f"low confidence ({resolution.confidence_score:.2f})")
439
+ if len(resolution.competing_interpretations) > 1:
440
+ reasons.append(f"{len(resolution.competing_interpretations)} competing interpretations")
441
+ return "; ".join(reasons) or "elevated risk"