PyPI - cfa-kernel - Versions diffs - 0.1.0__py3-none-any.whl - Mend

cfa-kernel 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

cfa/__init__.py +39 -0
cfa/_lazy.py +39 -0
cfa/adapters/__init__.py +104 -0
cfa/adapters/autogen.py +19 -0
cfa/adapters/crewai.py +19 -0
cfa/adapters/dspy.py +19 -0
cfa/adapters/langgraph.py +19 -0
cfa/adapters/openai_agents.py +19 -0
cfa/audit/__init__.py +15 -0
cfa/audit/context.py +205 -0
cfa/audit/hashing.py +41 -0
cfa/audit/trail.py +194 -0
cfa/backends/__init__.py +132 -0
cfa/backends/dbt.py +338 -0
cfa/backends/pyspark.py +240 -0
cfa/backends/sql.py +270 -0
cfa/behavior/__init__.py +49 -0
cfa/behavior/llm.py +244 -0
cfa/behavior/spec.py +235 -0
cfa/behavior/systematizer.py +222 -0
cfa/cli/__init__.py +296 -0
cfa/cli/__main__.py +6 -0
cfa/cli/_helpers.py +109 -0
cfa/cli/core/__init__.py +0 -0
cfa/cli/core/evaluate.py +72 -0
cfa/cli/core/validate.py +29 -0
cfa/cli/formatters.py +280 -0
cfa/cli/governance/__init__.py +0 -0
cfa/cli/governance/audit.py +65 -0
cfa/cli/governance/catalog.py +28 -0
cfa/cli/governance/policy.py +119 -0
cfa/cli/governance/rules.py +42 -0
cfa/cli/governance/signature.py +31 -0
cfa/cli/infrastructure/__init__.py +0 -0
cfa/cli/infrastructure/backend_list.py +24 -0
cfa/cli/infrastructure/storage.py +87 -0
cfa/cli/project/__init__.py +0 -0
cfa/cli/project/init.py +73 -0
cfa/cli/project/lifecycle.py +92 -0
cfa/cli/project/status.py +75 -0
cfa/cli/project/taxonomy.py +38 -0
cfa/cli/reporting/__init__.py +0 -0
cfa/cli/reporting/report.py +109 -0
cfa/cli/reporting/serve.py +43 -0
cfa/config.py +103 -0
cfa/core/__init__.py +19 -0
cfa/core/codegen.py +65 -0
cfa/core/conditions.py +129 -0
cfa/core/kernel.py +224 -0
cfa/core/phases/__init__.py +0 -0
cfa/core/phases/runner.py +477 -0
cfa/core/planner.py +290 -0
cfa/execution/__init__.py +12 -0
cfa/execution/partial.py +339 -0
cfa/execution/state_projection.py +216 -0
cfa/governance/__init__.py +76 -0
cfa/lifecycle/__init__.py +51 -0
cfa/mcp/__init__.py +347 -0
cfa/mcp/__main__.py +4 -0
cfa/normalizer/__init__.py +15 -0
cfa/normalizer/base.py +441 -0
cfa/normalizer/llm.py +426 -0
cfa/observability/__init__.py +14 -0
cfa/observability/indices.py +177 -0
cfa/observability/metrics.py +91 -0
cfa/observability/notify.py +79 -0
cfa/observability/otel.py +81 -0
cfa/observability/promotion.py +367 -0
cfa/policy/__init__.py +12 -0
cfa/policy/bundle.py +317 -0
cfa/policy/catalog.py +117 -0
cfa/policy/engine.py +306 -0
cfa/reporting/__init__.py +42 -0
cfa/reporting/charts.py +223 -0
cfa/reporting/engine.py +456 -0
cfa/resolution/__init__.py +62 -0
cfa/runtime/__init__.py +13 -0
cfa/runtime/gate.py +287 -0
cfa/sandbox/__init__.py +189 -0
cfa/sandbox/executor.py +92 -0
cfa/sandbox/mock.py +89 -0
cfa/sandbox/panic.py +52 -0
cfa/storage/__init__.py +591 -0
cfa/testing/__init__.py +60 -0
cfa/testing/asserts.py +77 -0
cfa/testing/evaluate.py +168 -0
cfa/testing/fixtures.py +89 -0
cfa/testing/markers.py +36 -0
cfa/types.py +489 -0
cfa/validation/__init__.py +14 -0
cfa/validation/runtime.py +285 -0
cfa/validation/signature.py +146 -0
cfa/validation/static.py +252 -0
cfa_kernel-0.1.0.dist-info/METADATA +32 -0
cfa_kernel-0.1.0.dist-info/RECORD +98 -0
cfa_kernel-0.1.0.dist-info/WHEEL +4 -0
cfa_kernel-0.1.0.dist-info/entry_points.txt +3 -0
cfa_kernel-0.1.0.dist-info/licenses/LICENSE +21 -0

cfa/normalizer/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""CFA Normalizer — intent normalization."""
+from cfa._lazy import LazyLoader
+__getattr__ = LazyLoader({
+    "IntentNormalizer": ("cfa.normalizer.base", "IntentNormalizer"),
+    "NormalizerBackend": ("cfa.normalizer.base", "NormalizerBackend"),
+    "MockNormalizerBackend": ("cfa.normalizer.base", "MockNormalizerBackend"),
+    "RuleBasedNormalizerBackend": ("cfa.normalizer.base", "RuleBasedNormalizerBackend"),
+    "ConfirmationOrchestrator": ("cfa.normalizer.base", "ConfirmationOrchestrator"),
+    "AutoApproveHandler": ("cfa.normalizer.base", "AutoApproveHandler"),
+    "AutoRejectHandler": ("cfa.normalizer.base", "AutoRejectHandler"),
+    "LLMNormalizerBackend": ("cfa.normalizer.llm", "LLMNormalizerBackend"),
+    "LLMProvider": ("cfa.normalizer.llm", "LLMProvider"),
+    "OpenAILMProvider": ("cfa.normalizer.llm", "OpenAILMProvider"),
+})

cfa/normalizer/base.py ADDED Viewed

@@ -0,0 +1,441 @@
+"""
+CFA Intent Normalizer + Confirmation Orchestrator
+==================================================
+Transforms natural language into a typed State Signature.
+The Normalizer is the most critical pipeline component:
+an error here contaminates the entire system with deterministic perfection.
+Architecture:
+- NormalizerBackend ABC — LLM-agnostic
+- IntentNormalizer — orchestrates resolution, context and signature
+- ConfirmationOrchestrator — risk-based escalation
+"""
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import Any, Protocol
+from cfa.types import (
+    AmbiguityLevel,
+    ConfirmationMode,
+    DatasetClassification,
+    DatasetRef,
+    ExecutionContext,
+    Fault,
+    FaultFamily,
+    FaultSeverity,
+    PolicyAction,
+    SemanticResolution,
+    SignatureConstraints,
+    StateSignature,
+    TargetLayer,
+)
+# ── Backend contract ─────────────────────────────────────────────────────────
+@dataclass
+class NormalizerInput:
+    raw_intent: str
+    environment_state: dict[str, Any]
+    catalog: dict[str, Any]
+    policy_bundle_version: str
+    catalog_snapshot_version: str
+    context_registry_version_id: str
+@dataclass
+class NormalizerOutput:
+    domain: str
+    intent: str
+    target_layer: str
+    datasets: list[dict[str, Any]]
+    constraints: dict[str, Any]
+    confidence_score: float
+    ambiguity_level: str
+    competing_interpretations: list[str] = field(default_factory=list)
+    environment_constraints_injected: list[str] = field(default_factory=list)
+    reasoning: str = ""
+class NormalizerBackend(ABC):
+    """
+    Interface for any semantic resolution backend.
+    Extension point: LLM, rule-based, hybrid, mock.
+    """
+    @abstractmethod
+    def resolve(self, inp: NormalizerInput) -> NormalizerOutput: ...
+# ── Shared keyword maps ───────────────────────────────────────────────────────
+_LAYER_KEYWORDS: dict[TargetLayer, list[str]] = {
+    TargetLayer.GOLD: ["gold", "ouro", "master", "curated", "final"],
+    TargetLayer.SILVER: ["silver", "prata", "refined", "trusted", "join", "reconcil"],
+    TargetLayer.BRONZE: ["bronze", "raw", "ingest", "landing"],
+}
+_DOMAIN_KEYWORDS: dict[str, list[str]] = {
+    "fiscal_data_processing": ["nfe", "nota fiscal", "fiscal", "tribut"],
+    "customer_data": ["client", "customer", "cpf", "cadastro"],
+    "financial_data": ["payment", "transac", "financ", "pagamento"],
+}
+_INTENT_KEYWORDS: dict[str, list[str]] = {
+    "reconciliation_and_persist": ["join", "reconcil", "merg"],
+    "ingest": ["ingest", "load", "import", "carregar"],
+    "aggregate_and_persist": ["aggregat", "summ", "group"],
+    "transform_and_persist": [],
+}
+# ── Rule-based production backend ────────────────────────────────────────────
+class RuleBasedNormalizerBackend(NormalizerBackend):
+    """Deterministic production baseline normalizer.
+    This backend is intentionally simple and catalog-grounded. It is not a
+    semantic oracle: if ``strict`` is enabled and the intent cannot be mapped to
+    catalog datasets with enough confidence, the kernel blocks before policy
+    evaluation instead of silently approving an underspecified operation.
+    """
+    def __init__(self, *, strict: bool = False, min_confidence: float = 0.65) -> None:
+        self.strict = strict
+        self.min_confidence = min_confidence
+    def resolve(self, inp: NormalizerInput) -> NormalizerOutput:
+        raw = inp.raw_intent.lower()
+        target_layer = self._detect_layer(raw)
+        datasets = self._detect_datasets(raw, inp.catalog)
+        domain = self._detect_domain(raw)
+        intent = self._detect_intent(raw)
+        has_pii = any(d.get("pii_columns") for d in datasets)
+        confidence = self._compute_confidence(datasets, has_pii, target_layer, inp.catalog)
+        env_constraints = self._detect_env_constraints(inp.environment_state)
+        ambiguity = self._derive_ambiguity(confidence, self.strict)
+        output = NormalizerOutput(
+            domain=domain,
+            intent=intent,
+            target_layer=target_layer,
+            datasets=datasets,
+            constraints={
+                "no_pii_raw": True,
+                "merge_key_required": target_layer in ("silver", "gold"),
+                "enforce_types": True,
+                "partition_by": ["processing_date"] if datasets else [],
+            },
+            confidence_score=round(confidence, 2),
+            ambiguity_level=ambiguity,
+            environment_constraints_injected=env_constraints,
+            reasoning=(
+                f"Rule-based: layer={target_layer}, "
+                f"datasets={[d['name'] for d in datasets]}, "
+                f"pii={has_pii}"
+            ),
+        )
+        catalog_names = set(inp.catalog.get("datasets", {}).keys())
+        if not catalog_names:
+            output.confidence_score = min(output.confidence_score, 0.20)
+            output.ambiguity_level = "high"
+            output.competing_interpretations.append("No catalog provided; datasets cannot be grounded.")
+            output.reasoning = "Rule-based: no catalog available to ground the requested transition."
+            return output
+        if not output.datasets:
+            output.confidence_score = min(output.confidence_score, 0.30)
+            output.ambiguity_level = "high"
+            output.competing_interpretations.append("No dataset from the catalog matched the intent.")
+            output.reasoning = "Rule-based: intent did not reference any known catalog dataset."
+            return output
+        if self.strict and output.confidence_score < self.min_confidence:
+            output.ambiguity_level = "high"
+            output.competing_interpretations.append(
+                f"Confidence {output.confidence_score:.2f} is below strict threshold {self.min_confidence:.2f}."
+            )
+        return output
+    # ── Private helpers ───────────────────────────────────────────────────
+    def _compute_confidence(
+        self,
+        datasets: list[dict[str, Any]],
+        has_pii: bool,
+        target_layer: str,
+        catalog: dict[str, Any],
+    ) -> float:
+        confidence = 0.85 if datasets else 0.45
+        if has_pii and target_layer in ("silver", "gold"):
+            confidence -= 0.1
+        if not catalog.get("datasets"):
+            confidence -= 0.3
+        return max(0.05, min(0.99, confidence))
+    def _derive_ambiguity(self, confidence: float, strict: bool) -> str:
+        if strict and confidence < 0.65:
+            return "high"
+        if confidence > 0.80:
+            return "low"
+        if confidence > 0.60:
+            return "medium"
+        return "high"
+    def _detect_layer(self, raw: str) -> str:
+        for layer, keywords in _LAYER_KEYWORDS.items():
+            if any(kw in raw for kw in keywords):
+                return layer.value
+        return "silver"
+    def _detect_datasets(self, raw: str, catalog: dict[str, Any]) -> list[dict[str, Any]]:
+        found: list[dict[str, Any]] = []
+        for name, meta in catalog.get("datasets", {}).items():
+            if name.lower() in raw:
+                found.append({
+                    "name": name,
+                    "classification": meta.get("classification", "internal"),
+                    "pii_columns": meta.get("pii_columns", []),
+                    "size_gb": meta.get("size_gb", 0.0),
+                    "partition_column": meta.get("partition_column"),
+                    "merge_keys": meta.get("merge_keys", []),
+                })
+        return found
+    def _detect_domain(self, raw: str) -> str:
+        for domain, keywords in _DOMAIN_KEYWORDS.items():
+            if any(kw in raw for kw in keywords):
+                return domain
+        return "general"
+    def _detect_intent(self, raw: str) -> str:
+        for intent_name, keywords in _INTENT_KEYWORDS.items():
+            if keywords and any(w in raw for w in keywords):
+                return intent_name
+        return "transform_and_persist"
+    def _detect_env_constraints(self, env_state: dict[str, Any]) -> list[str]:
+        constraints: list[str] = []
+        for name, state in env_state.get("datasets", {}).items():
+            if state.get("state") == "partially_committed":
+                constraints.append(
+                    f"{name}.state=partially_committed -> publish_allowed=false"
+                )
+        return constraints
+# ── Mock backend (test-only) ──────────────────────────────────────────────────
+class MockNormalizerBackend(NormalizerBackend):
+    """Deterministic backend for tests. Uses keyword matching.
+    This class delegates to the production RuleBasedNormalizerBackend internally
+    so test behaviour stays consistent with production. The name is preserved for
+    backward-compatible test imports.
+    """
+    def resolve(self, inp: NormalizerInput) -> NormalizerOutput:
+        backend = RuleBasedNormalizerBackend()
+        return backend.resolve(inp)
+# ── Intent Normalizer ────────────────────────────────────────────────────────
+class IntentNormalizer:
+    """
+    Transforms natural language into a typed State Signature.
+    Mandatory inputs (per whitepaper):
+    1. user_intent (natural language)
+    2. context_registry.environment_state
+    3. data_catalog
+    """
+    def __init__(
+        self,
+        backend: NormalizerBackend,
+        policy_bundle_version: str = "v1.0",
+        catalog_snapshot_version: str = "catalog_default",
+    ) -> None:
+        self.backend = backend
+        self.policy_bundle_version = policy_bundle_version
+        self.catalog_snapshot_version = catalog_snapshot_version
+    def normalize(
+        self,
+        raw_intent: str,
+        environment_state: dict[str, Any],
+        catalog: dict[str, Any],
+        context_registry_version_id: str = "v_initial",
+    ) -> SemanticResolution:
+        inp = NormalizerInput(
+            raw_intent=raw_intent,
+            environment_state=environment_state,
+            catalog=catalog,
+            policy_bundle_version=self.policy_bundle_version,
+            catalog_snapshot_version=self.catalog_snapshot_version,
+            context_registry_version_id=context_registry_version_id,
+        )
+        output = self.backend.resolve(inp)
+        signature = self._build_signature(output, raw_intent, context_registry_version_id)
+        ambiguity_map = {
+            "low": AmbiguityLevel.LOW,
+            "medium": AmbiguityLevel.MEDIUM,
+            "high": AmbiguityLevel.HIGH,
+        }
+        return SemanticResolution(
+            signature=signature,
+            confidence_score=output.confidence_score,
+            ambiguity_level=ambiguity_map.get(output.ambiguity_level, AmbiguityLevel.MEDIUM),
+            competing_interpretations=output.competing_interpretations,
+            environment_constraints_injected=output.environment_constraints_injected,
+            reasoning=output.reasoning,
+        )
+    def _build_signature(
+        self,
+        output: NormalizerOutput,
+        raw_intent: str,
+        context_registry_version_id: str,
+    ) -> StateSignature:
+        layer_map = {"bronze": TargetLayer.BRONZE, "silver": TargetLayer.SILVER, "gold": TargetLayer.GOLD}
+        target_layer = layer_map.get(output.target_layer, TargetLayer.SILVER)
+        cls_map = {
+            "public": DatasetClassification.PUBLIC,
+            "internal": DatasetClassification.INTERNAL,
+            "sensitive": DatasetClassification.SENSITIVE,
+            "high_volume": DatasetClassification.HIGH_VOLUME,
+        }
+        datasets = tuple(
+            DatasetRef(
+                name=d["name"],
+                classification=cls_map.get(d.get("classification", "internal"), DatasetClassification.INTERNAL),
+                size_gb=d.get("size_gb", 0.0),
+                pii_columns=tuple(d.get("pii_columns", [])),
+                partition_column=d.get("partition_column"),
+                merge_keys=tuple(d.get("merge_keys", [])),
+            )
+            for d in output.datasets
+        )
+        c = output.constraints
+        constraints = SignatureConstraints(
+            no_pii_raw=c.get("no_pii_raw", True),
+            merge_key_required=c.get("merge_key_required", True),
+            enforce_types=c.get("enforce_types", True),
+            partition_by=tuple(c.get("partition_by", [])),
+            max_cost_dbu=c.get("max_cost_dbu"),
+        )
+        execution_context = ExecutionContext(
+            policy_bundle_version=self.policy_bundle_version,
+            catalog_snapshot_version=self.catalog_snapshot_version,
+            context_registry_version_id=context_registry_version_id,
+        )
+        return StateSignature(
+            domain=output.domain,
+            intent=output.intent,
+            target_layer=target_layer,
+            datasets=datasets,
+            constraints=constraints,
+            execution_context=execution_context,
+            source_intent_raw=raw_intent,
+        )
+# ── Confirmation Orchestrator ────────────────────────────────────────────────
+class ConfirmationHandler(Protocol):
+    """Interface for confirmation handlers (Slack bot, web UI, mock, etc.)."""
+    def confirm(self, resolution: SemanticResolution, reason: str) -> bool: ...
+class AutoApproveHandler:
+    def confirm(self, resolution: SemanticResolution, reason: str) -> bool:
+        return True
+class AutoRejectHandler:
+    def confirm(self, resolution: SemanticResolution, reason: str) -> bool:
+        return False
+class ConfirmationOrchestrator:
+    """
+    Interposes escalation between Semantic Resolution and Policy Engine.
+    Selectively activated by risk — no friction in 90% of cases.
+    Modes:
+    - auto:             pass through
+    - soft:             log and pass
+    - hard:             require explicit confirmation
+    - human_escalation: send for human review with timeout
+    """
+    def __init__(
+        self,
+        handler: ConfirmationHandler | None = None,
+        timeout_seconds: int = 300,
+    ) -> None:
+        self.handler = handler or AutoApproveHandler()
+        self.timeout_seconds = timeout_seconds
+    def process(self, resolution: SemanticResolution) -> tuple[bool, str, Fault | None]:
+        """Returns (approved, reason, fault_or_none)."""
+        mode = resolution.confirmation_mode
+        if mode == ConfirmationMode.AUTO:
+            return True, "Auto-confirmed: low risk.", None
+        if mode == ConfirmationMode.SOFT:
+            return True, f"Soft-confirmed: confidence={resolution.confidence_score:.2f}", None
+        reason = self._build_reason(resolution)
+        approved = self.handler.confirm(resolution, reason)
+        if approved:
+            label = "Hard" if mode == ConfirmationMode.HARD else "Human escalation"
+            return True, f"{label} approved.", None
+        fault = Fault(
+            code=f"CONFIRMATION_{mode.value.upper()}_REJECTED",
+            family=FaultFamily.SEMANTIC,
+            severity=FaultSeverity.HIGH if mode == ConfirmationMode.HARD else FaultSeverity.CRITICAL,
+            stage="confirmation_orchestrator",
+            message=f"Confirmation rejected (mode={mode.value}).",
+            mandatory_action=PolicyAction.BLOCK,
+            remediation=("Review the intent and resubmit.",),
+        )
+        return False, f"Confirmation rejected (mode={mode.value}).", fault
+    def _build_reason(self, resolution: SemanticResolution) -> str:
+        sig = resolution.signature
+        reasons: list[str] = []
+        if sig.target_layer == TargetLayer.GOLD:
+            reasons.append("Gold layer write")
+        if sig.writes_to_protected_layer and sig.contains_pii:
+            reasons.append("protected layer write with PII")
+        if resolution.confidence_score < 0.65:
+            reasons.append(f"low confidence ({resolution.confidence_score:.2f})")
+        if len(resolution.competing_interpretations) > 1:
+            reasons.append(f"{len(resolution.competing_interpretations)} competing interpretations")
+        return "; ".join(reasons) or "elevated risk"