PyPI - sum-engine - Versions diffs - 0.1.0__py3-none-any.whl - Mend

sum-engine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

internal/__init__.py +8 -0
internal/algorithms/__init__.py +1 -0
internal/algorithms/causal_discovery.py +96 -0
internal/algorithms/predicate_canon.py +137 -0
internal/algorithms/semantic_arithmetic.py +890 -0
internal/algorithms/syntactic_sieve.py +452 -0
internal/algorithms/zk_semantics.py +90 -0
internal/ensemble/__init__.py +1 -0
internal/ensemble/automated_scientist.py +138 -0
internal/ensemble/autonomous_agent.py +157 -0
internal/ensemble/causal_triggers.py +121 -0
internal/ensemble/confidence_calibrator.py +284 -0
internal/ensemble/epistemic_arbiter.py +159 -0
internal/ensemble/epistemic_loop.py +136 -0
internal/ensemble/extraction_validator.py +172 -0
internal/ensemble/gauge_orchestrator.py +150 -0
internal/ensemble/live_llm_adapter.py +183 -0
internal/ensemble/llm_entailment.py +117 -0
internal/ensemble/mass_semantic_engine.py +138 -0
internal/ensemble/ouroboros.py +281 -0
internal/ensemble/semantic_dedup.py +261 -0
internal/ensemble/tome_generator.py +286 -0
internal/ensemble/tome_sliders.py +104 -0
internal/ensemble/vector_bridge.py +195 -0
internal/ensemble/venn_abers.py +211 -0
internal/infrastructure/__init__.py +1 -0
internal/infrastructure/akashic_ledger.py +812 -0
internal/infrastructure/canonical_codec.py +452 -0
internal/infrastructure/jcs.py +115 -0
internal/infrastructure/key_manager.py +239 -0
internal/infrastructure/p2p_mesh.py +168 -0
internal/infrastructure/prov_o.py +159 -0
internal/infrastructure/provenance.py +181 -0
internal/infrastructure/rate_limiter.py +81 -0
internal/infrastructure/resource_guards.py +117 -0
internal/infrastructure/scheme_registry.py +136 -0
internal/infrastructure/state_encoding.py +94 -0
internal/infrastructure/telemetry.py +91 -0
internal/infrastructure/tome_parser.py +55 -0
internal/infrastructure/verifiable_credential.py +412 -0
internal/infrastructure/zig_bridge.py +256 -0
sum_cli/__init__.py +18 -0
sum_cli/main.py +688 -0
sum_engine-0.1.0.dist-info/METADATA +590 -0
sum_engine-0.1.0.dist-info/RECORD +49 -0
sum_engine-0.1.0.dist-info/WHEEL +5 -0
sum_engine-0.1.0.dist-info/entry_points.txt +2 -0
sum_engine-0.1.0.dist-info/licenses/LICENSE +201 -0
sum_engine-0.1.0.dist-info/top_level.txt +2 -0

internal/ensemble/live_llm_adapter.py ADDED Viewed

@@ -0,0 +1,183 @@
+"""
+Live LLM Adapter — The Reality Bridge
+Replaces mock generators and extractors with real AI models.
+Uses Pydantic structured outputs to enforce strict (subject, predicate,
+object) schemas, and OpenAI's embedding endpoint for the Vector Bridge.
+Author: ototao
+License: Apache License 2.0
+"""
+import os
+import logging
+from typing import List, Tuple, Optional
+from pydantic import BaseModel, Field
+from openai import AsyncOpenAI
+logger = logging.getLogger(__name__)
+# ─── Pydantic schemas for structured LLM output ──────────────────────
+class SemanticTriplet(BaseModel):
+    """A single irreducible fact as a subject-predicate-object triple."""
+    subject: str = Field(
+        min_length=2, max_length=200,
+        description="The core entity or subject (lowercased, concise)",
+    )
+    predicate: str = Field(
+        min_length=2, max_length=200,
+        description="The relational verb or attribute (snake_case)",
+    )
+    object_: str = Field(
+        alias="object",
+        min_length=2, max_length=200,
+        description="The target entity or value (lowercased, concise)",
+    )
+    # Phase 19A: Metadata fields — do NOT alter algebra semantics
+    source_span: Optional[str] = Field(
+        default=None,
+        description="The exact text span this triplet was extracted from",
+    )
+    certainty: Optional[str] = Field(
+        default=None,
+        description="Model's assessment: 'definite', 'hedged', or 'speculative'",
+    )
+    extraction_notes: Optional[str] = Field(
+        default=None,
+        description="Any caveats about this extraction (negation, conditional, etc.)",
+    )
+    model_config = {"populate_by_name": True}
+class ExtractionResponse(BaseModel):
+    """Structured output wrapper for a list of extracted triplets."""
+    triplets: List[SemanticTriplet]
+# ─── Adapter ─────────────────────────────────────────────────────────
+class LiveLLMAdapter:
+    """
+    Production connector that maps natural language to the Gödel universe
+    via constrained LLM calls.
+    Three capabilities:
+        1. ``extract_triplets``  — text → List[(subj, pred, obj)]
+        2. ``generate_text``     — axioms + negative constraints → narrative
+        3. ``get_embedding``     — text → List[float] (continuous vector)
+    """
+    def __init__(
+        self,
+        api_key: str | None = None,
+        model: str = "gpt-4o-mini",
+        embedding_model: str = "text-embedding-3-small",
+    ):
+        self.client = AsyncOpenAI(
+            api_key=api_key or os.getenv("OPENAI_API_KEY")
+        )
+        self.model = model
+        self.embedding_model = embedding_model
+    # ------------------------------------------------------------------
+    # Extraction (Tags)
+    # ------------------------------------------------------------------
+    async def extract_triplets(
+        self, chunk: str
+    ) -> List[Tuple[str, str, str]]:
+        """
+        Maps natural language into strict topological triplets via
+        Pydantic-constrained structured output.
+        Phase 19A: Enhanced prompt with negation awareness, certainty
+        metadata, and source span tracking.
+        """
+        response = await self.client.beta.chat.completions.parse(
+            model=self.model,
+            messages=[
+                {
+                    "role": "system",
+                    "content": (
+                        "Extract all distinct factual claims from the text "
+                        "as subject-predicate-object triplets.\n\n"
+                        "Rules:\n"
+                        "- Keep subject, predicate, and object concise and lowercased\n"
+                        "- Use snake_case for multi-word predicates (e.g., 'is_part_of')\n"
+                        "- Do NOT extract opinions, questions, or hypotheticals as facts\n"
+                        "- If a statement is negated (e.g., 'X does NOT cause Y'), "
+                        "set certainty to 'speculative' and note 'negation' in extraction_notes\n"
+                        "- If language is hedged ('may', 'might', 'possibly'), "
+                        "set certainty to 'hedged'\n"
+                        "- For definite factual statements, set certainty to 'definite'\n"
+                        "- Include the source_span: the exact phrase from the text"
+                    ),
+                },
+                {"role": "user", "content": chunk},
+            ],
+            response_format=ExtractionResponse,
+        )
+        parsed = response.choices[0].message.parsed
+        return [
+            (t.subject.lower(), t.predicate.lower(), t.object_.lower())
+            for t in parsed.triplets
+            # Phase 19A: Skip speculative extractions (negations)
+            if t.certainty != "speculative"
+        ]
+    # ------------------------------------------------------------------
+    # Generation (Tomes)
+    # ------------------------------------------------------------------
+    async def generate_text(
+        self,
+        target_axioms: List[str],
+        negative_constraints: List[str],
+    ) -> str:
+        """
+        The Tomes generator for the Epistemic Loop.
+        Produces a cohesive narrative from verified axioms while honouring
+        negative constraints (previously identified hallucinations).
+        """
+        sys_prompt = (
+            "You are a precise technical writer. Extrapolate the "
+            "following absolute facts into a cohesive narrative. "
+            "Do not invent facts."
+        )
+        user_prompt = (
+            f"FACTS TO INCLUDE:\n{chr(10).join(target_axioms)}\n\n"
+        )
+        if negative_constraints:
+            user_prompt += (
+                "CRITICAL NEGATIVE CONSTRAINTS "
+                "(DO NOT INCLUDE THESE HALLUCINATIONS):\n"
+                f"{chr(10).join(negative_constraints)}"
+            )
+        response = await self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": sys_prompt},
+                {"role": "user", "content": user_prompt},
+            ],
+        )
+        return response.choices[0].message.content
+    # ------------------------------------------------------------------
+    # Embeddings (Vector Bridge)
+    # ------------------------------------------------------------------
+    async def get_embedding(self, text: str) -> List[float]:
+        """Continuous mapping for the Continuous-Discrete Vector Bridge."""
+        response = await self.client.embeddings.create(
+            model=self.embedding_model,
+            input=text,
+        )
+        return response.data[0].embedding

internal/ensemble/llm_entailment.py ADDED Viewed

@@ -0,0 +1,117 @@
+"""
+LLM Entailment Checker — Structured Entailment for Regeneration Faithfulness
+Wraps an LLM call behind a strict entailment-check interface. Given a passage
+and a claim triple, returns a boolean entailed + confidence score.
+This is the symbolic-boundary verifier for the regeneration path: once an LLM
+has rendered prose from a structured axiom set, each source axiom is
+independently checked for entailment against the prose. Non-entailed axioms
+are counted as drift for FActScore / MiniCheck-equivalent metrics surfaced by
+the bench harness.
+The model MUST be pinned (with date suffix); unpinned identifiers raise at
+construction time so reproducibility is preserved.
+Author: ototao
+License: Apache License 2.0
+"""
+from __future__ import annotations
+import os
+from dataclasses import dataclass
+from openai import AsyncOpenAI
+from pydantic import BaseModel, Field
+class EntailmentJudgment(BaseModel):
+    """Pydantic-enforced LLM output schema for one entailment decision."""
+    entailed: bool = Field(description="Does the passage support the claim?")
+    confidence: float = Field(
+        ge=0.0,
+        le=1.0,
+        description="Model's confidence in its judgment, 0.0-1.0",
+    )
+@dataclass(frozen=True)
+class EntailmentResult:
+    """Decoded entailment outcome for one (passage, claim) pair."""
+    entailed: bool
+    confidence: float
+    claim_sentence: str
+class LlmEntailmentChecker:
+    """Structured entailment verifier via OpenAI structured-output parsing.
+    Single method: ``check(passage, claim_triple) -> EntailmentResult``.
+    The claim triple is rendered as ``"{s} {p} {o}"`` and submitted alongside
+    the passage. The model answers with a boolean ``entailed`` + a confidence
+    in [0, 1]. The judgement prompt is intentionally conservative — paraphrases
+    of the same fact count as entailed; reinterpretations or unsupported
+    inferences do not.
+    """
+    def __init__(
+        self,
+        model_id: str,
+        api_key: str | None = None,
+    ) -> None:
+        if not model_id or not model_id.strip():
+            raise ValueError(
+                "LlmEntailmentChecker requires a pinned model_id "
+                "(e.g. 'gpt-4o-2024-08-06')."
+            )
+        self.model_id = model_id
+        self.client = AsyncOpenAI(
+            api_key=api_key or os.getenv("OPENAI_API_KEY")
+        )
+    async def check(
+        self, passage: str, claim: tuple[str, str, str]
+    ) -> EntailmentResult:
+        s, p, o = claim
+        claim_sentence = f"{s} {p} {o}"
+        response = await self.client.beta.chat.completions.parse(
+            model=self.model_id,
+            messages=[
+                {
+                    "role": "system",
+                    "content": (
+                        "You are a strict entailment checker. Given a "
+                        "passage and a claim, decide whether the passage "
+                        "supports the claim. Be conservative: set "
+                        "entailed=true only if the passage explicitly or "
+                        "strongly implies the claim. Paraphrases of the "
+                        "same fact count as entailed; reinterpretations or "
+                        "unsupported inferences do not."
+                    ),
+                },
+                {
+                    "role": "user",
+                    "content": (
+                        f"PASSAGE:\n{passage}\n\n"
+                        f"CLAIM: {claim_sentence}\n\n"
+                        f"Does the passage entail the claim?"
+                    ),
+                },
+            ],
+            response_format=EntailmentJudgment,
+        )
+        judgment = response.choices[0].message.parsed
+        if judgment is None:
+            return EntailmentResult(
+                entailed=False,
+                confidence=0.0,
+                claim_sentence=claim_sentence,
+            )
+        return EntailmentResult(
+            entailed=judgment.entailed,
+            confidence=judgment.confidence,
+            claim_sentence=claim_sentence,
+        )

internal/ensemble/mass_semantic_engine.py ADDED Viewed

@@ -0,0 +1,138 @@
+"""
+Mass Semantic Engine — MapReduce Gödel-State Parallelization
+Wire the SPNT bounding and Gödel-State Algebra into a fully async
+MAP → ENCODE → REDUCE → AUDIT pipeline for mass-parallel semantic
+extraction.
+Architecture:
+    1. MAP:    asyncio.gather all chunk extractions (lock-free)
+    2. ENCODE: Convert string triplets to Gödel integers
+    3. REDUCE: LCM merge all chunk states into one global integer
+    4. AUDIT:  Paradox detection + SPNT compression bound check
+Author: ototao
+License: Apache License 2.0
+"""
+import asyncio
+import logging
+from typing import Callable, Awaitable, List, Tuple, Dict, Any
+from internal.algorithms.semantic_arithmetic import (
+    GodelStateAlgebra,
+    SemanticPrimeNumberTheorem,
+)
+from internal.ensemble.extraction_validator import ExtractionValidator
+logger = logging.getLogger(__name__)
+class MassSemanticEngine:
+    """
+    Async MapReduce engine that extracts semantic triplets from parallel
+    text chunks, encodes them as Gödel integers, and merges them via LCM.
+    Complements the existing MassDocumentEngine (string-based hierarchical
+    summarization) with a mathematical, lock-free alternative.
+    """
+    def __init__(
+        self,
+        extractor_llm_func: Callable[
+            [str], Awaitable[List[Tuple[str, str, str]]]
+        ],
+    ):
+        """
+        Args:
+            extractor_llm_func: An async callable that accepts a text chunk
+                and returns a list of (subject, predicate, object) triplets.
+        """
+        self.extract_triplets = extractor_llm_func
+        self.algebra = GodelStateAlgebra()
+    async def tomes_to_tags(
+        self,
+        raw_claims_count: int,
+        chunks: List[str],
+    ) -> Dict[str, Any]:
+        """
+        Mass-parallel extraction mapping text chunks to a single Global
+        Integer State.
+        Args:
+            raw_claims_count: Estimated total raw claims across all chunks.
+            chunks:           List of text chunks to extract from in parallel.
+        Returns:
+            Dictionary with:
+                global_state        – the merged Gödel integer
+                total_unique_primes – count of unique axioms in the state
+                spnt_limit          – theoretical compression bound
+                compression_ok      – True if within SPNT bound
+                paradoxes           – list of detected curvature conflicts
+        """
+        # ── 1. MAP: Mass-parallel lock-free extraction ──────────────
+        tasks = [self.extract_triplets(chunk) for chunk in chunks]
+        extracted_chunks: List[List[Tuple[str, str, str]]] = list(
+            await asyncio.gather(*tasks)
+        )
+        # ── 1.5 VALIDATE: Structural gate (Phase 19A) ─────────────
+        validator = ExtractionValidator()
+        validated_chunks = []
+        total_rejected = 0
+        all_rejection_reasons = []
+        for chunk_triplets in extracted_chunks:
+            result = validator.validate_batch(chunk_triplets)
+            validated_chunks.append(result.accepted)
+            total_rejected += result.rejected_count
+            for rej in result.rejected:
+                all_rejection_reasons.append({
+                    "triplet": f"{rej.subject}||{rej.predicate}||{rej.object_}",
+                    "reason": rej.reason,
+                })
+        # ── 2. ENCODE: Convert validated triplets to Gödel integers ──
+        chunk_states = [
+            self.algebra.encode_chunk_state(triplets)
+            for triplets in validated_chunks
+        ]
+        # ── 3. REDUCE: Mathematical Merge via LCM ──────────────────
+        global_state = self.algebra.merge_parallel_states(chunk_states)
+        # ── 4. AUDIT: Paradox detection & compression bounds ───────
+        paradoxes = self.algebra.detect_curvature_paradoxes(global_state)
+        if paradoxes:
+            logger.warning("Curvature paradoxes detected: %s", paradoxes)
+        # Count unique axioms present in the global state
+        total_unique_primes = sum(
+            1
+            for p in self.algebra.prime_to_axiom
+            if global_state % p == 0
+        )
+        spnt_limit = SemanticPrimeNumberTheorem.asymptotic_bound(
+            raw_claims_count
+        )
+        compression_ok = total_unique_primes <= spnt_limit
+        if not compression_ok:
+            logger.warning(
+                "SPNT compression failed: %d primes exceeds bound %d",
+                total_unique_primes,
+                spnt_limit,
+            )
+        return {
+            "global_state": global_state,
+            "total_unique_primes": total_unique_primes,
+            "spnt_limit": spnt_limit,
+            "compression_ok": compression_ok,
+            "paradoxes": paradoxes,
+            "rejected_count": total_rejected,
+            "rejection_reasons": all_rejection_reasons,
+        }