sum-engine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. internal/__init__.py +8 -0
  2. internal/algorithms/__init__.py +1 -0
  3. internal/algorithms/causal_discovery.py +96 -0
  4. internal/algorithms/predicate_canon.py +137 -0
  5. internal/algorithms/semantic_arithmetic.py +890 -0
  6. internal/algorithms/syntactic_sieve.py +452 -0
  7. internal/algorithms/zk_semantics.py +90 -0
  8. internal/ensemble/__init__.py +1 -0
  9. internal/ensemble/automated_scientist.py +138 -0
  10. internal/ensemble/autonomous_agent.py +157 -0
  11. internal/ensemble/causal_triggers.py +121 -0
  12. internal/ensemble/confidence_calibrator.py +284 -0
  13. internal/ensemble/epistemic_arbiter.py +159 -0
  14. internal/ensemble/epistemic_loop.py +136 -0
  15. internal/ensemble/extraction_validator.py +172 -0
  16. internal/ensemble/gauge_orchestrator.py +150 -0
  17. internal/ensemble/live_llm_adapter.py +183 -0
  18. internal/ensemble/llm_entailment.py +117 -0
  19. internal/ensemble/mass_semantic_engine.py +138 -0
  20. internal/ensemble/ouroboros.py +281 -0
  21. internal/ensemble/semantic_dedup.py +261 -0
  22. internal/ensemble/tome_generator.py +286 -0
  23. internal/ensemble/tome_sliders.py +104 -0
  24. internal/ensemble/vector_bridge.py +195 -0
  25. internal/ensemble/venn_abers.py +211 -0
  26. internal/infrastructure/__init__.py +1 -0
  27. internal/infrastructure/akashic_ledger.py +812 -0
  28. internal/infrastructure/canonical_codec.py +452 -0
  29. internal/infrastructure/jcs.py +115 -0
  30. internal/infrastructure/key_manager.py +239 -0
  31. internal/infrastructure/p2p_mesh.py +168 -0
  32. internal/infrastructure/prov_o.py +159 -0
  33. internal/infrastructure/provenance.py +181 -0
  34. internal/infrastructure/rate_limiter.py +81 -0
  35. internal/infrastructure/resource_guards.py +117 -0
  36. internal/infrastructure/scheme_registry.py +136 -0
  37. internal/infrastructure/state_encoding.py +94 -0
  38. internal/infrastructure/telemetry.py +91 -0
  39. internal/infrastructure/tome_parser.py +55 -0
  40. internal/infrastructure/verifiable_credential.py +412 -0
  41. internal/infrastructure/zig_bridge.py +256 -0
  42. sum_cli/__init__.py +18 -0
  43. sum_cli/main.py +688 -0
  44. sum_engine-0.1.0.dist-info/METADATA +590 -0
  45. sum_engine-0.1.0.dist-info/RECORD +49 -0
  46. sum_engine-0.1.0.dist-info/WHEEL +5 -0
  47. sum_engine-0.1.0.dist-info/entry_points.txt +2 -0
  48. sum_engine-0.1.0.dist-info/licenses/LICENSE +201 -0
  49. sum_engine-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,183 @@
1
+ """
2
+ Live LLM Adapter — The Reality Bridge
3
+
4
+ Replaces mock generators and extractors with real AI models.
5
+ Uses Pydantic structured outputs to enforce strict (subject, predicate,
6
+ object) schemas, and OpenAI's embedding endpoint for the Vector Bridge.
7
+
8
+ Author: ototao
9
+ License: Apache License 2.0
10
+ """
11
+
12
+ import os
13
+ import logging
14
+ from typing import List, Tuple, Optional
15
+
16
+ from pydantic import BaseModel, Field
17
+ from openai import AsyncOpenAI
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ # ─── Pydantic schemas for structured LLM output ──────────────────────
23
+
24
+ class SemanticTriplet(BaseModel):
25
+ """A single irreducible fact as a subject-predicate-object triple."""
26
+ subject: str = Field(
27
+ min_length=2, max_length=200,
28
+ description="The core entity or subject (lowercased, concise)",
29
+ )
30
+ predicate: str = Field(
31
+ min_length=2, max_length=200,
32
+ description="The relational verb or attribute (snake_case)",
33
+ )
34
+ object_: str = Field(
35
+ alias="object",
36
+ min_length=2, max_length=200,
37
+ description="The target entity or value (lowercased, concise)",
38
+ )
39
+ # Phase 19A: Metadata fields — do NOT alter algebra semantics
40
+ source_span: Optional[str] = Field(
41
+ default=None,
42
+ description="The exact text span this triplet was extracted from",
43
+ )
44
+ certainty: Optional[str] = Field(
45
+ default=None,
46
+ description="Model's assessment: 'definite', 'hedged', or 'speculative'",
47
+ )
48
+ extraction_notes: Optional[str] = Field(
49
+ default=None,
50
+ description="Any caveats about this extraction (negation, conditional, etc.)",
51
+ )
52
+
53
+ model_config = {"populate_by_name": True}
54
+
55
+
56
+ class ExtractionResponse(BaseModel):
57
+ """Structured output wrapper for a list of extracted triplets."""
58
+ triplets: List[SemanticTriplet]
59
+
60
+
61
+ # ─── Adapter ─────────────────────────────────────────────────────────
62
+
63
+ class LiveLLMAdapter:
64
+ """
65
+ Production connector that maps natural language to the Gödel universe
66
+ via constrained LLM calls.
67
+
68
+ Three capabilities:
69
+ 1. ``extract_triplets`` — text → List[(subj, pred, obj)]
70
+ 2. ``generate_text`` — axioms + negative constraints → narrative
71
+ 3. ``get_embedding`` — text → List[float] (continuous vector)
72
+ """
73
+
74
+ def __init__(
75
+ self,
76
+ api_key: str | None = None,
77
+ model: str = "gpt-4o-mini",
78
+ embedding_model: str = "text-embedding-3-small",
79
+ ):
80
+ self.client = AsyncOpenAI(
81
+ api_key=api_key or os.getenv("OPENAI_API_KEY")
82
+ )
83
+ self.model = model
84
+ self.embedding_model = embedding_model
85
+
86
+ # ------------------------------------------------------------------
87
+ # Extraction (Tags)
88
+ # ------------------------------------------------------------------
89
+
90
+ async def extract_triplets(
91
+ self, chunk: str
92
+ ) -> List[Tuple[str, str, str]]:
93
+ """
94
+ Maps natural language into strict topological triplets via
95
+ Pydantic-constrained structured output.
96
+
97
+ Phase 19A: Enhanced prompt with negation awareness, certainty
98
+ metadata, and source span tracking.
99
+ """
100
+ response = await self.client.beta.chat.completions.parse(
101
+ model=self.model,
102
+ messages=[
103
+ {
104
+ "role": "system",
105
+ "content": (
106
+ "Extract all distinct factual claims from the text "
107
+ "as subject-predicate-object triplets.\n\n"
108
+ "Rules:\n"
109
+ "- Keep subject, predicate, and object concise and lowercased\n"
110
+ "- Use snake_case for multi-word predicates (e.g., 'is_part_of')\n"
111
+ "- Do NOT extract opinions, questions, or hypotheticals as facts\n"
112
+ "- If a statement is negated (e.g., 'X does NOT cause Y'), "
113
+ "set certainty to 'speculative' and note 'negation' in extraction_notes\n"
114
+ "- If language is hedged ('may', 'might', 'possibly'), "
115
+ "set certainty to 'hedged'\n"
116
+ "- For definite factual statements, set certainty to 'definite'\n"
117
+ "- Include the source_span: the exact phrase from the text"
118
+ ),
119
+ },
120
+ {"role": "user", "content": chunk},
121
+ ],
122
+ response_format=ExtractionResponse,
123
+ )
124
+
125
+ parsed = response.choices[0].message.parsed
126
+ return [
127
+ (t.subject.lower(), t.predicate.lower(), t.object_.lower())
128
+ for t in parsed.triplets
129
+ # Phase 19A: Skip speculative extractions (negations)
130
+ if t.certainty != "speculative"
131
+ ]
132
+
133
+ # ------------------------------------------------------------------
134
+ # Generation (Tomes)
135
+ # ------------------------------------------------------------------
136
+
137
+ async def generate_text(
138
+ self,
139
+ target_axioms: List[str],
140
+ negative_constraints: List[str],
141
+ ) -> str:
142
+ """
143
+ The Tomes generator for the Epistemic Loop.
144
+
145
+ Produces a cohesive narrative from verified axioms while honouring
146
+ negative constraints (previously identified hallucinations).
147
+ """
148
+ sys_prompt = (
149
+ "You are a precise technical writer. Extrapolate the "
150
+ "following absolute facts into a cohesive narrative. "
151
+ "Do not invent facts."
152
+ )
153
+ user_prompt = (
154
+ f"FACTS TO INCLUDE:\n{chr(10).join(target_axioms)}\n\n"
155
+ )
156
+
157
+ if negative_constraints:
158
+ user_prompt += (
159
+ "CRITICAL NEGATIVE CONSTRAINTS "
160
+ "(DO NOT INCLUDE THESE HALLUCINATIONS):\n"
161
+ f"{chr(10).join(negative_constraints)}"
162
+ )
163
+
164
+ response = await self.client.chat.completions.create(
165
+ model=self.model,
166
+ messages=[
167
+ {"role": "system", "content": sys_prompt},
168
+ {"role": "user", "content": user_prompt},
169
+ ],
170
+ )
171
+ return response.choices[0].message.content
172
+
173
+ # ------------------------------------------------------------------
174
+ # Embeddings (Vector Bridge)
175
+ # ------------------------------------------------------------------
176
+
177
+ async def get_embedding(self, text: str) -> List[float]:
178
+ """Continuous mapping for the Continuous-Discrete Vector Bridge."""
179
+ response = await self.client.embeddings.create(
180
+ model=self.embedding_model,
181
+ input=text,
182
+ )
183
+ return response.data[0].embedding
@@ -0,0 +1,117 @@
1
+ """
2
+ LLM Entailment Checker — Structured Entailment for Regeneration Faithfulness
3
+
4
+ Wraps an LLM call behind a strict entailment-check interface. Given a passage
5
+ and a claim triple, returns a boolean entailed + confidence score.
6
+
7
+ This is the symbolic-boundary verifier for the regeneration path: once an LLM
8
+ has rendered prose from a structured axiom set, each source axiom is
9
+ independently checked for entailment against the prose. Non-entailed axioms
10
+ are counted as drift for FActScore / MiniCheck-equivalent metrics surfaced by
11
+ the bench harness.
12
+
13
+ The model MUST be pinned (with date suffix); unpinned identifiers raise at
14
+ construction time so reproducibility is preserved.
15
+
16
+ Author: ototao
17
+ License: Apache License 2.0
18
+ """
19
+ from __future__ import annotations
20
+
21
+ import os
22
+ from dataclasses import dataclass
23
+
24
+ from openai import AsyncOpenAI
25
+ from pydantic import BaseModel, Field
26
+
27
+
28
+ class EntailmentJudgment(BaseModel):
29
+ """Pydantic-enforced LLM output schema for one entailment decision."""
30
+
31
+ entailed: bool = Field(description="Does the passage support the claim?")
32
+ confidence: float = Field(
33
+ ge=0.0,
34
+ le=1.0,
35
+ description="Model's confidence in its judgment, 0.0-1.0",
36
+ )
37
+
38
+
39
+ @dataclass(frozen=True)
40
+ class EntailmentResult:
41
+ """Decoded entailment outcome for one (passage, claim) pair."""
42
+
43
+ entailed: bool
44
+ confidence: float
45
+ claim_sentence: str
46
+
47
+
48
+ class LlmEntailmentChecker:
49
+ """Structured entailment verifier via OpenAI structured-output parsing.
50
+
51
+ Single method: ``check(passage, claim_triple) -> EntailmentResult``.
52
+ The claim triple is rendered as ``"{s} {p} {o}"`` and submitted alongside
53
+ the passage. The model answers with a boolean ``entailed`` + a confidence
54
+ in [0, 1]. The judgement prompt is intentionally conservative — paraphrases
55
+ of the same fact count as entailed; reinterpretations or unsupported
56
+ inferences do not.
57
+ """
58
+
59
+ def __init__(
60
+ self,
61
+ model_id: str,
62
+ api_key: str | None = None,
63
+ ) -> None:
64
+ if not model_id or not model_id.strip():
65
+ raise ValueError(
66
+ "LlmEntailmentChecker requires a pinned model_id "
67
+ "(e.g. 'gpt-4o-2024-08-06')."
68
+ )
69
+ self.model_id = model_id
70
+ self.client = AsyncOpenAI(
71
+ api_key=api_key or os.getenv("OPENAI_API_KEY")
72
+ )
73
+
74
+ async def check(
75
+ self, passage: str, claim: tuple[str, str, str]
76
+ ) -> EntailmentResult:
77
+ s, p, o = claim
78
+ claim_sentence = f"{s} {p} {o}"
79
+
80
+ response = await self.client.beta.chat.completions.parse(
81
+ model=self.model_id,
82
+ messages=[
83
+ {
84
+ "role": "system",
85
+ "content": (
86
+ "You are a strict entailment checker. Given a "
87
+ "passage and a claim, decide whether the passage "
88
+ "supports the claim. Be conservative: set "
89
+ "entailed=true only if the passage explicitly or "
90
+ "strongly implies the claim. Paraphrases of the "
91
+ "same fact count as entailed; reinterpretations or "
92
+ "unsupported inferences do not."
93
+ ),
94
+ },
95
+ {
96
+ "role": "user",
97
+ "content": (
98
+ f"PASSAGE:\n{passage}\n\n"
99
+ f"CLAIM: {claim_sentence}\n\n"
100
+ f"Does the passage entail the claim?"
101
+ ),
102
+ },
103
+ ],
104
+ response_format=EntailmentJudgment,
105
+ )
106
+ judgment = response.choices[0].message.parsed
107
+ if judgment is None:
108
+ return EntailmentResult(
109
+ entailed=False,
110
+ confidence=0.0,
111
+ claim_sentence=claim_sentence,
112
+ )
113
+ return EntailmentResult(
114
+ entailed=judgment.entailed,
115
+ confidence=judgment.confidence,
116
+ claim_sentence=claim_sentence,
117
+ )
@@ -0,0 +1,138 @@
1
+ """
2
+ Mass Semantic Engine — MapReduce Gödel-State Parallelization
3
+
4
+ Wire the SPNT bounding and Gödel-State Algebra into a fully async
5
+ MAP → ENCODE → REDUCE → AUDIT pipeline for mass-parallel semantic
6
+ extraction.
7
+
8
+ Architecture:
9
+ 1. MAP: asyncio.gather all chunk extractions (lock-free)
10
+ 2. ENCODE: Convert string triplets to Gödel integers
11
+ 3. REDUCE: LCM merge all chunk states into one global integer
12
+ 4. AUDIT: Paradox detection + SPNT compression bound check
13
+
14
+ Author: ototao
15
+ License: Apache License 2.0
16
+ """
17
+
18
+ import asyncio
19
+ import logging
20
+ from typing import Callable, Awaitable, List, Tuple, Dict, Any
21
+
22
+ from internal.algorithms.semantic_arithmetic import (
23
+ GodelStateAlgebra,
24
+ SemanticPrimeNumberTheorem,
25
+ )
26
+ from internal.ensemble.extraction_validator import ExtractionValidator
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ class MassSemanticEngine:
32
+ """
33
+ Async MapReduce engine that extracts semantic triplets from parallel
34
+ text chunks, encodes them as Gödel integers, and merges them via LCM.
35
+
36
+ Complements the existing MassDocumentEngine (string-based hierarchical
37
+ summarization) with a mathematical, lock-free alternative.
38
+ """
39
+
40
+ def __init__(
41
+ self,
42
+ extractor_llm_func: Callable[
43
+ [str], Awaitable[List[Tuple[str, str, str]]]
44
+ ],
45
+ ):
46
+ """
47
+ Args:
48
+ extractor_llm_func: An async callable that accepts a text chunk
49
+ and returns a list of (subject, predicate, object) triplets.
50
+ """
51
+ self.extract_triplets = extractor_llm_func
52
+ self.algebra = GodelStateAlgebra()
53
+
54
+ async def tomes_to_tags(
55
+ self,
56
+ raw_claims_count: int,
57
+ chunks: List[str],
58
+ ) -> Dict[str, Any]:
59
+ """
60
+ Mass-parallel extraction mapping text chunks to a single Global
61
+ Integer State.
62
+
63
+ Args:
64
+ raw_claims_count: Estimated total raw claims across all chunks.
65
+ chunks: List of text chunks to extract from in parallel.
66
+
67
+ Returns:
68
+ Dictionary with:
69
+ global_state – the merged Gödel integer
70
+ total_unique_primes – count of unique axioms in the state
71
+ spnt_limit – theoretical compression bound
72
+ compression_ok – True if within SPNT bound
73
+ paradoxes – list of detected curvature conflicts
74
+ """
75
+ # ── 1. MAP: Mass-parallel lock-free extraction ──────────────
76
+ tasks = [self.extract_triplets(chunk) for chunk in chunks]
77
+ extracted_chunks: List[List[Tuple[str, str, str]]] = list(
78
+ await asyncio.gather(*tasks)
79
+ )
80
+
81
+ # ── 1.5 VALIDATE: Structural gate (Phase 19A) ─────────────
82
+ validator = ExtractionValidator()
83
+ validated_chunks = []
84
+ total_rejected = 0
85
+ all_rejection_reasons = []
86
+
87
+ for chunk_triplets in extracted_chunks:
88
+ result = validator.validate_batch(chunk_triplets)
89
+ validated_chunks.append(result.accepted)
90
+ total_rejected += result.rejected_count
91
+ for rej in result.rejected:
92
+ all_rejection_reasons.append({
93
+ "triplet": f"{rej.subject}||{rej.predicate}||{rej.object_}",
94
+ "reason": rej.reason,
95
+ })
96
+
97
+ # ── 2. ENCODE: Convert validated triplets to Gödel integers ──
98
+ chunk_states = [
99
+ self.algebra.encode_chunk_state(triplets)
100
+ for triplets in validated_chunks
101
+ ]
102
+
103
+ # ── 3. REDUCE: Mathematical Merge via LCM ──────────────────
104
+ global_state = self.algebra.merge_parallel_states(chunk_states)
105
+
106
+ # ── 4. AUDIT: Paradox detection & compression bounds ───────
107
+ paradoxes = self.algebra.detect_curvature_paradoxes(global_state)
108
+ if paradoxes:
109
+ logger.warning("Curvature paradoxes detected: %s", paradoxes)
110
+
111
+ # Count unique axioms present in the global state
112
+ total_unique_primes = sum(
113
+ 1
114
+ for p in self.algebra.prime_to_axiom
115
+ if global_state % p == 0
116
+ )
117
+
118
+ spnt_limit = SemanticPrimeNumberTheorem.asymptotic_bound(
119
+ raw_claims_count
120
+ )
121
+ compression_ok = total_unique_primes <= spnt_limit
122
+
123
+ if not compression_ok:
124
+ logger.warning(
125
+ "SPNT compression failed: %d primes exceeds bound %d",
126
+ total_unique_primes,
127
+ spnt_limit,
128
+ )
129
+
130
+ return {
131
+ "global_state": global_state,
132
+ "total_unique_primes": total_unique_primes,
133
+ "spnt_limit": spnt_limit,
134
+ "compression_ok": compression_ok,
135
+ "paradoxes": paradoxes,
136
+ "rejected_count": total_rejected,
137
+ "rejection_reasons": all_rejection_reasons,
138
+ }