sum-engine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. internal/__init__.py +8 -0
  2. internal/algorithms/__init__.py +1 -0
  3. internal/algorithms/causal_discovery.py +96 -0
  4. internal/algorithms/predicate_canon.py +137 -0
  5. internal/algorithms/semantic_arithmetic.py +890 -0
  6. internal/algorithms/syntactic_sieve.py +452 -0
  7. internal/algorithms/zk_semantics.py +90 -0
  8. internal/ensemble/__init__.py +1 -0
  9. internal/ensemble/automated_scientist.py +138 -0
  10. internal/ensemble/autonomous_agent.py +157 -0
  11. internal/ensemble/causal_triggers.py +121 -0
  12. internal/ensemble/confidence_calibrator.py +284 -0
  13. internal/ensemble/epistemic_arbiter.py +159 -0
  14. internal/ensemble/epistemic_loop.py +136 -0
  15. internal/ensemble/extraction_validator.py +172 -0
  16. internal/ensemble/gauge_orchestrator.py +150 -0
  17. internal/ensemble/live_llm_adapter.py +183 -0
  18. internal/ensemble/llm_entailment.py +117 -0
  19. internal/ensemble/mass_semantic_engine.py +138 -0
  20. internal/ensemble/ouroboros.py +281 -0
  21. internal/ensemble/semantic_dedup.py +261 -0
  22. internal/ensemble/tome_generator.py +286 -0
  23. internal/ensemble/tome_sliders.py +104 -0
  24. internal/ensemble/vector_bridge.py +195 -0
  25. internal/ensemble/venn_abers.py +211 -0
  26. internal/infrastructure/__init__.py +1 -0
  27. internal/infrastructure/akashic_ledger.py +812 -0
  28. internal/infrastructure/canonical_codec.py +452 -0
  29. internal/infrastructure/jcs.py +115 -0
  30. internal/infrastructure/key_manager.py +239 -0
  31. internal/infrastructure/p2p_mesh.py +168 -0
  32. internal/infrastructure/prov_o.py +159 -0
  33. internal/infrastructure/provenance.py +181 -0
  34. internal/infrastructure/rate_limiter.py +81 -0
  35. internal/infrastructure/resource_guards.py +117 -0
  36. internal/infrastructure/scheme_registry.py +136 -0
  37. internal/infrastructure/state_encoding.py +94 -0
  38. internal/infrastructure/telemetry.py +91 -0
  39. internal/infrastructure/tome_parser.py +55 -0
  40. internal/infrastructure/verifiable_credential.py +412 -0
  41. internal/infrastructure/zig_bridge.py +256 -0
  42. sum_cli/__init__.py +18 -0
  43. sum_cli/main.py +688 -0
  44. sum_engine-0.1.0.dist-info/METADATA +590 -0
  45. sum_engine-0.1.0.dist-info/RECORD +49 -0
  46. sum_engine-0.1.0.dist-info/WHEEL +5 -0
  47. sum_engine-0.1.0.dist-info/entry_points.txt +2 -0
  48. sum_engine-0.1.0.dist-info/licenses/LICENSE +201 -0
  49. sum_engine-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,281 @@
1
+ """
2
+ Ouroboros Verifier — Proof of Semantic Conservation
3
+
4
+ Proves that the Gödel-State Engine performs lossless round-tripping
5
+ over its canonical axiom representation:
6
+
7
+ Integer A → Canonical Tome → Parse Axiom Keys → Integer B
8
+
9
+ If ``A == B``, semantic mass is conserved through the encode-decode cycle.
10
+
11
+ The proof operates on the **canonical layer**: the tome renderer emits
12
+ deterministic ``"The {s} {p} {o}."`` sentences, and the verifier parses
13
+ those exact templates back into axiom keys. The NLP sieve is **never**
14
+ used for the proof — it is a lossy projection (lemmatization, POS
15
+ parsing) that would break the bijection.
16
+
17
+ For ``verify_from_text``, the sieve encodes the initial text into
18
+ Integer A, but the conservation proof from A onward uses the canonical
19
+ path exclusively.
20
+
21
+ Diagnostics:
22
+ When the round-trip fails, the verifier reports exactly which axioms
23
+ were lost, which were spuriously added, and counts.
24
+
25
+ Phase 14: The Ouroboros Protocol.
26
+
27
+ Author: ototao
28
+ License: Apache License 2.0
29
+ """
30
+
31
+ import math
32
+ import re
33
+ import logging
34
+ from datetime import datetime, timezone
35
+ from typing import List, Tuple
36
+ from dataclasses import dataclass, field
37
+
38
+ from internal.algorithms.semantic_arithmetic import GodelStateAlgebra
39
+ from internal.algorithms.syntactic_sieve import DeterministicSieve
40
+ from internal.ensemble.tome_generator import (
41
+ AutoregressiveTomeGenerator,
42
+ CANONICAL_FORMAT_VERSION,
43
+ )
44
+
45
+ logger = logging.getLogger(__name__)
46
+
47
+
48
+ def _zig():
49
+ try:
50
+ from internal.infrastructure.zig_bridge import zig_engine
51
+ return zig_engine
52
+ except ImportError:
53
+ return None
54
+
55
+
56
+ @dataclass
57
+ class ConservationProof:
58
+ """
59
+ The result of a semantic conservation round-trip.
60
+
61
+ Attributes:
62
+ is_conserved: True if the round-trip is lossless.
63
+ format_version: Canonical format version used for the proof.
64
+ proof_mode: Always ``"canonical"`` for conservation proofs.
65
+ timestamp: ISO 8601 timestamp of verification.
66
+ source_state: Integer A (original encoding).
67
+ reconstructed_state: Integer B (re-encoded from canonical tome).
68
+ state_a_digits: Digit count of Integer A.
69
+ state_b_digits: Digit count of Integer B.
70
+ source_axiom_count: Number of axioms in A.
71
+ reconstructed_axiom_count: Number of axioms in B.
72
+ missing_axioms: Axioms in A but not in B (lost in decode).
73
+ extra_axioms: Axioms in B but not in A (spuriously added).
74
+ canonical_tome: The intermediate canonical text.
75
+ """
76
+ is_conserved: bool
77
+ format_version: str
78
+ proof_mode: str
79
+ timestamp: str
80
+ source_state: int
81
+ reconstructed_state: int
82
+ state_a_digits: int
83
+ state_b_digits: int
84
+ source_axiom_count: int
85
+ reconstructed_axiom_count: int
86
+ missing_axioms: List[str] = field(default_factory=list)
87
+ extra_axioms: List[str] = field(default_factory=list)
88
+ canonical_tome: str = ""
89
+
90
+
91
+ class OuroborosVerifier:
92
+ """
93
+ Proves Lossless Semantic Conservation.
94
+
95
+ The proof path is:
96
+ Integer A → Canonical Tome (deterministic template) →
97
+ Parse "The {s} {p} {o}." lines → Integer B
98
+
99
+ This is a bijective codec over the canonical representation.
100
+ The NLP sieve is only used for initial text→triplets encoding,
101
+ never for the conservation proof itself.
102
+ """
103
+
104
+ # Supported canonical format versions
105
+ SUPPORTED_VERSIONS = {"1.0.0"}
106
+
107
+ # Regex to extract axiom components from canonical "The {s} {p} {o}." lines
108
+ _CANONICAL_LINE_RE = re.compile(r"^The (\S+) (\S+) (.+)\.$")
109
+ # Regex to extract version header
110
+ _VERSION_RE = re.compile(r"^@canonical_version:\s*(.+)$")
111
+
112
+ def __init__(
113
+ self,
114
+ algebra: GodelStateAlgebra,
115
+ sieve: DeterministicSieve,
116
+ tome_generator: AutoregressiveTomeGenerator,
117
+ ):
118
+ self.algebra = algebra
119
+ self.sieve = sieve
120
+ self.tome_generator = tome_generator
121
+
122
+ def _reconstruct_from_canonical(
123
+ self, canonical_tome: str
124
+ ) -> Tuple[int, List[str], str]:
125
+ """
126
+ Parse canonical tome text back into axiom keys and re-encode.
127
+
128
+ The canonical renderer emits lines in the exact format::
129
+
130
+ @canonical_version: 1.0.0
131
+ The {subject} {predicate} {object}.
132
+
133
+ This method extracts the version, validates it, reconstructs
134
+ axiom keys, and re-encodes to a Gödel integer.
135
+
136
+ Returns:
137
+ (reconstructed_state, list_of_axiom_keys, format_version)
138
+ """
139
+ state = 1
140
+ axiom_keys = []
141
+ format_version = "unknown"
142
+
143
+ for line in canonical_tome.splitlines():
144
+ line = line.strip()
145
+
146
+ # Check for version header
147
+ vm = self._VERSION_RE.match(line)
148
+ if vm:
149
+ format_version = vm.group(1).strip()
150
+ continue
151
+
152
+ m = self._CANONICAL_LINE_RE.match(line)
153
+ if m:
154
+ s, p, o = m.group(1), m.group(2), m.group(3)
155
+ prime = self.algebra.get_or_mint_prime(s, p, o)
156
+ if state % prime != 0:
157
+ z = _zig()
158
+ r = z.bigint_lcm(state, prime) if z else None
159
+ state = r if r is not None else math.lcm(state, prime)
160
+ axiom_keys.append(f"{s}||{p}||{o}")
161
+
162
+ return state, axiom_keys, format_version
163
+
164
+ def _extract_axiom_keys(self, state: int) -> set:
165
+ """Extract the set of axiom keys whose primes divide the state."""
166
+ keys = set()
167
+ for prime, axiom in self.algebra.prime_to_axiom.items():
168
+ if state % prime == 0:
169
+ keys.add(axiom)
170
+ return keys
171
+
172
+ def verify_from_state(self, target_state: int) -> ConservationProof:
173
+ """
174
+ Verify lossless conservation of a Gödel Integer.
175
+
176
+ Pipeline:
177
+ Integer A → Canonical Tome → Parse Axiom Keys → Integer B
178
+ Conservation iff A == B
179
+
180
+ The proof uses deterministic canonical rendering + deterministic
181
+ template parsing. No NLP, no lemmatization, no ambiguity.
182
+
183
+ Args:
184
+ target_state: The Gödel integer to verify.
185
+
186
+ Returns:
187
+ A ``ConservationProof`` with full diagnostics.
188
+ """
189
+ # Step 1: Canonical decode
190
+ canonical_tome = self.tome_generator.generate_canonical(
191
+ target_state, "Ouroboros Verification Tome"
192
+ )
193
+
194
+ # Step 2: Re-encode from canonical template lines (NOT via NLP sieve)
195
+ reconstructed_state, _, format_version = self._reconstruct_from_canonical(
196
+ canonical_tome
197
+ )
198
+
199
+ # Step 3: Diagnose
200
+ source_axioms = self._extract_axiom_keys(target_state)
201
+ reconstructed_axioms = self._extract_axiom_keys(reconstructed_state)
202
+
203
+ missing = sorted(source_axioms - reconstructed_axioms)
204
+ extra = sorted(reconstructed_axioms - source_axioms)
205
+
206
+ is_conserved = (target_state == reconstructed_state)
207
+ now = datetime.now(timezone.utc).isoformat()
208
+
209
+ proof = ConservationProof(
210
+ is_conserved=is_conserved,
211
+ format_version=format_version,
212
+ proof_mode="canonical",
213
+ timestamp=now,
214
+ source_state=target_state,
215
+ reconstructed_state=reconstructed_state,
216
+ state_a_digits=len(str(target_state)),
217
+ state_b_digits=len(str(reconstructed_state)),
218
+ source_axiom_count=len(source_axioms),
219
+ reconstructed_axiom_count=len(reconstructed_axioms),
220
+ missing_axioms=missing,
221
+ extra_axioms=extra,
222
+ canonical_tome=canonical_tome,
223
+ )
224
+
225
+ if is_conserved:
226
+ logger.info(
227
+ "Ouroboros Proof: CONSERVED — %d axioms round-tripped losslessly.",
228
+ len(source_axioms),
229
+ )
230
+ else:
231
+ logger.warning(
232
+ "Ouroboros Proof: DIVERGED — missing=%d, extra=%d",
233
+ len(missing), len(extra),
234
+ )
235
+
236
+ return proof
237
+
238
+ def verify_from_text(self, text: str) -> ConservationProof:
239
+ """
240
+ Full Ouroboros: Text → Sieve → Integer A → Canonical → Parse → Integer B.
241
+
242
+ The sieve is used ONLY for the initial text→triplets encoding.
243
+ The conservation proof (A→Canonical→B) is fully deterministic.
244
+
245
+ Args:
246
+ text: Raw input text.
247
+
248
+ Returns:
249
+ A ``ConservationProof`` with full diagnostics.
250
+ """
251
+ # Step 1: Text → Triplets → Integer A (via NLP sieve)
252
+ triplets = self.sieve.extract_triplets(text)
253
+ state_a = 1
254
+ for s, p, o in triplets:
255
+ prime = self.algebra.get_or_mint_prime(s, p, o)
256
+ if state_a % prime != 0:
257
+ z = _zig()
258
+ r = z.bigint_lcm(state_a, prime) if z else None
259
+ state_a = r if r is not None else math.lcm(state_a, prime)
260
+
261
+ # Step 2: Verify conservation from Integer A (canonical path only)
262
+ proof = self.verify_from_state(state_a)
263
+
264
+ return proof
265
+
266
+ def proof_to_dict(self, proof: ConservationProof) -> dict:
267
+ """Serialize a ConservationProof for API responses."""
268
+ return {
269
+ "is_conserved": proof.is_conserved,
270
+ "format_version": proof.format_version,
271
+ "proof_mode": proof.proof_mode,
272
+ "timestamp": proof.timestamp,
273
+ "state_a_digits": proof.state_a_digits,
274
+ "state_b_digits": proof.state_b_digits,
275
+ "source_axiom_count": proof.source_axiom_count,
276
+ "reconstructed_axiom_count": proof.reconstructed_axiom_count,
277
+ "missing_axioms": proof.missing_axioms,
278
+ "extra_axioms": proof.extra_axioms,
279
+ "states_match": proof.source_state == proof.reconstructed_state,
280
+ }
281
+
@@ -0,0 +1,261 @@
1
+ """
2
+ Semantic Deduplicator — Near-Duplicate Axiom Detection
3
+
4
+ Phase 25: Prevents state bloat from semantically equivalent axioms
5
+ that differ only in surface text (e.g., "orbits" vs "revolves_around",
6
+ "New York" vs "new york").
7
+
8
+ Uses zero-cost string similarity:
9
+ 1. Normalization — lowercase, strip, collapse whitespace
10
+ 2. Jaccard token overlap — set intersection / union of words
11
+ 3. Levenshtein ratio — edit distance / max length
12
+
13
+ No LLM or embedding calls required.
14
+
15
+ Usage:
16
+ dedup = SemanticDeduplicator()
17
+ result = dedup.deduplicate(
18
+ "earth||revolves_around||sun",
19
+ existing_axioms=["earth||orbits||sun", "mars||has||moons"],
20
+ )
21
+ # result = DedupResult(canonical="earth||orbits||sun", is_duplicate=True, ...)
22
+
23
+ Author: ototao
24
+ License: Apache License 2.0
25
+ """
26
+
27
+ import re
28
+ import logging
29
+ from dataclasses import dataclass
30
+ from typing import List, Optional
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ # ─── Predicate Synonym Groups ────────────────────────────────────────
36
+ # Common predicates that are semantically equivalent.
37
+ # Each group maps to its canonical form (first element).
38
+
39
+ PREDICATE_SYNONYMS = {
40
+ "orbits": "orbits",
41
+ "revolves_around": "orbits",
42
+ "circles": "orbits",
43
+ "goes_around": "orbits",
44
+ "is_a": "is_a",
45
+ "is_an": "is_a",
46
+ "is_type": "is_a",
47
+ "type_of": "is_a",
48
+ "kind_of": "is_a",
49
+ "has": "has",
50
+ "has_a": "has",
51
+ "possesses": "has",
52
+ "owns": "has",
53
+ "contains": "contains",
54
+ "includes": "contains",
55
+ "has_part": "contains",
56
+ "located_in": "located_in",
57
+ "lives_in": "located_in",
58
+ "resides_in": "located_in",
59
+ "is_in": "located_in",
60
+ "situated_in": "located_in",
61
+ "created_by": "created_by",
62
+ "made_by": "created_by",
63
+ "authored_by": "created_by",
64
+ "written_by": "created_by",
65
+ "invented_by": "created_by",
66
+ "causes": "causes",
67
+ "leads_to": "causes",
68
+ "results_in": "causes",
69
+ "produces": "causes",
70
+ "part_of": "part_of",
71
+ "belongs_to": "part_of",
72
+ "member_of": "part_of",
73
+ "component_of": "part_of",
74
+ }
75
+
76
+ DEFAULT_THRESHOLD = 0.80
77
+
78
+
79
+ @dataclass
80
+ class DedupResult:
81
+ """Result of a deduplication check."""
82
+ canonical_key: str # The resolved canonical axiom key
83
+ is_duplicate: bool # True if a near-duplicate was found
84
+ duplicate_of: str # The existing axiom it duplicates ('' if none)
85
+ similarity: float # Similarity score (0.0 if not duplicate)
86
+ method: str # Which signal triggered: 'exact'|'predicate'|'fuzzy'|'none'
87
+
88
+
89
+ class SemanticDeduplicator:
90
+ """Zero-cost semantic deduplication engine.
91
+
92
+ Detects near-duplicate axioms using layered string similarity:
93
+ 1. Exact match (prime identity — already handled by algebra)
94
+ 2. Predicate synonym canonicalization
95
+ 3. Fuzzy matching via Jaccard + Levenshtein
96
+ """
97
+
98
+ def normalize(self, axiom_key: str) -> str:
99
+ """Normalize axiom key to canonical form.
100
+
101
+ - Lowercase
102
+ - Strip whitespace
103
+ - Collapse multiple underscores/spaces
104
+ - Canonicalize predicate via synonym table
105
+ """
106
+ parts = axiom_key.split("||")
107
+ if len(parts) != 3:
108
+ return axiom_key.strip().lower()
109
+
110
+ s, p, o = [x.strip().lower() for x in parts]
111
+
112
+ # Collapse whitespace and underscores
113
+ s = re.sub(r"[\s_]+", "_", s).strip("_")
114
+ p = re.sub(r"[\s_]+", "_", p).strip("_")
115
+ o = re.sub(r"[\s_]+", "_", o).strip("_")
116
+
117
+ # Canonicalize predicate
118
+ p = PREDICATE_SYNONYMS.get(p, p)
119
+
120
+ return f"{s}||{p}||{o}"
121
+
122
+ @staticmethod
123
+ def _tokenize(s: str) -> set:
124
+ """Split a string into word tokens."""
125
+ return set(re.split(r"[_\|\s]+", s.lower()))
126
+
127
+ @staticmethod
128
+ def jaccard_similarity(a: str, b: str) -> float:
129
+ """Jaccard token-overlap coefficient.
130
+
131
+ |A ∩ B| / |A ∪ B|
132
+ """
133
+ tokens_a = SemanticDeduplicator._tokenize(a)
134
+ tokens_b = SemanticDeduplicator._tokenize(b)
135
+
136
+ if not tokens_a and not tokens_b:
137
+ return 1.0
138
+ if not tokens_a or not tokens_b:
139
+ return 0.0
140
+
141
+ intersection = tokens_a & tokens_b
142
+ union = tokens_a | tokens_b
143
+ return len(intersection) / len(union)
144
+
145
+ @staticmethod
146
+ def levenshtein_ratio(a: str, b: str) -> float:
147
+ """Edit distance ratio: 1.0 = identical, 0.0 = completely different.
148
+
149
+ Uses dynamic programming Levenshtein distance.
150
+ """
151
+ if a == b:
152
+ return 1.0
153
+ if not a or not b:
154
+ return 0.0
155
+
156
+ m, n = len(a), len(b)
157
+
158
+ # Optimize: only keep two rows
159
+ prev = list(range(n + 1))
160
+ curr = [0] * (n + 1)
161
+
162
+ for i in range(1, m + 1):
163
+ curr[0] = i
164
+ for j in range(1, n + 1):
165
+ cost = 0 if a[i - 1] == b[j - 1] else 1
166
+ curr[j] = min(
167
+ prev[j] + 1, # deletion
168
+ curr[j - 1] + 1, # insertion
169
+ prev[j - 1] + cost # substitution
170
+ )
171
+ prev, curr = curr, [0] * (n + 1)
172
+
173
+ distance = prev[n]
174
+ return 1.0 - (distance / max(m, n))
175
+
176
+ def combined_similarity(self, a: str, b: str) -> float:
177
+ """Weighted combination of Jaccard and Levenshtein.
178
+
179
+ Weights: 60% Jaccard (semantic), 40% Levenshtein (syntactic).
180
+ """
181
+ j = self.jaccard_similarity(a, b)
182
+ l = self.levenshtein_ratio(a, b)
183
+ return 0.6 * j + 0.4 * l
184
+
185
+ def find_near_duplicates(
186
+ self,
187
+ axiom_key: str,
188
+ existing_axioms: List[str],
189
+ threshold: float = DEFAULT_THRESHOLD,
190
+ ) -> List[tuple]:
191
+ """Find axioms similar to axiom_key above the threshold.
192
+
193
+ Returns list of (existing_axiom, similarity, method) sorted by
194
+ similarity descending.
195
+ """
196
+ normalized = self.normalize(axiom_key)
197
+ results = []
198
+
199
+ for existing in existing_axioms:
200
+ norm_existing = self.normalize(existing)
201
+
202
+ # Layer 1: Exact after normalization
203
+ if normalized == norm_existing:
204
+ results.append((existing, 1.0, "exact"))
205
+ continue
206
+
207
+ # Layer 2: Check predicate synonym match
208
+ parts_new = normalized.split("||")
209
+ parts_ex = norm_existing.split("||")
210
+ if (len(parts_new) == 3 and len(parts_ex) == 3
211
+ and parts_new[0] == parts_ex[0]
212
+ and parts_new[2] == parts_ex[2]
213
+ and parts_new[1] == parts_ex[1]):
214
+ # After canonicalization they match
215
+ results.append((existing, 1.0, "predicate"))
216
+ continue
217
+
218
+ # Layer 3: Fuzzy similarity
219
+ sim = self.combined_similarity(normalized, norm_existing)
220
+ if sim >= threshold:
221
+ results.append((existing, sim, "fuzzy"))
222
+
223
+ results.sort(key=lambda x: x[1], reverse=True)
224
+ return results
225
+
226
+ def deduplicate(
227
+ self,
228
+ axiom_key: str,
229
+ existing_axioms: List[str],
230
+ threshold: float = DEFAULT_THRESHOLD,
231
+ ) -> DedupResult:
232
+ """Check if axiom_key is a near-duplicate of any existing axiom.
233
+
234
+ Returns a DedupResult with the canonical key and match info.
235
+ """
236
+ normalized = self.normalize(axiom_key)
237
+ duplicates = self.find_near_duplicates(
238
+ axiom_key, existing_axioms, threshold
239
+ )
240
+
241
+ if duplicates:
242
+ best_match, sim, method = duplicates[0]
243
+ logger.info(
244
+ "Dedup: '%s' → duplicate of '%s' (%.2f, %s)",
245
+ axiom_key, best_match, sim, method,
246
+ )
247
+ return DedupResult(
248
+ canonical_key=self.normalize(best_match),
249
+ is_duplicate=True,
250
+ duplicate_of=best_match,
251
+ similarity=sim,
252
+ method=method,
253
+ )
254
+
255
+ return DedupResult(
256
+ canonical_key=normalized,
257
+ is_duplicate=False,
258
+ duplicate_of="",
259
+ similarity=0.0,
260
+ method="none",
261
+ )