sum-engine 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- internal/__init__.py +8 -0
- internal/algorithms/__init__.py +1 -0
- internal/algorithms/causal_discovery.py +96 -0
- internal/algorithms/predicate_canon.py +137 -0
- internal/algorithms/semantic_arithmetic.py +890 -0
- internal/algorithms/syntactic_sieve.py +452 -0
- internal/algorithms/zk_semantics.py +90 -0
- internal/ensemble/__init__.py +1 -0
- internal/ensemble/automated_scientist.py +138 -0
- internal/ensemble/autonomous_agent.py +157 -0
- internal/ensemble/causal_triggers.py +121 -0
- internal/ensemble/confidence_calibrator.py +284 -0
- internal/ensemble/epistemic_arbiter.py +159 -0
- internal/ensemble/epistemic_loop.py +136 -0
- internal/ensemble/extraction_validator.py +172 -0
- internal/ensemble/gauge_orchestrator.py +150 -0
- internal/ensemble/live_llm_adapter.py +183 -0
- internal/ensemble/llm_entailment.py +117 -0
- internal/ensemble/mass_semantic_engine.py +138 -0
- internal/ensemble/ouroboros.py +281 -0
- internal/ensemble/semantic_dedup.py +261 -0
- internal/ensemble/tome_generator.py +286 -0
- internal/ensemble/tome_sliders.py +104 -0
- internal/ensemble/vector_bridge.py +195 -0
- internal/ensemble/venn_abers.py +211 -0
- internal/infrastructure/__init__.py +1 -0
- internal/infrastructure/akashic_ledger.py +812 -0
- internal/infrastructure/canonical_codec.py +452 -0
- internal/infrastructure/jcs.py +115 -0
- internal/infrastructure/key_manager.py +239 -0
- internal/infrastructure/p2p_mesh.py +168 -0
- internal/infrastructure/prov_o.py +159 -0
- internal/infrastructure/provenance.py +181 -0
- internal/infrastructure/rate_limiter.py +81 -0
- internal/infrastructure/resource_guards.py +117 -0
- internal/infrastructure/scheme_registry.py +136 -0
- internal/infrastructure/state_encoding.py +94 -0
- internal/infrastructure/telemetry.py +91 -0
- internal/infrastructure/tome_parser.py +55 -0
- internal/infrastructure/verifiable_credential.py +412 -0
- internal/infrastructure/zig_bridge.py +256 -0
- sum_cli/__init__.py +18 -0
- sum_cli/main.py +688 -0
- sum_engine-0.1.0.dist-info/METADATA +590 -0
- sum_engine-0.1.0.dist-info/RECORD +49 -0
- sum_engine-0.1.0.dist-info/WHEEL +5 -0
- sum_engine-0.1.0.dist-info/entry_points.txt +2 -0
- sum_engine-0.1.0.dist-info/licenses/LICENSE +201 -0
- sum_engine-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Epistemic Arbiter & Event Broadcaster — Wave Function Collapse Engine
|
|
3
|
+
|
|
4
|
+
The Arbiter resolves Level 3 Curvature (semantic contradictions) by
|
|
5
|
+
invoking an LLM judge to determine which conflicting fact survives.
|
|
6
|
+
The EventBroadcaster streams the internal "thinking" process to the
|
|
7
|
+
frontend via Server-Sent Events (SSE).
|
|
8
|
+
|
|
9
|
+
Author: ototao
|
|
10
|
+
License: Apache License 2.0
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import asyncio
|
|
14
|
+
import logging
|
|
15
|
+
from typing import Callable, Dict, List, Tuple
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class EventBroadcaster:
|
|
21
|
+
"""Streams internal mathematical thoughts to the frontend via SSE."""
|
|
22
|
+
|
|
23
|
+
def __init__(self):
|
|
24
|
+
self.queues: List[asyncio.Queue] = []
|
|
25
|
+
|
|
26
|
+
async def broadcast(self, message: str):
|
|
27
|
+
"""Push a message to all connected SSE subscribers."""
|
|
28
|
+
logger.info("[KOS Telemetry] %s", message)
|
|
29
|
+
for queue in self.queues:
|
|
30
|
+
await queue.put(message)
|
|
31
|
+
|
|
32
|
+
def subscribe(self) -> asyncio.Queue:
|
|
33
|
+
"""Create a new subscriber queue."""
|
|
34
|
+
q: asyncio.Queue = asyncio.Queue()
|
|
35
|
+
self.queues.append(q)
|
|
36
|
+
return q
|
|
37
|
+
|
|
38
|
+
def unsubscribe(self, queue: asyncio.Queue):
|
|
39
|
+
"""Remove a subscriber queue."""
|
|
40
|
+
if queue in self.queues:
|
|
41
|
+
self.queues.remove(queue)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# Global singleton — imported by quantum_router for SSE streaming
|
|
45
|
+
kos_telemetry = EventBroadcaster()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class EpistemicArbiter:
|
|
49
|
+
"""
|
|
50
|
+
Resolves Level 3 Curvature (Semantic Contradictions).
|
|
51
|
+
|
|
52
|
+
Collapses the wave function of conflicting primes into
|
|
53
|
+
absolute truth by invoking an LLM judge.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def __init__(self, llm_judge: Callable):
|
|
57
|
+
self.judge = llm_judge # async func(prompt: str) -> str
|
|
58
|
+
|
|
59
|
+
async def collapse_wave_function(
|
|
60
|
+
self, conflicts: List[Tuple[str, str, str, str]]
|
|
61
|
+
) -> Dict[Tuple[str, str], str]:
|
|
62
|
+
"""
|
|
63
|
+
Takes a list of conflicts: (subject, predicate, object_a, object_b).
|
|
64
|
+
Returns the winning mapping: {(subject, predicate): winning_object}.
|
|
65
|
+
"""
|
|
66
|
+
resolutions: Dict[Tuple[str, str], str] = {}
|
|
67
|
+
|
|
68
|
+
for subject, predicate, obj_a, obj_b in conflicts:
|
|
69
|
+
await kos_telemetry.broadcast(
|
|
70
|
+
f"⚠️ Level 3 Curvature Detected: "
|
|
71
|
+
f"{subject} {predicate} [{obj_a} OR {obj_b}]"
|
|
72
|
+
)
|
|
73
|
+
await kos_telemetry.broadcast(
|
|
74
|
+
"🌀 Entering Epistemic Superposition..."
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
prompt = (
|
|
78
|
+
f"You are a strict logic arbiter. We have a contradiction "
|
|
79
|
+
f"regarding '{subject}'.\n"
|
|
80
|
+
f"Claim A: {subject} {predicate} {obj_a}\n"
|
|
81
|
+
f"Claim B: {subject} {predicate} {obj_b}\n"
|
|
82
|
+
f"Analyze standard logical precedence, general knowledge, "
|
|
83
|
+
f"or temporal recency, and return ONLY the correct object "
|
|
84
|
+
f"value. If tied, pick the most specific."
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# Call the LLM Judge
|
|
88
|
+
winner = await self.judge(prompt)
|
|
89
|
+
winner_clean = winner.strip().lower()
|
|
90
|
+
|
|
91
|
+
# Fallback if the judge hallucinates an entirely new answer
|
|
92
|
+
if winner_clean not in [obj_a.lower(), obj_b.lower()]:
|
|
93
|
+
winner_clean = obj_a.lower()
|
|
94
|
+
|
|
95
|
+
resolutions[(subject, predicate)] = winner_clean
|
|
96
|
+
await kos_telemetry.broadcast(
|
|
97
|
+
f"⚡ Wave Function Collapsed: "
|
|
98
|
+
f"{subject} {predicate} → {winner_clean}"
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
return resolutions
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class DeterministicArbiter:
|
|
105
|
+
"""
|
|
106
|
+
Deterministic contradiction resolution without LLM.
|
|
107
|
+
|
|
108
|
+
Resolves Level 3 Curvature using SHA-256 lexicographic ordering:
|
|
109
|
+
for each conflict (subject, predicate, obj_a, obj_b), the winner
|
|
110
|
+
is whichever object has the lower SHA-256 hash of
|
|
111
|
+
``f"{subject}||{predicate}||{object}"``.
|
|
112
|
+
|
|
113
|
+
This guarantees:
|
|
114
|
+
- Identical resolution on every node (deterministic)
|
|
115
|
+
- No LLM cost or latency
|
|
116
|
+
- Consistent ordering regardless of minting order
|
|
117
|
+
- Reproducibility across runtimes (SHA-256 is universal)
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
@staticmethod
|
|
121
|
+
def _canonical_hash(subject: str, predicate: str, obj: str) -> str:
|
|
122
|
+
"""SHA-256 of the canonical triplet key."""
|
|
123
|
+
import hashlib
|
|
124
|
+
return hashlib.sha256(
|
|
125
|
+
f"{subject}||{predicate}||{obj}".encode()
|
|
126
|
+
).hexdigest()
|
|
127
|
+
|
|
128
|
+
async def collapse_wave_function(
|
|
129
|
+
self, conflicts: List[Tuple[str, str, str, str]]
|
|
130
|
+
) -> Dict[Tuple[str, str], str]:
|
|
131
|
+
"""
|
|
132
|
+
Resolve conflicts deterministically via SHA-256 ordering.
|
|
133
|
+
|
|
134
|
+
For each (subject, predicate, obj_a, obj_b), the object with
|
|
135
|
+
the lexicographically lower SHA-256 hash wins.
|
|
136
|
+
"""
|
|
137
|
+
resolutions: Dict[Tuple[str, str], str] = {}
|
|
138
|
+
|
|
139
|
+
for subject, predicate, obj_a, obj_b in conflicts:
|
|
140
|
+
hash_a = self._canonical_hash(subject, predicate, obj_a)
|
|
141
|
+
hash_b = self._canonical_hash(subject, predicate, obj_b)
|
|
142
|
+
|
|
143
|
+
winner = obj_a if hash_a <= hash_b else obj_b
|
|
144
|
+
|
|
145
|
+
await kos_telemetry.broadcast(
|
|
146
|
+
f"⚠️ Level 3 Curvature: "
|
|
147
|
+
f"{subject} {predicate} [{obj_a} OR {obj_b}]"
|
|
148
|
+
)
|
|
149
|
+
await kos_telemetry.broadcast(
|
|
150
|
+
f"🔬 Deterministic Resolution: SHA-256({obj_a})={hash_a[:8]}… "
|
|
151
|
+
f"vs SHA-256({obj_b})={hash_b[:8]}…"
|
|
152
|
+
)
|
|
153
|
+
await kos_telemetry.broadcast(
|
|
154
|
+
f"⚡ Collapsed → {subject} {predicate} → {winner}"
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
resolutions[(subject, predicate)] = winner
|
|
158
|
+
|
|
159
|
+
return resolutions
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Epistemic Feedback Loop — "Tags to Tomes then Back"
|
|
3
|
+
|
|
4
|
+
Governs the closed-loop extrapolation pipeline:
|
|
5
|
+
1. TOMES: Generate narrative text from verified Gödel axioms.
|
|
6
|
+
2. TAGS: Extract triplets from the narrative and re-encode as a
|
|
7
|
+
Gödel integer.
|
|
8
|
+
3. VERIFY: modulo check — ``global_state % generated_state == 0``.
|
|
9
|
+
4. DIAGNOSE: If verification fails, GCD-based hallucination isolation
|
|
10
|
+
identifies the exact fabricated claims.
|
|
11
|
+
5. SELF-CORRECT: Feed hallucinated axioms back as strict negative
|
|
12
|
+
constraints and re-generate.
|
|
13
|
+
|
|
14
|
+
The loop refuses to return a string until it is *mathematically proven*
|
|
15
|
+
to be a pure subset of the global truth.
|
|
16
|
+
|
|
17
|
+
Author: ototao
|
|
18
|
+
License: Apache License 2.0
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import logging
|
|
22
|
+
from typing import Callable, Awaitable, List, Tuple, Dict, Any
|
|
23
|
+
|
|
24
|
+
from internal.algorithms.semantic_arithmetic import GodelStateAlgebra
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class QuantumExtrapolator:
|
|
30
|
+
"""
|
|
31
|
+
Translates Gödel Integers (Tags) into Narrative Text (Tomes) and
|
|
32
|
+
verifies them mathematically by converting them back into Integers.
|
|
33
|
+
|
|
34
|
+
The extrapolation loop guarantees zero hallucination through an
|
|
35
|
+
unbreakable epistemic cage: no text is returned until
|
|
36
|
+
``global_state % generated_state == 0``.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
godel_algebra: GodelStateAlgebra,
|
|
42
|
+
llm_generator: Callable[
|
|
43
|
+
[List[str], List[str]], Awaitable[str]
|
|
44
|
+
],
|
|
45
|
+
llm_extractor: Callable[
|
|
46
|
+
[str], Awaitable[List[Tuple[str, str, str]]]
|
|
47
|
+
],
|
|
48
|
+
max_retries: int = 3,
|
|
49
|
+
):
|
|
50
|
+
"""
|
|
51
|
+
Args:
|
|
52
|
+
godel_algebra: A GodelStateAlgebra instance with the global
|
|
53
|
+
truth already encoded.
|
|
54
|
+
llm_generator: Async callable (axioms, negative_constraints) → text.
|
|
55
|
+
llm_extractor: Async callable (text) → List[(subj, pred, obj)].
|
|
56
|
+
max_retries: Maximum correction attempts before raising.
|
|
57
|
+
"""
|
|
58
|
+
self.algebra = godel_algebra
|
|
59
|
+
self.generate_text = llm_generator
|
|
60
|
+
self.extract_triplets = llm_extractor
|
|
61
|
+
self.max_retries = max_retries
|
|
62
|
+
|
|
63
|
+
async def extrapolate_with_proof(
|
|
64
|
+
self,
|
|
65
|
+
global_state: int,
|
|
66
|
+
target_axioms: List[str],
|
|
67
|
+
) -> str:
|
|
68
|
+
"""
|
|
69
|
+
The Tags-to-Tomes pipeline.
|
|
70
|
+
|
|
71
|
+
Guarantees the output text strictly entails the global state with
|
|
72
|
+
zero hallucinations.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
global_state: The verified global Gödel integer.
|
|
76
|
+
target_axioms: Axiom key strings to expand into narrative.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
A narrative string that is *mathematically proven* to contain
|
|
80
|
+
only claims present in the global state.
|
|
81
|
+
|
|
82
|
+
Raises:
|
|
83
|
+
RuntimeError: If the LLM fails to self-correct within
|
|
84
|
+
``max_retries`` attempts.
|
|
85
|
+
"""
|
|
86
|
+
negative_constraints: List[str] = []
|
|
87
|
+
|
|
88
|
+
for attempt in range(self.max_retries):
|
|
89
|
+
# ── 1. TOMES: Generate narrative from verified axioms ────
|
|
90
|
+
narrative = await self.generate_text(
|
|
91
|
+
target_axioms, negative_constraints
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# ── 2. TAGS: Map narrative back to a Gödel integer ───────
|
|
95
|
+
extracted_triplets = await self.extract_triplets(narrative)
|
|
96
|
+
|
|
97
|
+
if not extracted_triplets:
|
|
98
|
+
negative_constraints.append(
|
|
99
|
+
"Failed to extract any verifiable axioms. "
|
|
100
|
+
"Be more explicit."
|
|
101
|
+
)
|
|
102
|
+
continue
|
|
103
|
+
|
|
104
|
+
generated_state = self.algebra.encode_chunk_state(
|
|
105
|
+
extracted_triplets
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# ── 3. VERIFY: The Epistemic Hardware Filter ────────────
|
|
109
|
+
if self.algebra.verify_entailment(global_state, generated_state):
|
|
110
|
+
logger.info(
|
|
111
|
+
"Mathematical Proof of Zero Hallucination achieved "
|
|
112
|
+
"on attempt %d.",
|
|
113
|
+
attempt + 1,
|
|
114
|
+
)
|
|
115
|
+
return narrative
|
|
116
|
+
|
|
117
|
+
# ── 4. DIAGNOSE: Isolate hallucinated primes via GCD ─────
|
|
118
|
+
hallucinations = self.algebra.isolate_hallucinations(
|
|
119
|
+
global_state, generated_state
|
|
120
|
+
)
|
|
121
|
+
logger.warning(
|
|
122
|
+
"Modulo check failed (attempt %d). "
|
|
123
|
+
"Hallucinations detected: %s",
|
|
124
|
+
attempt + 1,
|
|
125
|
+
hallucinations,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# ── 5. SELF-CORRECT: Feed exact errors back ──────────────
|
|
129
|
+
if hallucinations:
|
|
130
|
+
negative_constraints.extend(hallucinations)
|
|
131
|
+
|
|
132
|
+
raise RuntimeError(
|
|
133
|
+
f"Epistemic Failure: LLM failed to mathematically align "
|
|
134
|
+
f"after {self.max_retries} attempts. "
|
|
135
|
+
f"Residual hallucinations: {negative_constraints[-5:]}"
|
|
136
|
+
)
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Extraction Validator — Structural Gate for LLM→Algebra Boundary
|
|
3
|
+
|
|
4
|
+
Phase 19A: Validates, canonicalizes, and deduplicates extracted triplets
|
|
5
|
+
BEFORE they enter the Gödel State Algebra. Malformed or underspecified
|
|
6
|
+
outputs are rejected with audit reasons, not silently ingested.
|
|
7
|
+
|
|
8
|
+
This is the system's immune system at the NLP boundary.
|
|
9
|
+
|
|
10
|
+
Pipeline:
|
|
11
|
+
1. Structural validation (non-empty, length bounds, illegal chars)
|
|
12
|
+
2. Predicate canonicalization (synonym collapse)
|
|
13
|
+
3. Batch deduplication (identical triplets within one extraction)
|
|
14
|
+
4. Return accepted + rejected with audit trail
|
|
15
|
+
|
|
16
|
+
Author: ototao
|
|
17
|
+
License: Apache License 2.0
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import re
|
|
21
|
+
import logging
|
|
22
|
+
from dataclasses import dataclass, field
|
|
23
|
+
from typing import List, Tuple, Optional
|
|
24
|
+
|
|
25
|
+
from internal.algorithms.predicate_canon import canonicalize
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
# ── Constraints ───────────────────────────────────────────────────────
|
|
30
|
+
|
|
31
|
+
MIN_FIELD_LENGTH = 2 # Single-char subjects/objects are garbage
|
|
32
|
+
MAX_FIELD_LENGTH = 200 # Absurdly long strings indicate extraction failure
|
|
33
|
+
CONTROL_CHAR_PATTERN = re.compile(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]')
|
|
34
|
+
JSON_FRAGMENT_PATTERN = re.compile(r'[{}\[\]]')
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class RejectedTriplet:
|
|
39
|
+
"""A triplet that failed validation, with the reason why."""
|
|
40
|
+
subject: str
|
|
41
|
+
predicate: str
|
|
42
|
+
object_: str
|
|
43
|
+
reason: str
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class ValidatedExtraction:
|
|
48
|
+
"""Result of running extraction through the validation gate."""
|
|
49
|
+
accepted: List[Tuple[str, str, str]] = field(default_factory=list)
|
|
50
|
+
rejected: List[RejectedTriplet] = field(default_factory=list)
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def accepted_count(self) -> int:
|
|
54
|
+
return len(self.accepted)
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def rejected_count(self) -> int:
|
|
58
|
+
return len(self.rejected)
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def valid_schema_rate(self) -> float:
|
|
62
|
+
total = self.accepted_count + self.rejected_count
|
|
63
|
+
return self.accepted_count / total if total > 0 else 0.0
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class ExtractionValidator:
|
|
67
|
+
"""
|
|
68
|
+
Structural gate between LLM extraction and Gödel algebra.
|
|
69
|
+
|
|
70
|
+
Validates each triplet, canonicalizes predicates, deduplicates
|
|
71
|
+
within a batch, and returns an auditable result.
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
def validate_field(self, value: str, field_name: str) -> Optional[str]:
|
|
75
|
+
"""
|
|
76
|
+
Validate a single triplet field. Returns rejection reason or None.
|
|
77
|
+
"""
|
|
78
|
+
if not value or not value.strip():
|
|
79
|
+
return f"{field_name} is empty"
|
|
80
|
+
|
|
81
|
+
stripped = value.strip()
|
|
82
|
+
|
|
83
|
+
if len(stripped) < MIN_FIELD_LENGTH:
|
|
84
|
+
return f"{field_name} too short ({len(stripped)} chars, min {MIN_FIELD_LENGTH})"
|
|
85
|
+
|
|
86
|
+
if len(stripped) > MAX_FIELD_LENGTH:
|
|
87
|
+
return f"{field_name} too long ({len(stripped)} chars, max {MAX_FIELD_LENGTH})"
|
|
88
|
+
|
|
89
|
+
if CONTROL_CHAR_PATTERN.search(stripped):
|
|
90
|
+
return f"{field_name} contains control characters"
|
|
91
|
+
|
|
92
|
+
if JSON_FRAGMENT_PATTERN.search(stripped) and len(stripped) < 10:
|
|
93
|
+
return f"{field_name} appears to be a JSON fragment"
|
|
94
|
+
|
|
95
|
+
return None
|
|
96
|
+
|
|
97
|
+
def validate_triplet(
|
|
98
|
+
self,
|
|
99
|
+
subject: str,
|
|
100
|
+
predicate: str,
|
|
101
|
+
object_: str,
|
|
102
|
+
) -> Optional[str]:
|
|
103
|
+
"""
|
|
104
|
+
Validate a full triplet. Returns rejection reason or None if valid.
|
|
105
|
+
"""
|
|
106
|
+
for val, name in [
|
|
107
|
+
(subject, "subject"),
|
|
108
|
+
(predicate, "predicate"),
|
|
109
|
+
(object_, "object"),
|
|
110
|
+
]:
|
|
111
|
+
reason = self.validate_field(val, name)
|
|
112
|
+
if reason:
|
|
113
|
+
return reason
|
|
114
|
+
|
|
115
|
+
return None
|
|
116
|
+
|
|
117
|
+
def validate_batch(
|
|
118
|
+
self,
|
|
119
|
+
triplets: List[Tuple[str, str, str]],
|
|
120
|
+
canonicalize_predicates: bool = True,
|
|
121
|
+
) -> ValidatedExtraction:
|
|
122
|
+
"""
|
|
123
|
+
Validate, canonicalize, and deduplicate a batch of extracted triplets.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
triplets: Raw (subject, predicate, object) tuples from LLM.
|
|
127
|
+
canonicalize_predicates: If True, run predicate canonicalization.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
ValidatedExtraction with accepted and rejected lists.
|
|
131
|
+
"""
|
|
132
|
+
result = ValidatedExtraction()
|
|
133
|
+
seen: set = set()
|
|
134
|
+
|
|
135
|
+
for s, p, o in triplets:
|
|
136
|
+
# Normalize
|
|
137
|
+
s_clean = s.strip().lower()
|
|
138
|
+
p_clean = p.strip().lower().replace(" ", "_")
|
|
139
|
+
o_clean = o.strip().lower()
|
|
140
|
+
|
|
141
|
+
# Structural validation
|
|
142
|
+
reason = self.validate_triplet(s_clean, p_clean, o_clean)
|
|
143
|
+
if reason:
|
|
144
|
+
result.rejected.append(
|
|
145
|
+
RejectedTriplet(s_clean, p_clean, o_clean, reason)
|
|
146
|
+
)
|
|
147
|
+
continue
|
|
148
|
+
|
|
149
|
+
# Predicate canonicalization
|
|
150
|
+
if canonicalize_predicates:
|
|
151
|
+
p_clean = canonicalize(p_clean)
|
|
152
|
+
|
|
153
|
+
# Batch deduplication
|
|
154
|
+
key = (s_clean, p_clean, o_clean)
|
|
155
|
+
if key in seen:
|
|
156
|
+
result.rejected.append(
|
|
157
|
+
RejectedTriplet(s_clean, p_clean, o_clean, "duplicate in batch")
|
|
158
|
+
)
|
|
159
|
+
continue
|
|
160
|
+
|
|
161
|
+
seen.add(key)
|
|
162
|
+
result.accepted.append((s_clean, p_clean, o_clean))
|
|
163
|
+
|
|
164
|
+
if result.rejected_count > 0:
|
|
165
|
+
logger.info(
|
|
166
|
+
"Extraction gate: %d accepted, %d rejected (%.0f%% valid)",
|
|
167
|
+
result.accepted_count,
|
|
168
|
+
result.rejected_count,
|
|
169
|
+
result.valid_schema_rate * 100,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
return result
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Gauge-Theoretic Orchestrator — Commutativity Hierarchy Engine
|
|
3
|
+
|
|
4
|
+
Implements Yaroslavtsev's three-level commutativity detection:
|
|
5
|
+
- Level 1 (Commutative): Independent facts, safe to merge in any order.
|
|
6
|
+
- Level 2 (Conditionally commutative): Same entity, different predicates.
|
|
7
|
+
- Level 3 (Curvature): Same entity, same predicate, different objects.
|
|
8
|
+
These MUST be serialized — triggers the EpistemicArbiter.
|
|
9
|
+
|
|
10
|
+
Author: ototao
|
|
11
|
+
License: Apache License 2.0
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import logging
|
|
15
|
+
from enum import IntEnum
|
|
16
|
+
from typing import Callable, List, Tuple, Optional
|
|
17
|
+
|
|
18
|
+
import networkx as nx
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class CommutativityLevel(IntEnum):
|
|
24
|
+
"""Yaroslavtsev's Commutativity Hierarchy."""
|
|
25
|
+
|
|
26
|
+
L1_COMMUTATIVE = 1 # Independent facts — parallel merge safe
|
|
27
|
+
L2_CONDITIONAL = 2 # Same entity, different predicates
|
|
28
|
+
L3_CURVATURE = 3 # Same entity, same predicate, different objects
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class GaugeTheoreticOrchestrator:
|
|
32
|
+
"""
|
|
33
|
+
Manages the merge of knowledge graph extractions according to
|
|
34
|
+
the Commutativity Hierarchy.
|
|
35
|
+
|
|
36
|
+
When Level 3 Curvature is detected, the ``arbitrate`` callback
|
|
37
|
+
(an ``EpistemicArbiter.collapse_wave_function``) is invoked to
|
|
38
|
+
select the winning fact.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, arbitrate_fn: Optional[Callable] = None):
|
|
42
|
+
self.arbitrate = arbitrate_fn # async func(conflicts) -> resolutions
|
|
43
|
+
|
|
44
|
+
def detect_curvature(
|
|
45
|
+
self,
|
|
46
|
+
base_graph: nx.MultiDiGraph,
|
|
47
|
+
new_graph: nx.MultiDiGraph,
|
|
48
|
+
) -> Tuple[CommutativityLevel, List[Tuple[str, str, str, str]]]:
|
|
49
|
+
"""
|
|
50
|
+
Detect the commutativity level between two knowledge graphs.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
base_graph: Existing knowledge graph.
|
|
54
|
+
new_graph: Newly extracted graph.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
(level, conflicts) where conflicts is a list of
|
|
58
|
+
(subject, predicate, old_object, new_object) tuples.
|
|
59
|
+
"""
|
|
60
|
+
conflicts: List[Tuple[str, str, str, str]] = []
|
|
61
|
+
max_level = CommutativityLevel.L1_COMMUTATIVE
|
|
62
|
+
|
|
63
|
+
# Build an index of (source, relation) -> target from the base graph
|
|
64
|
+
base_index: dict[Tuple[str, str], str] = {}
|
|
65
|
+
for src, tgt, data in base_graph.edges(data=True):
|
|
66
|
+
rel = data.get("relation", "related_to")
|
|
67
|
+
base_index[(src, rel)] = tgt
|
|
68
|
+
|
|
69
|
+
# Check new graph edges against the base index
|
|
70
|
+
for src, tgt, data in new_graph.edges(data=True):
|
|
71
|
+
rel = data.get("relation", "related_to")
|
|
72
|
+
|
|
73
|
+
if (src, rel) in base_index:
|
|
74
|
+
old_tgt = base_index[(src, rel)]
|
|
75
|
+
if old_tgt != tgt:
|
|
76
|
+
# Level 3: Same subject + predicate, different object
|
|
77
|
+
conflicts.append((src, rel, old_tgt, tgt))
|
|
78
|
+
max_level = CommutativityLevel.L3_CURVATURE
|
|
79
|
+
# else: identical fact, no conflict
|
|
80
|
+
elif src in {s for s, _, _ in base_graph.edges(data=True)}:
|
|
81
|
+
# Level 2: Same entity, different predicates
|
|
82
|
+
if max_level < CommutativityLevel.L2_CONDITIONAL:
|
|
83
|
+
max_level = CommutativityLevel.L2_CONDITIONAL
|
|
84
|
+
|
|
85
|
+
return max_level, conflicts
|
|
86
|
+
|
|
87
|
+
async def merge_extractions(
|
|
88
|
+
self,
|
|
89
|
+
base_graph: nx.MultiDiGraph,
|
|
90
|
+
new_graphs: List[nx.MultiDiGraph],
|
|
91
|
+
) -> nx.MultiDiGraph:
|
|
92
|
+
"""
|
|
93
|
+
Merge multiple extraction graphs into the base graph,
|
|
94
|
+
resolving Level 3 Curvature via arbitration.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
base_graph: The canonical knowledge graph.
|
|
98
|
+
new_graphs: List of newly extracted graphs to merge.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Merged graph with contradictions resolved.
|
|
102
|
+
"""
|
|
103
|
+
merged = base_graph.copy()
|
|
104
|
+
|
|
105
|
+
for new_graph in new_graphs:
|
|
106
|
+
level, conflicts = self.detect_curvature(merged, new_graph)
|
|
107
|
+
|
|
108
|
+
if level == CommutativityLevel.L3_CURVATURE and conflicts:
|
|
109
|
+
if self.arbitrate:
|
|
110
|
+
resolutions = await self.arbitrate(conflicts)
|
|
111
|
+
else:
|
|
112
|
+
# Default: new information wins (recency bias)
|
|
113
|
+
resolutions = {
|
|
114
|
+
(subj, pred): new_obj
|
|
115
|
+
for subj, pred, _old, new_obj in conflicts
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
# Apply resolutions
|
|
119
|
+
for (subj, pred), winner in resolutions.items():
|
|
120
|
+
# Remove conflicting edges
|
|
121
|
+
edges_to_remove = []
|
|
122
|
+
for u, v, key, data in merged.edges(
|
|
123
|
+
keys=True, data=True
|
|
124
|
+
):
|
|
125
|
+
if u == subj and data.get("relation") == pred:
|
|
126
|
+
edges_to_remove.append((u, v, key))
|
|
127
|
+
|
|
128
|
+
for u, v, key in edges_to_remove:
|
|
129
|
+
merged.remove_edge(u, v, key=key)
|
|
130
|
+
|
|
131
|
+
# Add the winner
|
|
132
|
+
merged.add_edge(
|
|
133
|
+
subj,
|
|
134
|
+
winner,
|
|
135
|
+
relation=pred,
|
|
136
|
+
verified_curvature=True,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
logger.info(
|
|
140
|
+
"Curvature resolved: %s %s → %s", subj, pred, winner
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# Add all non-conflicting edges from new_graph
|
|
144
|
+
conflict_keys = {(s, p) for s, p, _, _ in conflicts}
|
|
145
|
+
for src, tgt, data in new_graph.edges(data=True):
|
|
146
|
+
rel = data.get("relation", "related_to")
|
|
147
|
+
if (src, rel) not in conflict_keys:
|
|
148
|
+
merged.add_edge(src, tgt, **data)
|
|
149
|
+
|
|
150
|
+
return merged
|