sum-engine 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- internal/__init__.py +8 -0
- internal/algorithms/__init__.py +1 -0
- internal/algorithms/causal_discovery.py +96 -0
- internal/algorithms/predicate_canon.py +137 -0
- internal/algorithms/semantic_arithmetic.py +890 -0
- internal/algorithms/syntactic_sieve.py +452 -0
- internal/algorithms/zk_semantics.py +90 -0
- internal/ensemble/__init__.py +1 -0
- internal/ensemble/automated_scientist.py +138 -0
- internal/ensemble/autonomous_agent.py +157 -0
- internal/ensemble/causal_triggers.py +121 -0
- internal/ensemble/confidence_calibrator.py +284 -0
- internal/ensemble/epistemic_arbiter.py +159 -0
- internal/ensemble/epistemic_loop.py +136 -0
- internal/ensemble/extraction_validator.py +172 -0
- internal/ensemble/gauge_orchestrator.py +150 -0
- internal/ensemble/live_llm_adapter.py +183 -0
- internal/ensemble/llm_entailment.py +117 -0
- internal/ensemble/mass_semantic_engine.py +138 -0
- internal/ensemble/ouroboros.py +281 -0
- internal/ensemble/semantic_dedup.py +261 -0
- internal/ensemble/tome_generator.py +286 -0
- internal/ensemble/tome_sliders.py +104 -0
- internal/ensemble/vector_bridge.py +195 -0
- internal/ensemble/venn_abers.py +211 -0
- internal/infrastructure/__init__.py +1 -0
- internal/infrastructure/akashic_ledger.py +812 -0
- internal/infrastructure/canonical_codec.py +452 -0
- internal/infrastructure/jcs.py +115 -0
- internal/infrastructure/key_manager.py +239 -0
- internal/infrastructure/p2p_mesh.py +168 -0
- internal/infrastructure/prov_o.py +159 -0
- internal/infrastructure/provenance.py +181 -0
- internal/infrastructure/rate_limiter.py +81 -0
- internal/infrastructure/resource_guards.py +117 -0
- internal/infrastructure/scheme_registry.py +136 -0
- internal/infrastructure/state_encoding.py +94 -0
- internal/infrastructure/telemetry.py +91 -0
- internal/infrastructure/tome_parser.py +55 -0
- internal/infrastructure/verifiable_credential.py +412 -0
- internal/infrastructure/zig_bridge.py +256 -0
- sum_cli/__init__.py +18 -0
- sum_cli/main.py +688 -0
- sum_engine-0.1.0.dist-info/METADATA +590 -0
- sum_engine-0.1.0.dist-info/RECORD +49 -0
- sum_engine-0.1.0.dist-info/WHEEL +5 -0
- sum_engine-0.1.0.dist-info/entry_points.txt +2 -0
- sum_engine-0.1.0.dist-info/licenses/LICENSE +201 -0
- sum_engine-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,452 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Deterministic Syntactic Sieve — High-Fidelity Edge NLP
|
|
3
|
+
|
|
4
|
+
Extracts topological (Subject, Predicate, Object) triplets using strict
|
|
5
|
+
grammatical dependency parsing via spaCy. Replaces the LLM for bulk
|
|
6
|
+
ingestion, parsing text at bare-metal CPU speeds.
|
|
7
|
+
|
|
8
|
+
Cost: $0. Speed: 10,000+ words per second. Deterministic: always.
|
|
9
|
+
|
|
10
|
+
Phase 13: Zenith of Process Intensification.
|
|
11
|
+
Stage 4 — Hedging detection for linguistic confidence signals.
|
|
12
|
+
|
|
13
|
+
Author: ototao
|
|
14
|
+
License: Apache License 2.0
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import re
|
|
18
|
+
from datetime import datetime, timezone
|
|
19
|
+
from typing import Dict, List, Optional, Tuple
|
|
20
|
+
|
|
21
|
+
from internal.infrastructure.provenance import (
|
|
22
|
+
EXCERPT_MAX_CHARS,
|
|
23
|
+
ProvenanceRecord,
|
|
24
|
+
sha256_uri_for_text,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
SIEVE_EXTRACTOR_ID = "sum.sieve:deterministic_v1"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# ─── Hedging / Epistemic Markers ──────────────────────────────────────
|
|
31
|
+
# Words and phrases that indicate uncertainty in the source text.
|
|
32
|
+
# Presence reduces confidence at the linguistic level.
|
|
33
|
+
|
|
34
|
+
HEDGING_MARKERS = [
|
|
35
|
+
# Modal verbs of uncertainty
|
|
36
|
+
re.compile(r"\b(may|might|could|would)\b", re.IGNORECASE),
|
|
37
|
+
# Epistemic adverbs
|
|
38
|
+
re.compile(r"\b(possibly|probably|perhaps|likely|unlikely|apparently|"
|
|
39
|
+
r"allegedly|purportedly|supposedly|seemingly|arguably|"
|
|
40
|
+
r"conceivably|presumably|ostensibly)\b", re.IGNORECASE),
|
|
41
|
+
# Hedging verbs
|
|
42
|
+
re.compile(r"\b(suggest|imply|indicate|appear|seem|tend|believe|"
|
|
43
|
+
r"estimate|speculate|hypothesize|propose|conjecture)\b",
|
|
44
|
+
re.IGNORECASE),
|
|
45
|
+
# Hedging phrases
|
|
46
|
+
re.compile(r"\b(it is (thought|believed|estimated|assumed)|"
|
|
47
|
+
r"according to( some)?|some (researchers|scientists|experts)|"
|
|
48
|
+
r"there is (some )?evidence|in (some|certain) cases|"
|
|
49
|
+
r"not (entirely |fully )?clear)\b", re.IGNORECASE),
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
# Each matched marker reduces certainty by this factor
|
|
53
|
+
HEDGE_PENALTY_PER_MARKER = 0.15
|
|
54
|
+
HEDGE_FLOOR = 0.20 # minimum confidence from hedging alone
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
_FALLBACK_CONTENT_POS = frozenset({"NOUN", "PROPN", "VERB", "ADJ"})
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _is_negated(sent) -> bool:
|
|
61
|
+
"""Return True iff the sentence contains a negation particle scoping the
|
|
62
|
+
main predication.
|
|
63
|
+
|
|
64
|
+
spaCy tags ``not``, ``n't``, ``never`` (and similar) as ``dep_ == "neg"``
|
|
65
|
+
attached to the ROOT verb or copular AUX. When a negation is present,
|
|
66
|
+
the SVO structure still parses — but its semantic polarity is inverted
|
|
67
|
+
relative to what the bare triple would assert. Emitting a positive
|
|
68
|
+
(s, p, o) from a negated source sentence is worse than emitting nothing:
|
|
69
|
+
it silently ships a false assertion into the Gödel state with no
|
|
70
|
+
surface marker that the original sentence denied it.
|
|
71
|
+
|
|
72
|
+
The hedging detector (``detect_hedging``) handles the weaker modal
|
|
73
|
+
class (``may``, ``might``, ``possibly``) by lowering a certainty score.
|
|
74
|
+
Negation is not uncertainty — it is an inversion — so the correct
|
|
75
|
+
response is to refuse extraction, not to annotate it.
|
|
76
|
+
|
|
77
|
+
Scope: any ``dep_ == "neg"`` anywhere in the sentence triggers suppression.
|
|
78
|
+
This is intentionally aggressive: a doubly-negated sentence is ambiguous
|
|
79
|
+
under SUM's SVO frame, and false negatives (missing a triple) are
|
|
80
|
+
strictly preferable to false positives (asserting an inverted fact).
|
|
81
|
+
"""
|
|
82
|
+
for token in sent:
|
|
83
|
+
if token.dep_ == "neg":
|
|
84
|
+
return True
|
|
85
|
+
return False
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _is_passive(sent) -> bool:
|
|
89
|
+
"""Return True iff the sentence's ROOT verb carries a passive-voice
|
|
90
|
+
grammatical subject (``dep_ == "nsubjpass"``).
|
|
91
|
+
|
|
92
|
+
A passive construction inverts the surface order: the grammatical
|
|
93
|
+
subject is the semantic OBJECT, and the semantic subject (if
|
|
94
|
+
recoverable) lives inside the agent prepositional phrase — spaCy
|
|
95
|
+
tags ``by`` with ``dep_ == "agent"`` and the agent noun as a
|
|
96
|
+
``pobj`` child of the ``by`` token. Emitting a triple in surface
|
|
97
|
+
(s,p,o) order from such a sentence produces the inverted fact —
|
|
98
|
+
"Hamlet was written by Shakespeare" → (hamlet, write, shakespeare)
|
|
99
|
+
which asserts the opposite of the source. The POS fallback is
|
|
100
|
+
especially dangerous here because for three-content-token passives
|
|
101
|
+
(e.g. "Hamlet/written/Shakespeare") it produces the inverted
|
|
102
|
+
triple even when the dep-based path bails out. Callers that detect
|
|
103
|
+
passive should either run the swap-and-emit path below
|
|
104
|
+
(``_extract_passive``) or refuse to extract at all.
|
|
105
|
+
"""
|
|
106
|
+
for child in sent.root.children:
|
|
107
|
+
if child.dep_ == "nsubjpass":
|
|
108
|
+
return True
|
|
109
|
+
return False
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _extract_passive(sent) -> Optional[Tuple[str, str, str]]:
|
|
113
|
+
"""Extract an active-form triple from a passive-voice sentence.
|
|
114
|
+
|
|
115
|
+
Strategy (works for both "Hamlet was written by Shakespeare" and
|
|
116
|
+
any other ``nsubjpass + agent-by-pobj`` surface):
|
|
117
|
+
|
|
118
|
+
real subject = the pobj under the agent ``by`` (semantic agent)
|
|
119
|
+
real object = the nsubjpass noun (semantic patient)
|
|
120
|
+
predicate = ROOT verb's lemma
|
|
121
|
+
|
|
122
|
+
If the passive is agentless ("The paper was submitted."), the
|
|
123
|
+
agent is grammatically absent and the semantic subject cannot be
|
|
124
|
+
recovered — return None. This is the same discipline as negation:
|
|
125
|
+
refusing to extract is strictly preferable to asserting an
|
|
126
|
+
inverted fact.
|
|
127
|
+
"""
|
|
128
|
+
root = sent.root
|
|
129
|
+
subj_token = None
|
|
130
|
+
obj_token = None
|
|
131
|
+
for child in root.children:
|
|
132
|
+
if child.dep_ == "nsubjpass" and obj_token is None:
|
|
133
|
+
obj_token = child
|
|
134
|
+
elif child.dep_ == "agent":
|
|
135
|
+
for grandchild in child.children:
|
|
136
|
+
if grandchild.dep_ == "pobj":
|
|
137
|
+
subj_token = grandchild
|
|
138
|
+
break
|
|
139
|
+
if subj_token is None or obj_token is None:
|
|
140
|
+
return None
|
|
141
|
+
|
|
142
|
+
subj_modifiers = [
|
|
143
|
+
c.text for c in subj_token.children
|
|
144
|
+
if c.dep_ in ("amod", "compound")
|
|
145
|
+
]
|
|
146
|
+
subject = "_".join(subj_modifiers + [subj_token.lemma_]).strip()
|
|
147
|
+
obj_modifiers = [
|
|
148
|
+
c.text for c in obj_token.children
|
|
149
|
+
if c.dep_ in ("amod", "compound")
|
|
150
|
+
]
|
|
151
|
+
object_ = " ".join(obj_modifiers + [obj_token.lemma_]).strip()
|
|
152
|
+
predicate = root.lemma_
|
|
153
|
+
|
|
154
|
+
if not (subject and predicate and object_):
|
|
155
|
+
return None
|
|
156
|
+
if len(subject.split("_")) > 5 or len(object_.split()) > 8:
|
|
157
|
+
return None
|
|
158
|
+
return (subject.lower(), predicate.lower(), object_.lower())
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _extract_from_sent(sent) -> Optional[Tuple[str, str, str]]:
|
|
162
|
+
"""Extract at most one (subject, predicate, object) triple from a sentence.
|
|
163
|
+
|
|
164
|
+
Returns None if the sentence is negated, produces no valid ROOT verb, or
|
|
165
|
+
yields a parse whose subject/object exceed the size filters. The POS
|
|
166
|
+
fallback is consulted only when dependency-based extraction fails.
|
|
167
|
+
|
|
168
|
+
This helper is the single source of truth for per-sentence extraction.
|
|
169
|
+
``extract_triplets`` and ``extract_with_provenance`` both call it, so
|
|
170
|
+
their outputs remain triple-for-triple identical — the provenance path
|
|
171
|
+
just adds metadata around the same extraction decisions.
|
|
172
|
+
"""
|
|
173
|
+
if _is_negated(sent):
|
|
174
|
+
return None
|
|
175
|
+
|
|
176
|
+
# Passive voice inverts surface (s,p,o) order. Handle it with a
|
|
177
|
+
# dedicated extractor that swaps the agent phrase's pobj into the
|
|
178
|
+
# subject position and the nsubjpass into the object position. An
|
|
179
|
+
# agentless passive ("The paper was submitted.") cannot recover
|
|
180
|
+
# its semantic subject, so _extract_passive returns None and the
|
|
181
|
+
# sentence is suppressed — the POS fallback is skipped because its
|
|
182
|
+
# left-to-right heuristic would re-emit the inverted triple for
|
|
183
|
+
# three-content-token passives.
|
|
184
|
+
if _is_passive(sent):
|
|
185
|
+
return _extract_passive(sent)
|
|
186
|
+
|
|
187
|
+
subject = None
|
|
188
|
+
predicate = None
|
|
189
|
+
object_ = None
|
|
190
|
+
|
|
191
|
+
for token in sent:
|
|
192
|
+
if token.dep_ == "ROOT" or token.pos_ == "VERB":
|
|
193
|
+
predicate = token.lemma_
|
|
194
|
+
# Compound modifiers are joined with '_' for subject (not space)
|
|
195
|
+
# so multi-word subjects satisfy the canonical template's "\S+"
|
|
196
|
+
# parser in OuroborosVerifier. Object keeps space-joining because
|
|
197
|
+
# the canonical regex for object is ".+" and accommodates spaces.
|
|
198
|
+
for child in token.children:
|
|
199
|
+
if child.dep_ in ("nsubj", "nsubjpass", "csubj", "npadvmod"):
|
|
200
|
+
modifiers = [
|
|
201
|
+
c.text for c in child.children
|
|
202
|
+
if c.dep_ in ("amod", "compound")
|
|
203
|
+
]
|
|
204
|
+
subject = "_".join(modifiers + [child.lemma_]).strip()
|
|
205
|
+
for child in token.children:
|
|
206
|
+
if child.dep_ in ("dobj", "pobj", "attr", "acomp"):
|
|
207
|
+
modifiers = [
|
|
208
|
+
c.text for c in child.children
|
|
209
|
+
if c.dep_ in ("amod", "compound")
|
|
210
|
+
]
|
|
211
|
+
object_ = " ".join(modifiers + [child.lemma_]).strip()
|
|
212
|
+
|
|
213
|
+
if subject and predicate and object_:
|
|
214
|
+
if len(subject.split()) <= 5 and len(object_.split()) <= 8:
|
|
215
|
+
return (subject.lower(), predicate.lower(), object_.lower())
|
|
216
|
+
|
|
217
|
+
return _pos_fallback_triplet(sent)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _pos_fallback_triplet(sent):
|
|
221
|
+
"""POS-based fallback extraction for sentences the dep parser misparses.
|
|
222
|
+
|
|
223
|
+
Activates only when dep-based extraction yielded nothing for the sentence.
|
|
224
|
+
Strategy: if the sentence contains EXACTLY three content tokens
|
|
225
|
+
(NOUN / PROPN / VERB / ADJ — excluding DET / AUX / ADV / ADP / PUNCT / PART),
|
|
226
|
+
emit them in order as (subject, predicate, object).
|
|
227
|
+
|
|
228
|
+
This targets the known spaCy en_core_web_sm failure mode on sentences
|
|
229
|
+
like "Dogs chase cats" where the verb is mis-tagged as NOUN and the
|
|
230
|
+
ROOT is shifted to the object noun. Conservative: the exact three-content
|
|
231
|
+
rule refuses to fire on sentences with adverbial modifiers, adjectives
|
|
232
|
+
stacking on the object, passive-voice auxiliaries, or prepositional
|
|
233
|
+
phrases — all of which the dep-based path handles correctly.
|
|
234
|
+
|
|
235
|
+
Returns (subject_lemma, predicate_lemma, object_lemma) all lowercased,
|
|
236
|
+
or None if the pattern does not match.
|
|
237
|
+
"""
|
|
238
|
+
content = [t for t in sent if t.pos_ in _FALLBACK_CONTENT_POS]
|
|
239
|
+
if len(content) != 3:
|
|
240
|
+
return None
|
|
241
|
+
s, p, o = content
|
|
242
|
+
if not (p.lemma_.isalpha() and 1 < len(p.lemma_) <= 20):
|
|
243
|
+
return None
|
|
244
|
+
|
|
245
|
+
# When spaCy mis-tags a plural noun as ADJ (e.g. "Dogs" in "Dogs chase
|
|
246
|
+
# cats"), the token lemma preserves the plural form. Reverse the
|
|
247
|
+
# common -s plural so the canonical key matches the expected singular.
|
|
248
|
+
s_lemma = s.lemma_.lower()
|
|
249
|
+
if (
|
|
250
|
+
s.tag_.startswith("JJ")
|
|
251
|
+
and s_lemma.endswith("s")
|
|
252
|
+
and len(s_lemma) > 2
|
|
253
|
+
and s_lemma[:-1].isalpha()
|
|
254
|
+
):
|
|
255
|
+
s_lemma = s_lemma[:-1]
|
|
256
|
+
|
|
257
|
+
return (s_lemma, p.lemma_.lower(), o.lemma_.lower())
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def detect_hedging(text: str) -> float:
|
|
261
|
+
"""Score the linguistic certainty of a text.
|
|
262
|
+
|
|
263
|
+
Returns a value in [HEDGE_FLOOR, 1.0] where 1.0 means no hedging
|
|
264
|
+
detected and lower values indicate increasing uncertainty.
|
|
265
|
+
|
|
266
|
+
This is a metadata-only signal — it does NOT affect the algebra.
|
|
267
|
+
"""
|
|
268
|
+
if not text:
|
|
269
|
+
return 1.0
|
|
270
|
+
|
|
271
|
+
hit_count = 0
|
|
272
|
+
for pattern in HEDGING_MARKERS:
|
|
273
|
+
hits = pattern.findall(text)
|
|
274
|
+
hit_count += len(hits)
|
|
275
|
+
|
|
276
|
+
if hit_count == 0:
|
|
277
|
+
return 1.0
|
|
278
|
+
|
|
279
|
+
certainty = 1.0 - (hit_count * HEDGE_PENALTY_PER_MARKER)
|
|
280
|
+
return max(HEDGE_FLOOR, certainty)
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
class DeterministicSieve:
|
|
284
|
+
"""
|
|
285
|
+
High-Fidelity Edge NLP.
|
|
286
|
+
|
|
287
|
+
Extracts topological (Subject, Predicate, Object) triplets using
|
|
288
|
+
strict grammatical dependency parsing.
|
|
289
|
+
|
|
290
|
+
Cost: $0. Speed: 10,000+ words per second.
|
|
291
|
+
"""
|
|
292
|
+
|
|
293
|
+
def __init__(self):
|
|
294
|
+
import spacy # Lazy import: only required when sieve is instantiated
|
|
295
|
+
|
|
296
|
+
try:
|
|
297
|
+
self.nlp = spacy.load("en_core_web_sm")
|
|
298
|
+
except OSError:
|
|
299
|
+
import subprocess
|
|
300
|
+
import sys
|
|
301
|
+
|
|
302
|
+
# CRITICAL: route spaCy's download progress to stderr so it does
|
|
303
|
+
# not contaminate the CLI's stdout. `sum attest > bundle.json`
|
|
304
|
+
# must emit nothing but the CanonicalBundle JSON; the CI's
|
|
305
|
+
# pip-install smoke test catches this regression. Announcing the
|
|
306
|
+
# fallback on stderr is also more honest than silent auto-install.
|
|
307
|
+
print(
|
|
308
|
+
"sum: spaCy model 'en_core_web_sm' missing; downloading "
|
|
309
|
+
"(~50 MB, one-time)…",
|
|
310
|
+
file=sys.stderr,
|
|
311
|
+
)
|
|
312
|
+
subprocess.check_call(
|
|
313
|
+
[sys.executable, "-m", "spacy", "download", "en_core_web_sm"],
|
|
314
|
+
stdout=sys.stderr,
|
|
315
|
+
)
|
|
316
|
+
self.nlp = spacy.load("en_core_web_sm")
|
|
317
|
+
|
|
318
|
+
def extract_triplets(self, text: str) -> List[Tuple[str, str, str]]:
|
|
319
|
+
"""
|
|
320
|
+
Parse text into semantic triplets using dependency grammar.
|
|
321
|
+
|
|
322
|
+
Walks each sentence's dependency tree to find the ROOT verb,
|
|
323
|
+
then extracts its nominal subject and direct/prepositional
|
|
324
|
+
object, including adjectival and compound modifiers.
|
|
325
|
+
|
|
326
|
+
Args:
|
|
327
|
+
text: Raw text to parse.
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
Deduplicated list of (subject, predicate, object) tuples.
|
|
331
|
+
"""
|
|
332
|
+
doc = self.nlp(text)
|
|
333
|
+
triplets = []
|
|
334
|
+
for sent in doc.sents:
|
|
335
|
+
triple = _extract_from_sent(sent)
|
|
336
|
+
if triple is not None:
|
|
337
|
+
triplets.append(triple)
|
|
338
|
+
return list(set(triplets)) # Deduplicate
|
|
339
|
+
|
|
340
|
+
def extract_with_provenance(
|
|
341
|
+
self,
|
|
342
|
+
text: str,
|
|
343
|
+
source_uri: Optional[str] = None,
|
|
344
|
+
timestamp: Optional[str] = None,
|
|
345
|
+
) -> List[Tuple[Tuple[str, str, str], ProvenanceRecord]]:
|
|
346
|
+
"""Extract (s, p, o) triples paired with per-sentence ProvenanceRecords.
|
|
347
|
+
|
|
348
|
+
Each returned record locates the originating sentence's byte range in
|
|
349
|
+
``source_uri``'s bytes, names the extractor version, and carries a
|
|
350
|
+
literal text excerpt (up to EXCERPT_MAX_CHARS) so third-party auditors
|
|
351
|
+
can validate the claim without refetching the source.
|
|
352
|
+
|
|
353
|
+
Args:
|
|
354
|
+
text: Input text. Also becomes the content-addressable
|
|
355
|
+
source if ``source_uri`` is omitted.
|
|
356
|
+
source_uri: Optional override. Defaults to ``sha256:<hex>`` of
|
|
357
|
+
``text``'s UTF-8 bytes, which makes the byte
|
|
358
|
+
ranges self-consistent and third-party-verifiable
|
|
359
|
+
without any network dependency.
|
|
360
|
+
timestamp: Optional ISO-8601 UTC timestamp. Defaults to
|
|
361
|
+
``datetime.now(timezone.utc).isoformat()``.
|
|
362
|
+
|
|
363
|
+
Returns:
|
|
364
|
+
List of ``((s, p, o), ProvenanceRecord)`` pairs — NOT deduplicated
|
|
365
|
+
at the triple level. Two sentences producing the same triple yield
|
|
366
|
+
two records with different byte ranges and different prov_ids.
|
|
367
|
+
The AkashicLedger is the dedup boundary, not this method.
|
|
368
|
+
"""
|
|
369
|
+
src = source_uri or sha256_uri_for_text(text)
|
|
370
|
+
ts = timestamp or datetime.now(timezone.utc).isoformat()
|
|
371
|
+
doc = self.nlp(text)
|
|
372
|
+
out: List[Tuple[Tuple[str, str, str], ProvenanceRecord]] = []
|
|
373
|
+
for sent in doc.sents:
|
|
374
|
+
triple = _extract_from_sent(sent)
|
|
375
|
+
if triple is None:
|
|
376
|
+
continue
|
|
377
|
+
# spaCy's sent.start_char / end_char are character offsets in
|
|
378
|
+
# the original text; convert to byte offsets in the UTF-8
|
|
379
|
+
# representation so the byte_range is correct for any consumer
|
|
380
|
+
# that stores bytes, not Python strings.
|
|
381
|
+
byte_start = len(text[: sent.start_char].encode("utf-8"))
|
|
382
|
+
byte_end = len(text[: sent.end_char].encode("utf-8"))
|
|
383
|
+
excerpt = sent.text[:EXCERPT_MAX_CHARS]
|
|
384
|
+
record = ProvenanceRecord(
|
|
385
|
+
source_uri=src,
|
|
386
|
+
byte_start=byte_start,
|
|
387
|
+
byte_end=byte_end,
|
|
388
|
+
extractor_id=SIEVE_EXTRACTOR_ID,
|
|
389
|
+
timestamp=ts,
|
|
390
|
+
text_excerpt=excerpt,
|
|
391
|
+
)
|
|
392
|
+
out.append((triple, record))
|
|
393
|
+
return out
|
|
394
|
+
|
|
395
|
+
def extract_annotated_triplets(
|
|
396
|
+
self, text: str
|
|
397
|
+
) -> List[Dict[str, object]]:
|
|
398
|
+
"""Extract triplets with per-sentence hedging annotation.
|
|
399
|
+
|
|
400
|
+
Returns a list of dicts:
|
|
401
|
+
{
|
|
402
|
+
"subject": str,
|
|
403
|
+
"predicate": str,
|
|
404
|
+
"object": str,
|
|
405
|
+
"linguistic_certainty": float, # 1.0 = definite, <1.0 = hedged
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
The linguistic_certainty score is a metadata-only signal
|
|
409
|
+
that does NOT affect the Gödel algebra.
|
|
410
|
+
"""
|
|
411
|
+
doc = self.nlp(text)
|
|
412
|
+
results = []
|
|
413
|
+
|
|
414
|
+
for sent in doc.sents:
|
|
415
|
+
# Negated sentences produce no triple — see _is_negated.
|
|
416
|
+
if _is_negated(sent):
|
|
417
|
+
continue
|
|
418
|
+
|
|
419
|
+
subject = None
|
|
420
|
+
predicate = None
|
|
421
|
+
object_ = None
|
|
422
|
+
|
|
423
|
+
for token in sent:
|
|
424
|
+
if token.dep_ == "ROOT" or token.pos_ == "VERB":
|
|
425
|
+
predicate = token.lemma_
|
|
426
|
+
for child in token.children:
|
|
427
|
+
if child.dep_ in ("nsubj", "nsubjpass", "csubj", "npadvmod"):
|
|
428
|
+
modifiers = [
|
|
429
|
+
c.text for c in child.children
|
|
430
|
+
if c.dep_ in ("amod", "compound")
|
|
431
|
+
]
|
|
432
|
+
# '_'-joined to satisfy canonical "\S+" subject invariant.
|
|
433
|
+
subject = "_".join(modifiers + [child.lemma_]).strip()
|
|
434
|
+
for child in token.children:
|
|
435
|
+
if child.dep_ in ("dobj", "pobj", "attr", "acomp"):
|
|
436
|
+
modifiers = [
|
|
437
|
+
c.text for c in child.children
|
|
438
|
+
if c.dep_ in ("amod", "compound")
|
|
439
|
+
]
|
|
440
|
+
object_ = " ".join(modifiers + [child.lemma_]).strip()
|
|
441
|
+
|
|
442
|
+
if subject and predicate and object_:
|
|
443
|
+
if len(subject.split()) <= 5 and len(object_.split()) <= 8:
|
|
444
|
+
certainty = detect_hedging(sent.text)
|
|
445
|
+
results.append({
|
|
446
|
+
"subject": subject.lower(),
|
|
447
|
+
"predicate": predicate.lower(),
|
|
448
|
+
"object": object_.lower(),
|
|
449
|
+
"linguistic_certainty": certainty,
|
|
450
|
+
})
|
|
451
|
+
|
|
452
|
+
return results
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Zero-Knowledge Semantic Proofs
|
|
3
|
+
|
|
4
|
+
Implements cryptographic commitments over Gödel-State entailment:
|
|
5
|
+
a node can mathematically prove it knows a specific axiom (= prime
|
|
6
|
+
factor of its global state) **without revealing the full state integer**.
|
|
7
|
+
|
|
8
|
+
Protocol:
|
|
9
|
+
1. Prover computes Q = State // prime (the co-factor).
|
|
10
|
+
2. Prover generates a random salt and publishes
|
|
11
|
+
Commitment = SHA-256( Q || salt ).
|
|
12
|
+
3. Verifier receives (commitment, salt, Q, prime) and re-hashes to
|
|
13
|
+
confirm the prover genuinely held the factor.
|
|
14
|
+
|
|
15
|
+
This is a simplified Pedersen-style commitment scheme optimised for
|
|
16
|
+
the Gödel integer domain. It is non-interactive.
|
|
17
|
+
|
|
18
|
+
Author: ototao
|
|
19
|
+
License: Apache License 2.0
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
import hashlib
|
|
23
|
+
import os
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ZKSemanticProver:
|
|
27
|
+
"""
|
|
28
|
+
Zero-Knowledge proofs for Gödel State Entailment.
|
|
29
|
+
|
|
30
|
+
Proves ``State % prime == 0`` without revealing the full State
|
|
31
|
+
integer, using a salted hash commitment over the quotient.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
@staticmethod
|
|
35
|
+
def generate_proof(global_state: int, prime: int) -> dict:
|
|
36
|
+
"""
|
|
37
|
+
Generate a ZK proof that ``global_state`` contains ``prime``.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
global_state: The full Gödel BigInt.
|
|
41
|
+
prime: The semantic prime to prove knowledge of.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
A proof dict with ``commitment``, ``salt``, ``prime``,
|
|
45
|
+
and ``quotient`` (as a string for BigInt JSON safety).
|
|
46
|
+
|
|
47
|
+
Raises:
|
|
48
|
+
ValueError: If the state does not actually entail the prime.
|
|
49
|
+
"""
|
|
50
|
+
if global_state % prime != 0:
|
|
51
|
+
raise ValueError("State does not entail this prime.")
|
|
52
|
+
|
|
53
|
+
quotient = global_state // prime
|
|
54
|
+
salt = os.urandom(16).hex()
|
|
55
|
+
|
|
56
|
+
# Commitment = Hash(Quotient || Salt)
|
|
57
|
+
commitment = hashlib.sha256(
|
|
58
|
+
f"{quotient}:{salt}".encode()
|
|
59
|
+
).hexdigest()
|
|
60
|
+
|
|
61
|
+
return {
|
|
62
|
+
"commitment": commitment,
|
|
63
|
+
"salt": salt,
|
|
64
|
+
"prime": prime,
|
|
65
|
+
"quotient": str(quotient),
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
@staticmethod
|
|
69
|
+
def verify_proof(proof: dict) -> bool:
|
|
70
|
+
"""
|
|
71
|
+
Verify a ZK semantic proof by re-computing the commitment.
|
|
72
|
+
|
|
73
|
+
The verifier checks that SHA-256(Q || salt) matches the
|
|
74
|
+
published commitment, confirming the prover genuinely held
|
|
75
|
+
a state divisible by the claimed prime.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
proof: Dict with ``commitment``, ``salt``, ``quotient``.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
True if the commitment is valid.
|
|
82
|
+
"""
|
|
83
|
+
q = int(proof["quotient"])
|
|
84
|
+
salt = proof["salt"]
|
|
85
|
+
|
|
86
|
+
expected = hashlib.sha256(
|
|
87
|
+
f"{q}:{salt}".encode()
|
|
88
|
+
).hexdigest()
|
|
89
|
+
|
|
90
|
+
return expected == proof["commitment"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Tome generation + LLM adapter + orchestration modules."""
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Automated Scientist Daemon — Machine Synthesis
|
|
3
|
+
|
|
4
|
+
Background asyncio task that continuously evaluates the Gödel Integer,
|
|
5
|
+
discovers novel topological relationships via transitive closure,
|
|
6
|
+
batch-mints primes via the Zig C-ABI, and permanently expands the
|
|
7
|
+
system's intelligence.
|
|
8
|
+
|
|
9
|
+
Every discovery is logged to the Akashic Ledger as a ``DEDUCED`` event,
|
|
10
|
+
providing a complete provenance trail of machine-generated knowledge.
|
|
11
|
+
|
|
12
|
+
Author: ototao
|
|
13
|
+
License: Apache License 2.0
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import math
|
|
17
|
+
import asyncio
|
|
18
|
+
import logging
|
|
19
|
+
from typing import Optional
|
|
20
|
+
|
|
21
|
+
from internal.algorithms.causal_discovery import CausalDiscoveryEngine
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger("sum.scientist")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class AutomatedScientistDaemon:
|
|
27
|
+
"""
|
|
28
|
+
Horizon V: The Dreaming Machine.
|
|
29
|
+
|
|
30
|
+
Runs in the background, sweeping the global Gödel state for
|
|
31
|
+
logically entailed but unminted axioms. Uses Zig FFI batch
|
|
32
|
+
minting when available.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(self, kos_instance, interval_seconds: int = 15):
|
|
36
|
+
self.kos = kos_instance
|
|
37
|
+
self.interval = interval_seconds
|
|
38
|
+
self.discovery_engine = CausalDiscoveryEngine(self.kos.algebra)
|
|
39
|
+
self.running = False
|
|
40
|
+
self.total_discoveries = 0
|
|
41
|
+
|
|
42
|
+
async def start_dreaming(self):
|
|
43
|
+
"""Begin the autonomous deduction loop."""
|
|
44
|
+
self.running = True
|
|
45
|
+
|
|
46
|
+
# Broadcast startup
|
|
47
|
+
try:
|
|
48
|
+
from internal.ensemble.epistemic_arbiter import kos_telemetry
|
|
49
|
+
await kos_telemetry.broadcast(
|
|
50
|
+
"🔬 Automated Scientist initialized. Awaiting REM sleep cycles..."
|
|
51
|
+
)
|
|
52
|
+
except Exception:
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
while self.running:
|
|
56
|
+
await asyncio.sleep(self.interval)
|
|
57
|
+
try:
|
|
58
|
+
await self._dream_cycle()
|
|
59
|
+
except Exception as e:
|
|
60
|
+
logger.error("Dream cycle error: %s", e)
|
|
61
|
+
|
|
62
|
+
async def stop(self):
|
|
63
|
+
"""Graceful shutdown."""
|
|
64
|
+
self.running = False
|
|
65
|
+
|
|
66
|
+
async def _dream_cycle(self):
|
|
67
|
+
"""
|
|
68
|
+
Single deduction sweep:
|
|
69
|
+
1. Get current state
|
|
70
|
+
2. Discover novel transitive relationships
|
|
71
|
+
3. Batch-mint primes (Zig if available, Python fallback)
|
|
72
|
+
4. LCM into global state
|
|
73
|
+
5. Log to Akashic Ledger
|
|
74
|
+
"""
|
|
75
|
+
current_state = self.kos.branches.get("main", 1)
|
|
76
|
+
if current_state == 1:
|
|
77
|
+
return # Nothing to analyze
|
|
78
|
+
|
|
79
|
+
novel_triplets = self.discovery_engine.sweep_for_discoveries(current_state)
|
|
80
|
+
if not novel_triplets:
|
|
81
|
+
return
|
|
82
|
+
|
|
83
|
+
# Broadcast discovery
|
|
84
|
+
try:
|
|
85
|
+
from internal.ensemble.epistemic_arbiter import kos_telemetry
|
|
86
|
+
await kos_telemetry.broadcast(
|
|
87
|
+
f"🧠 EUREKA! Synthesizing {len(novel_triplets)} novel discoveries..."
|
|
88
|
+
)
|
|
89
|
+
except Exception:
|
|
90
|
+
pass
|
|
91
|
+
|
|
92
|
+
axiom_strings = [f"{s}||{p}||{o}" for s, p, o in novel_triplets]
|
|
93
|
+
|
|
94
|
+
# Batch mint — Zig bare-metal if available
|
|
95
|
+
try:
|
|
96
|
+
from internal.infrastructure.zig_bridge import zig_engine
|
|
97
|
+
if zig_engine and hasattr(zig_engine, 'batch_mint_primes'):
|
|
98
|
+
new_primes = zig_engine.batch_mint_primes(axiom_strings)
|
|
99
|
+
else:
|
|
100
|
+
new_primes = None
|
|
101
|
+
except ImportError:
|
|
102
|
+
new_primes = None
|
|
103
|
+
|
|
104
|
+
if new_primes is None:
|
|
105
|
+
new_primes = [
|
|
106
|
+
self.kos.algebra.get_or_mint_prime(s, p, o)
|
|
107
|
+
for s, p, o in novel_triplets
|
|
108
|
+
]
|
|
109
|
+
|
|
110
|
+
async with self.kos.branch_lock("main"):
|
|
111
|
+
current_state = self.kos.branches.get("main", 1)
|
|
112
|
+
new_state = current_state
|
|
113
|
+
for prime, axiom_str in zip(new_primes, axiom_strings):
|
|
114
|
+
new_state = math.lcm(new_state, prime)
|
|
115
|
+
await self.kos.ledger.append_event("DEDUCED", prime, axiom_str)
|
|
116
|
+
await self.kos.ledger.append_event("MUL", prime)
|
|
117
|
+
|
|
118
|
+
self.kos.branches["main"] = new_state
|
|
119
|
+
# 19D coherence: daemon mutation must update index
|
|
120
|
+
if hasattr(self.kos, 'prime_index'):
|
|
121
|
+
self.kos.prime_index.rebuild("main", new_state, self.kos.algebra)
|
|
122
|
+
self.kos.prime_index.assert_coherent(
|
|
123
|
+
"main", new_state, self.kos.algebra, context="automated_scientist"
|
|
124
|
+
)
|
|
125
|
+
self.total_discoveries += len(novel_triplets)
|
|
126
|
+
|
|
127
|
+
logger.info(
|
|
128
|
+
"Scientist: %d new discoveries (total: %d)",
|
|
129
|
+
len(novel_triplets), self.total_discoveries
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
try:
|
|
133
|
+
from internal.ensemble.epistemic_arbiter import kos_telemetry
|
|
134
|
+
await kos_telemetry.broadcast(
|
|
135
|
+
f"🧬 State multiplied. Total autonomous discoveries: {self.total_discoveries}"
|
|
136
|
+
)
|
|
137
|
+
except Exception:
|
|
138
|
+
pass
|