wikimoth 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wikimoth/__init__.py ADDED
@@ -0,0 +1,32 @@
1
+ """WikiMoth — deterministic, token-minimal, reproducible memory for Claude/agents.
2
+
3
+ Pipeline: MOTHRAG ``GraphRetriever(source="wikilinks")`` pulls the relevant
4
+ note-chain from a ``[[wikilink]]`` markdown vault → a compaction stage →
5
+ a Claude reader. The public surface is the :class:`MemoryRAG` pipeline plus its
6
+ pluggable stages (compactor, reader).
7
+
8
+ Importing this package is free of network/API and of the MOTHRAG dependency:
9
+ MOTHRAG is imported lazily, only when a vault is indexed with the default
10
+ retriever (or when :class:`ClaudeReader` is constructed).
11
+ """
12
+
13
+ from wikimoth.compaction import Compactor, HeadroomCompactor, NoOpCompactor
14
+ from wikimoth.pipeline import MemoryRAG
15
+ from wikimoth.reader import ClaudeReader, EchoReader, Reader
16
+ from wikimoth.tokens import count_passage_tokens, count_tokens, token_backend
17
+
18
+ __version__ = "0.1.0"
19
+
20
+ __all__ = [
21
+ "MemoryRAG",
22
+ "Compactor",
23
+ "NoOpCompactor",
24
+ "HeadroomCompactor",
25
+ "Reader",
26
+ "EchoReader",
27
+ "ClaudeReader",
28
+ "count_tokens",
29
+ "count_passage_tokens",
30
+ "token_backend",
31
+ "__version__",
32
+ ]
@@ -0,0 +1,32 @@
1
+ """WikiMoth benchmark harness (4-arm efficiency test).
2
+
3
+ Exports the harness surface: the four arms (``dump`` / ``agentic`` (stub) /
4
+ ``deterministic`` / ``deterministic_compacted``), the :class:`Question` /
5
+ :class:`ArmRecord` records, and the :func:`oracle_retrieval_loss` hook.
6
+ """
7
+
8
+ from wikimoth.benchmark.corpus import (
9
+ GoldQuestion,
10
+ generate_corpus,
11
+ generate_realistic_corpus,
12
+ )
13
+ from wikimoth.benchmark.harness import (
14
+ ARMS,
15
+ ArmRecord,
16
+ FourArmHarness,
17
+ Question,
18
+ oracle_retrieval_loss,
19
+ summarize,
20
+ )
21
+
22
+ __all__ = [
23
+ "ARMS",
24
+ "ArmRecord",
25
+ "FourArmHarness",
26
+ "Question",
27
+ "oracle_retrieval_loss",
28
+ "summarize",
29
+ "GoldQuestion",
30
+ "generate_corpus",
31
+ "generate_realistic_corpus",
32
+ ]
@@ -0,0 +1,154 @@
1
+ # Copyright 2026 Julian Geymonat
2
+ # Licensed under the Apache License, Version 2.0
3
+ """Real flat baselines for the benchmark — BM25 (sparse) and dense (semantic).
4
+
5
+ The clean/realistic corpora compare graph traversal against retrieval that has
6
+ no notion of links. To make that comparison credible (not "GraphRetriever with
7
+ traversal disabled"), this module ships two *standard* flat retrievers that
8
+ satisfy MOTHRAG's ``Retriever`` Protocol (``index`` / ``retrieve`` / ``__len__``)
9
+ so they drop straight into :class:`wikimoth.MemoryRAG` / the harness:
10
+
11
+ - :class:`BM25Retriever` — Okapi BM25 (``rank_bm25``). The canonical sparse
12
+ baseline. Like any lexical method it cannot reach a note that shares no terms
13
+ with the query.
14
+ - :class:`STDenseRetriever` — sentence-transformers bi-encoder cosine
15
+ (``all-MiniLM-L6-v2`` by default). The honest *semantic* adversary: it can
16
+ reach a note that is topically related without sharing words. $0 inference,
17
+ no API key (local model); import-guarded so the package needs neither dep
18
+ until a baseline is actually constructed.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import os
24
+ import re
25
+ from typing import Any, Sequence
26
+
27
+ # Set before any torch import: avoids a known OpenMP duplicate-runtime crash on
28
+ # Windows (the same conflict that segfaults the `sentence_transformers` package
29
+ # on this stack — we use transformers+torch directly to sidestep it).
30
+ os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
31
+
32
+ _WORD = re.compile(r"\w+")
33
+
34
+
35
+ def _tokenize(text: str) -> list[str]:
36
+ return _WORD.findall((text or "").lower())
37
+
38
+
39
+ class BM25Retriever:
40
+ """Okapi BM25 sparse retriever (``rank_bm25``). Real flat sparse baseline."""
41
+
42
+ name = "bm25"
43
+
44
+ def __init__(self) -> None:
45
+ self._chunks: list[Any] = []
46
+ self._bm25 = None
47
+
48
+ def index(self, chunks: Sequence[Any]) -> None:
49
+ try:
50
+ from rank_bm25 import BM25Okapi
51
+ except ImportError as e: # pragma: no cover
52
+ raise ImportError("BM25Retriever requires `rank_bm25` (pip install rank_bm25).") from e
53
+ self._chunks = list(chunks)
54
+ corpus = [_tokenize(getattr(c, "text", "") or "") for c in self._chunks]
55
+ # BM25Okapi needs a non-empty doc per row; substitute a sentinel token.
56
+ corpus = [toks or ["\x00empty"] for toks in corpus]
57
+ self._bm25 = BM25Okapi(corpus)
58
+
59
+ def retrieve(self, question: str, *, top_k: int = 10) -> list[Any]:
60
+ if not self._chunks or self._bm25 is None:
61
+ return []
62
+ scores = self._bm25.get_scores(_tokenize(question))
63
+ order = sorted(range(len(scores)), key=lambda i: (-scores[i], i))[:top_k]
64
+ out: list[Any] = []
65
+ for i in order:
66
+ c = self._chunks[i]
67
+ try:
68
+ c.score = float(scores[i])
69
+ except (AttributeError, TypeError):
70
+ pass
71
+ out.append(c)
72
+ return out
73
+
74
+ def __len__(self) -> int:
75
+ return len(self._chunks)
76
+
77
+
78
+ class STDenseRetriever:
79
+ """Dense semantic retriever — MiniLM bi-encoder cosine, via transformers.
80
+
81
+ The honest *semantic* baseline: it can reach a note topically related to the
82
+ query without sharing words. Built directly on ``transformers`` + ``torch``
83
+ (mean pooling + L2 normalize — exactly what sentence-transformers does for
84
+ this model) to sidestep a local segfault in the ``sentence_transformers``
85
+ package. Local model → $0 inference, no API key. Model + tokenizer are
86
+ loaded lazily in ``__init__`` (first use downloads ~80MB from the HF hub).
87
+ """
88
+
89
+ name = "dense_st"
90
+
91
+ def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
92
+ *, batch_size: int = 64, max_length: int = 256) -> None:
93
+ try:
94
+ import torch
95
+ from transformers import AutoModel, AutoTokenizer
96
+ except ImportError as e: # pragma: no cover
97
+ raise ImportError(
98
+ "STDenseRetriever requires `transformers` + `torch`."
99
+ ) from e
100
+ self._torch = torch
101
+ self.model_name = model_name
102
+ self.batch_size = batch_size
103
+ self.max_length = max_length
104
+ self._tok = AutoTokenizer.from_pretrained(model_name)
105
+ self._model = AutoModel.from_pretrained(model_name)
106
+ self._model.eval()
107
+ self._chunks: list[Any] = []
108
+ self._emb = None
109
+
110
+ def _encode(self, texts: list[str]):
111
+ import numpy as np
112
+ torch = self._torch
113
+ out_chunks = []
114
+ for start in range(0, len(texts), self.batch_size):
115
+ batch = texts[start : start + self.batch_size]
116
+ enc = self._tok(batch, padding=True, truncation=True,
117
+ max_length=self.max_length, return_tensors="pt")
118
+ with torch.no_grad():
119
+ model_out = self._model(**enc)
120
+ tok_emb = model_out.last_hidden_state # (B, T, H)
121
+ mask = enc["attention_mask"].unsqueeze(-1).float()
122
+ summed = (tok_emb * mask).sum(dim=1)
123
+ counts = mask.sum(dim=1).clamp(min=1e-9)
124
+ emb = summed / counts # mean pooling
125
+ emb = torch.nn.functional.normalize(emb, p=2, dim=1)
126
+ out_chunks.append(emb.cpu().numpy())
127
+ return np.vstack(out_chunks) if out_chunks else np.zeros((0, 1))
128
+
129
+ def index(self, chunks: Sequence[Any]) -> None:
130
+ self._chunks = list(chunks)
131
+ texts = [getattr(c, "text", "") or "" for c in self._chunks]
132
+ self._emb = self._encode(texts)
133
+
134
+ def retrieve(self, question: str, *, top_k: int = 10) -> list[Any]:
135
+ if not self._chunks or self._emb is None:
136
+ return []
137
+ q = self._encode([question])[0]
138
+ scores = self._emb @ q # cosine (vectors are unit-normalized)
139
+ order = sorted(range(len(scores)), key=lambda i: (-float(scores[i]), i))[:top_k]
140
+ out: list[Any] = []
141
+ for i in order:
142
+ c = self._chunks[i]
143
+ try:
144
+ c.score = float(scores[i])
145
+ except (AttributeError, TypeError):
146
+ pass
147
+ out.append(c)
148
+ return out
149
+
150
+ def __len__(self) -> int:
151
+ return len(self._chunks)
152
+
153
+
154
+ __all__ = ["BM25Retriever", "STDenseRetriever"]
@@ -0,0 +1,273 @@
1
+ # Copyright 2026 Julian Geymonat
2
+ # Licensed under the Apache License, Version 2.0
3
+ """Synthetic gold corpus — a frozen ``[[wikilink]]`` benchmark vault.
4
+
5
+ No public benchmark does QA over a ``[[wikilink]]`` vault (research verdict, see
6
+ ``MEMORY_BENCHMARK_DESIGN.md``), so we build one. This generator plants
7
+ multi-hop chains with **known gold note-chains and hop labels** — exactly the
8
+ ground truth Recall@k / hop-only-recall / oracle metrics need — and is fully
9
+ **deterministic** (seeded), so the benchmark is a re-runnable, shareable artifact
10
+ (no private data, no model in the loop).
11
+
12
+ Each question plants one chain of ``k`` hops::
13
+
14
+ anchor ──[[link]]──▶ relay₁ ──▶ … ──▶ relay_{k-1} ──▶ endpoint
15
+
16
+ - **anchor** is the only note that lexically matches the question (it carries the
17
+ unique ``topicNNNN`` token + the query's content words), so a lexical retriever
18
+ seeds *here*.
19
+ - **relays + endpoint** share **zero** tokens with the question, so they are
20
+ reachable **only by following ``[[wikilinks]]``** — the connect-the-dots
21
+ property a flat retriever misses. The endpoint holds the answer token.
22
+
23
+ So ``gold`` = the whole chain; ``hop_only`` = everything past the anchor (the
24
+ notes a flat retriever cannot reach). Distractor "filing" notes add corpus bulk
25
+ and noise (they never carry a ``topicNNNN`` token, so they never seed a query).
26
+
27
+ The vocabularies of the question, the relays, and the endpoint are kept disjoint
28
+ *by construction* (see ``_QUERY_WORDS`` / ``_RELAY_WORDS`` / ``_ENDPOINT_WORDS``)
29
+ so the hop-only property is guaranteed, not hoped for.
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import random
35
+ from dataclasses import dataclass, field
36
+ from pathlib import Path
37
+
38
+ # Disjoint vocabularies — the guarantee behind "hop-only". The question's ONLY
39
+ # token that appears in ANY note is the unique ``topicNNNN`` (in anchor_i), so
40
+ # anchor_i is the SOLE lexical seed. Every other query word ("starting from
41
+ # what is the final destination") is absent from all note bodies; the anchor,
42
+ # relay, and endpoint vocabularies are mutually disjoint and disjoint from the
43
+ # query. This matters: GraphRetriever ranks seeds (proximity 1.0) above
44
+ # BFS-reached notes, so any spurious extra seed would crowd the real chain out
45
+ # of top-k. One seed → clean BFS down the chain → recall≈1, hop-only≈1.
46
+ _RELAY_WORDS = ("Relay junction.", "Forward hop.", "Passthrough segment.",
47
+ "Intermediate waypoint.", "Continue along.")
48
+ _ENDPOINT_WORDS = "Endpoint marker recorded. Archive sealed." # no query word
49
+
50
+
51
+ @dataclass
52
+ class GoldQuestion:
53
+ """A benchmark question with its gold note-chain + hop label.
54
+
55
+ Mirrors :class:`wikimoth.benchmark.harness.Question` plus the hop-only subset,
56
+ so it can be passed straight to the harness (which reads ``text`` /
57
+ ``gold_doc_ids`` and, if present, ``hop`` / ``hop_only_doc_ids``).
58
+ """
59
+
60
+ text: str
61
+ gold_doc_ids: list[str] = field(default_factory=list)
62
+ hop_only_doc_ids: list[str] = field(default_factory=list)
63
+ hop: int = 0
64
+ answer: str = ""
65
+
66
+
67
+ def _anchor_stem(i: int) -> str:
68
+ return f"dossier-{i:04d}-anchor"
69
+
70
+
71
+ def _relay_stem(i: int, m: int) -> str:
72
+ return f"dossier-{i:04d}-relay-{m}"
73
+
74
+
75
+ def _endpoint_stem(i: int) -> str:
76
+ return f"dossier-{i:04d}-endpoint"
77
+
78
+
79
+ def _filing_stem(j: int) -> str:
80
+ return f"filing-{j:05d}"
81
+
82
+
83
+ def _write_note(out: Path, stem: str, body: str) -> None:
84
+ fm = f"---\nname: {stem}\ndescription: \"synthetic benchmark note\"\n---\n"
85
+ out.joinpath(f"{stem}.md").write_text(fm + body + "\n", encoding="utf-8")
86
+
87
+
88
+ def generate_corpus(
89
+ out_dir: str | Path,
90
+ *,
91
+ n_questions: int = 60,
92
+ hops: tuple[int, ...] = (1, 2, 3),
93
+ n_distractors: int = 200,
94
+ seed: int = 7,
95
+ ) -> list[GoldQuestion]:
96
+ """Write a frozen synthetic vault to ``out_dir`` and return its questions.
97
+
98
+ Parameters
99
+ ----------
100
+ n_questions
101
+ Number of planted chains (= questions).
102
+ hops
103
+ Hop lengths to cycle through (chain length per question). The harness
104
+ retriever's ``max_hops`` must be ``>= max(hops)`` to reach every
105
+ endpoint (use ``GraphRetriever(source="wikilinks", max_hops=max(hops))``).
106
+ n_distractors
107
+ Number of noise "filing" notes (corpus bulk; never seed a query).
108
+ seed
109
+ PRNG seed for distractor link wiring (chain structure is fully
110
+ determined by the question index, so the gold set is seed-independent).
111
+ """
112
+ out = Path(out_dir)
113
+ out.mkdir(parents=True, exist_ok=True)
114
+ rng = random.Random(seed)
115
+
116
+ questions: list[GoldQuestion] = []
117
+
118
+ for i in range(n_questions):
119
+ k = hops[i % len(hops)]
120
+ topic = f"topic{i:04d}"
121
+ answer = f"ANSWERTOKEN{i:04d}"
122
+
123
+ anchor = _anchor_stem(i)
124
+ relays = [_relay_stem(i, m) for m in range(1, k)] # k-1 relays
125
+ endpoint = _endpoint_stem(i)
126
+ chain = [anchor] + relays + [endpoint]
127
+
128
+ # anchor: the ONLY note carrying the unique topic (so the ONLY lexical
129
+ # seed for this query) + the first chain link. Its other words
130
+ # ("dossier reference path begins here") are absent from the query, so
131
+ # they create no spurious seeds across the 60 anchors.
132
+ nxt = chain[1]
133
+ _write_note(
134
+ out, anchor,
135
+ f"Dossier {topic}. Reference path begins here. [[{nxt}]]",
136
+ )
137
+ # relays: generic vocab only (no query words), link onward.
138
+ for idx, r in enumerate(relays):
139
+ nxt = chain[chain.index(r) + 1]
140
+ phrase = _RELAY_WORDS[idx % len(_RELAY_WORDS)]
141
+ _write_note(out, r, f"{phrase} Segment {idx + 1}. [[{nxt}]]")
142
+ # endpoint: holds the answer, disjoint vocab, no outgoing chain link.
143
+ _write_note(
144
+ out, endpoint,
145
+ f"{_ENDPOINT_WORDS} Marker value: {answer}.",
146
+ )
147
+
148
+ # Query: its ONLY token present in any note is `topic` (→ anchor_i).
149
+ # "starting from what is the final destination" appear in no note body.
150
+ text = f"Starting from {topic}, what is the final destination?"
151
+ questions.append(
152
+ GoldQuestion(
153
+ text=text,
154
+ gold_doc_ids=list(chain),
155
+ hop_only_doc_ids=relays + [endpoint],
156
+ hop=k,
157
+ answer=answer,
158
+ )
159
+ )
160
+
161
+ # Distractor "filing" notes: bulk + noise, optionally cross-linked among
162
+ # themselves so the graph is not trivially star-shaped. They never carry a
163
+ # topicNNNN token, so they never lexically seed a question.
164
+ filings = [_filing_stem(j) for j in range(n_distractors)]
165
+ for j, f in enumerate(filings):
166
+ link = ""
167
+ if filings and rng.random() < 0.5:
168
+ tgt = rng.choice(filings)
169
+ if tgt != f:
170
+ link = f" [[{tgt}]]"
171
+ _write_note(
172
+ out, f,
173
+ f"Filing {j:05d}. Routine archived record, no cross-reference of "
174
+ f"note.{link}",
175
+ )
176
+
177
+ return questions
178
+
179
+
180
+ # ---------------------------------------------------------------------------
181
+ # Realistic corpus — decoy-rich (the credible differentiator)
182
+ # ---------------------------------------------------------------------------
183
+ # The clean corpus above proves the mechanism but its distractors share NO words
184
+ # with the query (flat retrieval fails by having nothing to match). The realistic
185
+ # corpus is the convincing version: distractor "memo" notes DO share query words
186
+ # (they actively mislead a flat/lexical retriever and bloat a dump), while the
187
+ # answer still lives only at the end of a [[wikilink]] chain. So flat retrieval
188
+ # is tempted toward decoys and misses the answer; the graph walks to it.
189
+ #
190
+ # Ranking guarantee (so the result is robust, not luck): the anchor contains the
191
+ # unique topic token PLUS every phrase word → it is always the single top seed
192
+ # (graph BFS starts there). Decoys contain a SUBSET of the phrase words → they
193
+ # rank below the anchor but above the chain. The chain (links + answer) contains
194
+ # NO phrase word → flat never seeds it; only graph traversal reaches it.
195
+ _PHRASE = ("city", "team", "lead", "responsible") # query words; anchor has all
196
+
197
+
198
+ def generate_realistic_corpus(
199
+ out_dir: str | Path,
200
+ *,
201
+ n_questions: int = 60,
202
+ hops: tuple[int, ...] = (1, 2, 3),
203
+ n_decoys: int = 200,
204
+ seed: int = 11,
205
+ ) -> list[GoldQuestion]:
206
+ """Write a frozen *decoy-rich* synthetic vault and return its questions.
207
+
208
+ Like :func:`generate_corpus` but with knowledge-base-style notes and
209
+ **lexical decoys** that share query words, so a flat/lexical retriever is
210
+ actively misled (not merely starved). Deterministic; answers are unique
211
+ tokens (EM-gradeable, no LLM-judge needed).
212
+ """
213
+ out = Path(out_dir)
214
+ out.mkdir(parents=True, exist_ok=True)
215
+ rng = random.Random(seed)
216
+ questions: list[GoldQuestion] = []
217
+
218
+ for i in range(n_questions):
219
+ k = hops[i % len(hops)]
220
+ topic = f"proj{i:04d}"
221
+ answer = f"CITY{i:04d}"
222
+
223
+ anchor = f"record-{i:04d}-anchor"
224
+ relays = [f"record-{i:04d}-link-{m}" for m in range(1, k)]
225
+ ans_note = f"record-{i:04d}-answer"
226
+ chain = [anchor] + relays + [ans_note]
227
+
228
+ # anchor: unique topic + ALL phrase words → always the top seed.
229
+ _write_note(
230
+ out, anchor,
231
+ f"Project {topic}. The responsible team lead and city for this "
232
+ f"project are on record. See the owning unit. [[{chain[1]}]]",
233
+ )
234
+ # relays: no phrase word → flat never seeds them.
235
+ for idx, r in enumerate(relays):
236
+ nxt = chain[chain.index(r) + 1]
237
+ _write_note(out, r, f"Owning unit node {idx + 1}. Continue to record. [[{nxt}]]")
238
+ # answer note: no phrase word; holds the unique answer city token.
239
+ _write_note(
240
+ out, ans_note,
241
+ f"Coordinator profile. Stationed permanently at {answer}.",
242
+ )
243
+
244
+ # The question shares the unique topic with the anchor and the phrase
245
+ # words with the anchor + decoys (NOT the chain).
246
+ text = (
247
+ f"Which city hosts the responsible team lead for {topic}?"
248
+ )
249
+ questions.append(
250
+ GoldQuestion(
251
+ text=text,
252
+ gold_doc_ids=list(chain),
253
+ hop_only_doc_ids=relays + [ans_note],
254
+ hop=k,
255
+ answer=answer,
256
+ )
257
+ )
258
+
259
+ # Decoys: each carries a 2-subset of the phrase words (so it lexically
260
+ # matches the query and ranks above the chain, but below the all-phrase
261
+ # anchor) and never links into a chain.
262
+ for j in range(n_decoys):
263
+ words = rng.sample(_PHRASE, 2)
264
+ _write_note(
265
+ out, f"memo-{j:05d}",
266
+ f"Memo {j:05d}: notes on {words[0]} and {words[1]} matters, "
267
+ f"unrelated to any specific project record.",
268
+ )
269
+
270
+ return questions
271
+
272
+
273
+ __all__ = ["GoldQuestion", "generate_corpus", "generate_realistic_corpus"]