wikimoth 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wikimoth/__init__.py +32 -0
- wikimoth/benchmark/__init__.py +32 -0
- wikimoth/benchmark/baselines.py +154 -0
- wikimoth/benchmark/corpus.py +273 -0
- wikimoth/benchmark/harness.py +405 -0
- wikimoth/capture/__init__.py +39 -0
- wikimoth/capture/buffer.py +143 -0
- wikimoth/capture/config.py +128 -0
- wikimoth/capture/hook.py +116 -0
- wikimoth/capture/install.py +168 -0
- wikimoth/capture/links.py +217 -0
- wikimoth/capture/note.py +274 -0
- wikimoth/capture/recall.py +107 -0
- wikimoth/cli.py +166 -0
- wikimoth/compaction.py +148 -0
- wikimoth/hybrid.py +174 -0
- wikimoth/pipeline.py +389 -0
- wikimoth/reader.py +135 -0
- wikimoth/retrieval/__init__.py +32 -0
- wikimoth/retrieval/chunk.py +32 -0
- wikimoth/retrieval/chunking.py +58 -0
- wikimoth/retrieval/graph.py +301 -0
- wikimoth/tokens.py +59 -0
- wikimoth/viewer.py +470 -0
- wikimoth-0.1.0.dist-info/METADATA +223 -0
- wikimoth-0.1.0.dist-info/RECORD +30 -0
- wikimoth-0.1.0.dist-info/WHEEL +5 -0
- wikimoth-0.1.0.dist-info/entry_points.txt +2 -0
- wikimoth-0.1.0.dist-info/licenses/LICENSE +201 -0
- wikimoth-0.1.0.dist-info/top_level.txt +1 -0
wikimoth/__init__.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""WikiMoth — deterministic, token-minimal, reproducible memory for Claude/agents.
|
|
2
|
+
|
|
3
|
+
Pipeline: MOTHRAG ``GraphRetriever(source="wikilinks")`` pulls the relevant
|
|
4
|
+
note-chain from a ``[[wikilink]]`` markdown vault → a compaction stage →
|
|
5
|
+
a Claude reader. The public surface is the :class:`MemoryRAG` pipeline plus its
|
|
6
|
+
pluggable stages (compactor, reader).
|
|
7
|
+
|
|
8
|
+
Importing this package is free of network/API and of the MOTHRAG dependency:
|
|
9
|
+
MOTHRAG is imported lazily, only when a vault is indexed with the default
|
|
10
|
+
retriever (or when :class:`ClaudeReader` is constructed).
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from wikimoth.compaction import Compactor, HeadroomCompactor, NoOpCompactor
|
|
14
|
+
from wikimoth.pipeline import MemoryRAG
|
|
15
|
+
from wikimoth.reader import ClaudeReader, EchoReader, Reader
|
|
16
|
+
from wikimoth.tokens import count_passage_tokens, count_tokens, token_backend
|
|
17
|
+
|
|
18
|
+
__version__ = "0.1.0"
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"MemoryRAG",
|
|
22
|
+
"Compactor",
|
|
23
|
+
"NoOpCompactor",
|
|
24
|
+
"HeadroomCompactor",
|
|
25
|
+
"Reader",
|
|
26
|
+
"EchoReader",
|
|
27
|
+
"ClaudeReader",
|
|
28
|
+
"count_tokens",
|
|
29
|
+
"count_passage_tokens",
|
|
30
|
+
"token_backend",
|
|
31
|
+
"__version__",
|
|
32
|
+
]
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""WikiMoth benchmark harness (4-arm efficiency test).
|
|
2
|
+
|
|
3
|
+
Exports the harness surface: the four arms (``dump`` / ``agentic`` (stub) /
|
|
4
|
+
``deterministic`` / ``deterministic_compacted``), the :class:`Question` /
|
|
5
|
+
:class:`ArmRecord` records, and the :func:`oracle_retrieval_loss` hook.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from wikimoth.benchmark.corpus import (
|
|
9
|
+
GoldQuestion,
|
|
10
|
+
generate_corpus,
|
|
11
|
+
generate_realistic_corpus,
|
|
12
|
+
)
|
|
13
|
+
from wikimoth.benchmark.harness import (
|
|
14
|
+
ARMS,
|
|
15
|
+
ArmRecord,
|
|
16
|
+
FourArmHarness,
|
|
17
|
+
Question,
|
|
18
|
+
oracle_retrieval_loss,
|
|
19
|
+
summarize,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"ARMS",
|
|
24
|
+
"ArmRecord",
|
|
25
|
+
"FourArmHarness",
|
|
26
|
+
"Question",
|
|
27
|
+
"oracle_retrieval_loss",
|
|
28
|
+
"summarize",
|
|
29
|
+
"GoldQuestion",
|
|
30
|
+
"generate_corpus",
|
|
31
|
+
"generate_realistic_corpus",
|
|
32
|
+
]
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
# Copyright 2026 Julian Geymonat
|
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
|
3
|
+
"""Real flat baselines for the benchmark — BM25 (sparse) and dense (semantic).
|
|
4
|
+
|
|
5
|
+
The clean/realistic corpora compare graph traversal against retrieval that has
|
|
6
|
+
no notion of links. To make that comparison credible (not "GraphRetriever with
|
|
7
|
+
traversal disabled"), this module ships two *standard* flat retrievers that
|
|
8
|
+
satisfy MOTHRAG's ``Retriever`` Protocol (``index`` / ``retrieve`` / ``__len__``)
|
|
9
|
+
so they drop straight into :class:`wikimoth.MemoryRAG` / the harness:
|
|
10
|
+
|
|
11
|
+
- :class:`BM25Retriever` — Okapi BM25 (``rank_bm25``). The canonical sparse
|
|
12
|
+
baseline. Like any lexical method it cannot reach a note that shares no terms
|
|
13
|
+
with the query.
|
|
14
|
+
- :class:`STDenseRetriever` — sentence-transformers bi-encoder cosine
|
|
15
|
+
(``all-MiniLM-L6-v2`` by default). The honest *semantic* adversary: it can
|
|
16
|
+
reach a note that is topically related without sharing words. $0 inference,
|
|
17
|
+
no API key (local model); import-guarded so the package needs neither dep
|
|
18
|
+
until a baseline is actually constructed.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import os
|
|
24
|
+
import re
|
|
25
|
+
from typing import Any, Sequence
|
|
26
|
+
|
|
27
|
+
# Set before any torch import: avoids a known OpenMP duplicate-runtime crash on
|
|
28
|
+
# Windows (the same conflict that segfaults the `sentence_transformers` package
|
|
29
|
+
# on this stack — we use transformers+torch directly to sidestep it).
|
|
30
|
+
os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
|
|
31
|
+
|
|
32
|
+
_WORD = re.compile(r"\w+")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _tokenize(text: str) -> list[str]:
|
|
36
|
+
return _WORD.findall((text or "").lower())
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class BM25Retriever:
|
|
40
|
+
"""Okapi BM25 sparse retriever (``rank_bm25``). Real flat sparse baseline."""
|
|
41
|
+
|
|
42
|
+
name = "bm25"
|
|
43
|
+
|
|
44
|
+
def __init__(self) -> None:
|
|
45
|
+
self._chunks: list[Any] = []
|
|
46
|
+
self._bm25 = None
|
|
47
|
+
|
|
48
|
+
def index(self, chunks: Sequence[Any]) -> None:
|
|
49
|
+
try:
|
|
50
|
+
from rank_bm25 import BM25Okapi
|
|
51
|
+
except ImportError as e: # pragma: no cover
|
|
52
|
+
raise ImportError("BM25Retriever requires `rank_bm25` (pip install rank_bm25).") from e
|
|
53
|
+
self._chunks = list(chunks)
|
|
54
|
+
corpus = [_tokenize(getattr(c, "text", "") or "") for c in self._chunks]
|
|
55
|
+
# BM25Okapi needs a non-empty doc per row; substitute a sentinel token.
|
|
56
|
+
corpus = [toks or ["\x00empty"] for toks in corpus]
|
|
57
|
+
self._bm25 = BM25Okapi(corpus)
|
|
58
|
+
|
|
59
|
+
def retrieve(self, question: str, *, top_k: int = 10) -> list[Any]:
|
|
60
|
+
if not self._chunks or self._bm25 is None:
|
|
61
|
+
return []
|
|
62
|
+
scores = self._bm25.get_scores(_tokenize(question))
|
|
63
|
+
order = sorted(range(len(scores)), key=lambda i: (-scores[i], i))[:top_k]
|
|
64
|
+
out: list[Any] = []
|
|
65
|
+
for i in order:
|
|
66
|
+
c = self._chunks[i]
|
|
67
|
+
try:
|
|
68
|
+
c.score = float(scores[i])
|
|
69
|
+
except (AttributeError, TypeError):
|
|
70
|
+
pass
|
|
71
|
+
out.append(c)
|
|
72
|
+
return out
|
|
73
|
+
|
|
74
|
+
def __len__(self) -> int:
|
|
75
|
+
return len(self._chunks)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class STDenseRetriever:
|
|
79
|
+
"""Dense semantic retriever — MiniLM bi-encoder cosine, via transformers.
|
|
80
|
+
|
|
81
|
+
The honest *semantic* baseline: it can reach a note topically related to the
|
|
82
|
+
query without sharing words. Built directly on ``transformers`` + ``torch``
|
|
83
|
+
(mean pooling + L2 normalize — exactly what sentence-transformers does for
|
|
84
|
+
this model) to sidestep a local segfault in the ``sentence_transformers``
|
|
85
|
+
package. Local model → $0 inference, no API key. Model + tokenizer are
|
|
86
|
+
loaded lazily in ``__init__`` (first use downloads ~80MB from the HF hub).
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
name = "dense_st"
|
|
90
|
+
|
|
91
|
+
def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
|
|
92
|
+
*, batch_size: int = 64, max_length: int = 256) -> None:
|
|
93
|
+
try:
|
|
94
|
+
import torch
|
|
95
|
+
from transformers import AutoModel, AutoTokenizer
|
|
96
|
+
except ImportError as e: # pragma: no cover
|
|
97
|
+
raise ImportError(
|
|
98
|
+
"STDenseRetriever requires `transformers` + `torch`."
|
|
99
|
+
) from e
|
|
100
|
+
self._torch = torch
|
|
101
|
+
self.model_name = model_name
|
|
102
|
+
self.batch_size = batch_size
|
|
103
|
+
self.max_length = max_length
|
|
104
|
+
self._tok = AutoTokenizer.from_pretrained(model_name)
|
|
105
|
+
self._model = AutoModel.from_pretrained(model_name)
|
|
106
|
+
self._model.eval()
|
|
107
|
+
self._chunks: list[Any] = []
|
|
108
|
+
self._emb = None
|
|
109
|
+
|
|
110
|
+
def _encode(self, texts: list[str]):
|
|
111
|
+
import numpy as np
|
|
112
|
+
torch = self._torch
|
|
113
|
+
out_chunks = []
|
|
114
|
+
for start in range(0, len(texts), self.batch_size):
|
|
115
|
+
batch = texts[start : start + self.batch_size]
|
|
116
|
+
enc = self._tok(batch, padding=True, truncation=True,
|
|
117
|
+
max_length=self.max_length, return_tensors="pt")
|
|
118
|
+
with torch.no_grad():
|
|
119
|
+
model_out = self._model(**enc)
|
|
120
|
+
tok_emb = model_out.last_hidden_state # (B, T, H)
|
|
121
|
+
mask = enc["attention_mask"].unsqueeze(-1).float()
|
|
122
|
+
summed = (tok_emb * mask).sum(dim=1)
|
|
123
|
+
counts = mask.sum(dim=1).clamp(min=1e-9)
|
|
124
|
+
emb = summed / counts # mean pooling
|
|
125
|
+
emb = torch.nn.functional.normalize(emb, p=2, dim=1)
|
|
126
|
+
out_chunks.append(emb.cpu().numpy())
|
|
127
|
+
return np.vstack(out_chunks) if out_chunks else np.zeros((0, 1))
|
|
128
|
+
|
|
129
|
+
def index(self, chunks: Sequence[Any]) -> None:
|
|
130
|
+
self._chunks = list(chunks)
|
|
131
|
+
texts = [getattr(c, "text", "") or "" for c in self._chunks]
|
|
132
|
+
self._emb = self._encode(texts)
|
|
133
|
+
|
|
134
|
+
def retrieve(self, question: str, *, top_k: int = 10) -> list[Any]:
|
|
135
|
+
if not self._chunks or self._emb is None:
|
|
136
|
+
return []
|
|
137
|
+
q = self._encode([question])[0]
|
|
138
|
+
scores = self._emb @ q # cosine (vectors are unit-normalized)
|
|
139
|
+
order = sorted(range(len(scores)), key=lambda i: (-float(scores[i]), i))[:top_k]
|
|
140
|
+
out: list[Any] = []
|
|
141
|
+
for i in order:
|
|
142
|
+
c = self._chunks[i]
|
|
143
|
+
try:
|
|
144
|
+
c.score = float(scores[i])
|
|
145
|
+
except (AttributeError, TypeError):
|
|
146
|
+
pass
|
|
147
|
+
out.append(c)
|
|
148
|
+
return out
|
|
149
|
+
|
|
150
|
+
def __len__(self) -> int:
|
|
151
|
+
return len(self._chunks)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
__all__ = ["BM25Retriever", "STDenseRetriever"]
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
# Copyright 2026 Julian Geymonat
|
|
2
|
+
# Licensed under the Apache License, Version 2.0
|
|
3
|
+
"""Synthetic gold corpus — a frozen ``[[wikilink]]`` benchmark vault.
|
|
4
|
+
|
|
5
|
+
No public benchmark does QA over a ``[[wikilink]]`` vault (research verdict, see
|
|
6
|
+
``MEMORY_BENCHMARK_DESIGN.md``), so we build one. This generator plants
|
|
7
|
+
multi-hop chains with **known gold note-chains and hop labels** — exactly the
|
|
8
|
+
ground truth Recall@k / hop-only-recall / oracle metrics need — and is fully
|
|
9
|
+
**deterministic** (seeded), so the benchmark is a re-runnable, shareable artifact
|
|
10
|
+
(no private data, no model in the loop).
|
|
11
|
+
|
|
12
|
+
Each question plants one chain of ``k`` hops::
|
|
13
|
+
|
|
14
|
+
anchor ──[[link]]──▶ relay₁ ──▶ … ──▶ relay_{k-1} ──▶ endpoint
|
|
15
|
+
|
|
16
|
+
- **anchor** is the only note that lexically matches the question (it carries the
|
|
17
|
+
unique ``topicNNNN`` token + the query's content words), so a lexical retriever
|
|
18
|
+
seeds *here*.
|
|
19
|
+
- **relays + endpoint** share **zero** tokens with the question, so they are
|
|
20
|
+
reachable **only by following ``[[wikilinks]]``** — the connect-the-dots
|
|
21
|
+
property a flat retriever misses. The endpoint holds the answer token.
|
|
22
|
+
|
|
23
|
+
So ``gold`` = the whole chain; ``hop_only`` = everything past the anchor (the
|
|
24
|
+
notes a flat retriever cannot reach). Distractor "filing" notes add corpus bulk
|
|
25
|
+
and noise (they never carry a ``topicNNNN`` token, so they never seed a query).
|
|
26
|
+
|
|
27
|
+
The vocabularies of the question, the relays, and the endpoint are kept disjoint
|
|
28
|
+
*by construction* (see ``_QUERY_WORDS`` / ``_RELAY_WORDS`` / ``_ENDPOINT_WORDS``)
|
|
29
|
+
so the hop-only property is guaranteed, not hoped for.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from __future__ import annotations
|
|
33
|
+
|
|
34
|
+
import random
|
|
35
|
+
from dataclasses import dataclass, field
|
|
36
|
+
from pathlib import Path
|
|
37
|
+
|
|
38
|
+
# Disjoint vocabularies — the guarantee behind "hop-only". The question's ONLY
|
|
39
|
+
# token that appears in ANY note is the unique ``topicNNNN`` (in anchor_i), so
|
|
40
|
+
# anchor_i is the SOLE lexical seed. Every other query word ("starting from
|
|
41
|
+
# what is the final destination") is absent from all note bodies; the anchor,
|
|
42
|
+
# relay, and endpoint vocabularies are mutually disjoint and disjoint from the
|
|
43
|
+
# query. This matters: GraphRetriever ranks seeds (proximity 1.0) above
|
|
44
|
+
# BFS-reached notes, so any spurious extra seed would crowd the real chain out
|
|
45
|
+
# of top-k. One seed → clean BFS down the chain → recall≈1, hop-only≈1.
|
|
46
|
+
_RELAY_WORDS = ("Relay junction.", "Forward hop.", "Passthrough segment.",
|
|
47
|
+
"Intermediate waypoint.", "Continue along.")
|
|
48
|
+
_ENDPOINT_WORDS = "Endpoint marker recorded. Archive sealed." # no query word
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class GoldQuestion:
|
|
53
|
+
"""A benchmark question with its gold note-chain + hop label.
|
|
54
|
+
|
|
55
|
+
Mirrors :class:`wikimoth.benchmark.harness.Question` plus the hop-only subset,
|
|
56
|
+
so it can be passed straight to the harness (which reads ``text`` /
|
|
57
|
+
``gold_doc_ids`` and, if present, ``hop`` / ``hop_only_doc_ids``).
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
text: str
|
|
61
|
+
gold_doc_ids: list[str] = field(default_factory=list)
|
|
62
|
+
hop_only_doc_ids: list[str] = field(default_factory=list)
|
|
63
|
+
hop: int = 0
|
|
64
|
+
answer: str = ""
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _anchor_stem(i: int) -> str:
|
|
68
|
+
return f"dossier-{i:04d}-anchor"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _relay_stem(i: int, m: int) -> str:
|
|
72
|
+
return f"dossier-{i:04d}-relay-{m}"
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _endpoint_stem(i: int) -> str:
|
|
76
|
+
return f"dossier-{i:04d}-endpoint"
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _filing_stem(j: int) -> str:
|
|
80
|
+
return f"filing-{j:05d}"
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _write_note(out: Path, stem: str, body: str) -> None:
|
|
84
|
+
fm = f"---\nname: {stem}\ndescription: \"synthetic benchmark note\"\n---\n"
|
|
85
|
+
out.joinpath(f"{stem}.md").write_text(fm + body + "\n", encoding="utf-8")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def generate_corpus(
|
|
89
|
+
out_dir: str | Path,
|
|
90
|
+
*,
|
|
91
|
+
n_questions: int = 60,
|
|
92
|
+
hops: tuple[int, ...] = (1, 2, 3),
|
|
93
|
+
n_distractors: int = 200,
|
|
94
|
+
seed: int = 7,
|
|
95
|
+
) -> list[GoldQuestion]:
|
|
96
|
+
"""Write a frozen synthetic vault to ``out_dir`` and return its questions.
|
|
97
|
+
|
|
98
|
+
Parameters
|
|
99
|
+
----------
|
|
100
|
+
n_questions
|
|
101
|
+
Number of planted chains (= questions).
|
|
102
|
+
hops
|
|
103
|
+
Hop lengths to cycle through (chain length per question). The harness
|
|
104
|
+
retriever's ``max_hops`` must be ``>= max(hops)`` to reach every
|
|
105
|
+
endpoint (use ``GraphRetriever(source="wikilinks", max_hops=max(hops))``).
|
|
106
|
+
n_distractors
|
|
107
|
+
Number of noise "filing" notes (corpus bulk; never seed a query).
|
|
108
|
+
seed
|
|
109
|
+
PRNG seed for distractor link wiring (chain structure is fully
|
|
110
|
+
determined by the question index, so the gold set is seed-independent).
|
|
111
|
+
"""
|
|
112
|
+
out = Path(out_dir)
|
|
113
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
114
|
+
rng = random.Random(seed)
|
|
115
|
+
|
|
116
|
+
questions: list[GoldQuestion] = []
|
|
117
|
+
|
|
118
|
+
for i in range(n_questions):
|
|
119
|
+
k = hops[i % len(hops)]
|
|
120
|
+
topic = f"topic{i:04d}"
|
|
121
|
+
answer = f"ANSWERTOKEN{i:04d}"
|
|
122
|
+
|
|
123
|
+
anchor = _anchor_stem(i)
|
|
124
|
+
relays = [_relay_stem(i, m) for m in range(1, k)] # k-1 relays
|
|
125
|
+
endpoint = _endpoint_stem(i)
|
|
126
|
+
chain = [anchor] + relays + [endpoint]
|
|
127
|
+
|
|
128
|
+
# anchor: the ONLY note carrying the unique topic (so the ONLY lexical
|
|
129
|
+
# seed for this query) + the first chain link. Its other words
|
|
130
|
+
# ("dossier reference path begins here") are absent from the query, so
|
|
131
|
+
# they create no spurious seeds across the 60 anchors.
|
|
132
|
+
nxt = chain[1]
|
|
133
|
+
_write_note(
|
|
134
|
+
out, anchor,
|
|
135
|
+
f"Dossier {topic}. Reference path begins here. [[{nxt}]]",
|
|
136
|
+
)
|
|
137
|
+
# relays: generic vocab only (no query words), link onward.
|
|
138
|
+
for idx, r in enumerate(relays):
|
|
139
|
+
nxt = chain[chain.index(r) + 1]
|
|
140
|
+
phrase = _RELAY_WORDS[idx % len(_RELAY_WORDS)]
|
|
141
|
+
_write_note(out, r, f"{phrase} Segment {idx + 1}. [[{nxt}]]")
|
|
142
|
+
# endpoint: holds the answer, disjoint vocab, no outgoing chain link.
|
|
143
|
+
_write_note(
|
|
144
|
+
out, endpoint,
|
|
145
|
+
f"{_ENDPOINT_WORDS} Marker value: {answer}.",
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
# Query: its ONLY token present in any note is `topic` (→ anchor_i).
|
|
149
|
+
# "starting from what is the final destination" appear in no note body.
|
|
150
|
+
text = f"Starting from {topic}, what is the final destination?"
|
|
151
|
+
questions.append(
|
|
152
|
+
GoldQuestion(
|
|
153
|
+
text=text,
|
|
154
|
+
gold_doc_ids=list(chain),
|
|
155
|
+
hop_only_doc_ids=relays + [endpoint],
|
|
156
|
+
hop=k,
|
|
157
|
+
answer=answer,
|
|
158
|
+
)
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# Distractor "filing" notes: bulk + noise, optionally cross-linked among
|
|
162
|
+
# themselves so the graph is not trivially star-shaped. They never carry a
|
|
163
|
+
# topicNNNN token, so they never lexically seed a question.
|
|
164
|
+
filings = [_filing_stem(j) for j in range(n_distractors)]
|
|
165
|
+
for j, f in enumerate(filings):
|
|
166
|
+
link = ""
|
|
167
|
+
if filings and rng.random() < 0.5:
|
|
168
|
+
tgt = rng.choice(filings)
|
|
169
|
+
if tgt != f:
|
|
170
|
+
link = f" [[{tgt}]]"
|
|
171
|
+
_write_note(
|
|
172
|
+
out, f,
|
|
173
|
+
f"Filing {j:05d}. Routine archived record, no cross-reference of "
|
|
174
|
+
f"note.{link}",
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
return questions
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
# ---------------------------------------------------------------------------
|
|
181
|
+
# Realistic corpus — decoy-rich (the credible differentiator)
|
|
182
|
+
# ---------------------------------------------------------------------------
|
|
183
|
+
# The clean corpus above proves the mechanism but its distractors share NO words
|
|
184
|
+
# with the query (flat retrieval fails by having nothing to match). The realistic
|
|
185
|
+
# corpus is the convincing version: distractor "memo" notes DO share query words
|
|
186
|
+
# (they actively mislead a flat/lexical retriever and bloat a dump), while the
|
|
187
|
+
# answer still lives only at the end of a [[wikilink]] chain. So flat retrieval
|
|
188
|
+
# is tempted toward decoys and misses the answer; the graph walks to it.
|
|
189
|
+
#
|
|
190
|
+
# Ranking guarantee (so the result is robust, not luck): the anchor contains the
|
|
191
|
+
# unique topic token PLUS every phrase word → it is always the single top seed
|
|
192
|
+
# (graph BFS starts there). Decoys contain a SUBSET of the phrase words → they
|
|
193
|
+
# rank below the anchor but above the chain. The chain (links + answer) contains
|
|
194
|
+
# NO phrase word → flat never seeds it; only graph traversal reaches it.
|
|
195
|
+
_PHRASE = ("city", "team", "lead", "responsible") # query words; anchor has all
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def generate_realistic_corpus(
|
|
199
|
+
out_dir: str | Path,
|
|
200
|
+
*,
|
|
201
|
+
n_questions: int = 60,
|
|
202
|
+
hops: tuple[int, ...] = (1, 2, 3),
|
|
203
|
+
n_decoys: int = 200,
|
|
204
|
+
seed: int = 11,
|
|
205
|
+
) -> list[GoldQuestion]:
|
|
206
|
+
"""Write a frozen *decoy-rich* synthetic vault and return its questions.
|
|
207
|
+
|
|
208
|
+
Like :func:`generate_corpus` but with knowledge-base-style notes and
|
|
209
|
+
**lexical decoys** that share query words, so a flat/lexical retriever is
|
|
210
|
+
actively misled (not merely starved). Deterministic; answers are unique
|
|
211
|
+
tokens (EM-gradeable, no LLM-judge needed).
|
|
212
|
+
"""
|
|
213
|
+
out = Path(out_dir)
|
|
214
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
215
|
+
rng = random.Random(seed)
|
|
216
|
+
questions: list[GoldQuestion] = []
|
|
217
|
+
|
|
218
|
+
for i in range(n_questions):
|
|
219
|
+
k = hops[i % len(hops)]
|
|
220
|
+
topic = f"proj{i:04d}"
|
|
221
|
+
answer = f"CITY{i:04d}"
|
|
222
|
+
|
|
223
|
+
anchor = f"record-{i:04d}-anchor"
|
|
224
|
+
relays = [f"record-{i:04d}-link-{m}" for m in range(1, k)]
|
|
225
|
+
ans_note = f"record-{i:04d}-answer"
|
|
226
|
+
chain = [anchor] + relays + [ans_note]
|
|
227
|
+
|
|
228
|
+
# anchor: unique topic + ALL phrase words → always the top seed.
|
|
229
|
+
_write_note(
|
|
230
|
+
out, anchor,
|
|
231
|
+
f"Project {topic}. The responsible team lead and city for this "
|
|
232
|
+
f"project are on record. See the owning unit. [[{chain[1]}]]",
|
|
233
|
+
)
|
|
234
|
+
# relays: no phrase word → flat never seeds them.
|
|
235
|
+
for idx, r in enumerate(relays):
|
|
236
|
+
nxt = chain[chain.index(r) + 1]
|
|
237
|
+
_write_note(out, r, f"Owning unit node {idx + 1}. Continue to record. [[{nxt}]]")
|
|
238
|
+
# answer note: no phrase word; holds the unique answer city token.
|
|
239
|
+
_write_note(
|
|
240
|
+
out, ans_note,
|
|
241
|
+
f"Coordinator profile. Stationed permanently at {answer}.",
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
# The question shares the unique topic with the anchor and the phrase
|
|
245
|
+
# words with the anchor + decoys (NOT the chain).
|
|
246
|
+
text = (
|
|
247
|
+
f"Which city hosts the responsible team lead for {topic}?"
|
|
248
|
+
)
|
|
249
|
+
questions.append(
|
|
250
|
+
GoldQuestion(
|
|
251
|
+
text=text,
|
|
252
|
+
gold_doc_ids=list(chain),
|
|
253
|
+
hop_only_doc_ids=relays + [ans_note],
|
|
254
|
+
hop=k,
|
|
255
|
+
answer=answer,
|
|
256
|
+
)
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
# Decoys: each carries a 2-subset of the phrase words (so it lexically
|
|
260
|
+
# matches the query and ranks above the chain, but below the all-phrase
|
|
261
|
+
# anchor) and never links into a chain.
|
|
262
|
+
for j in range(n_decoys):
|
|
263
|
+
words = rng.sample(_PHRASE, 2)
|
|
264
|
+
_write_note(
|
|
265
|
+
out, f"memo-{j:05d}",
|
|
266
|
+
f"Memo {j:05d}: notes on {words[0]} and {words[1]} matters, "
|
|
267
|
+
f"unrelated to any specific project record.",
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
return questions
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
__all__ = ["GoldQuestion", "generate_corpus", "generate_realistic_corpus"]
|