ralph-hero-knowledge-index 0.1.31 → 0.1.33
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/.mcp.json +1 -1
- package/benchmark/README.md +90 -1
- package/benchmark/eval-rerank.mjs +241 -0
- package/benchmark/reindex-heap-bench.ts +369 -0
- package/dist/hybrid-search.d.ts +22 -1
- package/dist/hybrid-search.js +100 -2
- package/dist/hybrid-search.js.map +1 -1
- package/dist/index.d.ts +9 -0
- package/dist/index.js +22 -2
- package/dist/index.js.map +1 -1
- package/dist/reranker.d.ts +127 -0
- package/dist/reranker.js +159 -0
- package/dist/reranker.js.map +1 -0
- package/dist/search.d.ts +34 -0
- package/dist/search.js.map +1 -1
- package/package.json +3 -1
- package/src/__tests__/hybrid-search.test.ts +412 -0
- package/src/__tests__/index.test.ts +236 -0
- package/src/__tests__/reranker.test.ts +253 -0
- package/src/hybrid-search.ts +103 -0
- package/src/index.ts +29 -1
- package/src/reranker.ts +240 -0
- package/src/search.ts +34 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "ralph-knowledge",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.33",
|
|
4
4
|
"description": "Knowledge graph for ralph-hero: semantic search, relationship traversal, and document indexing across thoughts/ documents. Optional companion to ralph-hero.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "Chad Dubiel",
|
package/.mcp.json
CHANGED
package/benchmark/README.md
CHANGED
|
@@ -11,7 +11,8 @@ will not break the CI matrix on Node 18/20/22.
|
|
|
11
11
|
## Running
|
|
12
12
|
|
|
13
13
|
Each script is a standalone TypeScript file that can be run directly with
|
|
14
|
-
`tsx` (
|
|
14
|
+
`tsx` (declared as a `devDependency` in `package.json`, installed by
|
|
15
|
+
`npm ci`):
|
|
15
16
|
|
|
16
17
|
```bash
|
|
17
18
|
# From repo root or plugin/ralph-knowledge:
|
|
@@ -19,6 +20,9 @@ npx tsx benchmark/reranker-bench.ts
|
|
|
19
20
|
|
|
20
21
|
# Or, equivalently, with the node loader form:
|
|
21
22
|
node --import tsx benchmark/reranker-bench.ts
|
|
23
|
+
|
|
24
|
+
# Or via the npm script (used by CI for the heap bench):
|
|
25
|
+
npm run bench:heap -- --assert
|
|
22
26
|
```
|
|
23
27
|
|
|
24
28
|
Scripts read the same `RALPH_KNOWLEDGE_DB` env var as the MCP server, so by
|
|
@@ -48,3 +52,88 @@ the entire run.
|
|
|
48
52
|
The script is purely additive — it does not modify `hybrid-search.ts` or any
|
|
49
53
|
production source file. Production wiring of a default reranker is a separate
|
|
50
54
|
followup gated on the benchmark findings.
|
|
55
|
+
|
|
56
|
+
### `reindex-heap-bench.ts` (GH-913)
|
|
57
|
+
|
|
58
|
+
Microbenchmark guarding the OOM fix from #907 (#911 embedder tensor disposal,
|
|
59
|
+
#916 chunker forward-progress). Generates a deterministic 50-doc / ~240-chunk
|
|
60
|
+
synthetic corpus in a tmp dir via a seeded `mulberry32` RNG, runs `reindex()`
|
|
61
|
+
against it with `RALPH_CONTEXTUAL_RETRIEVAL=0`, samples
|
|
62
|
+
`process.memoryUsage()` every 100 ms, and writes a TSV row with peak
|
|
63
|
+
`heap_used`, `rss`, `external`, wall clock, and chunk count. (The reranker
|
|
64
|
+
bench measures cold-start; the heap bench does not, because `reindex()`
|
|
65
|
+
exposes no hook to mark the moment when the embedding model finishes loading.)
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
# Run once, write TSV row, no exit-1 behavior:
|
|
69
|
+
npx tsx benchmark/reindex-heap-bench.ts
|
|
70
|
+
|
|
71
|
+
# Same, but exit 1 if peak_heap_used > 600 MB or peak_rss > 800 MB:
|
|
72
|
+
npx tsx benchmark/reindex-heap-bench.ts --assert
|
|
73
|
+
|
|
74
|
+
# Same as above but via the npm script (used by CI in build-and-test-knowledge):
|
|
75
|
+
npm run bench:heap -- --assert
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
Results are appended one row per run to `benchmark/results-YYYY-MM-DD.tsv`
|
|
79
|
+
(history-preserving — re-running the bench during a tuning session adds rows
|
|
80
|
+
under the same header rather than overwriting). The TSV header is:
|
|
81
|
+
|
|
82
|
+
```
|
|
83
|
+
date doc_count chunk_count wall_clock_s peak_heap_used_mb peak_rss_mb peak_external_mb threshold_pass notes
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Default thresholds (sourced from
|
|
87
|
+
[2026-04-29-reindex-memory-profile.md](../../../thoughts/shared/research/2026-04-29-reindex-memory-profile.md)):
|
|
88
|
+
|
|
89
|
+
| Threshold | Value | Rationale |
|
|
90
|
+
|----------------------|-------|--------------------------------------------------------------------------------------------------------------------------------------------|
|
|
91
|
+
| `peak_heap_used_mb` | 600 | Catches catastrophic regrowth (the original OOM was 4 GB+); ~12x margin over today's typical ~30-50 MB on the 50-doc bench corpus. |
|
|
92
|
+
| `peak_rss_mb` | 800 | Catches transformer-model bloat or external-buffer growth; ~1.6-2x margin over today's typical ~400-450 MB on the 50-doc bench corpus. |
|
|
93
|
+
|
|
94
|
+
**Tuning the thresholds**: open the TSV results history, find the
|
|
95
|
+
95th-percentile `peak_heap_used_mb` across the last ~10 runs on your CI
|
|
96
|
+
hardware, multiply by 2. That yields a regression-detection threshold without
|
|
97
|
+
flakiness from per-run jitter.
|
|
98
|
+
|
|
99
|
+
#### Manually verifying the bench fails on a regression
|
|
100
|
+
|
|
101
|
+
The intuition behind the bench is: **a regression that re-introduces
|
|
102
|
+
unbounded transient allocation will push one of the three peak metrics
|
|
103
|
+
(`heap_used`, `rss`, `external`) far above today's baseline**. The TSV
|
|
104
|
+
records all three so a tuning session can pick the right metric for the
|
|
105
|
+
regression class being guarded.
|
|
106
|
+
|
|
107
|
+
To confirm the bench's `--assert` path works end-to-end, force a synthetic
|
|
108
|
+
breach by temporarily lowering one of the thresholds in
|
|
109
|
+
`benchmark/reindex-heap-bench.ts`:
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
# In benchmark/reindex-heap-bench.ts, temporarily set:
|
|
113
|
+
# const HEAP_THRESHOLD_MB = 30; // below today's ~40 MB baseline
|
|
114
|
+
# (or)
|
|
115
|
+
# const RSS_THRESHOLD_MB = 300; // below today's ~450 MB baseline
|
|
116
|
+
|
|
117
|
+
npx tsx benchmark/reindex-heap-bench.ts --assert
|
|
118
|
+
# expected: exit code 1, console line:
|
|
119
|
+
# reindex-heap-bench: ASSERT FAIL — THRESHOLD BREACH: heap_used 41.2 > 30
|
|
120
|
+
|
|
121
|
+
# Restore the threshold (revert benchmark/reindex-heap-bench.ts).
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
Do **NOT** commit the threshold change — it's a one-time confirmation that
|
|
125
|
+
the assertion path works end-to-end. The bench script itself is purely
|
|
126
|
+
additive and never modifies `embedder.ts`/`chunker.ts`/`reindex.ts`.
|
|
127
|
+
|
|
128
|
+
**Note on the dispose() regression**: an earlier draft of this section
|
|
129
|
+
suggested reverting `output.dispose()` in `src/embedder.ts` to verify the
|
|
130
|
+
bench catches the original GH-911 OOM. Empirically, on the 50-doc / ~240-chunk
|
|
131
|
+
synthetic corpus, removing the dispose call leaves `peak_heap_used_mb`
|
|
132
|
+
unchanged (~41 MB) and only adds ~3x to `peak_external_mb` (~21 MB -> ~65 MB).
|
|
133
|
+
The original OOM manifested at the live ~14k-chunk corpus scale, not at this
|
|
134
|
+
bench's scale. The bench therefore guards against **catastrophic
|
|
135
|
+
regressions** (a 10x+ allocation increase that crosses the 600 MB / 800 MB
|
|
136
|
+
margins) rather than the specific dispose() leak — which would need a much
|
|
137
|
+
larger synthetic corpus to be detectable. The `peak_external_mb` column is
|
|
138
|
+
recorded in the TSV for future tuning if a tighter native-buffer guard
|
|
139
|
+
becomes worth the added bench runtime.
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* GH-927 — Re-run the 8-query golden eval with `rerank: true` against the live
|
|
4
|
+
* `~/.ralph-hero/knowledge.db`. Captures per-query rank-of-expected, cold/warm
|
|
5
|
+
* latency, and rerank logits. Output is JSON on stdout for downstream
|
|
6
|
+
* markdown formatting.
|
|
7
|
+
*
|
|
8
|
+
* Usage:
|
|
9
|
+
* node benchmark/eval-rerank.mjs > /tmp/eval-rerank-results.json
|
|
10
|
+
*
|
|
11
|
+
* The script imports the compiled `dist/` modules so it exercises the exact
|
|
12
|
+
* code path `knowledge_search` runs in production.
|
|
13
|
+
*/
|
|
14
|
+
import { homedir } from "node:os";
|
|
15
|
+
import { join } from "node:path";
|
|
16
|
+
import { performance } from "node:perf_hooks";
|
|
17
|
+
import { KnowledgeDB } from "../dist/db.js";
|
|
18
|
+
import { FtsSearch } from "../dist/search.js";
|
|
19
|
+
import { VectorSearch } from "../dist/vector-search.js";
|
|
20
|
+
import { HybridSearch } from "../dist/hybrid-search.js";
|
|
21
|
+
import { embed } from "../dist/embedder.js";
|
|
22
|
+
import { Reranker } from "../dist/reranker.js";
|
|
23
|
+
|
|
24
|
+
const DB_PATH = process.env.RALPH_KNOWLEDGE_DB
|
|
25
|
+
?? join(homedir(), ".ralph-hero", "knowledge.db");
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* The 8 golden queries from the 2026-04-29 baseline eval. `expectedSubstrings`
|
|
29
|
+
* lists path-segment substrings any of which counts as a hit (some queries
|
|
30
|
+
* have multiple legitimate primary docs per the baseline).
|
|
31
|
+
*/
|
|
32
|
+
const QUERIES = [
|
|
33
|
+
{
|
|
34
|
+
n: 1,
|
|
35
|
+
query: "what causes the reindex to OOM in ralph-knowledge",
|
|
36
|
+
expectedSubstrings: ["2026-04-29-reindex-memory-profile"],
|
|
37
|
+
type: "specific-keyword",
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
n: 2,
|
|
41
|
+
query: "release transformer tensors after embedding to free memory",
|
|
42
|
+
expectedSubstrings: ["2026-04-29-GH-911-release-embedder-tensors"],
|
|
43
|
+
type: "specific-keyword",
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
n: 3,
|
|
47
|
+
query: "chunker forward progress infinite loop fix",
|
|
48
|
+
expectedSubstrings: ["2026-04-29-GH-916-chunker-no-progress-fix"],
|
|
49
|
+
type: "specific-keyword",
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
n: 4,
|
|
53
|
+
query: "dream-loop memory consolidation pipeline architecture",
|
|
54
|
+
expectedSubstrings: [
|
|
55
|
+
"2026-04-26-dreaming-research-trail-and-self-containment",
|
|
56
|
+
"2026-04-16-GH-0761",
|
|
57
|
+
],
|
|
58
|
+
type: "mixed",
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
n: 5,
|
|
62
|
+
query: "cross-encoder reranker score calibration",
|
|
63
|
+
expectedSubstrings: ["2026-04-26-softmax-and-rerank-calibration"],
|
|
64
|
+
type: "mixed",
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
n: 6,
|
|
68
|
+
query: "wikilink extractor for markdown",
|
|
69
|
+
expectedSubstrings: ["2026-04-26-ralph-knowledge-wikilink-extractor"],
|
|
70
|
+
type: "specific-keyword",
|
|
71
|
+
},
|
|
72
|
+
{
|
|
73
|
+
n: 7,
|
|
74
|
+
query: "context handoff topology between agents",
|
|
75
|
+
expectedSubstrings: ["2026-04-22-context-handoff-topology"],
|
|
76
|
+
type: "mixed",
|
|
77
|
+
},
|
|
78
|
+
{
|
|
79
|
+
n: 8,
|
|
80
|
+
query: "landcrawler permit raw data migration hardening",
|
|
81
|
+
expectedSubstrings: ["2026-04-24-landcrawler-backend-hardening-postmortem"],
|
|
82
|
+
type: "specific-keyword",
|
|
83
|
+
},
|
|
84
|
+
];
|
|
85
|
+
|
|
86
|
+
function findRank(results, expectedSubstrings) {
|
|
87
|
+
for (let i = 0; i < results.length; i++) {
|
|
88
|
+
const r = results[i];
|
|
89
|
+
const path = (r.path ?? "") + (r.id ?? "");
|
|
90
|
+
for (const sub of expectedSubstrings) {
|
|
91
|
+
if (path.includes(sub)) return i + 1;
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
return null;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
async function runOne(hybrid, q, withRerank) {
|
|
98
|
+
const t0 = performance.now();
|
|
99
|
+
const results = await hybrid.search(q.query, {
|
|
100
|
+
limit: 10,
|
|
101
|
+
diagnosticMode: true,
|
|
102
|
+
rerank: withRerank,
|
|
103
|
+
});
|
|
104
|
+
const elapsed = performance.now() - t0;
|
|
105
|
+
return { results, elapsedMs: elapsed };
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
async function main() {
|
|
109
|
+
console.error(`[eval-rerank] DB: ${DB_PATH}`);
|
|
110
|
+
const db = new KnowledgeDB(DB_PATH);
|
|
111
|
+
const fts = new FtsSearch(db);
|
|
112
|
+
const vec = new VectorSearch(db);
|
|
113
|
+
const reranker = new Reranker();
|
|
114
|
+
const hybrid = new HybridSearch(db, fts, vec, embed, reranker);
|
|
115
|
+
|
|
116
|
+
// Warm up the embedder once (it's a separate model from the reranker; we
|
|
117
|
+
// care about cold-start of the *reranker*, not the embedder, so isolate it).
|
|
118
|
+
console.error(`[eval-rerank] warming embedder...`);
|
|
119
|
+
await hybrid.search("warmup", { limit: 1 });
|
|
120
|
+
|
|
121
|
+
const out = [];
|
|
122
|
+
let firstRerankCall = true;
|
|
123
|
+
|
|
124
|
+
for (const q of QUERIES) {
|
|
125
|
+
console.error(`[eval-rerank] Q${q.n}: ${q.query}`);
|
|
126
|
+
|
|
127
|
+
// First rerank invocation is the cold-start one (model load + first batch).
|
|
128
|
+
// Subsequent invocations are warm.
|
|
129
|
+
const cold = await runOne(hybrid, q, true);
|
|
130
|
+
const wasFirstCall = firstRerankCall;
|
|
131
|
+
firstRerankCall = false;
|
|
132
|
+
|
|
133
|
+
// Warm runs: 3 repeats to compute median + p95.
|
|
134
|
+
const warmRuns = [];
|
|
135
|
+
for (let i = 0; i < 3; i++) {
|
|
136
|
+
warmRuns.push(await runOne(hybrid, q, true));
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// Also capture the no-rerank baseline order from the same DB so the eval
|
|
140
|
+
// doc can verify the baseline column is reproducible (sanity check).
|
|
141
|
+
const noRerank = await runOne(hybrid, q, false);
|
|
142
|
+
|
|
143
|
+
const sortedWarm = warmRuns.map((r) => r.elapsedMs).sort((a, b) => a - b);
|
|
144
|
+
const median = sortedWarm[Math.floor(sortedWarm.length / 2)];
|
|
145
|
+
const p95Idx = Math.min(sortedWarm.length - 1, Math.floor(sortedWarm.length * 0.95));
|
|
146
|
+
|
|
147
|
+
const top10 = cold.results.slice(0, 10).map((r) => ({
|
|
148
|
+
id: r.id,
|
|
149
|
+
path: r.path,
|
|
150
|
+
title: r.title,
|
|
151
|
+
score: r.score,
|
|
152
|
+
rerankScore: r.rerankScore,
|
|
153
|
+
ftsScore: r.ftsScore,
|
|
154
|
+
vecDistance: r.vecDistance,
|
|
155
|
+
hitSources: r.hitSources,
|
|
156
|
+
}));
|
|
157
|
+
const noRerankTop10 = noRerank.results.slice(0, 10).map((r) => ({
|
|
158
|
+
id: r.id,
|
|
159
|
+
path: r.path,
|
|
160
|
+
title: r.title,
|
|
161
|
+
score: r.score,
|
|
162
|
+
}));
|
|
163
|
+
|
|
164
|
+
const rank = findRank(cold.results, q.expectedSubstrings);
|
|
165
|
+
const rankNoRerank = findRank(noRerank.results, q.expectedSubstrings);
|
|
166
|
+
const expectedHit = rank !== null
|
|
167
|
+
? cold.results[rank - 1]
|
|
168
|
+
: null;
|
|
169
|
+
|
|
170
|
+
out.push({
|
|
171
|
+
n: q.n,
|
|
172
|
+
query: q.query,
|
|
173
|
+
type: q.type,
|
|
174
|
+
expectedSubstrings: q.expectedSubstrings,
|
|
175
|
+
rank,
|
|
176
|
+
rankNoRerank,
|
|
177
|
+
expected: expectedHit
|
|
178
|
+
? {
|
|
179
|
+
id: expectedHit.id,
|
|
180
|
+
path: expectedHit.path,
|
|
181
|
+
rerankScore: expectedHit.rerankScore,
|
|
182
|
+
rrfScore: expectedHit.score,
|
|
183
|
+
}
|
|
184
|
+
: null,
|
|
185
|
+
latency: {
|
|
186
|
+
coldStartMs: wasFirstCall ? cold.elapsedMs : null,
|
|
187
|
+
firstCallMs: cold.elapsedMs,
|
|
188
|
+
warmMedianMs: median,
|
|
189
|
+
warmP95Ms: sortedWarm[p95Idx],
|
|
190
|
+
warmRunsMs: sortedWarm,
|
|
191
|
+
noRerankMs: noRerank.elapsedMs,
|
|
192
|
+
},
|
|
193
|
+
top10,
|
|
194
|
+
noRerankTop10,
|
|
195
|
+
});
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
const ranksWithRerank = out.map((o) => o.rank);
|
|
199
|
+
const ranksNoRerank = out.map((o) => o.rankNoRerank);
|
|
200
|
+
const hitAt = (ranks, k) => ranks.filter((r) => r !== null && r <= k).length;
|
|
201
|
+
const mrr = (ranks) =>
|
|
202
|
+
ranks.reduce((a, r) => a + (r === null ? 0 : 1 / r), 0) / ranks.length;
|
|
203
|
+
|
|
204
|
+
const aggregate = {
|
|
205
|
+
rerank: {
|
|
206
|
+
hitAt1: `${hitAt(ranksWithRerank, 1)}/${ranksWithRerank.length}`,
|
|
207
|
+
hitAt5: `${hitAt(ranksWithRerank, 5)}/${ranksWithRerank.length}`,
|
|
208
|
+
hitAt10: `${hitAt(ranksWithRerank, 10)}/${ranksWithRerank.length}`,
|
|
209
|
+
mrr: mrr(ranksWithRerank),
|
|
210
|
+
},
|
|
211
|
+
noRerank: {
|
|
212
|
+
hitAt1: `${hitAt(ranksNoRerank, 1)}/${ranksNoRerank.length}`,
|
|
213
|
+
hitAt5: `${hitAt(ranksNoRerank, 5)}/${ranksNoRerank.length}`,
|
|
214
|
+
hitAt10: `${hitAt(ranksNoRerank, 10)}/${ranksNoRerank.length}`,
|
|
215
|
+
mrr: mrr(ranksNoRerank),
|
|
216
|
+
},
|
|
217
|
+
};
|
|
218
|
+
|
|
219
|
+
// Latency aggregate
|
|
220
|
+
const allCold = out.map((o) => o.latency.coldStartMs).filter((x) => x != null);
|
|
221
|
+
const allWarmMedian = out.map((o) => o.latency.warmMedianMs);
|
|
222
|
+
const allFirst = out.map((o) => o.latency.firstCallMs);
|
|
223
|
+
aggregate.latency = {
|
|
224
|
+
coldStartMs: allCold[0] ?? null,
|
|
225
|
+
avgWarmMedianMs:
|
|
226
|
+
allWarmMedian.reduce((a, x) => a + x, 0) / allWarmMedian.length,
|
|
227
|
+
avgFirstCallMs:
|
|
228
|
+
allFirst.reduce((a, x) => a + x, 0) / allFirst.length,
|
|
229
|
+
avgNoRerankMs:
|
|
230
|
+
out.reduce((a, o) => a + o.latency.noRerankMs, 0) / out.length,
|
|
231
|
+
};
|
|
232
|
+
|
|
233
|
+
console.log(JSON.stringify({ queries: out, aggregate, dbPath: DB_PATH }, null, 2));
|
|
234
|
+
console.error(`[eval-rerank] done; aggregate:`, aggregate);
|
|
235
|
+
db.close();
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
main().catch((e) => {
|
|
239
|
+
console.error(e);
|
|
240
|
+
process.exit(1);
|
|
241
|
+
});
|