ralph-hero-knowledge-index 0.1.27 → 0.1.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ralph-knowledge",
3
- "version": "0.1.27",
3
+ "version": "0.1.29",
4
4
  "description": "Knowledge graph for ralph-hero: semantic search, relationship traversal, and document indexing across thoughts/ documents. Optional companion to ralph-hero.",
5
5
  "author": {
6
6
  "name": "Chad Dubiel",
package/.mcp.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "mcpServers": {
3
3
  "ralph-knowledge": {
4
4
  "command": "npx",
5
- "args": ["-y", "ralph-hero-knowledge-index@0.1.27"]
5
+ "args": ["-y", "ralph-hero-knowledge-index@0.1.29"]
6
6
  }
7
7
  }
8
8
  }
package/README.md CHANGED
@@ -107,3 +107,35 @@ fast-path before any matcher is consulted.
107
107
  | `RALPH_KNOWLEDGE_CONFIG` | Override path to `knowledge.config.json` (tilde expanded). |
108
108
  | `RALPH_KNOWLEDGE_DIRS` | Comma-separated list of roots. Beats config, loses to CLI. |
109
109
  | `RALPH_KNOWLEDGE_DB` | Override SQLite path. Beats `config.dbPath`, loses to a CLI `.db` positional. |
110
+
111
+ ## Benchmarks
112
+
113
+ Standalone benchmarks live under [`benchmark/`](./benchmark/) — see
114
+ [`benchmark/README.md`](./benchmark/README.md) for the directory's conventions
115
+ (scripts are not part of the published npm package and are not run by
116
+ `vitest`).
117
+
118
+ ### Reranker benchmark (GH-901)
119
+
120
+ [`benchmark/reranker-bench.ts`](./benchmark/reranker-bench.ts) compares two
121
+ ONNX cross-encoder rerankers loaded via the existing `@huggingface/transformers`
122
+ v3 dependency:
123
+
124
+ - `onnx-community/bge-reranker-v2-m3-ONNX` (int8 quantized) — primary candidate
125
+ - `Xenova/ms-marco-MiniLM-L-6-v2` — speed baseline
126
+
127
+ For ~44 sample queries spanning the five query intent classes (prior-work
128
+ topic, plan-by-issue lookup, claim evidence, epic context, hero orientation),
129
+ the script fetches top-20 RRF candidates, reranks each candidate set with both
130
+ models, and writes a TSV table with cold-start latency, p50/p95 per-pair
131
+ latency, batch-of-20 latency, RSS memory delta, and top-3 agreement vs
132
+ RRF-only. Results land at `benchmark/results-YYYY-MM-DD.tsv`; the most recent
133
+ run is checked into the repo.
134
+
135
+ ```bash
136
+ RALPH_KNOWLEDGE_DB=~/.ralph-hero/knowledge.db \
137
+ npx tsx plugin/ralph-knowledge/benchmark/reranker-bench.ts
138
+ ```
139
+
140
+ The script does not modify `hybrid-search.ts` — production wiring of a
141
+ default reranker is a separate followup gated on the benchmark's findings.
File without changes
@@ -0,0 +1,50 @@
1
+ # ralph-knowledge benchmarks
2
+
3
+ Standalone benchmark scripts that exercise the ralph-knowledge runtime against
4
+ the live `knowledge.db`. They import from `../src/` but are NOT part of the
5
+ published npm package and are NOT executed by the test suite (`vitest`).
6
+
7
+ The `benchmark/` directory is excluded from `tsconfig.json`'s `include`
8
+ glob, so adding a script here will not change the `npm run build` output and
9
+ will not break the CI matrix on Node 18/20/22.
10
+
11
+ ## Running
12
+
13
+ Each script is a standalone TypeScript file that can be run directly with
14
+ `tsx` (already a transitive devDependency via `vitest` — no install required):
15
+
16
+ ```bash
17
+ # From repo root or plugin/ralph-knowledge:
18
+ npx tsx benchmark/reranker-bench.ts
19
+
20
+ # Or, equivalently, with the node loader form:
21
+ node --import tsx benchmark/reranker-bench.ts
22
+ ```
23
+
24
+ Scripts read the same `RALPH_KNOWLEDGE_DB` env var as the MCP server, so by
25
+ default they target `~/.ralph-hero/knowledge.db`.
26
+
27
+ ## Scripts
28
+
29
+ ### `reranker-bench.ts` (GH-901)
30
+
31
+ Benchmarks two ONNX cross-encoder rerankers loaded via `@huggingface/transformers`:
32
+
33
+ - `onnx-community/bge-reranker-v2-m3-ONNX` (int8 quantized) — primary candidate
34
+ - `Xenova/ms-marco-MiniLM-L-6-v2` — speed baseline
35
+
36
+ Draws a hard-coded set of ~44 sample queries spanning the five query intent
37
+ classes from the Phase 3 research (prior-work topic, plan-by-issue lookup,
38
+ claim evidence, epic context, hero orientation), runs `HybridSearch.search()`
39
+ to fetch top-20 RRF candidates per query, then reranks the candidates with
40
+ each loaded model. Captures cold-start latency, p50/p95 per-pair latency,
41
+ batch-of-20 latency, RSS memory delta, and top-3 agreement vs RRF-only.
42
+
43
+ Results are written as a TSV file at `benchmark/results-YYYY-MM-DD.tsv` and
44
+ echoed to stdout as a human-readable summary table. Models that fail to
45
+ download or load are reported with a `notes` column entry rather than aborting
46
+ the entire run.
47
+
48
+ The script is purely additive — it does not modify `hybrid-search.ts` or any
49
+ production source file. Production wiring of a default reranker is a separate
50
+ followup gated on the benchmark findings.
@@ -0,0 +1,511 @@
1
+ /**
2
+ * Phase 4 (GH-901) — Benchmark local cross-encoder rerankers on M5 Pro.
3
+ *
4
+ * Loads two ONNX cross-encoder rerankers via the existing
5
+ * `@huggingface/transformers` v3 dependency, runs each over the top-20 RRF
6
+ * candidates from a hard-coded sample-query set, and writes a TSV results
7
+ * table covering cold-start load, per-pair latency p50/p95, batch latency,
8
+ * RSS memory delta, and top-3 agreement vs RRF-only.
9
+ *
10
+ * NOT wired into `hybrid-search.ts` — production wiring is a separate
11
+ * followup gated on the table's findings (see plan §"What We're NOT Doing").
12
+ *
13
+ * Run with:
14
+ * npx tsx plugin/ralph-knowledge/benchmark/reranker-bench.ts
15
+ */
16
+ import { homedir } from "node:os";
17
+ import { join, dirname } from "node:path";
18
+ import { writeFileSync } from "node:fs";
19
+ import { fileURLToPath, pathToFileURL } from "node:url";
20
+ import {
21
+ AutoTokenizer,
22
+ AutoModelForSequenceClassification,
23
+ type PreTrainedTokenizer,
24
+ type PreTrainedModel,
25
+ } from "@huggingface/transformers";
26
+ import { KnowledgeDB } from "../src/db.js";
27
+ import { FtsSearch } from "../src/search.js";
28
+ import { VectorSearch } from "../src/vector-search.js";
29
+ import { HybridSearch } from "../src/hybrid-search.js";
30
+ import { embed } from "../src/embedder.js";
31
+ import type { SearchResult } from "../src/search.js";
32
+
33
+ const DEFAULT_DB_PATH = join(homedir(), ".ralph-hero", "knowledge.db");
34
+ const TOP_K_CANDIDATES = 20;
35
+ const TOP_AGREEMENT_K = 3;
36
+
37
+ /**
38
+ * One reranker model under test. The HF model id resolves through the
39
+ * transformers.js Hub cache (same backing store as the embedder), so no new
40
+ * npm dependency or network setup is required beyond the first download.
41
+ */
42
+ interface ModelSpec {
43
+ /** Display name used in TSV + console output. */
44
+ label: string;
45
+ /** Hugging Face model id (loaded via `pipeline('text-classification', ...)`). */
46
+ modelId: string;
47
+ /**
48
+ * Optional dtype passed to the pipeline factory. `'q8'` selects the int8
49
+ * quantized ONNX variant when the repo ships one (BGE-Reranker-v2-m3-ONNX
50
+ * does; MiniLM-L6 ships only fp32 + q8).
51
+ */
52
+ dtype?: "fp32" | "fp16" | "q8" | "int8" | "uint8" | "q4" | "bnb4" | "auto";
53
+ }
54
+
55
+ const MODELS: ModelSpec[] = [
56
+ {
57
+ label: "bge-reranker-v2-m3-ONNX-int8",
58
+ modelId: "onnx-community/bge-reranker-v2-m3-ONNX",
59
+ dtype: "q8",
60
+ },
61
+ {
62
+ label: "ms-marco-MiniLM-L-6-v2",
63
+ modelId: "Xenova/ms-marco-MiniLM-L-6-v2",
64
+ // MiniLM ships an fp32 ONNX as the default — no quantization needed (it's
65
+ // already tiny). Letting transformers.js pick the default avoids a load
66
+ // failure if the q8 variant isn't packaged in this revision.
67
+ },
68
+ ];
69
+
70
+ /**
71
+ * Hard-coded sample queries spanning the five query intent classes from
72
+ * Phase 3 research (GH-900): prior-work topic, plan-by-issue lookup, claim
73
+ * evidence, epic context, hero orientation. Total = 44.
74
+ *
75
+ * Skewed toward generic ralph-knowledge / ralph-hero corpus topics so the
76
+ * RRF retriever returns non-empty results on a representative dev DB.
77
+ */
78
+ const SAMPLE_QUERIES: string[] = [
79
+ // 12 prior-work topic queries
80
+ "hybrid search RRF fusion",
81
+ "MMR diversity reranking",
82
+ "cross-encoder reranker latency",
83
+ "calibration of search scores",
84
+ "platt scaling for retrieval",
85
+ "softmax temperature in reranking",
86
+ "BGE reranker BAAI multilingual",
87
+ "transformers.js ONNX runtime apple silicon",
88
+ "sqlite-vec cosine distance",
89
+ "FTS5 BM25 ranking sqlite",
90
+ "chunked embeddings dream loop",
91
+ "contextual retrieval anthropic",
92
+ // 8 plan-by-issue lookups
93
+ "plan for ralph-knowledge stage-2 reranker",
94
+ "plan GH-902 MMR diversity",
95
+ "plan GH-899 RRF observability",
96
+ "plan GH-901 cross-encoder benchmark",
97
+ "plan GH-900 labeling effort scope",
98
+ "plan GH-761 chunked embeddings",
99
+ "plan epic ralph-hero token resolution",
100
+ "plan hello skill output budget",
101
+ // 8 claim evidence queries
102
+ "evidence MMR demotes near duplicates",
103
+ "evidence cross-encoder beats RRF",
104
+ "evidence platt calibration improves NDCG",
105
+ "evidence isotonic regression sample floor",
106
+ "evidence LambdaMART labeled data requirement",
107
+ "evidence Qwen3 reranker MTEB-R score",
108
+ "evidence transformers.js cpu latency",
109
+ "evidence sqlite-vec POINT query plan",
110
+ // 8 epic context queries
111
+ "ralph-knowledge epic stage-2 capabilities",
112
+ "ralph-hero workflow state machine epic",
113
+ "knowledge graph traversal epic",
114
+ "memory tier dream loop epic",
115
+ "outcome events search feedback epic",
116
+ "github projects v2 automation epic",
117
+ "claude code plugin architecture epic",
118
+ "stream-based parallel implementation epic",
119
+ // 8 hero orientation queries
120
+ "what does ralph-hero do",
121
+ "how to add a new skill to ralph-hero",
122
+ "how to run ralph-knowledge tests",
123
+ "how to debug MCP server stdio",
124
+ "how to create a new agent for ralph-hero",
125
+ "what is the ralph workflow state machine",
126
+ "how to wire a new tool into hybrid search",
127
+ "how plan agents dispatch impl agents",
128
+ ];
129
+
130
+ /** Result row aggregated per model for TSV output. */
131
+ interface ModelResult {
132
+ model: string;
133
+ cold_start_ms: number;
134
+ latency_p50_ms: number;
135
+ latency_p95_ms: number;
136
+ batch_top20_p50_ms: number;
137
+ memory_rss_delta_mb: number;
138
+ top3_agreement_avg: number;
139
+ notes: string;
140
+ }
141
+
142
+ /**
143
+ * Per-query, per-pair raw measurements collected before percentiling.
144
+ */
145
+ interface PerQueryMeasurement {
146
+ /** Wall-clock ms for the entire batch of `TOP_K_CANDIDATES` (query, doc) pairs. */
147
+ batchMs: number;
148
+ /** `batchMs / TOP_K_CANDIDATES` — the per-pair latency at this batch size. */
149
+ perPairMs: number;
150
+ /** Top-K agreement vs the RRF-only ordering of the same candidates. */
151
+ top3Agreement: number;
152
+ }
153
+
154
+ function percentile(sorted: number[], p: number): number {
155
+ if (sorted.length === 0) return 0;
156
+ const idx = Math.min(
157
+ sorted.length - 1,
158
+ Math.max(0, Math.floor(sorted.length * p)),
159
+ );
160
+ return sorted[idx];
161
+ }
162
+
163
+ function bytesToMb(bytes: number): number {
164
+ return bytes / (1024 * 1024);
165
+ }
166
+
167
+ /**
168
+ * Truncate a snippet for cross-encoder consumption. The transformers.js
169
+ * pipeline tokenizes/truncates internally to the model's max_position
170
+ * (typically 512), but capping the input string here keeps memory and
171
+ * tokenization cost predictable across models with different max_position.
172
+ */
173
+ function truncateForRerank(s: string, maxChars = 1000): string {
174
+ if (s.length <= maxChars) return s;
175
+ return s.slice(0, maxChars);
176
+ }
177
+
178
+ /**
179
+ * Build the parallel `texts[]` and `text_pairs[]` arrays for a candidate set.
180
+ * The doc text combines title + snippet so the cross-encoder sees the same
181
+ * anchor that the embedder used (title is the strongest semantic anchor in
182
+ * this corpus).
183
+ *
184
+ * Returned shape matches what `tokenizer(texts, { text_pair, padding,
185
+ * truncation })` expects — see the AutoTokenizer encode signature in
186
+ * transformers.js (tokenizers.js `_encode_plus`). This is the only reliable
187
+ * way to invoke a cross-encoder reranker through the library: the
188
+ * higher-level `pipeline('text-classification', ...)` callback accepts only
189
+ * a single text per input and silently coerces `{text, text_pair}` objects
190
+ * to strings, returning a constant `score=1` for every pair. The direct
191
+ * tokenizer + model path returns the actual logits.
192
+ */
193
+ function buildPairs(
194
+ query: string,
195
+ candidates: SearchResult[],
196
+ ): { texts: string[]; textPairs: string[] } {
197
+ const texts: string[] = [];
198
+ const textPairs: string[] = [];
199
+ for (const c of candidates) {
200
+ texts.push(query);
201
+ textPairs.push(truncateForRerank(`${c.title}\n${c.snippet}`));
202
+ }
203
+ return { texts, textPairs };
204
+ }
205
+
206
+ /**
207
+ * Compute top-K agreement: |intersection of top-K id sets| / K.
208
+ * `rerankedOrder` is the candidate index order the reranker produced (best
209
+ * first). The RRF baseline order is `[0, 1, ..., n-1]` since
210
+ * `candidates` is already RRF-sorted.
211
+ */
212
+ function topKAgreement(
213
+ candidates: SearchResult[],
214
+ rerankedOrder: number[],
215
+ k: number,
216
+ ): number {
217
+ const rrfTop = new Set(candidates.slice(0, k).map((c) => c.id));
218
+ const rerTop = new Set(
219
+ rerankedOrder.slice(0, k).map((idx) => candidates[idx].id),
220
+ );
221
+ let intersect = 0;
222
+ for (const id of rerTop) if (rrfTop.has(id)) intersect++;
223
+ return intersect / k;
224
+ }
225
+
226
+ /**
227
+ * Run a single reranker model against the per-query candidate sets. Returns
228
+ * the aggregated `ModelResult` row plus a notes string describing any
229
+ * partial failures encountered.
230
+ */
231
+ async function benchmarkModel(
232
+ spec: ModelSpec,
233
+ perQueryCandidates: Array<{ query: string; candidates: SearchResult[] }>,
234
+ ): Promise<ModelResult> {
235
+ const notes: string[] = [];
236
+ const rssBefore = process.memoryUsage().rss;
237
+
238
+ // ---- Cold-start (load + first inference) ----
239
+ let coldStartMs = 0;
240
+ let tokenizer: PreTrainedTokenizer | null = null;
241
+ let model: PreTrainedModel | null = null;
242
+ const loadStart = performance.now();
243
+ try {
244
+ tokenizer = await AutoTokenizer.from_pretrained(spec.modelId);
245
+ model = await AutoModelForSequenceClassification.from_pretrained(
246
+ spec.modelId,
247
+ spec.dtype ? { dtype: spec.dtype } : {},
248
+ );
249
+ } catch (e) {
250
+ return {
251
+ model: spec.label,
252
+ cold_start_ms: 0,
253
+ latency_p50_ms: 0,
254
+ latency_p95_ms: 0,
255
+ batch_top20_p50_ms: 0,
256
+ memory_rss_delta_mb: 0,
257
+ top3_agreement_avg: 0,
258
+ notes: `model load failed: ${(e as Error).message}`,
259
+ };
260
+ }
261
+ // First-inference penalty (model warmup): use the first query's pairs.
262
+ const firstNonEmpty = perQueryCandidates.find((q) => q.candidates.length > 0);
263
+ if (firstNonEmpty && model && tokenizer) {
264
+ try {
265
+ const { texts: warmT, textPairs: warmP } = buildPairs(
266
+ firstNonEmpty.query,
267
+ firstNonEmpty.candidates,
268
+ );
269
+ const inputs = await tokenizer(warmT, {
270
+ text_pair: warmP,
271
+ padding: true,
272
+ truncation: true,
273
+ });
274
+ await model(inputs);
275
+ } catch (e) {
276
+ notes.push(`warmup-failed: ${(e as Error).message.slice(0, 80)}`);
277
+ }
278
+ }
279
+ coldStartMs = performance.now() - loadStart;
280
+
281
+ const rssAfter = process.memoryUsage().rss;
282
+ const memDeltaMb = bytesToMb(rssAfter - rssBefore);
283
+
284
+ // ---- Per-query measurement loop ----
285
+ const measurements: PerQueryMeasurement[] = [];
286
+ let queryFailures = 0;
287
+ for (const { query, candidates } of perQueryCandidates) {
288
+ if (candidates.length === 0) continue;
289
+ const { texts, textPairs } = buildPairs(query, candidates);
290
+ const start = performance.now();
291
+ let logitsList: number[];
292
+ try {
293
+ const inputs = await tokenizer(texts, {
294
+ text_pair: textPairs,
295
+ padding: true,
296
+ truncation: true,
297
+ });
298
+ const outputs = await model(inputs);
299
+ // outputs.logits is a Tensor with shape [batch, num_labels]. Cross-
300
+ // encoder rerankers ship a single-label sigmoid head, so logits is
301
+ // [batch, 1]. `.tolist()` yields nested number[][]; flatten by taking
302
+ // the first (and only) value per row. Fallback to softmax-and-take-
303
+ // first when num_labels > 1 (e.g., a 2-class classifier).
304
+ const logits = outputs.logits as { tolist: () => number[][]; dims?: number[] };
305
+ const tolist = logits.tolist();
306
+ logitsList = tolist.map((row) => (row.length > 0 ? row[0] : 0));
307
+ } catch (e) {
308
+ queryFailures++;
309
+ if (queryFailures <= 3) {
310
+ notes.push(`query-failed: ${(e as Error).message.slice(0, 80)}`);
311
+ }
312
+ continue;
313
+ }
314
+ const batchMs = performance.now() - start;
315
+
316
+ // Map each candidate idx -> logit, sort desc.
317
+ const scored = logitsList.map((score, idx) => ({ idx, score }));
318
+ scored.sort((a, b) => b.score - a.score);
319
+ const rerankedOrder = scored.map((s) => s.idx);
320
+ const agreement = topKAgreement(
321
+ candidates,
322
+ rerankedOrder,
323
+ TOP_AGREEMENT_K,
324
+ );
325
+ measurements.push({
326
+ batchMs,
327
+ perPairMs: batchMs / texts.length,
328
+ top3Agreement: agreement,
329
+ });
330
+ }
331
+
332
+ if (measurements.length === 0) {
333
+ return {
334
+ model: spec.label,
335
+ cold_start_ms: Math.round(coldStartMs),
336
+ latency_p50_ms: 0,
337
+ latency_p95_ms: 0,
338
+ batch_top20_p50_ms: 0,
339
+ memory_rss_delta_mb: Number(memDeltaMb.toFixed(1)),
340
+ top3_agreement_avg: 0,
341
+ notes:
342
+ notes.length > 0
343
+ ? notes.join("; ")
344
+ : "no successful query measurements",
345
+ };
346
+ }
347
+
348
+ const perPairSorted = [...measurements.map((m) => m.perPairMs)].sort(
349
+ (a, b) => a - b,
350
+ );
351
+ const batchSorted = [...measurements.map((m) => m.batchMs)].sort(
352
+ (a, b) => a - b,
353
+ );
354
+ const agreementAvg =
355
+ measurements.reduce((s, m) => s + m.top3Agreement, 0) /
356
+ measurements.length;
357
+
358
+ if (queryFailures > 0) {
359
+ notes.push(
360
+ `${queryFailures}/${perQueryCandidates.length} queries failed during rerank`,
361
+ );
362
+ }
363
+
364
+ return {
365
+ model: spec.label,
366
+ cold_start_ms: Math.round(coldStartMs),
367
+ latency_p50_ms: Number(percentile(perPairSorted, 0.5).toFixed(2)),
368
+ latency_p95_ms: Number(percentile(perPairSorted, 0.95).toFixed(2)),
369
+ batch_top20_p50_ms: Number(percentile(batchSorted, 0.5).toFixed(2)),
370
+ memory_rss_delta_mb: Number(memDeltaMb.toFixed(1)),
371
+ top3_agreement_avg: Number(agreementAvg.toFixed(3)),
372
+ notes:
373
+ notes.length > 0
374
+ ? notes.join("; ")
375
+ : `n=${measurements.length} queries`,
376
+ };
377
+ }
378
+
379
+ function formatTsv(rows: ModelResult[]): string {
380
+ const headers = [
381
+ "model",
382
+ "cold_start_ms",
383
+ "latency_p50_ms",
384
+ "latency_p95_ms",
385
+ "batch_top20_p50_ms",
386
+ "memory_rss_delta_mb",
387
+ "top3_agreement_avg",
388
+ "notes",
389
+ ];
390
+ const lines = [headers.join("\t")];
391
+ for (const r of rows) {
392
+ lines.push(
393
+ [
394
+ r.model,
395
+ r.cold_start_ms,
396
+ r.latency_p50_ms,
397
+ r.latency_p95_ms,
398
+ r.batch_top20_p50_ms,
399
+ r.memory_rss_delta_mb,
400
+ r.top3_agreement_avg,
401
+ r.notes,
402
+ ].join("\t"),
403
+ );
404
+ }
405
+ return lines.join("\n") + "\n";
406
+ }
407
+
408
+ function printSummary(rows: ModelResult[]): void {
409
+ // Console-friendly two-column dump per row (TSV is the machine-readable form).
410
+ console.log("\n=== Reranker Benchmark Results ===");
411
+ for (const r of rows) {
412
+ console.log(`\n[${r.model}]`);
413
+ console.log(` cold_start_ms : ${r.cold_start_ms}`);
414
+ console.log(` latency_p50_ms : ${r.latency_p50_ms}`);
415
+ console.log(` latency_p95_ms : ${r.latency_p95_ms}`);
416
+ console.log(` batch_top20_p50_ms : ${r.batch_top20_p50_ms}`);
417
+ console.log(` memory_rss_delta_mb : ${r.memory_rss_delta_mb}`);
418
+ console.log(` top3_agreement_avg : ${r.top3_agreement_avg}`);
419
+ console.log(` notes : ${r.notes}`);
420
+ }
421
+ console.log("");
422
+ }
423
+
424
+ function isoDate(): string {
425
+ return new Date().toISOString().slice(0, 10); // YYYY-MM-DD
426
+ }
427
+
428
+ export async function main(): Promise<void> {
429
+ const dbPath = process.env.RALPH_KNOWLEDGE_DB ?? DEFAULT_DB_PATH;
430
+ console.log(`reranker-bench: opening DB at ${dbPath}`);
431
+ const db = new KnowledgeDB(dbPath);
432
+ const fts = new FtsSearch(db);
433
+ const vec = new VectorSearch(db);
434
+ const hybrid = new HybridSearch(db, fts, vec, embed);
435
+
436
+ // Pre-compute the RRF candidate set per query (top-20). Doing this once,
437
+ // before loading any reranker, ensures all rerankers benchmark against the
438
+ // identical candidate sets. Empty candidate sets are kept in the array so
439
+ // the per-query iteration matches between runs.
440
+ console.log(
441
+ `reranker-bench: pre-computing RRF candidates for ${SAMPLE_QUERIES.length} queries...`,
442
+ );
443
+ const perQueryCandidates: Array<{ query: string; candidates: SearchResult[] }> =
444
+ [];
445
+ let nonEmpty = 0;
446
+ for (const q of SAMPLE_QUERIES) {
447
+ let candidates: SearchResult[] = [];
448
+ try {
449
+ candidates = await hybrid.search(q, { limit: TOP_K_CANDIDATES });
450
+ } catch (e) {
451
+ console.warn(` query failed: "${q}" — ${(e as Error).message}`);
452
+ }
453
+ perQueryCandidates.push({ query: q, candidates });
454
+ if (candidates.length > 0) nonEmpty++;
455
+ }
456
+ console.log(
457
+ ` ${nonEmpty}/${SAMPLE_QUERIES.length} queries returned candidates`,
458
+ );
459
+
460
+ if (nonEmpty === 0) {
461
+ console.error(
462
+ "reranker-bench: no queries returned RRF candidates — is the DB indexed?",
463
+ );
464
+ process.exit(1);
465
+ }
466
+
467
+ // Run each model serially. Loading two ONNX models in parallel would
468
+ // confound the cold-start and RSS-delta measurements.
469
+ const results: ModelResult[] = [];
470
+ for (const spec of MODELS) {
471
+ console.log(`\nreranker-bench: loading ${spec.label} (${spec.modelId})...`);
472
+ const r = await benchmarkModel(spec, perQueryCandidates);
473
+ results.push(r);
474
+ if (r.notes.startsWith("model load failed")) {
475
+ console.warn(` ${spec.label}: ${r.notes}`);
476
+ } else {
477
+ console.log(
478
+ ` ${spec.label}: cold_start=${r.cold_start_ms}ms, p50=${r.latency_p50_ms}ms/pair, agreement=${r.top3_agreement_avg}`,
479
+ );
480
+ }
481
+ }
482
+
483
+ // Write TSV next to this script.
484
+ const here = dirname(fileURLToPath(import.meta.url));
485
+ const outPath = join(here, `results-${isoDate()}.tsv`);
486
+ writeFileSync(outPath, formatTsv(results), "utf8");
487
+ console.log(`\nreranker-bench: wrote ${outPath}`);
488
+
489
+ printSummary(results);
490
+
491
+ // Exit non-zero only if ALL models failed to load.
492
+ const anySucceeded = results.some(
493
+ (r) => !r.notes.startsWith("model load failed"),
494
+ );
495
+ if (!anySucceeded) {
496
+ console.error("reranker-bench: every model failed to load — exiting 1");
497
+ process.exit(1);
498
+ }
499
+ }
500
+
501
+ // Top-level runner — only executes when this file is invoked directly,
502
+ // not when imported by another script (e.g., a future suite that compares
503
+ // runs across hardware revisions).
504
+ const invokedDirectly =
505
+ import.meta.url === pathToFileURL(process.argv[1] ?? "").href;
506
+ if (invokedDirectly) {
507
+ main().catch((e) => {
508
+ console.error("reranker-bench: fatal error", e);
509
+ process.exit(1);
510
+ });
511
+ }
@@ -0,0 +1,3 @@
1
+ model cold_start_ms latency_p50_ms latency_p95_ms batch_top20_p50_ms memory_rss_delta_mb top3_agreement_avg notes
2
+ bge-reranker-v2-m3-ONNX-int8 1293 40.2 43.66 763.18 1863.3 0.402 n=44 queries
3
+ ms-marco-MiniLM-L-6-v2 142 4.96 5.56 94.15 226.9 0.424 n=44 queries
package/dist/db.d.ts CHANGED
@@ -85,6 +85,7 @@ export declare class KnowledgeDB {
85
85
  private createSchema;
86
86
  upsertDocument(doc: Omit<DocumentRow, "isStub"> & {
87
87
  isStub?: number;
88
+ memoryTier?: string | null;
88
89
  }): void;
89
90
  /**
90
91
  * Creates a stub document for an unresolved wikilink target.
package/dist/db.js CHANGED
@@ -127,13 +127,23 @@ export class KnowledgeDB {
127
127
  }
128
128
  }
129
129
  upsertDocument(doc) {
130
+ // memoryTier is intentionally optional: callers that don't pass it get the
131
+ // SQL column default ('doc') on insert and preserve the existing value on
132
+ // update via COALESCE. This keeps existing test fixtures and any future
133
+ // call sites that don't care about tiers compiling without changes, while
134
+ // still letting reindex.ts forward the parsed value through.
135
+ const params = {
136
+ ...doc,
137
+ memoryTier: doc.memoryTier ?? null,
138
+ };
130
139
  this.db.prepare(`
131
- INSERT INTO documents (id, path, title, date, type, status, github_issue, content, is_stub)
132
- VALUES (@id, @path, @title, @date, @type, @status, @githubIssue, @content, 0)
140
+ INSERT INTO documents (id, path, title, date, type, status, github_issue, content, is_stub, memory_tier)
141
+ VALUES (@id, @path, @title, @date, @type, @status, @githubIssue, @content, 0, COALESCE(@memoryTier, 'doc'))
133
142
  ON CONFLICT(id) DO UPDATE SET
134
143
  path = @path, title = @title, date = @date, type = @type,
135
- status = @status, github_issue = @githubIssue, content = @content, is_stub = 0
136
- `).run(doc);
144
+ status = @status, github_issue = @githubIssue, content = @content, is_stub = 0,
145
+ memory_tier = COALESCE(@memoryTier, memory_tier)
146
+ `).run(params);
137
147
  }
138
148
  /**
139
149
  * Creates a stub document for an unresolved wikilink target.