ralph-hero-knowledge-index 0.1.27 → 0.1.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/.mcp.json +1 -1
- package/README.md +32 -0
- package/benchmark/.gitkeep +0 -0
- package/benchmark/README.md +50 -0
- package/benchmark/reranker-bench.ts +511 -0
- package/benchmark/results-2026-04-27.tsv +3 -0
- package/dist/db.d.ts +1 -0
- package/dist/db.js +14 -4
- package/dist/db.js.map +1 -1
- package/dist/hybrid-search.d.ts +25 -0
- package/dist/hybrid-search.js +163 -1
- package/dist/hybrid-search.js.map +1 -1
- package/dist/index.js +23 -2
- package/dist/index.js.map +1 -1
- package/dist/parser.d.ts +1 -0
- package/dist/parser.js +18 -1
- package/dist/parser.js.map +1 -1
- package/dist/reindex.js +1 -0
- package/dist/reindex.js.map +1 -1
- package/dist/search.d.ts +35 -0
- package/dist/search.js.map +1 -1
- package/dist/vector-search.d.ts +13 -0
- package/dist/vector-search.js +24 -0
- package/dist/vector-search.js.map +1 -1
- package/package.json +1 -1
- package/src/__tests__/db.test.ts +102 -0
- package/src/__tests__/hybrid-search.test.ts +408 -0
- package/src/__tests__/index.test.ts +196 -0
- package/src/__tests__/parser.test.ts +52 -1
- package/src/__tests__/reindex.test.ts +78 -0
- package/src/__tests__/vector-search.test.ts +41 -0
- package/src/db.ts +17 -5
- package/src/hybrid-search.ts +182 -1
- package/src/index.ts +30 -2
- package/src/parser.ts +22 -1
- package/src/reindex.ts +1 -0
- package/src/search.ts +38 -0
- package/src/vector-search.ts +26 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "ralph-knowledge",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.29",
|
|
4
4
|
"description": "Knowledge graph for ralph-hero: semantic search, relationship traversal, and document indexing across thoughts/ documents. Optional companion to ralph-hero.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "Chad Dubiel",
|
package/.mcp.json
CHANGED
package/README.md
CHANGED
|
@@ -107,3 +107,35 @@ fast-path before any matcher is consulted.
|
|
|
107
107
|
| `RALPH_KNOWLEDGE_CONFIG` | Override path to `knowledge.config.json` (tilde expanded). |
|
|
108
108
|
| `RALPH_KNOWLEDGE_DIRS` | Comma-separated list of roots. Beats config, loses to CLI. |
|
|
109
109
|
| `RALPH_KNOWLEDGE_DB` | Override SQLite path. Beats `config.dbPath`, loses to a CLI `.db` positional. |
|
|
110
|
+
|
|
111
|
+
## Benchmarks
|
|
112
|
+
|
|
113
|
+
Standalone benchmarks live under [`benchmark/`](./benchmark/) — see
|
|
114
|
+
[`benchmark/README.md`](./benchmark/README.md) for the directory's conventions
|
|
115
|
+
(scripts are not part of the published npm package and are not run by
|
|
116
|
+
`vitest`).
|
|
117
|
+
|
|
118
|
+
### Reranker benchmark (GH-901)
|
|
119
|
+
|
|
120
|
+
[`benchmark/reranker-bench.ts`](./benchmark/reranker-bench.ts) compares two
|
|
121
|
+
ONNX cross-encoder rerankers loaded via the existing `@huggingface/transformers`
|
|
122
|
+
v3 dependency:
|
|
123
|
+
|
|
124
|
+
- `onnx-community/bge-reranker-v2-m3-ONNX` (int8 quantized) — primary candidate
|
|
125
|
+
- `Xenova/ms-marco-MiniLM-L-6-v2` — speed baseline
|
|
126
|
+
|
|
127
|
+
For ~44 sample queries spanning the five query intent classes (prior-work
|
|
128
|
+
topic, plan-by-issue lookup, claim evidence, epic context, hero orientation),
|
|
129
|
+
the script fetches top-20 RRF candidates, reranks each candidate set with both
|
|
130
|
+
models, and writes a TSV table with cold-start latency, p50/p95 per-pair
|
|
131
|
+
latency, batch-of-20 latency, RSS memory delta, and top-3 agreement vs
|
|
132
|
+
RRF-only. Results land at `benchmark/results-YYYY-MM-DD.tsv`; the most recent
|
|
133
|
+
run is checked into the repo.
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
RALPH_KNOWLEDGE_DB=~/.ralph-hero/knowledge.db \
|
|
137
|
+
npx tsx plugin/ralph-knowledge/benchmark/reranker-bench.ts
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
The script does not modify `hybrid-search.ts` — production wiring of a
|
|
141
|
+
default reranker is a separate followup gated on the benchmark's findings.
|
|
File without changes
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# ralph-knowledge benchmarks
|
|
2
|
+
|
|
3
|
+
Standalone benchmark scripts that exercise the ralph-knowledge runtime against
|
|
4
|
+
the live `knowledge.db`. They import from `../src/` but are NOT part of the
|
|
5
|
+
published npm package and are NOT executed by the test suite (`vitest`).
|
|
6
|
+
|
|
7
|
+
The `benchmark/` directory is excluded from `tsconfig.json`'s `include`
|
|
8
|
+
glob, so adding a script here will not change the `npm run build` output and
|
|
9
|
+
will not break the CI matrix on Node 18/20/22.
|
|
10
|
+
|
|
11
|
+
## Running
|
|
12
|
+
|
|
13
|
+
Each script is a standalone TypeScript file that can be run directly with
|
|
14
|
+
`tsx` (already a transitive devDependency via `vitest` — no install required):
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
# From repo root or plugin/ralph-knowledge:
|
|
18
|
+
npx tsx benchmark/reranker-bench.ts
|
|
19
|
+
|
|
20
|
+
# Or, equivalently, with the node loader form:
|
|
21
|
+
node --import tsx benchmark/reranker-bench.ts
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Scripts read the same `RALPH_KNOWLEDGE_DB` env var as the MCP server, so by
|
|
25
|
+
default they target `~/.ralph-hero/knowledge.db`.
|
|
26
|
+
|
|
27
|
+
## Scripts
|
|
28
|
+
|
|
29
|
+
### `reranker-bench.ts` (GH-901)
|
|
30
|
+
|
|
31
|
+
Benchmarks two ONNX cross-encoder rerankers loaded via `@huggingface/transformers`:
|
|
32
|
+
|
|
33
|
+
- `onnx-community/bge-reranker-v2-m3-ONNX` (int8 quantized) — primary candidate
|
|
34
|
+
- `Xenova/ms-marco-MiniLM-L-6-v2` — speed baseline
|
|
35
|
+
|
|
36
|
+
Draws a hard-coded set of ~44 sample queries spanning the five query intent
|
|
37
|
+
classes from the Phase 3 research (prior-work topic, plan-by-issue lookup,
|
|
38
|
+
claim evidence, epic context, hero orientation), runs `HybridSearch.search()`
|
|
39
|
+
to fetch top-20 RRF candidates per query, then reranks the candidates with
|
|
40
|
+
each loaded model. Captures cold-start latency, p50/p95 per-pair latency,
|
|
41
|
+
batch-of-20 latency, RSS memory delta, and top-3 agreement vs RRF-only.
|
|
42
|
+
|
|
43
|
+
Results are written as a TSV file at `benchmark/results-YYYY-MM-DD.tsv` and
|
|
44
|
+
echoed to stdout as a human-readable summary table. Models that fail to
|
|
45
|
+
download or load are reported with a `notes` column entry rather than aborting
|
|
46
|
+
the entire run.
|
|
47
|
+
|
|
48
|
+
The script is purely additive — it does not modify `hybrid-search.ts` or any
|
|
49
|
+
production source file. Production wiring of a default reranker is a separate
|
|
50
|
+
followup gated on the benchmark findings.
|
|
@@ -0,0 +1,511 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Phase 4 (GH-901) — Benchmark local cross-encoder rerankers on M5 Pro.
|
|
3
|
+
*
|
|
4
|
+
* Loads two ONNX cross-encoder rerankers via the existing
|
|
5
|
+
* `@huggingface/transformers` v3 dependency, runs each over the top-20 RRF
|
|
6
|
+
* candidates from a hard-coded sample-query set, and writes a TSV results
|
|
7
|
+
* table covering cold-start load, per-pair latency p50/p95, batch latency,
|
|
8
|
+
* RSS memory delta, and top-3 agreement vs RRF-only.
|
|
9
|
+
*
|
|
10
|
+
* NOT wired into `hybrid-search.ts` — production wiring is a separate
|
|
11
|
+
* followup gated on the table's findings (see plan §"What We're NOT Doing").
|
|
12
|
+
*
|
|
13
|
+
* Run with:
|
|
14
|
+
* npx tsx plugin/ralph-knowledge/benchmark/reranker-bench.ts
|
|
15
|
+
*/
|
|
16
|
+
import { homedir } from "node:os";
|
|
17
|
+
import { join, dirname } from "node:path";
|
|
18
|
+
import { writeFileSync } from "node:fs";
|
|
19
|
+
import { fileURLToPath, pathToFileURL } from "node:url";
|
|
20
|
+
import {
|
|
21
|
+
AutoTokenizer,
|
|
22
|
+
AutoModelForSequenceClassification,
|
|
23
|
+
type PreTrainedTokenizer,
|
|
24
|
+
type PreTrainedModel,
|
|
25
|
+
} from "@huggingface/transformers";
|
|
26
|
+
import { KnowledgeDB } from "../src/db.js";
|
|
27
|
+
import { FtsSearch } from "../src/search.js";
|
|
28
|
+
import { VectorSearch } from "../src/vector-search.js";
|
|
29
|
+
import { HybridSearch } from "../src/hybrid-search.js";
|
|
30
|
+
import { embed } from "../src/embedder.js";
|
|
31
|
+
import type { SearchResult } from "../src/search.js";
|
|
32
|
+
|
|
33
|
+
const DEFAULT_DB_PATH = join(homedir(), ".ralph-hero", "knowledge.db");
|
|
34
|
+
const TOP_K_CANDIDATES = 20;
|
|
35
|
+
const TOP_AGREEMENT_K = 3;
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* One reranker model under test. The HF model id resolves through the
|
|
39
|
+
* transformers.js Hub cache (same backing store as the embedder), so no new
|
|
40
|
+
* npm dependency or network setup is required beyond the first download.
|
|
41
|
+
*/
|
|
42
|
+
interface ModelSpec {
|
|
43
|
+
/** Display name used in TSV + console output. */
|
|
44
|
+
label: string;
|
|
45
|
+
/** Hugging Face model id (loaded via `pipeline('text-classification', ...)`). */
|
|
46
|
+
modelId: string;
|
|
47
|
+
/**
|
|
48
|
+
* Optional dtype passed to the pipeline factory. `'q8'` selects the int8
|
|
49
|
+
* quantized ONNX variant when the repo ships one (BGE-Reranker-v2-m3-ONNX
|
|
50
|
+
* does; MiniLM-L6 ships only fp32 + q8).
|
|
51
|
+
*/
|
|
52
|
+
dtype?: "fp32" | "fp16" | "q8" | "int8" | "uint8" | "q4" | "bnb4" | "auto";
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
const MODELS: ModelSpec[] = [
|
|
56
|
+
{
|
|
57
|
+
label: "bge-reranker-v2-m3-ONNX-int8",
|
|
58
|
+
modelId: "onnx-community/bge-reranker-v2-m3-ONNX",
|
|
59
|
+
dtype: "q8",
|
|
60
|
+
},
|
|
61
|
+
{
|
|
62
|
+
label: "ms-marco-MiniLM-L-6-v2",
|
|
63
|
+
modelId: "Xenova/ms-marco-MiniLM-L-6-v2",
|
|
64
|
+
// MiniLM ships an fp32 ONNX as the default — no quantization needed (it's
|
|
65
|
+
// already tiny). Letting transformers.js pick the default avoids a load
|
|
66
|
+
// failure if the q8 variant isn't packaged in this revision.
|
|
67
|
+
},
|
|
68
|
+
];
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Hard-coded sample queries spanning the five query intent classes from
|
|
72
|
+
* Phase 3 research (GH-900): prior-work topic, plan-by-issue lookup, claim
|
|
73
|
+
* evidence, epic context, hero orientation. Total = 44.
|
|
74
|
+
*
|
|
75
|
+
* Skewed toward generic ralph-knowledge / ralph-hero corpus topics so the
|
|
76
|
+
* RRF retriever returns non-empty results on a representative dev DB.
|
|
77
|
+
*/
|
|
78
|
+
const SAMPLE_QUERIES: string[] = [
|
|
79
|
+
// 12 prior-work topic queries
|
|
80
|
+
"hybrid search RRF fusion",
|
|
81
|
+
"MMR diversity reranking",
|
|
82
|
+
"cross-encoder reranker latency",
|
|
83
|
+
"calibration of search scores",
|
|
84
|
+
"platt scaling for retrieval",
|
|
85
|
+
"softmax temperature in reranking",
|
|
86
|
+
"BGE reranker BAAI multilingual",
|
|
87
|
+
"transformers.js ONNX runtime apple silicon",
|
|
88
|
+
"sqlite-vec cosine distance",
|
|
89
|
+
"FTS5 BM25 ranking sqlite",
|
|
90
|
+
"chunked embeddings dream loop",
|
|
91
|
+
"contextual retrieval anthropic",
|
|
92
|
+
// 8 plan-by-issue lookups
|
|
93
|
+
"plan for ralph-knowledge stage-2 reranker",
|
|
94
|
+
"plan GH-902 MMR diversity",
|
|
95
|
+
"plan GH-899 RRF observability",
|
|
96
|
+
"plan GH-901 cross-encoder benchmark",
|
|
97
|
+
"plan GH-900 labeling effort scope",
|
|
98
|
+
"plan GH-761 chunked embeddings",
|
|
99
|
+
"plan epic ralph-hero token resolution",
|
|
100
|
+
"plan hello skill output budget",
|
|
101
|
+
// 8 claim evidence queries
|
|
102
|
+
"evidence MMR demotes near duplicates",
|
|
103
|
+
"evidence cross-encoder beats RRF",
|
|
104
|
+
"evidence platt calibration improves NDCG",
|
|
105
|
+
"evidence isotonic regression sample floor",
|
|
106
|
+
"evidence LambdaMART labeled data requirement",
|
|
107
|
+
"evidence Qwen3 reranker MTEB-R score",
|
|
108
|
+
"evidence transformers.js cpu latency",
|
|
109
|
+
"evidence sqlite-vec POINT query plan",
|
|
110
|
+
// 8 epic context queries
|
|
111
|
+
"ralph-knowledge epic stage-2 capabilities",
|
|
112
|
+
"ralph-hero workflow state machine epic",
|
|
113
|
+
"knowledge graph traversal epic",
|
|
114
|
+
"memory tier dream loop epic",
|
|
115
|
+
"outcome events search feedback epic",
|
|
116
|
+
"github projects v2 automation epic",
|
|
117
|
+
"claude code plugin architecture epic",
|
|
118
|
+
"stream-based parallel implementation epic",
|
|
119
|
+
// 8 hero orientation queries
|
|
120
|
+
"what does ralph-hero do",
|
|
121
|
+
"how to add a new skill to ralph-hero",
|
|
122
|
+
"how to run ralph-knowledge tests",
|
|
123
|
+
"how to debug MCP server stdio",
|
|
124
|
+
"how to create a new agent for ralph-hero",
|
|
125
|
+
"what is the ralph workflow state machine",
|
|
126
|
+
"how to wire a new tool into hybrid search",
|
|
127
|
+
"how plan agents dispatch impl agents",
|
|
128
|
+
];
|
|
129
|
+
|
|
130
|
+
/** Result row aggregated per model for TSV output. */
|
|
131
|
+
interface ModelResult {
|
|
132
|
+
model: string;
|
|
133
|
+
cold_start_ms: number;
|
|
134
|
+
latency_p50_ms: number;
|
|
135
|
+
latency_p95_ms: number;
|
|
136
|
+
batch_top20_p50_ms: number;
|
|
137
|
+
memory_rss_delta_mb: number;
|
|
138
|
+
top3_agreement_avg: number;
|
|
139
|
+
notes: string;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Per-query, per-pair raw measurements collected before percentiling.
|
|
144
|
+
*/
|
|
145
|
+
interface PerQueryMeasurement {
|
|
146
|
+
/** Wall-clock ms for the entire batch of `TOP_K_CANDIDATES` (query, doc) pairs. */
|
|
147
|
+
batchMs: number;
|
|
148
|
+
/** `batchMs / TOP_K_CANDIDATES` — the per-pair latency at this batch size. */
|
|
149
|
+
perPairMs: number;
|
|
150
|
+
/** Top-K agreement vs the RRF-only ordering of the same candidates. */
|
|
151
|
+
top3Agreement: number;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
function percentile(sorted: number[], p: number): number {
|
|
155
|
+
if (sorted.length === 0) return 0;
|
|
156
|
+
const idx = Math.min(
|
|
157
|
+
sorted.length - 1,
|
|
158
|
+
Math.max(0, Math.floor(sorted.length * p)),
|
|
159
|
+
);
|
|
160
|
+
return sorted[idx];
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
function bytesToMb(bytes: number): number {
|
|
164
|
+
return bytes / (1024 * 1024);
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Truncate a snippet for cross-encoder consumption. The transformers.js
|
|
169
|
+
* pipeline tokenizes/truncates internally to the model's max_position
|
|
170
|
+
* (typically 512), but capping the input string here keeps memory and
|
|
171
|
+
* tokenization cost predictable across models with different max_position.
|
|
172
|
+
*/
|
|
173
|
+
function truncateForRerank(s: string, maxChars = 1000): string {
|
|
174
|
+
if (s.length <= maxChars) return s;
|
|
175
|
+
return s.slice(0, maxChars);
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/**
|
|
179
|
+
* Build the parallel `texts[]` and `text_pairs[]` arrays for a candidate set.
|
|
180
|
+
* The doc text combines title + snippet so the cross-encoder sees the same
|
|
181
|
+
* anchor that the embedder used (title is the strongest semantic anchor in
|
|
182
|
+
* this corpus).
|
|
183
|
+
*
|
|
184
|
+
* Returned shape matches what `tokenizer(texts, { text_pair, padding,
|
|
185
|
+
* truncation })` expects — see the AutoTokenizer encode signature in
|
|
186
|
+
* transformers.js (tokenizers.js `_encode_plus`). This is the only reliable
|
|
187
|
+
* way to invoke a cross-encoder reranker through the library: the
|
|
188
|
+
* higher-level `pipeline('text-classification', ...)` callback accepts only
|
|
189
|
+
* a single text per input and silently coerces `{text, text_pair}` objects
|
|
190
|
+
* to strings, returning a constant `score=1` for every pair. The direct
|
|
191
|
+
* tokenizer + model path returns the actual logits.
|
|
192
|
+
*/
|
|
193
|
+
function buildPairs(
|
|
194
|
+
query: string,
|
|
195
|
+
candidates: SearchResult[],
|
|
196
|
+
): { texts: string[]; textPairs: string[] } {
|
|
197
|
+
const texts: string[] = [];
|
|
198
|
+
const textPairs: string[] = [];
|
|
199
|
+
for (const c of candidates) {
|
|
200
|
+
texts.push(query);
|
|
201
|
+
textPairs.push(truncateForRerank(`${c.title}\n${c.snippet}`));
|
|
202
|
+
}
|
|
203
|
+
return { texts, textPairs };
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
/**
|
|
207
|
+
* Compute top-K agreement: |intersection of top-K id sets| / K.
|
|
208
|
+
* `rerankedOrder` is the candidate index order the reranker produced (best
|
|
209
|
+
* first). The RRF baseline order is `[0, 1, ..., n-1]` since
|
|
210
|
+
* `candidates` is already RRF-sorted.
|
|
211
|
+
*/
|
|
212
|
+
function topKAgreement(
|
|
213
|
+
candidates: SearchResult[],
|
|
214
|
+
rerankedOrder: number[],
|
|
215
|
+
k: number,
|
|
216
|
+
): number {
|
|
217
|
+
const rrfTop = new Set(candidates.slice(0, k).map((c) => c.id));
|
|
218
|
+
const rerTop = new Set(
|
|
219
|
+
rerankedOrder.slice(0, k).map((idx) => candidates[idx].id),
|
|
220
|
+
);
|
|
221
|
+
let intersect = 0;
|
|
222
|
+
for (const id of rerTop) if (rrfTop.has(id)) intersect++;
|
|
223
|
+
return intersect / k;
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
/**
|
|
227
|
+
* Run a single reranker model against the per-query candidate sets. Returns
|
|
228
|
+
* the aggregated `ModelResult` row plus a notes string describing any
|
|
229
|
+
* partial failures encountered.
|
|
230
|
+
*/
|
|
231
|
+
async function benchmarkModel(
|
|
232
|
+
spec: ModelSpec,
|
|
233
|
+
perQueryCandidates: Array<{ query: string; candidates: SearchResult[] }>,
|
|
234
|
+
): Promise<ModelResult> {
|
|
235
|
+
const notes: string[] = [];
|
|
236
|
+
const rssBefore = process.memoryUsage().rss;
|
|
237
|
+
|
|
238
|
+
// ---- Cold-start (load + first inference) ----
|
|
239
|
+
let coldStartMs = 0;
|
|
240
|
+
let tokenizer: PreTrainedTokenizer | null = null;
|
|
241
|
+
let model: PreTrainedModel | null = null;
|
|
242
|
+
const loadStart = performance.now();
|
|
243
|
+
try {
|
|
244
|
+
tokenizer = await AutoTokenizer.from_pretrained(spec.modelId);
|
|
245
|
+
model = await AutoModelForSequenceClassification.from_pretrained(
|
|
246
|
+
spec.modelId,
|
|
247
|
+
spec.dtype ? { dtype: spec.dtype } : {},
|
|
248
|
+
);
|
|
249
|
+
} catch (e) {
|
|
250
|
+
return {
|
|
251
|
+
model: spec.label,
|
|
252
|
+
cold_start_ms: 0,
|
|
253
|
+
latency_p50_ms: 0,
|
|
254
|
+
latency_p95_ms: 0,
|
|
255
|
+
batch_top20_p50_ms: 0,
|
|
256
|
+
memory_rss_delta_mb: 0,
|
|
257
|
+
top3_agreement_avg: 0,
|
|
258
|
+
notes: `model load failed: ${(e as Error).message}`,
|
|
259
|
+
};
|
|
260
|
+
}
|
|
261
|
+
// First-inference penalty (model warmup): use the first query's pairs.
|
|
262
|
+
const firstNonEmpty = perQueryCandidates.find((q) => q.candidates.length > 0);
|
|
263
|
+
if (firstNonEmpty && model && tokenizer) {
|
|
264
|
+
try {
|
|
265
|
+
const { texts: warmT, textPairs: warmP } = buildPairs(
|
|
266
|
+
firstNonEmpty.query,
|
|
267
|
+
firstNonEmpty.candidates,
|
|
268
|
+
);
|
|
269
|
+
const inputs = await tokenizer(warmT, {
|
|
270
|
+
text_pair: warmP,
|
|
271
|
+
padding: true,
|
|
272
|
+
truncation: true,
|
|
273
|
+
});
|
|
274
|
+
await model(inputs);
|
|
275
|
+
} catch (e) {
|
|
276
|
+
notes.push(`warmup-failed: ${(e as Error).message.slice(0, 80)}`);
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
coldStartMs = performance.now() - loadStart;
|
|
280
|
+
|
|
281
|
+
const rssAfter = process.memoryUsage().rss;
|
|
282
|
+
const memDeltaMb = bytesToMb(rssAfter - rssBefore);
|
|
283
|
+
|
|
284
|
+
// ---- Per-query measurement loop ----
|
|
285
|
+
const measurements: PerQueryMeasurement[] = [];
|
|
286
|
+
let queryFailures = 0;
|
|
287
|
+
for (const { query, candidates } of perQueryCandidates) {
|
|
288
|
+
if (candidates.length === 0) continue;
|
|
289
|
+
const { texts, textPairs } = buildPairs(query, candidates);
|
|
290
|
+
const start = performance.now();
|
|
291
|
+
let logitsList: number[];
|
|
292
|
+
try {
|
|
293
|
+
const inputs = await tokenizer(texts, {
|
|
294
|
+
text_pair: textPairs,
|
|
295
|
+
padding: true,
|
|
296
|
+
truncation: true,
|
|
297
|
+
});
|
|
298
|
+
const outputs = await model(inputs);
|
|
299
|
+
// outputs.logits is a Tensor with shape [batch, num_labels]. Cross-
|
|
300
|
+
// encoder rerankers ship a single-label sigmoid head, so logits is
|
|
301
|
+
// [batch, 1]. `.tolist()` yields nested number[][]; flatten by taking
|
|
302
|
+
// the first (and only) value per row. Fallback to softmax-and-take-
|
|
303
|
+
// first when num_labels > 1 (e.g., a 2-class classifier).
|
|
304
|
+
const logits = outputs.logits as { tolist: () => number[][]; dims?: number[] };
|
|
305
|
+
const tolist = logits.tolist();
|
|
306
|
+
logitsList = tolist.map((row) => (row.length > 0 ? row[0] : 0));
|
|
307
|
+
} catch (e) {
|
|
308
|
+
queryFailures++;
|
|
309
|
+
if (queryFailures <= 3) {
|
|
310
|
+
notes.push(`query-failed: ${(e as Error).message.slice(0, 80)}`);
|
|
311
|
+
}
|
|
312
|
+
continue;
|
|
313
|
+
}
|
|
314
|
+
const batchMs = performance.now() - start;
|
|
315
|
+
|
|
316
|
+
// Map each candidate idx -> logit, sort desc.
|
|
317
|
+
const scored = logitsList.map((score, idx) => ({ idx, score }));
|
|
318
|
+
scored.sort((a, b) => b.score - a.score);
|
|
319
|
+
const rerankedOrder = scored.map((s) => s.idx);
|
|
320
|
+
const agreement = topKAgreement(
|
|
321
|
+
candidates,
|
|
322
|
+
rerankedOrder,
|
|
323
|
+
TOP_AGREEMENT_K,
|
|
324
|
+
);
|
|
325
|
+
measurements.push({
|
|
326
|
+
batchMs,
|
|
327
|
+
perPairMs: batchMs / texts.length,
|
|
328
|
+
top3Agreement: agreement,
|
|
329
|
+
});
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
if (measurements.length === 0) {
|
|
333
|
+
return {
|
|
334
|
+
model: spec.label,
|
|
335
|
+
cold_start_ms: Math.round(coldStartMs),
|
|
336
|
+
latency_p50_ms: 0,
|
|
337
|
+
latency_p95_ms: 0,
|
|
338
|
+
batch_top20_p50_ms: 0,
|
|
339
|
+
memory_rss_delta_mb: Number(memDeltaMb.toFixed(1)),
|
|
340
|
+
top3_agreement_avg: 0,
|
|
341
|
+
notes:
|
|
342
|
+
notes.length > 0
|
|
343
|
+
? notes.join("; ")
|
|
344
|
+
: "no successful query measurements",
|
|
345
|
+
};
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
const perPairSorted = [...measurements.map((m) => m.perPairMs)].sort(
|
|
349
|
+
(a, b) => a - b,
|
|
350
|
+
);
|
|
351
|
+
const batchSorted = [...measurements.map((m) => m.batchMs)].sort(
|
|
352
|
+
(a, b) => a - b,
|
|
353
|
+
);
|
|
354
|
+
const agreementAvg =
|
|
355
|
+
measurements.reduce((s, m) => s + m.top3Agreement, 0) /
|
|
356
|
+
measurements.length;
|
|
357
|
+
|
|
358
|
+
if (queryFailures > 0) {
|
|
359
|
+
notes.push(
|
|
360
|
+
`${queryFailures}/${perQueryCandidates.length} queries failed during rerank`,
|
|
361
|
+
);
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
return {
|
|
365
|
+
model: spec.label,
|
|
366
|
+
cold_start_ms: Math.round(coldStartMs),
|
|
367
|
+
latency_p50_ms: Number(percentile(perPairSorted, 0.5).toFixed(2)),
|
|
368
|
+
latency_p95_ms: Number(percentile(perPairSorted, 0.95).toFixed(2)),
|
|
369
|
+
batch_top20_p50_ms: Number(percentile(batchSorted, 0.5).toFixed(2)),
|
|
370
|
+
memory_rss_delta_mb: Number(memDeltaMb.toFixed(1)),
|
|
371
|
+
top3_agreement_avg: Number(agreementAvg.toFixed(3)),
|
|
372
|
+
notes:
|
|
373
|
+
notes.length > 0
|
|
374
|
+
? notes.join("; ")
|
|
375
|
+
: `n=${measurements.length} queries`,
|
|
376
|
+
};
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
function formatTsv(rows: ModelResult[]): string {
|
|
380
|
+
const headers = [
|
|
381
|
+
"model",
|
|
382
|
+
"cold_start_ms",
|
|
383
|
+
"latency_p50_ms",
|
|
384
|
+
"latency_p95_ms",
|
|
385
|
+
"batch_top20_p50_ms",
|
|
386
|
+
"memory_rss_delta_mb",
|
|
387
|
+
"top3_agreement_avg",
|
|
388
|
+
"notes",
|
|
389
|
+
];
|
|
390
|
+
const lines = [headers.join("\t")];
|
|
391
|
+
for (const r of rows) {
|
|
392
|
+
lines.push(
|
|
393
|
+
[
|
|
394
|
+
r.model,
|
|
395
|
+
r.cold_start_ms,
|
|
396
|
+
r.latency_p50_ms,
|
|
397
|
+
r.latency_p95_ms,
|
|
398
|
+
r.batch_top20_p50_ms,
|
|
399
|
+
r.memory_rss_delta_mb,
|
|
400
|
+
r.top3_agreement_avg,
|
|
401
|
+
r.notes,
|
|
402
|
+
].join("\t"),
|
|
403
|
+
);
|
|
404
|
+
}
|
|
405
|
+
return lines.join("\n") + "\n";
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
function printSummary(rows: ModelResult[]): void {
|
|
409
|
+
// Console-friendly two-column dump per row (TSV is the machine-readable form).
|
|
410
|
+
console.log("\n=== Reranker Benchmark Results ===");
|
|
411
|
+
for (const r of rows) {
|
|
412
|
+
console.log(`\n[${r.model}]`);
|
|
413
|
+
console.log(` cold_start_ms : ${r.cold_start_ms}`);
|
|
414
|
+
console.log(` latency_p50_ms : ${r.latency_p50_ms}`);
|
|
415
|
+
console.log(` latency_p95_ms : ${r.latency_p95_ms}`);
|
|
416
|
+
console.log(` batch_top20_p50_ms : ${r.batch_top20_p50_ms}`);
|
|
417
|
+
console.log(` memory_rss_delta_mb : ${r.memory_rss_delta_mb}`);
|
|
418
|
+
console.log(` top3_agreement_avg : ${r.top3_agreement_avg}`);
|
|
419
|
+
console.log(` notes : ${r.notes}`);
|
|
420
|
+
}
|
|
421
|
+
console.log("");
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
function isoDate(): string {
|
|
425
|
+
return new Date().toISOString().slice(0, 10); // YYYY-MM-DD
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
export async function main(): Promise<void> {
|
|
429
|
+
const dbPath = process.env.RALPH_KNOWLEDGE_DB ?? DEFAULT_DB_PATH;
|
|
430
|
+
console.log(`reranker-bench: opening DB at ${dbPath}`);
|
|
431
|
+
const db = new KnowledgeDB(dbPath);
|
|
432
|
+
const fts = new FtsSearch(db);
|
|
433
|
+
const vec = new VectorSearch(db);
|
|
434
|
+
const hybrid = new HybridSearch(db, fts, vec, embed);
|
|
435
|
+
|
|
436
|
+
// Pre-compute the RRF candidate set per query (top-20). Doing this once,
|
|
437
|
+
// before loading any reranker, ensures all rerankers benchmark against the
|
|
438
|
+
// identical candidate sets. Empty candidate sets are kept in the array so
|
|
439
|
+
// the per-query iteration matches between runs.
|
|
440
|
+
console.log(
|
|
441
|
+
`reranker-bench: pre-computing RRF candidates for ${SAMPLE_QUERIES.length} queries...`,
|
|
442
|
+
);
|
|
443
|
+
const perQueryCandidates: Array<{ query: string; candidates: SearchResult[] }> =
|
|
444
|
+
[];
|
|
445
|
+
let nonEmpty = 0;
|
|
446
|
+
for (const q of SAMPLE_QUERIES) {
|
|
447
|
+
let candidates: SearchResult[] = [];
|
|
448
|
+
try {
|
|
449
|
+
candidates = await hybrid.search(q, { limit: TOP_K_CANDIDATES });
|
|
450
|
+
} catch (e) {
|
|
451
|
+
console.warn(` query failed: "${q}" — ${(e as Error).message}`);
|
|
452
|
+
}
|
|
453
|
+
perQueryCandidates.push({ query: q, candidates });
|
|
454
|
+
if (candidates.length > 0) nonEmpty++;
|
|
455
|
+
}
|
|
456
|
+
console.log(
|
|
457
|
+
` ${nonEmpty}/${SAMPLE_QUERIES.length} queries returned candidates`,
|
|
458
|
+
);
|
|
459
|
+
|
|
460
|
+
if (nonEmpty === 0) {
|
|
461
|
+
console.error(
|
|
462
|
+
"reranker-bench: no queries returned RRF candidates — is the DB indexed?",
|
|
463
|
+
);
|
|
464
|
+
process.exit(1);
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
// Run each model serially. Loading two ONNX models in parallel would
|
|
468
|
+
// confound the cold-start and RSS-delta measurements.
|
|
469
|
+
const results: ModelResult[] = [];
|
|
470
|
+
for (const spec of MODELS) {
|
|
471
|
+
console.log(`\nreranker-bench: loading ${spec.label} (${spec.modelId})...`);
|
|
472
|
+
const r = await benchmarkModel(spec, perQueryCandidates);
|
|
473
|
+
results.push(r);
|
|
474
|
+
if (r.notes.startsWith("model load failed")) {
|
|
475
|
+
console.warn(` ${spec.label}: ${r.notes}`);
|
|
476
|
+
} else {
|
|
477
|
+
console.log(
|
|
478
|
+
` ${spec.label}: cold_start=${r.cold_start_ms}ms, p50=${r.latency_p50_ms}ms/pair, agreement=${r.top3_agreement_avg}`,
|
|
479
|
+
);
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
// Write TSV next to this script.
|
|
484
|
+
const here = dirname(fileURLToPath(import.meta.url));
|
|
485
|
+
const outPath = join(here, `results-${isoDate()}.tsv`);
|
|
486
|
+
writeFileSync(outPath, formatTsv(results), "utf8");
|
|
487
|
+
console.log(`\nreranker-bench: wrote ${outPath}`);
|
|
488
|
+
|
|
489
|
+
printSummary(results);
|
|
490
|
+
|
|
491
|
+
// Exit non-zero only if ALL models failed to load.
|
|
492
|
+
const anySucceeded = results.some(
|
|
493
|
+
(r) => !r.notes.startsWith("model load failed"),
|
|
494
|
+
);
|
|
495
|
+
if (!anySucceeded) {
|
|
496
|
+
console.error("reranker-bench: every model failed to load — exiting 1");
|
|
497
|
+
process.exit(1);
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
// Top-level runner — only executes when this file is invoked directly,
|
|
502
|
+
// not when imported by another script (e.g., a future suite that compares
|
|
503
|
+
// runs across hardware revisions).
|
|
504
|
+
const invokedDirectly =
|
|
505
|
+
import.meta.url === pathToFileURL(process.argv[1] ?? "").href;
|
|
506
|
+
if (invokedDirectly) {
|
|
507
|
+
main().catch((e) => {
|
|
508
|
+
console.error("reranker-bench: fatal error", e);
|
|
509
|
+
process.exit(1);
|
|
510
|
+
});
|
|
511
|
+
}
|
package/dist/db.d.ts
CHANGED
package/dist/db.js
CHANGED
|
@@ -127,13 +127,23 @@ export class KnowledgeDB {
|
|
|
127
127
|
}
|
|
128
128
|
}
|
|
129
129
|
upsertDocument(doc) {
|
|
130
|
+
// memoryTier is intentionally optional: callers that don't pass it get the
|
|
131
|
+
// SQL column default ('doc') on insert and preserve the existing value on
|
|
132
|
+
// update via COALESCE. This keeps existing test fixtures and any future
|
|
133
|
+
// call sites that don't care about tiers compiling without changes, while
|
|
134
|
+
// still letting reindex.ts forward the parsed value through.
|
|
135
|
+
const params = {
|
|
136
|
+
...doc,
|
|
137
|
+
memoryTier: doc.memoryTier ?? null,
|
|
138
|
+
};
|
|
130
139
|
this.db.prepare(`
|
|
131
|
-
INSERT INTO documents (id, path, title, date, type, status, github_issue, content, is_stub)
|
|
132
|
-
VALUES (@id, @path, @title, @date, @type, @status, @githubIssue, @content, 0)
|
|
140
|
+
INSERT INTO documents (id, path, title, date, type, status, github_issue, content, is_stub, memory_tier)
|
|
141
|
+
VALUES (@id, @path, @title, @date, @type, @status, @githubIssue, @content, 0, COALESCE(@memoryTier, 'doc'))
|
|
133
142
|
ON CONFLICT(id) DO UPDATE SET
|
|
134
143
|
path = @path, title = @title, date = @date, type = @type,
|
|
135
|
-
status = @status, github_issue = @githubIssue, content = @content, is_stub = 0
|
|
136
|
-
|
|
144
|
+
status = @status, github_issue = @githubIssue, content = @content, is_stub = 0,
|
|
145
|
+
memory_tier = COALESCE(@memoryTier, memory_tier)
|
|
146
|
+
`).run(params);
|
|
137
147
|
}
|
|
138
148
|
/**
|
|
139
149
|
* Creates a stub document for an unresolved wikilink target.
|