qmdr 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/.claude-plugin/marketplace.json +29 -0
  2. package/.env.example +85 -0
  3. package/.gitattributes +3 -0
  4. package/.github/workflows/release.yml +77 -0
  5. package/AI-SETUP.md +466 -0
  6. package/LICENSE +22 -0
  7. package/README.md +78 -0
  8. package/bun.lock +637 -0
  9. package/docs/README-zh.md +78 -0
  10. package/docs/refactor-checklist.md +54 -0
  11. package/docs/setup-openclaw.md +139 -0
  12. package/example-index.yml +33 -0
  13. package/finetune/BALANCED_DISTRIBUTION.md +157 -0
  14. package/finetune/DATA_IMPROVEMENTS.md +218 -0
  15. package/finetune/Justfile +43 -0
  16. package/finetune/Modelfile +16 -0
  17. package/finetune/README.md +299 -0
  18. package/finetune/SCORING.md +286 -0
  19. package/finetune/configs/accelerate_multi_gpu.yaml +17 -0
  20. package/finetune/configs/grpo.yaml +49 -0
  21. package/finetune/configs/sft.yaml +42 -0
  22. package/finetune/configs/sft_local.yaml +40 -0
  23. package/finetune/convert_gguf.py +221 -0
  24. package/finetune/data/best_glm_prompt.txt +17 -0
  25. package/finetune/data/gepa_generated.prompts.json +32 -0
  26. package/finetune/data/qmd_expansion_balanced_deduped.jsonl +413 -0
  27. package/finetune/data/qmd_expansion_diverse_addon.jsonl +386 -0
  28. package/finetune/data/qmd_expansion_handcrafted.jsonl +65 -0
  29. package/finetune/data/qmd_expansion_handcrafted_only.jsonl +336 -0
  30. package/finetune/data/qmd_expansion_locations.jsonl +64 -0
  31. package/finetune/data/qmd_expansion_people.jsonl +46 -0
  32. package/finetune/data/qmd_expansion_short_nontech.jsonl +200 -0
  33. package/finetune/data/qmd_expansion_v2.jsonl +1498 -0
  34. package/finetune/data/qmd_only_sampled.jsonl +399 -0
  35. package/finetune/dataset/analyze_data.py +369 -0
  36. package/finetune/dataset/clean_data.py +906 -0
  37. package/finetune/dataset/generate_balanced.py +823 -0
  38. package/finetune/dataset/generate_data.py +714 -0
  39. package/finetune/dataset/generate_data_offline.py +206 -0
  40. package/finetune/dataset/generate_diverse.py +441 -0
  41. package/finetune/dataset/generate_ollama.py +326 -0
  42. package/finetune/dataset/prepare_data.py +197 -0
  43. package/finetune/dataset/schema.py +73 -0
  44. package/finetune/dataset/score_data.py +115 -0
  45. package/finetune/dataset/validate_schema.py +104 -0
  46. package/finetune/eval.py +196 -0
  47. package/finetune/evals/queries.txt +56 -0
  48. package/finetune/gepa/__init__.py +1 -0
  49. package/finetune/gepa/best_prompt.txt +31 -0
  50. package/finetune/gepa/best_prompt_glm.txt +1 -0
  51. package/finetune/gepa/dspy_gepa.py +204 -0
  52. package/finetune/gepa/example.py +117 -0
  53. package/finetune/gepa/generate.py +129 -0
  54. package/finetune/gepa/gepa_outputs.jsonl +10 -0
  55. package/finetune/gepa/gepa_outputs_glm.jsonl +20 -0
  56. package/finetune/gepa/model.json +19 -0
  57. package/finetune/gepa/optimizer.py +70 -0
  58. package/finetune/gepa/score.py +84 -0
  59. package/finetune/jobs/eval.py +490 -0
  60. package/finetune/jobs/eval_common.py +354 -0
  61. package/finetune/jobs/eval_verbose.py +113 -0
  62. package/finetune/jobs/grpo.py +141 -0
  63. package/finetune/jobs/quantize.py +244 -0
  64. package/finetune/jobs/sft.py +121 -0
  65. package/finetune/pyproject.toml +23 -0
  66. package/finetune/reward.py +610 -0
  67. package/finetune/train.py +611 -0
  68. package/finetune/uv.lock +4070 -0
  69. package/flake.lock +61 -0
  70. package/flake.nix +83 -0
  71. package/migrate-schema.ts +162 -0
  72. package/package.json +56 -0
  73. package/skills/qmdr/SKILL.md +172 -0
  74. package/skills/qmdr/references/mcp-setup.md +88 -0
  75. package/src/app/commands/collection.ts +55 -0
  76. package/src/app/commands/context.ts +82 -0
  77. package/src/app/commands/document.ts +46 -0
  78. package/src/app/commands/maintenance.ts +60 -0
  79. package/src/app/commands/search.ts +45 -0
  80. package/src/app/ports/llm.ts +13 -0
  81. package/src/app/services/llm-service.ts +145 -0
  82. package/src/cli.test.ts +963 -0
  83. package/src/collections.ts +390 -0
  84. package/src/eval.test.ts +412 -0
  85. package/src/formatter.ts +427 -0
  86. package/src/llm.test.ts +559 -0
  87. package/src/llm.ts +1990 -0
  88. package/src/mcp.test.ts +889 -0
  89. package/src/mcp.ts +626 -0
  90. package/src/qmd.ts +3330 -0
  91. package/src/store/collections.ts +7 -0
  92. package/src/store/context.ts +10 -0
  93. package/src/store/db.ts +5 -0
  94. package/src/store/documents.ts +26 -0
  95. package/src/store/maintenance.ts +15 -0
  96. package/src/store/path.ts +13 -0
  97. package/src/store/search.ts +10 -0
  98. package/src/store-paths.test.ts +395 -0
  99. package/src/store.test.ts +2483 -0
  100. package/src/store.ts +2813 -0
  101. package/test/eval-harness.ts +223 -0
  102. package/tsconfig.json +29 -0
@@ -0,0 +1,412 @@
1
+ /**
2
+ * Evaluation Tests for QMD Search Quality
3
+ *
4
+ * Tests search quality against synthetic documents with known-answer queries.
5
+ * Validates that search improvements don't regress quality.
6
+ *
7
+ * Three test suites:
8
+ * 1. BM25 (FTS) - lexical search baseline
9
+ * 2. Vector Search - semantic search with embeddings
10
+ * 3. Hybrid (RRF) - combined lexical + vector with rank fusion
11
+ */
12
+
13
+ import { describe, test, expect, beforeAll, afterAll } from "bun:test";
14
+ import { mkdtempSync, rmSync, readFileSync, readdirSync } from "fs";
15
+ import { join } from "path";
16
+ import { tmpdir } from "os";
17
+ import Database from "bun:sqlite";
18
+
19
+ // Set INDEX_PATH before importing store to prevent using global index
20
+ const tempDir = mkdtempSync(join(tmpdir(), "qmd-eval-"));
21
+ process.env.INDEX_PATH = join(tempDir, "eval.sqlite");
22
+
23
+ import {
24
+ createStore,
25
+ searchFTS,
26
+ searchVec,
27
+ insertDocument,
28
+ insertContent,
29
+ insertEmbedding,
30
+ chunkDocumentByTokens,
31
+ reciprocalRankFusion,
32
+ DEFAULT_EMBED_MODEL,
33
+ type RankedResult,
34
+ } from "./store";
35
+ import { getDefaultLlamaCpp, formatDocForEmbedding, disposeDefaultLlamaCpp } from "./llm";
36
+
37
+ // Eval queries with expected documents
38
+ const evalQueries: {
39
+ query: string;
40
+ expectedDoc: string;
41
+ difficulty: "easy" | "medium" | "hard" | "fusion";
42
+ }[] = [
43
+ // EASY: Exact keyword matches
44
+ { query: "API versioning", expectedDoc: "api-design", difficulty: "easy" },
45
+ { query: "Series A fundraising", expectedDoc: "fundraising", difficulty: "easy" },
46
+ { query: "CAP theorem", expectedDoc: "distributed-systems", difficulty: "easy" },
47
+ { query: "overfitting machine learning", expectedDoc: "machine-learning", difficulty: "easy" },
48
+ { query: "remote work VPN", expectedDoc: "remote-work", difficulty: "easy" },
49
+ { query: "Project Phoenix retrospective", expectedDoc: "product-launch", difficulty: "easy" },
50
+
51
+ // MEDIUM: Semantic/conceptual queries
52
+ { query: "how to structure REST endpoints", expectedDoc: "api-design", difficulty: "medium" },
53
+ { query: "raising money for startup", expectedDoc: "fundraising", difficulty: "medium" },
54
+ { query: "consistency vs availability tradeoffs", expectedDoc: "distributed-systems", difficulty: "medium" },
55
+ { query: "how to prevent models from memorizing data", expectedDoc: "machine-learning", difficulty: "medium" },
56
+ { query: "working from home guidelines", expectedDoc: "remote-work", difficulty: "medium" },
57
+ { query: "what went wrong with the launch", expectedDoc: "product-launch", difficulty: "medium" },
58
+
59
+ // HARD: Vague, partial memory, indirect
60
+ { query: "nouns not verbs", expectedDoc: "api-design", difficulty: "hard" },
61
+ { query: "Sequoia investor pitch", expectedDoc: "fundraising", difficulty: "hard" },
62
+ { query: "Raft algorithm leader election", expectedDoc: "distributed-systems", difficulty: "hard" },
63
+ { query: "F1 score precision recall", expectedDoc: "machine-learning", difficulty: "hard" },
64
+ { query: "quarterly team gathering travel", expectedDoc: "remote-work", difficulty: "hard" },
65
+ { query: "beta program 47 bugs", expectedDoc: "product-launch", difficulty: "hard" },
66
+
67
+ // FUSION: Multi-signal queries that need both lexical AND semantic matching
68
+ // These should have weak individual scores but strong combined RRF scores
69
+ { query: "how much runway before running out of money", expectedDoc: "fundraising", difficulty: "fusion" },
70
+ { query: "datacenter replication sync strategy", expectedDoc: "distributed-systems", difficulty: "fusion" },
71
+ { query: "splitting data for training and testing", expectedDoc: "machine-learning", difficulty: "fusion" },
72
+ { query: "JSON response codes error messages", expectedDoc: "api-design", difficulty: "fusion" },
73
+ { query: "video calls camera async messaging", expectedDoc: "remote-work", difficulty: "fusion" },
74
+ { query: "CI/CD pipeline testing coverage", expectedDoc: "product-launch", difficulty: "fusion" },
75
+ ];
76
+
77
+ // Helper to check if result matches expected doc
78
+ function matchesExpected(filepath: string, expectedDoc: string): boolean {
79
+ return filepath.toLowerCase().includes(expectedDoc);
80
+ }
81
+
82
+ // Helper to calculate hit rate
83
+ function calcHitRate(
84
+ queries: typeof evalQueries,
85
+ searchFn: (query: string) => { filepath: string }[],
86
+ topK: number
87
+ ): number {
88
+ let hits = 0;
89
+ for (const { query, expectedDoc } of queries) {
90
+ const results = searchFn(query).slice(0, topK);
91
+ if (results.some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
92
+ }
93
+ return hits / queries.length;
94
+ }
95
+
96
+ // =============================================================================
97
+ // BM25 (Lexical) Tests - Fast, no model loading needed
98
+ // =============================================================================
99
+
100
+ describe("BM25 Search (FTS)", () => {
101
+ let store: ReturnType<typeof createStore>;
102
+ let db: Database;
103
+
104
+ beforeAll(() => {
105
+ store = createStore();
106
+ db = store.db;
107
+
108
+ // Load and index eval documents
109
+ const evalDocsDir = join(import.meta.dir, "../test/eval-docs");
110
+ const files = readdirSync(evalDocsDir).filter(f => f.endsWith(".md"));
111
+
112
+ for (const file of files) {
113
+ const content = readFileSync(join(evalDocsDir, file), "utf-8");
114
+ const title = content.split("\n")[0]?.replace(/^#\s*/, "") || file;
115
+ const hash = Bun.hash(content).toString(16).slice(0, 12);
116
+ const now = new Date().toISOString();
117
+
118
+ insertContent(db, hash, content, now);
119
+ insertDocument(db, "eval-docs", file, title, hash, now, now);
120
+ }
121
+ });
122
+
123
+ afterAll(() => {
124
+ store.close();
125
+ });
126
+
127
+ test("easy queries: ≥80% Hit@3", () => {
128
+ const easyQueries = evalQueries.filter(q => q.difficulty === "easy");
129
+ const hitRate = calcHitRate(easyQueries, q => searchFTS(db, q, 5), 3);
130
+ expect(hitRate).toBeGreaterThanOrEqual(0.8);
131
+ });
132
+
133
+ test("medium queries: ≥15% Hit@3 (BM25 struggles with semantic)", () => {
134
+ const mediumQueries = evalQueries.filter(q => q.difficulty === "medium");
135
+ const hitRate = calcHitRate(mediumQueries, q => searchFTS(db, q, 5), 3);
136
+ expect(hitRate).toBeGreaterThanOrEqual(0.15);
137
+ });
138
+
139
+ test("hard queries: ≥15% Hit@5 (BM25 baseline)", () => {
140
+ const hardQueries = evalQueries.filter(q => q.difficulty === "hard");
141
+ const hitRate = calcHitRate(hardQueries, q => searchFTS(db, q, 5), 5);
142
+ expect(hitRate).toBeGreaterThanOrEqual(0.15);
143
+ });
144
+
145
+ test("overall Hit@3 ≥40% (BM25 baseline)", () => {
146
+ const hitRate = calcHitRate(evalQueries, q => searchFTS(db, q, 5), 3);
147
+ expect(hitRate).toBeGreaterThanOrEqual(0.4);
148
+ });
149
+ });
150
+
151
+ // =============================================================================
152
+ // Vector Search Tests - Requires embedding model
153
+ // =============================================================================
154
+
155
+ describe("Vector Search", () => {
156
+ let store: ReturnType<typeof createStore>;
157
+ let db: Database;
158
+ let hasEmbeddings = false;
159
+
160
+ beforeAll(async () => {
161
+ store = createStore();
162
+ db = store.db;
163
+
164
+ // Check if embeddings already exist (from previous test run)
165
+ const vecTable = db.prepare(
166
+ `SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`
167
+ ).get();
168
+
169
+ if (vecTable) {
170
+ const count = db.prepare(`SELECT COUNT(*) as cnt FROM vectors_vec`).get() as { cnt: number };
171
+ if (count.cnt > 0) {
172
+ hasEmbeddings = true;
173
+ return;
174
+ }
175
+ }
176
+
177
+ // Generate embeddings for test documents
178
+ const llm = getDefaultLlamaCpp();
179
+ store.ensureVecTable(768); // embeddinggemma uses 768 dimensions
180
+
181
+ const evalDocsDir = join(import.meta.dir, "../test/eval-docs");
182
+ const files = readdirSync(evalDocsDir).filter(f => f.endsWith(".md"));
183
+
184
+ for (const file of files) {
185
+ const content = readFileSync(join(evalDocsDir, file), "utf-8");
186
+ const hash = Bun.hash(content).toString(16).slice(0, 12);
187
+ const title = content.split("\n")[0]?.replace(/^#\s*/, "") || file;
188
+
189
+ // Chunk and embed
190
+ const chunks = await chunkDocumentByTokens(content);
191
+ for (let seq = 0; seq < chunks.length; seq++) {
192
+ const chunk = chunks[seq];
193
+ if (!chunk) continue;
194
+ const formatted = formatDocForEmbedding(chunk.text, title);
195
+ const result = await llm.embed(formatted, { model: DEFAULT_EMBED_MODEL, isQuery: false });
196
+ if (result?.embedding) {
197
+ // Convert to Float32Array for sqlite-vec
198
+ const embedding = new Float32Array(result.embedding);
199
+ const now = new Date().toISOString();
200
+ insertEmbedding(db, hash, seq, chunk.pos, embedding, DEFAULT_EMBED_MODEL, now);
201
+ }
202
+ }
203
+ }
204
+ hasEmbeddings = true;
205
+ }, 120000); // 2 minute timeout for embedding generation
206
+
207
+ afterAll(() => {
208
+ store.close();
209
+ });
210
+
211
+ // Note: Don't dispose here - Hybrid tests also use llama.
212
+ // Dispose happens in the global afterAll.
213
+
214
+ test("easy queries: ≥60% Hit@3 (vector should match keywords too)", async () => {
215
+ if (!hasEmbeddings) return; // Skip if embedding failed
216
+
217
+ const easyQueries = evalQueries.filter(q => q.difficulty === "easy");
218
+ let hits = 0;
219
+ for (const { query, expectedDoc } of easyQueries) {
220
+ const results = await searchVec(db, query, DEFAULT_EMBED_MODEL, 5);
221
+ if (results.slice(0, 3).some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
222
+ }
223
+ expect(hits / easyQueries.length).toBeGreaterThanOrEqual(0.6);
224
+ }, 60000);
225
+
226
+ test("medium queries: ≥40% Hit@3 (vector excels at semantic)", async () => {
227
+ if (!hasEmbeddings) return;
228
+
229
+ const mediumQueries = evalQueries.filter(q => q.difficulty === "medium");
230
+ let hits = 0;
231
+ for (const { query, expectedDoc } of mediumQueries) {
232
+ const results = await searchVec(db, query, DEFAULT_EMBED_MODEL, 5);
233
+ if (results.slice(0, 3).some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
234
+ }
235
+ // Vector search should do better on semantic queries than BM25
236
+ expect(hits / mediumQueries.length).toBeGreaterThanOrEqual(0.4);
237
+ }, 60000);
238
+
239
+ test("hard queries: ≥30% Hit@5 (vector helps with vague queries)", async () => {
240
+ if (!hasEmbeddings) return;
241
+
242
+ const hardQueries = evalQueries.filter(q => q.difficulty === "hard");
243
+ let hits = 0;
244
+ for (const { query, expectedDoc } of hardQueries) {
245
+ const results = await searchVec(db, query, DEFAULT_EMBED_MODEL, 5);
246
+ if (results.some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
247
+ }
248
+ expect(hits / hardQueries.length).toBeGreaterThanOrEqual(0.3);
249
+ }, 60000);
250
+
251
+ test("overall Hit@3 ≥50% (vector baseline)", async () => {
252
+ if (!hasEmbeddings) return;
253
+
254
+ let hits = 0;
255
+ for (const { query, expectedDoc } of evalQueries) {
256
+ const results = await searchVec(db, query, DEFAULT_EMBED_MODEL, 5);
257
+ if (results.slice(0, 3).some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
258
+ }
259
+ expect(hits / evalQueries.length).toBeGreaterThanOrEqual(0.5);
260
+ }, 60000);
261
+ });
262
+
263
+ // =============================================================================
264
+ // Hybrid Search (RRF) Tests - Combines BM25 + Vector
265
+ // =============================================================================
266
+
267
+ describe("Hybrid Search (RRF)", () => {
268
+ let store: ReturnType<typeof createStore>;
269
+ let db: Database;
270
+ let hasVectors = false;
271
+
272
+ beforeAll(() => {
273
+ store = createStore();
274
+ db = store.db;
275
+ // Check if vectors exist
276
+ const vecTable = db.prepare(
277
+ `SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`
278
+ ).get();
279
+ if (vecTable) {
280
+ const count = db.prepare(`SELECT COUNT(*) as cnt FROM vectors_vec`).get() as { cnt: number };
281
+ hasVectors = count.cnt > 0;
282
+ }
283
+ });
284
+
285
+ afterAll(() => {
286
+ store.close();
287
+ });
288
+
289
+ // Helper: run hybrid search with RRF fusion
290
+ async function hybridSearch(query: string, limit: number = 10): Promise<RankedResult[]> {
291
+ const rankedLists: RankedResult[][] = [];
292
+
293
+ // FTS results
294
+ const ftsResults = searchFTS(db, query, 20);
295
+ if (ftsResults.length > 0) {
296
+ rankedLists.push(ftsResults.map(r => ({
297
+ file: r.filepath,
298
+ displayPath: r.displayPath,
299
+ title: r.title,
300
+ body: r.body || "",
301
+ score: r.score
302
+ })));
303
+ }
304
+
305
+ // Vector results
306
+ const vecResults = await searchVec(db, query, DEFAULT_EMBED_MODEL, 20);
307
+ if (vecResults.length > 0) {
308
+ rankedLists.push(vecResults.map(r => ({
309
+ file: r.filepath,
310
+ displayPath: r.displayPath,
311
+ title: r.title,
312
+ body: r.body || "",
313
+ score: r.score
314
+ })));
315
+ }
316
+
317
+ if (rankedLists.length === 0) return [];
318
+
319
+ // Apply RRF fusion
320
+ const fused = reciprocalRankFusion(rankedLists);
321
+ return fused.slice(0, limit);
322
+ }
323
+
324
+ test("easy queries: ≥80% Hit@3 (hybrid should match BM25)", async () => {
325
+ const easyQueries = evalQueries.filter(q => q.difficulty === "easy");
326
+ let hits = 0;
327
+ for (const { query, expectedDoc } of easyQueries) {
328
+ const results = await hybridSearch(query);
329
+ if (results.slice(0, 3).some(r => matchesExpected(r.file, expectedDoc))) hits++;
330
+ }
331
+ expect(hits / easyQueries.length).toBeGreaterThanOrEqual(0.8);
332
+ }, 60000);
333
+
334
+ test("medium queries: ≥50% Hit@3 with vectors, ≥15% without", async () => {
335
+ const mediumQueries = evalQueries.filter(q => q.difficulty === "medium");
336
+ let hits = 0;
337
+ for (const { query, expectedDoc } of mediumQueries) {
338
+ const results = await hybridSearch(query);
339
+ if (results.slice(0, 3).some(r => matchesExpected(r.file, expectedDoc))) hits++;
340
+ }
341
+ // With vectors: hybrid should outperform both BM25 (15%) and vector (40%)
342
+ // Without vectors: hybrid is just BM25, so use BM25 threshold
343
+ const threshold = hasVectors ? 0.5 : 0.15;
344
+ expect(hits / mediumQueries.length).toBeGreaterThanOrEqual(threshold);
345
+ }, 60000);
346
+
347
+ test("hard queries: ≥35% Hit@5 with vectors, ≥15% without", async () => {
348
+ const hardQueries = evalQueries.filter(q => q.difficulty === "hard");
349
+ let hits = 0;
350
+ for (const { query, expectedDoc } of hardQueries) {
351
+ const results = await hybridSearch(query);
352
+ if (results.some(r => matchesExpected(r.file, expectedDoc))) hits++;
353
+ }
354
+ const threshold = hasVectors ? 0.35 : 0.15;
355
+ expect(hits / hardQueries.length).toBeGreaterThanOrEqual(threshold);
356
+ }, 60000);
357
+
358
+ test("fusion queries: ≥50% Hit@3 (RRF combines weak signals)", async () => {
359
+ if (!hasVectors) return; // Fusion requires both methods
360
+
361
+ const fusionQueries = evalQueries.filter(q => q.difficulty === "fusion");
362
+ let hybridHits = 0;
363
+ let bm25Hits = 0;
364
+ let vecHits = 0;
365
+
366
+ for (const { query, expectedDoc } of fusionQueries) {
367
+ // Hybrid results
368
+ const hybridResults = await hybridSearch(query);
369
+ if (hybridResults.slice(0, 3).some(r => matchesExpected(r.file, expectedDoc))) hybridHits++;
370
+
371
+ // BM25 results for comparison
372
+ const bm25Results = searchFTS(db, query, 5);
373
+ if (bm25Results.slice(0, 3).some(r => matchesExpected(r.filepath, expectedDoc))) bm25Hits++;
374
+
375
+ // Vector results for comparison
376
+ const vecResults = await searchVec(db, query, DEFAULT_EMBED_MODEL, 5);
377
+ if (vecResults.slice(0, 3).some(r => matchesExpected(r.filepath, expectedDoc))) vecHits++;
378
+ }
379
+
380
+ const hybridRate = hybridHits / fusionQueries.length;
381
+ const bm25Rate = bm25Hits / fusionQueries.length;
382
+ const vecRate = vecHits / fusionQueries.length;
383
+
384
+ // Fusion should achieve at least 50% on these multi-signal queries
385
+ expect(hybridRate).toBeGreaterThanOrEqual(0.5);
386
+
387
+ // Fusion should outperform or match the best individual method
388
+ expect(hybridRate).toBeGreaterThanOrEqual(Math.max(bm25Rate, vecRate));
389
+ }, 60000);
390
+
391
+ test("overall Hit@3 ≥60% with vectors, ≥40% without", async () => {
392
+ // Filter out fusion queries for overall score (they're tested separately)
393
+ const standardQueries = evalQueries.filter(q => q.difficulty !== "fusion");
394
+ let hits = 0;
395
+ for (const { query, expectedDoc } of standardQueries) {
396
+ const results = await hybridSearch(query);
397
+ if (results.slice(0, 3).some(r => matchesExpected(r.file, expectedDoc))) hits++;
398
+ }
399
+ const threshold = hasVectors ? 0.6 : 0.4;
400
+ expect(hits / standardQueries.length).toBeGreaterThanOrEqual(threshold);
401
+ }, 60000);
402
+ });
403
+
404
+ // =============================================================================
405
+ // Cleanup
406
+ // =============================================================================
407
+
408
+ afterAll(async () => {
409
+ // Ensure native resources are released to avoid ggml-metal asserts on process exit.
410
+ await disposeDefaultLlamaCpp();
411
+ rmSync(tempDir, { recursive: true, force: true });
412
+ });