qmdr 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +29 -0
- package/.env.example +85 -0
- package/.gitattributes +3 -0
- package/.github/workflows/release.yml +77 -0
- package/AI-SETUP.md +466 -0
- package/LICENSE +22 -0
- package/README.md +78 -0
- package/bun.lock +637 -0
- package/docs/README-zh.md +78 -0
- package/docs/refactor-checklist.md +54 -0
- package/docs/setup-openclaw.md +139 -0
- package/example-index.yml +33 -0
- package/finetune/BALANCED_DISTRIBUTION.md +157 -0
- package/finetune/DATA_IMPROVEMENTS.md +218 -0
- package/finetune/Justfile +43 -0
- package/finetune/Modelfile +16 -0
- package/finetune/README.md +299 -0
- package/finetune/SCORING.md +286 -0
- package/finetune/configs/accelerate_multi_gpu.yaml +17 -0
- package/finetune/configs/grpo.yaml +49 -0
- package/finetune/configs/sft.yaml +42 -0
- package/finetune/configs/sft_local.yaml +40 -0
- package/finetune/convert_gguf.py +221 -0
- package/finetune/data/best_glm_prompt.txt +17 -0
- package/finetune/data/gepa_generated.prompts.json +32 -0
- package/finetune/data/qmd_expansion_balanced_deduped.jsonl +413 -0
- package/finetune/data/qmd_expansion_diverse_addon.jsonl +386 -0
- package/finetune/data/qmd_expansion_handcrafted.jsonl +65 -0
- package/finetune/data/qmd_expansion_handcrafted_only.jsonl +336 -0
- package/finetune/data/qmd_expansion_locations.jsonl +64 -0
- package/finetune/data/qmd_expansion_people.jsonl +46 -0
- package/finetune/data/qmd_expansion_short_nontech.jsonl +200 -0
- package/finetune/data/qmd_expansion_v2.jsonl +1498 -0
- package/finetune/data/qmd_only_sampled.jsonl +399 -0
- package/finetune/dataset/analyze_data.py +369 -0
- package/finetune/dataset/clean_data.py +906 -0
- package/finetune/dataset/generate_balanced.py +823 -0
- package/finetune/dataset/generate_data.py +714 -0
- package/finetune/dataset/generate_data_offline.py +206 -0
- package/finetune/dataset/generate_diverse.py +441 -0
- package/finetune/dataset/generate_ollama.py +326 -0
- package/finetune/dataset/prepare_data.py +197 -0
- package/finetune/dataset/schema.py +73 -0
- package/finetune/dataset/score_data.py +115 -0
- package/finetune/dataset/validate_schema.py +104 -0
- package/finetune/eval.py +196 -0
- package/finetune/evals/queries.txt +56 -0
- package/finetune/gepa/__init__.py +1 -0
- package/finetune/gepa/best_prompt.txt +31 -0
- package/finetune/gepa/best_prompt_glm.txt +1 -0
- package/finetune/gepa/dspy_gepa.py +204 -0
- package/finetune/gepa/example.py +117 -0
- package/finetune/gepa/generate.py +129 -0
- package/finetune/gepa/gepa_outputs.jsonl +10 -0
- package/finetune/gepa/gepa_outputs_glm.jsonl +20 -0
- package/finetune/gepa/model.json +19 -0
- package/finetune/gepa/optimizer.py +70 -0
- package/finetune/gepa/score.py +84 -0
- package/finetune/jobs/eval.py +490 -0
- package/finetune/jobs/eval_common.py +354 -0
- package/finetune/jobs/eval_verbose.py +113 -0
- package/finetune/jobs/grpo.py +141 -0
- package/finetune/jobs/quantize.py +244 -0
- package/finetune/jobs/sft.py +121 -0
- package/finetune/pyproject.toml +23 -0
- package/finetune/reward.py +610 -0
- package/finetune/train.py +611 -0
- package/finetune/uv.lock +4070 -0
- package/flake.lock +61 -0
- package/flake.nix +83 -0
- package/migrate-schema.ts +162 -0
- package/package.json +56 -0
- package/skills/qmdr/SKILL.md +172 -0
- package/skills/qmdr/references/mcp-setup.md +88 -0
- package/src/app/commands/collection.ts +55 -0
- package/src/app/commands/context.ts +82 -0
- package/src/app/commands/document.ts +46 -0
- package/src/app/commands/maintenance.ts +60 -0
- package/src/app/commands/search.ts +45 -0
- package/src/app/ports/llm.ts +13 -0
- package/src/app/services/llm-service.ts +145 -0
- package/src/cli.test.ts +963 -0
- package/src/collections.ts +390 -0
- package/src/eval.test.ts +412 -0
- package/src/formatter.ts +427 -0
- package/src/llm.test.ts +559 -0
- package/src/llm.ts +1990 -0
- package/src/mcp.test.ts +889 -0
- package/src/mcp.ts +626 -0
- package/src/qmd.ts +3330 -0
- package/src/store/collections.ts +7 -0
- package/src/store/context.ts +10 -0
- package/src/store/db.ts +5 -0
- package/src/store/documents.ts +26 -0
- package/src/store/maintenance.ts +15 -0
- package/src/store/path.ts +13 -0
- package/src/store/search.ts +10 -0
- package/src/store-paths.test.ts +395 -0
- package/src/store.test.ts +2483 -0
- package/src/store.ts +2813 -0
- package/test/eval-harness.ts +223 -0
- package/tsconfig.json +29 -0
package/src/eval.test.ts
ADDED
|
@@ -0,0 +1,412 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluation Tests for QMD Search Quality
|
|
3
|
+
*
|
|
4
|
+
* Tests search quality against synthetic documents with known-answer queries.
|
|
5
|
+
* Validates that search improvements don't regress quality.
|
|
6
|
+
*
|
|
7
|
+
* Three test suites:
|
|
8
|
+
* 1. BM25 (FTS) - lexical search baseline
|
|
9
|
+
* 2. Vector Search - semantic search with embeddings
|
|
10
|
+
* 3. Hybrid (RRF) - combined lexical + vector with rank fusion
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { describe, test, expect, beforeAll, afterAll } from "bun:test";
|
|
14
|
+
import { mkdtempSync, rmSync, readFileSync, readdirSync } from "fs";
|
|
15
|
+
import { join } from "path";
|
|
16
|
+
import { tmpdir } from "os";
|
|
17
|
+
import Database from "bun:sqlite";
|
|
18
|
+
|
|
19
|
+
// Set INDEX_PATH before importing store to prevent using global index
|
|
20
|
+
const tempDir = mkdtempSync(join(tmpdir(), "qmd-eval-"));
|
|
21
|
+
process.env.INDEX_PATH = join(tempDir, "eval.sqlite");
|
|
22
|
+
|
|
23
|
+
import {
|
|
24
|
+
createStore,
|
|
25
|
+
searchFTS,
|
|
26
|
+
searchVec,
|
|
27
|
+
insertDocument,
|
|
28
|
+
insertContent,
|
|
29
|
+
insertEmbedding,
|
|
30
|
+
chunkDocumentByTokens,
|
|
31
|
+
reciprocalRankFusion,
|
|
32
|
+
DEFAULT_EMBED_MODEL,
|
|
33
|
+
type RankedResult,
|
|
34
|
+
} from "./store";
|
|
35
|
+
import { getDefaultLlamaCpp, formatDocForEmbedding, disposeDefaultLlamaCpp } from "./llm";
|
|
36
|
+
|
|
37
|
+
// Eval queries with expected documents
|
|
38
|
+
const evalQueries: {
|
|
39
|
+
query: string;
|
|
40
|
+
expectedDoc: string;
|
|
41
|
+
difficulty: "easy" | "medium" | "hard" | "fusion";
|
|
42
|
+
}[] = [
|
|
43
|
+
// EASY: Exact keyword matches
|
|
44
|
+
{ query: "API versioning", expectedDoc: "api-design", difficulty: "easy" },
|
|
45
|
+
{ query: "Series A fundraising", expectedDoc: "fundraising", difficulty: "easy" },
|
|
46
|
+
{ query: "CAP theorem", expectedDoc: "distributed-systems", difficulty: "easy" },
|
|
47
|
+
{ query: "overfitting machine learning", expectedDoc: "machine-learning", difficulty: "easy" },
|
|
48
|
+
{ query: "remote work VPN", expectedDoc: "remote-work", difficulty: "easy" },
|
|
49
|
+
{ query: "Project Phoenix retrospective", expectedDoc: "product-launch", difficulty: "easy" },
|
|
50
|
+
|
|
51
|
+
// MEDIUM: Semantic/conceptual queries
|
|
52
|
+
{ query: "how to structure REST endpoints", expectedDoc: "api-design", difficulty: "medium" },
|
|
53
|
+
{ query: "raising money for startup", expectedDoc: "fundraising", difficulty: "medium" },
|
|
54
|
+
{ query: "consistency vs availability tradeoffs", expectedDoc: "distributed-systems", difficulty: "medium" },
|
|
55
|
+
{ query: "how to prevent models from memorizing data", expectedDoc: "machine-learning", difficulty: "medium" },
|
|
56
|
+
{ query: "working from home guidelines", expectedDoc: "remote-work", difficulty: "medium" },
|
|
57
|
+
{ query: "what went wrong with the launch", expectedDoc: "product-launch", difficulty: "medium" },
|
|
58
|
+
|
|
59
|
+
// HARD: Vague, partial memory, indirect
|
|
60
|
+
{ query: "nouns not verbs", expectedDoc: "api-design", difficulty: "hard" },
|
|
61
|
+
{ query: "Sequoia investor pitch", expectedDoc: "fundraising", difficulty: "hard" },
|
|
62
|
+
{ query: "Raft algorithm leader election", expectedDoc: "distributed-systems", difficulty: "hard" },
|
|
63
|
+
{ query: "F1 score precision recall", expectedDoc: "machine-learning", difficulty: "hard" },
|
|
64
|
+
{ query: "quarterly team gathering travel", expectedDoc: "remote-work", difficulty: "hard" },
|
|
65
|
+
{ query: "beta program 47 bugs", expectedDoc: "product-launch", difficulty: "hard" },
|
|
66
|
+
|
|
67
|
+
// FUSION: Multi-signal queries that need both lexical AND semantic matching
|
|
68
|
+
// These should have weak individual scores but strong combined RRF scores
|
|
69
|
+
{ query: "how much runway before running out of money", expectedDoc: "fundraising", difficulty: "fusion" },
|
|
70
|
+
{ query: "datacenter replication sync strategy", expectedDoc: "distributed-systems", difficulty: "fusion" },
|
|
71
|
+
{ query: "splitting data for training and testing", expectedDoc: "machine-learning", difficulty: "fusion" },
|
|
72
|
+
{ query: "JSON response codes error messages", expectedDoc: "api-design", difficulty: "fusion" },
|
|
73
|
+
{ query: "video calls camera async messaging", expectedDoc: "remote-work", difficulty: "fusion" },
|
|
74
|
+
{ query: "CI/CD pipeline testing coverage", expectedDoc: "product-launch", difficulty: "fusion" },
|
|
75
|
+
];
|
|
76
|
+
|
|
77
|
+
// Helper to check if result matches expected doc
|
|
78
|
+
function matchesExpected(filepath: string, expectedDoc: string): boolean {
|
|
79
|
+
return filepath.toLowerCase().includes(expectedDoc);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// Helper to calculate hit rate
|
|
83
|
+
function calcHitRate(
|
|
84
|
+
queries: typeof evalQueries,
|
|
85
|
+
searchFn: (query: string) => { filepath: string }[],
|
|
86
|
+
topK: number
|
|
87
|
+
): number {
|
|
88
|
+
let hits = 0;
|
|
89
|
+
for (const { query, expectedDoc } of queries) {
|
|
90
|
+
const results = searchFn(query).slice(0, topK);
|
|
91
|
+
if (results.some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
|
|
92
|
+
}
|
|
93
|
+
return hits / queries.length;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// =============================================================================
|
|
97
|
+
// BM25 (Lexical) Tests - Fast, no model loading needed
|
|
98
|
+
// =============================================================================
|
|
99
|
+
|
|
100
|
+
describe("BM25 Search (FTS)", () => {
|
|
101
|
+
let store: ReturnType<typeof createStore>;
|
|
102
|
+
let db: Database;
|
|
103
|
+
|
|
104
|
+
beforeAll(() => {
|
|
105
|
+
store = createStore();
|
|
106
|
+
db = store.db;
|
|
107
|
+
|
|
108
|
+
// Load and index eval documents
|
|
109
|
+
const evalDocsDir = join(import.meta.dir, "../test/eval-docs");
|
|
110
|
+
const files = readdirSync(evalDocsDir).filter(f => f.endsWith(".md"));
|
|
111
|
+
|
|
112
|
+
for (const file of files) {
|
|
113
|
+
const content = readFileSync(join(evalDocsDir, file), "utf-8");
|
|
114
|
+
const title = content.split("\n")[0]?.replace(/^#\s*/, "") || file;
|
|
115
|
+
const hash = Bun.hash(content).toString(16).slice(0, 12);
|
|
116
|
+
const now = new Date().toISOString();
|
|
117
|
+
|
|
118
|
+
insertContent(db, hash, content, now);
|
|
119
|
+
insertDocument(db, "eval-docs", file, title, hash, now, now);
|
|
120
|
+
}
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
afterAll(() => {
|
|
124
|
+
store.close();
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
test("easy queries: ≥80% Hit@3", () => {
|
|
128
|
+
const easyQueries = evalQueries.filter(q => q.difficulty === "easy");
|
|
129
|
+
const hitRate = calcHitRate(easyQueries, q => searchFTS(db, q, 5), 3);
|
|
130
|
+
expect(hitRate).toBeGreaterThanOrEqual(0.8);
|
|
131
|
+
});
|
|
132
|
+
|
|
133
|
+
test("medium queries: ≥15% Hit@3 (BM25 struggles with semantic)", () => {
|
|
134
|
+
const mediumQueries = evalQueries.filter(q => q.difficulty === "medium");
|
|
135
|
+
const hitRate = calcHitRate(mediumQueries, q => searchFTS(db, q, 5), 3);
|
|
136
|
+
expect(hitRate).toBeGreaterThanOrEqual(0.15);
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
test("hard queries: ≥15% Hit@5 (BM25 baseline)", () => {
|
|
140
|
+
const hardQueries = evalQueries.filter(q => q.difficulty === "hard");
|
|
141
|
+
const hitRate = calcHitRate(hardQueries, q => searchFTS(db, q, 5), 5);
|
|
142
|
+
expect(hitRate).toBeGreaterThanOrEqual(0.15);
|
|
143
|
+
});
|
|
144
|
+
|
|
145
|
+
test("overall Hit@3 ≥40% (BM25 baseline)", () => {
|
|
146
|
+
const hitRate = calcHitRate(evalQueries, q => searchFTS(db, q, 5), 3);
|
|
147
|
+
expect(hitRate).toBeGreaterThanOrEqual(0.4);
|
|
148
|
+
});
|
|
149
|
+
});
|
|
150
|
+
|
|
151
|
+
// =============================================================================
|
|
152
|
+
// Vector Search Tests - Requires embedding model
|
|
153
|
+
// =============================================================================
|
|
154
|
+
|
|
155
|
+
describe("Vector Search", () => {
|
|
156
|
+
let store: ReturnType<typeof createStore>;
|
|
157
|
+
let db: Database;
|
|
158
|
+
let hasEmbeddings = false;
|
|
159
|
+
|
|
160
|
+
beforeAll(async () => {
|
|
161
|
+
store = createStore();
|
|
162
|
+
db = store.db;
|
|
163
|
+
|
|
164
|
+
// Check if embeddings already exist (from previous test run)
|
|
165
|
+
const vecTable = db.prepare(
|
|
166
|
+
`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`
|
|
167
|
+
).get();
|
|
168
|
+
|
|
169
|
+
if (vecTable) {
|
|
170
|
+
const count = db.prepare(`SELECT COUNT(*) as cnt FROM vectors_vec`).get() as { cnt: number };
|
|
171
|
+
if (count.cnt > 0) {
|
|
172
|
+
hasEmbeddings = true;
|
|
173
|
+
return;
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
// Generate embeddings for test documents
|
|
178
|
+
const llm = getDefaultLlamaCpp();
|
|
179
|
+
store.ensureVecTable(768); // embeddinggemma uses 768 dimensions
|
|
180
|
+
|
|
181
|
+
const evalDocsDir = join(import.meta.dir, "../test/eval-docs");
|
|
182
|
+
const files = readdirSync(evalDocsDir).filter(f => f.endsWith(".md"));
|
|
183
|
+
|
|
184
|
+
for (const file of files) {
|
|
185
|
+
const content = readFileSync(join(evalDocsDir, file), "utf-8");
|
|
186
|
+
const hash = Bun.hash(content).toString(16).slice(0, 12);
|
|
187
|
+
const title = content.split("\n")[0]?.replace(/^#\s*/, "") || file;
|
|
188
|
+
|
|
189
|
+
// Chunk and embed
|
|
190
|
+
const chunks = await chunkDocumentByTokens(content);
|
|
191
|
+
for (let seq = 0; seq < chunks.length; seq++) {
|
|
192
|
+
const chunk = chunks[seq];
|
|
193
|
+
if (!chunk) continue;
|
|
194
|
+
const formatted = formatDocForEmbedding(chunk.text, title);
|
|
195
|
+
const result = await llm.embed(formatted, { model: DEFAULT_EMBED_MODEL, isQuery: false });
|
|
196
|
+
if (result?.embedding) {
|
|
197
|
+
// Convert to Float32Array for sqlite-vec
|
|
198
|
+
const embedding = new Float32Array(result.embedding);
|
|
199
|
+
const now = new Date().toISOString();
|
|
200
|
+
insertEmbedding(db, hash, seq, chunk.pos, embedding, DEFAULT_EMBED_MODEL, now);
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
hasEmbeddings = true;
|
|
205
|
+
}, 120000); // 2 minute timeout for embedding generation
|
|
206
|
+
|
|
207
|
+
afterAll(() => {
|
|
208
|
+
store.close();
|
|
209
|
+
});
|
|
210
|
+
|
|
211
|
+
// Note: Don't dispose here - Hybrid tests also use llama.
|
|
212
|
+
// Dispose happens in the global afterAll.
|
|
213
|
+
|
|
214
|
+
test("easy queries: ≥60% Hit@3 (vector should match keywords too)", async () => {
|
|
215
|
+
if (!hasEmbeddings) return; // Skip if embedding failed
|
|
216
|
+
|
|
217
|
+
const easyQueries = evalQueries.filter(q => q.difficulty === "easy");
|
|
218
|
+
let hits = 0;
|
|
219
|
+
for (const { query, expectedDoc } of easyQueries) {
|
|
220
|
+
const results = await searchVec(db, query, DEFAULT_EMBED_MODEL, 5);
|
|
221
|
+
if (results.slice(0, 3).some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
|
|
222
|
+
}
|
|
223
|
+
expect(hits / easyQueries.length).toBeGreaterThanOrEqual(0.6);
|
|
224
|
+
}, 60000);
|
|
225
|
+
|
|
226
|
+
test("medium queries: ≥40% Hit@3 (vector excels at semantic)", async () => {
|
|
227
|
+
if (!hasEmbeddings) return;
|
|
228
|
+
|
|
229
|
+
const mediumQueries = evalQueries.filter(q => q.difficulty === "medium");
|
|
230
|
+
let hits = 0;
|
|
231
|
+
for (const { query, expectedDoc } of mediumQueries) {
|
|
232
|
+
const results = await searchVec(db, query, DEFAULT_EMBED_MODEL, 5);
|
|
233
|
+
if (results.slice(0, 3).some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
|
|
234
|
+
}
|
|
235
|
+
// Vector search should do better on semantic queries than BM25
|
|
236
|
+
expect(hits / mediumQueries.length).toBeGreaterThanOrEqual(0.4);
|
|
237
|
+
}, 60000);
|
|
238
|
+
|
|
239
|
+
test("hard queries: ≥30% Hit@5 (vector helps with vague queries)", async () => {
|
|
240
|
+
if (!hasEmbeddings) return;
|
|
241
|
+
|
|
242
|
+
const hardQueries = evalQueries.filter(q => q.difficulty === "hard");
|
|
243
|
+
let hits = 0;
|
|
244
|
+
for (const { query, expectedDoc } of hardQueries) {
|
|
245
|
+
const results = await searchVec(db, query, DEFAULT_EMBED_MODEL, 5);
|
|
246
|
+
if (results.some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
|
|
247
|
+
}
|
|
248
|
+
expect(hits / hardQueries.length).toBeGreaterThanOrEqual(0.3);
|
|
249
|
+
}, 60000);
|
|
250
|
+
|
|
251
|
+
test("overall Hit@3 ≥50% (vector baseline)", async () => {
|
|
252
|
+
if (!hasEmbeddings) return;
|
|
253
|
+
|
|
254
|
+
let hits = 0;
|
|
255
|
+
for (const { query, expectedDoc } of evalQueries) {
|
|
256
|
+
const results = await searchVec(db, query, DEFAULT_EMBED_MODEL, 5);
|
|
257
|
+
if (results.slice(0, 3).some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
|
|
258
|
+
}
|
|
259
|
+
expect(hits / evalQueries.length).toBeGreaterThanOrEqual(0.5);
|
|
260
|
+
}, 60000);
|
|
261
|
+
});
|
|
262
|
+
|
|
263
|
+
// =============================================================================
|
|
264
|
+
// Hybrid Search (RRF) Tests - Combines BM25 + Vector
|
|
265
|
+
// =============================================================================
|
|
266
|
+
|
|
267
|
+
describe("Hybrid Search (RRF)", () => {
|
|
268
|
+
let store: ReturnType<typeof createStore>;
|
|
269
|
+
let db: Database;
|
|
270
|
+
let hasVectors = false;
|
|
271
|
+
|
|
272
|
+
beforeAll(() => {
|
|
273
|
+
store = createStore();
|
|
274
|
+
db = store.db;
|
|
275
|
+
// Check if vectors exist
|
|
276
|
+
const vecTable = db.prepare(
|
|
277
|
+
`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`
|
|
278
|
+
).get();
|
|
279
|
+
if (vecTable) {
|
|
280
|
+
const count = db.prepare(`SELECT COUNT(*) as cnt FROM vectors_vec`).get() as { cnt: number };
|
|
281
|
+
hasVectors = count.cnt > 0;
|
|
282
|
+
}
|
|
283
|
+
});
|
|
284
|
+
|
|
285
|
+
afterAll(() => {
|
|
286
|
+
store.close();
|
|
287
|
+
});
|
|
288
|
+
|
|
289
|
+
// Helper: run hybrid search with RRF fusion
|
|
290
|
+
async function hybridSearch(query: string, limit: number = 10): Promise<RankedResult[]> {
|
|
291
|
+
const rankedLists: RankedResult[][] = [];
|
|
292
|
+
|
|
293
|
+
// FTS results
|
|
294
|
+
const ftsResults = searchFTS(db, query, 20);
|
|
295
|
+
if (ftsResults.length > 0) {
|
|
296
|
+
rankedLists.push(ftsResults.map(r => ({
|
|
297
|
+
file: r.filepath,
|
|
298
|
+
displayPath: r.displayPath,
|
|
299
|
+
title: r.title,
|
|
300
|
+
body: r.body || "",
|
|
301
|
+
score: r.score
|
|
302
|
+
})));
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
// Vector results
|
|
306
|
+
const vecResults = await searchVec(db, query, DEFAULT_EMBED_MODEL, 20);
|
|
307
|
+
if (vecResults.length > 0) {
|
|
308
|
+
rankedLists.push(vecResults.map(r => ({
|
|
309
|
+
file: r.filepath,
|
|
310
|
+
displayPath: r.displayPath,
|
|
311
|
+
title: r.title,
|
|
312
|
+
body: r.body || "",
|
|
313
|
+
score: r.score
|
|
314
|
+
})));
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
if (rankedLists.length === 0) return [];
|
|
318
|
+
|
|
319
|
+
// Apply RRF fusion
|
|
320
|
+
const fused = reciprocalRankFusion(rankedLists);
|
|
321
|
+
return fused.slice(0, limit);
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
test("easy queries: ≥80% Hit@3 (hybrid should match BM25)", async () => {
|
|
325
|
+
const easyQueries = evalQueries.filter(q => q.difficulty === "easy");
|
|
326
|
+
let hits = 0;
|
|
327
|
+
for (const { query, expectedDoc } of easyQueries) {
|
|
328
|
+
const results = await hybridSearch(query);
|
|
329
|
+
if (results.slice(0, 3).some(r => matchesExpected(r.file, expectedDoc))) hits++;
|
|
330
|
+
}
|
|
331
|
+
expect(hits / easyQueries.length).toBeGreaterThanOrEqual(0.8);
|
|
332
|
+
}, 60000);
|
|
333
|
+
|
|
334
|
+
test("medium queries: ≥50% Hit@3 with vectors, ≥15% without", async () => {
|
|
335
|
+
const mediumQueries = evalQueries.filter(q => q.difficulty === "medium");
|
|
336
|
+
let hits = 0;
|
|
337
|
+
for (const { query, expectedDoc } of mediumQueries) {
|
|
338
|
+
const results = await hybridSearch(query);
|
|
339
|
+
if (results.slice(0, 3).some(r => matchesExpected(r.file, expectedDoc))) hits++;
|
|
340
|
+
}
|
|
341
|
+
// With vectors: hybrid should outperform both BM25 (15%) and vector (40%)
|
|
342
|
+
// Without vectors: hybrid is just BM25, so use BM25 threshold
|
|
343
|
+
const threshold = hasVectors ? 0.5 : 0.15;
|
|
344
|
+
expect(hits / mediumQueries.length).toBeGreaterThanOrEqual(threshold);
|
|
345
|
+
}, 60000);
|
|
346
|
+
|
|
347
|
+
test("hard queries: ≥35% Hit@5 with vectors, ≥15% without", async () => {
|
|
348
|
+
const hardQueries = evalQueries.filter(q => q.difficulty === "hard");
|
|
349
|
+
let hits = 0;
|
|
350
|
+
for (const { query, expectedDoc } of hardQueries) {
|
|
351
|
+
const results = await hybridSearch(query);
|
|
352
|
+
if (results.some(r => matchesExpected(r.file, expectedDoc))) hits++;
|
|
353
|
+
}
|
|
354
|
+
const threshold = hasVectors ? 0.35 : 0.15;
|
|
355
|
+
expect(hits / hardQueries.length).toBeGreaterThanOrEqual(threshold);
|
|
356
|
+
}, 60000);
|
|
357
|
+
|
|
358
|
+
test("fusion queries: ≥50% Hit@3 (RRF combines weak signals)", async () => {
|
|
359
|
+
if (!hasVectors) return; // Fusion requires both methods
|
|
360
|
+
|
|
361
|
+
const fusionQueries = evalQueries.filter(q => q.difficulty === "fusion");
|
|
362
|
+
let hybridHits = 0;
|
|
363
|
+
let bm25Hits = 0;
|
|
364
|
+
let vecHits = 0;
|
|
365
|
+
|
|
366
|
+
for (const { query, expectedDoc } of fusionQueries) {
|
|
367
|
+
// Hybrid results
|
|
368
|
+
const hybridResults = await hybridSearch(query);
|
|
369
|
+
if (hybridResults.slice(0, 3).some(r => matchesExpected(r.file, expectedDoc))) hybridHits++;
|
|
370
|
+
|
|
371
|
+
// BM25 results for comparison
|
|
372
|
+
const bm25Results = searchFTS(db, query, 5);
|
|
373
|
+
if (bm25Results.slice(0, 3).some(r => matchesExpected(r.filepath, expectedDoc))) bm25Hits++;
|
|
374
|
+
|
|
375
|
+
// Vector results for comparison
|
|
376
|
+
const vecResults = await searchVec(db, query, DEFAULT_EMBED_MODEL, 5);
|
|
377
|
+
if (vecResults.slice(0, 3).some(r => matchesExpected(r.filepath, expectedDoc))) vecHits++;
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
const hybridRate = hybridHits / fusionQueries.length;
|
|
381
|
+
const bm25Rate = bm25Hits / fusionQueries.length;
|
|
382
|
+
const vecRate = vecHits / fusionQueries.length;
|
|
383
|
+
|
|
384
|
+
// Fusion should achieve at least 50% on these multi-signal queries
|
|
385
|
+
expect(hybridRate).toBeGreaterThanOrEqual(0.5);
|
|
386
|
+
|
|
387
|
+
// Fusion should outperform or match the best individual method
|
|
388
|
+
expect(hybridRate).toBeGreaterThanOrEqual(Math.max(bm25Rate, vecRate));
|
|
389
|
+
}, 60000);
|
|
390
|
+
|
|
391
|
+
test("overall Hit@3 ≥60% with vectors, ≥40% without", async () => {
|
|
392
|
+
// Filter out fusion queries for overall score (they're tested separately)
|
|
393
|
+
const standardQueries = evalQueries.filter(q => q.difficulty !== "fusion");
|
|
394
|
+
let hits = 0;
|
|
395
|
+
for (const { query, expectedDoc } of standardQueries) {
|
|
396
|
+
const results = await hybridSearch(query);
|
|
397
|
+
if (results.slice(0, 3).some(r => matchesExpected(r.file, expectedDoc))) hits++;
|
|
398
|
+
}
|
|
399
|
+
const threshold = hasVectors ? 0.6 : 0.4;
|
|
400
|
+
expect(hits / standardQueries.length).toBeGreaterThanOrEqual(threshold);
|
|
401
|
+
}, 60000);
|
|
402
|
+
});
|
|
403
|
+
|
|
404
|
+
// =============================================================================
|
|
405
|
+
// Cleanup
|
|
406
|
+
// =============================================================================
|
|
407
|
+
|
|
408
|
+
afterAll(async () => {
|
|
409
|
+
// Ensure native resources are released to avoid ggml-metal asserts on process exit.
|
|
410
|
+
await disposeDefaultLlamaCpp();
|
|
411
|
+
rmSync(tempDir, { recursive: true, force: true });
|
|
412
|
+
});
|