@joycodetech/qmd-ja 2.5.3-ja.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/CHANGELOG.md +821 -0
  2. package/LICENSE +21 -0
  3. package/README.md +1143 -0
  4. package/bin/qmd-ja +162 -0
  5. package/dist/ast.d.ts +65 -0
  6. package/dist/ast.js +334 -0
  7. package/dist/bench/bench.d.ts +23 -0
  8. package/dist/bench/bench.js +280 -0
  9. package/dist/bench/score.d.ts +33 -0
  10. package/dist/bench/score.js +88 -0
  11. package/dist/bench/types.d.ts +80 -0
  12. package/dist/bench/types.js +8 -0
  13. package/dist/cli/formatter.d.ts +120 -0
  14. package/dist/cli/formatter.js +355 -0
  15. package/dist/cli/qmd.d.ts +43 -0
  16. package/dist/cli/qmd.js +4179 -0
  17. package/dist/collections.d.ts +166 -0
  18. package/dist/collections.js +410 -0
  19. package/dist/db.d.ts +44 -0
  20. package/dist/db.js +75 -0
  21. package/dist/index.d.ts +230 -0
  22. package/dist/index.js +242 -0
  23. package/dist/llm.d.ts +500 -0
  24. package/dist/llm.js +1615 -0
  25. package/dist/maintenance.d.ts +23 -0
  26. package/dist/maintenance.js +37 -0
  27. package/dist/mcp/server.d.ts +24 -0
  28. package/dist/mcp/server.js +702 -0
  29. package/dist/paths.d.ts +1 -0
  30. package/dist/paths.js +4 -0
  31. package/dist/store.d.ts +1002 -0
  32. package/dist/store.js +4208 -0
  33. package/models/vaporetto-bccwj.model +0 -0
  34. package/package.json +130 -0
  35. package/scripts/build.mjs +30 -0
  36. package/scripts/check-package-grammars.mjs +29 -0
  37. package/scripts/package-smoke.mjs +65 -0
  38. package/scripts/test-all.mjs +38 -0
  39. package/skills/qmd/SKILL.md +295 -0
  40. package/skills/qmd/references/mcp-setup.md +102 -0
  41. package/skills/release/SKILL.md +139 -0
  42. package/skills/release/scripts/install-hooks.sh +38 -0
  43. package/vendor/vaporetto-node-wasm/LICENSE +22 -0
  44. package/vendor/vaporetto-node-wasm/package.json +11 -0
  45. package/vendor/vaporetto-node-wasm/vaporetto_node_wasm.d.ts +19 -0
  46. package/vendor/vaporetto-node-wasm/vaporetto_node_wasm.js +202 -0
  47. package/vendor/vaporetto-node-wasm/vaporetto_node_wasm_bg.wasm +0 -0
  48. package/vendor/vaporetto-node-wasm/vaporetto_node_wasm_bg.wasm.d.ts +13 -0
package/dist/store.js ADDED
@@ -0,0 +1,4208 @@
1
+ /**
2
+ * QMD Store - Core data access and retrieval functions
3
+ *
4
+ * This module provides all database operations, search functions, and document
5
+ * retrieval for QMD. It returns raw data structures that can be formatted by
6
+ * CLI or MCP consumers.
7
+ *
8
+ * Usage:
9
+ * const store = createStore("/path/to/db.sqlite");
10
+ * // or use default path:
11
+ * const store = createStore();
12
+ */
13
+ import { openDatabase, loadSqliteVec } from "./db.js";
14
+ import picomatch from "picomatch";
15
+ import { createHash } from "crypto";
16
+ import { readFileSync, realpathSync, statSync, mkdirSync, existsSync } from "node:fs";
17
+ import { createRequire } from "node:module";
18
+ import { dirname, join as pathJoin } from "node:path";
19
+ import { fileURLToPath } from "node:url";
20
+ // Note: node:path resolve is not imported — we export our own cross-platform resolve()
21
+ import fastGlob from "fast-glob";
22
+ import { qmdHomedir } from "./paths.js";
23
+ import { LlamaCpp, getDefaultLlamaCpp, formatQueryForEmbedding, formatDocForEmbedding, withLLMSessionForLlm, DEFAULT_EMBED_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, } from "./llm.js";
24
+ // =============================================================================
25
+ // Configuration
26
+ // =============================================================================
27
+ export const DEFAULT_EMBED_MODEL = DEFAULT_EMBED_MODEL_URI;
28
+ export const DEFAULT_RERANK_MODEL = DEFAULT_RERANK_MODEL_URI;
29
+ export const DEFAULT_QUERY_MODEL = DEFAULT_GENERATE_MODEL_URI;
30
+ export const DEFAULT_GLOB = "**/*.md";
31
+ export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB
32
+ export const DEFAULT_EMBED_MAX_DOCS_PER_BATCH = 64;
33
+ export const DEFAULT_EMBED_MAX_BATCH_BYTES = 64 * 1024 * 1024; // 64MB
34
+ const EMBED_FINGERPRINT_PROBE_QUERY = "__qmd_embedding_query_probe__";
35
+ const EMBED_FINGERPRINT_PROBE_TITLE = "__qmd_embedding_title_probe__";
36
+ const EMBED_FINGERPRINT_PROBE_DOC = "__qmd_embedding_document_probe__";
37
+ // Chunking: 900 tokens per chunk with 15% overlap
38
+ // Increased from 800 to accommodate smart chunking finding natural break points
39
+ export const CHUNK_SIZE_TOKENS = 900;
40
+ export const CHUNK_OVERLAP_TOKENS = Math.floor(CHUNK_SIZE_TOKENS * 0.15); // 135 tokens (15% overlap)
41
+ // Fallback char-based approximation for sync chunking (~4 chars per token)
42
+ export const CHUNK_SIZE_CHARS = CHUNK_SIZE_TOKENS * 4; // 3600 chars
43
+ export const CHUNK_OVERLAP_CHARS = CHUNK_OVERLAP_TOKENS * 4; // 540 chars
44
+ // Search window for finding optimal break points (in tokens, ~200 tokens)
45
+ export const CHUNK_WINDOW_TOKENS = 200;
46
+ export const CHUNK_WINDOW_CHARS = CHUNK_WINDOW_TOKENS * 4; // 800 chars
47
+ export function getEmbeddingFingerprint(model = DEFAULT_EMBED_MODEL) {
48
+ const significant = [
49
+ `model:${model}`,
50
+ `query:${formatQueryForEmbedding(EMBED_FINGERPRINT_PROBE_QUERY, model)}`,
51
+ `doc:${formatDocForEmbedding(EMBED_FINGERPRINT_PROBE_DOC, EMBED_FINGERPRINT_PROBE_TITLE, model)}`,
52
+ `chunk_tokens:${CHUNK_SIZE_TOKENS}`,
53
+ `chunk_overlap_tokens:${CHUNK_OVERLAP_TOKENS}`,
54
+ ].join("\n");
55
+ return createHash("sha256").update(significant).digest("hex").slice(0, 6);
56
+ }
57
+ /**
58
+ * Get the LlamaCpp instance for a store — prefers the store's own instance,
59
+ * falls back to the global singleton.
60
+ */
61
+ function getLlm(store) {
62
+ return store.llm ?? getDefaultLlamaCpp();
63
+ }
64
+ /**
65
+ * Patterns for detecting break points in markdown documents.
66
+ * Higher scores indicate better places to split.
67
+ * Scores are spread wide so headings decisively beat lower-quality breaks.
68
+ * Order matters for scoring - more specific patterns first.
69
+ */
70
+ export const BREAK_PATTERNS = [
71
+ [/\n#{1}(?!#)/g, 100, 'h1'], // # but not ##
72
+ [/\n#{2}(?!#)/g, 90, 'h2'], // ## but not ###
73
+ [/\n#{3}(?!#)/g, 80, 'h3'], // ### but not ####
74
+ [/\n#{4}(?!#)/g, 70, 'h4'], // #### but not #####
75
+ [/\n#{5}(?!#)/g, 60, 'h5'], // ##### but not ######
76
+ [/\n#{6}(?!#)/g, 50, 'h6'], // ######
77
+ [/\n```/g, 80, 'codeblock'], // code block boundary (same as h3)
78
+ [/\n(?:---|\*\*\*|___)\s*\n/g, 60, 'hr'], // horizontal rule
79
+ [/\n\n+/g, 20, 'blank'], // paragraph boundary
80
+ [/\n[-*]\s/g, 5, 'list'], // unordered list item
81
+ [/\n\d+\.\s/g, 5, 'numlist'], // ordered list item
82
+ [/\n/g, 1, 'newline'], // minimal break
83
+ ];
84
+ /**
85
+ * Scan text for all potential break points.
86
+ * Returns sorted array of break points with higher-scoring patterns taking precedence
87
+ * when multiple patterns match the same position.
88
+ */
89
+ export function scanBreakPoints(text) {
90
+ const points = [];
91
+ const seen = new Map(); // pos -> best break point at that pos
92
+ for (const [pattern, score, type] of BREAK_PATTERNS) {
93
+ for (const match of text.matchAll(pattern)) {
94
+ const pos = match.index;
95
+ const existing = seen.get(pos);
96
+ // Keep higher score if position already seen
97
+ if (!existing || score > existing.score) {
98
+ const bp = { pos, score, type };
99
+ seen.set(pos, bp);
100
+ }
101
+ }
102
+ }
103
+ // Convert to array and sort by position
104
+ for (const bp of seen.values()) {
105
+ points.push(bp);
106
+ }
107
+ return points.sort((a, b) => a.pos - b.pos);
108
+ }
109
+ /**
110
+ * Find all code fence regions in the text.
111
+ * Code fences are delimited by ``` and we should never split inside them.
112
+ */
113
+ export function findCodeFences(text) {
114
+ const regions = [];
115
+ const fencePattern = /\n```/g;
116
+ let inFence = false;
117
+ let fenceStart = 0;
118
+ for (const match of text.matchAll(fencePattern)) {
119
+ if (!inFence) {
120
+ fenceStart = match.index;
121
+ inFence = true;
122
+ }
123
+ else {
124
+ regions.push({ start: fenceStart, end: match.index + match[0].length });
125
+ inFence = false;
126
+ }
127
+ }
128
+ // Handle unclosed fence - extends to end of document
129
+ if (inFence) {
130
+ regions.push({ start: fenceStart, end: text.length });
131
+ }
132
+ return regions;
133
+ }
134
+ /**
135
+ * Check if a position is inside a code fence region.
136
+ */
137
+ export function isInsideCodeFence(pos, fences) {
138
+ return fences.some(f => pos > f.start && pos < f.end);
139
+ }
140
+ /**
141
+ * Find the best cut position using scored break points with distance decay.
142
+ *
143
+ * Uses squared distance for gentler early decay - headings far back still win
144
+ * over low-quality breaks near the target.
145
+ *
146
+ * @param breakPoints - Pre-scanned break points from scanBreakPoints()
147
+ * @param targetCharPos - The ideal cut position (e.g., maxChars boundary)
148
+ * @param windowChars - How far back to search for break points (default ~200 tokens)
149
+ * @param decayFactor - How much to penalize distance (0.7 = 30% score at window edge)
150
+ * @param codeFences - Code fence regions to avoid splitting inside
151
+ * @returns The best position to cut at
152
+ */
153
+ export function findBestCutoff(breakPoints, targetCharPos, windowChars = CHUNK_WINDOW_CHARS, decayFactor = 0.7, codeFences = []) {
154
+ const windowStart = targetCharPos - windowChars;
155
+ let bestScore = -1;
156
+ let bestPos = targetCharPos;
157
+ for (const bp of breakPoints) {
158
+ if (bp.pos < windowStart)
159
+ continue;
160
+ if (bp.pos > targetCharPos)
161
+ break; // sorted, so we can stop
162
+ // Skip break points inside code fences
163
+ if (isInsideCodeFence(bp.pos, codeFences))
164
+ continue;
165
+ const distance = targetCharPos - bp.pos;
166
+ // Squared distance decay: gentle early, steep late
167
+ // At target: multiplier = 1.0
168
+ // At 25% back: multiplier = 0.956
169
+ // At 50% back: multiplier = 0.825
170
+ // At 75% back: multiplier = 0.606
171
+ // At window edge: multiplier = 0.3
172
+ const normalizedDist = distance / windowChars;
173
+ const multiplier = 1.0 - (normalizedDist * normalizedDist) * decayFactor;
174
+ const finalScore = bp.score * multiplier;
175
+ if (finalScore > bestScore) {
176
+ bestScore = finalScore;
177
+ bestPos = bp.pos;
178
+ }
179
+ }
180
+ return bestPos;
181
+ }
182
+ /**
183
+ * Merge two sets of break points (e.g. regex + AST), keeping the highest
184
+ * score at each position. Result is sorted by position.
185
+ */
186
+ export function mergeBreakPoints(a, b) {
187
+ const seen = new Map();
188
+ for (const bp of a) {
189
+ const existing = seen.get(bp.pos);
190
+ if (!existing || bp.score > existing.score) {
191
+ seen.set(bp.pos, bp);
192
+ }
193
+ }
194
+ for (const bp of b) {
195
+ const existing = seen.get(bp.pos);
196
+ if (!existing || bp.score > existing.score) {
197
+ seen.set(bp.pos, bp);
198
+ }
199
+ }
200
+ return Array.from(seen.values()).sort((a, b) => a.pos - b.pos);
201
+ }
202
+ /**
203
+ * Core chunk algorithm that operates on precomputed break points and code fences.
204
+ * This is the shared implementation used by both regex-only and AST-aware chunking.
205
+ */
206
+ export function chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars = CHUNK_SIZE_CHARS, overlapChars = CHUNK_OVERLAP_CHARS, windowChars = CHUNK_WINDOW_CHARS) {
207
+ if (content.length <= maxChars) {
208
+ return [{ text: content, pos: 0 }];
209
+ }
210
+ const chunks = [];
211
+ let charPos = 0;
212
+ while (charPos < content.length) {
213
+ const targetEndPos = Math.min(charPos + maxChars, content.length);
214
+ let endPos = targetEndPos;
215
+ if (endPos < content.length) {
216
+ const bestCutoff = findBestCutoff(breakPoints, targetEndPos, windowChars, 0.7, codeFences);
217
+ if (bestCutoff > charPos && bestCutoff <= targetEndPos) {
218
+ endPos = bestCutoff;
219
+ }
220
+ }
221
+ if (endPos <= charPos) {
222
+ endPos = Math.min(charPos + maxChars, content.length);
223
+ }
224
+ chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
225
+ if (endPos >= content.length) {
226
+ break;
227
+ }
228
+ charPos = endPos - overlapChars;
229
+ const lastChunkPos = chunks.at(-1).pos;
230
+ if (charPos <= lastChunkPos) {
231
+ charPos = endPos;
232
+ }
233
+ }
234
+ return chunks;
235
+ }
236
+ // Hybrid query: strong BM25 signal detection thresholds
237
+ // Skip expensive LLM expansion when top result is strong AND clearly separated from runner-up
238
+ export const STRONG_SIGNAL_MIN_SCORE = 0.85;
239
+ export const STRONG_SIGNAL_MIN_GAP = 0.15;
240
+ // Max candidates to pass to reranker — balances quality vs latency.
241
+ // 40 keeps rank 31-40 visible to the reranker (matters for recall on broad queries).
242
+ export const RERANK_CANDIDATE_LIMIT = 40;
243
+ // =============================================================================
244
+ // Path utilities
245
+ // =============================================================================
246
+ export function homedir() {
247
+ return qmdHomedir();
248
+ }
249
+ /**
250
+ * Check if a path is absolute.
251
+ * Supports:
252
+ * - Unix paths: /path/to/file
253
+ * - Windows native: C:\path or C:/path
254
+ * - Git Bash: /c/path or /C/path (C-Z drives, excluding A/B floppy drives)
255
+ *
256
+ * Note: /c without trailing slash is treated as Unix path (directory named "c"),
257
+ * while /c/ or /c/path are treated as Git Bash paths (C: drive).
258
+ */
259
+ export function isAbsolutePath(path) {
260
+ if (!path)
261
+ return false;
262
+ // Unix absolute path
263
+ if (path.startsWith('/')) {
264
+ // Check if it's a Git Bash style path like /c/ or /c/Users (C-Z only, not A or B)
265
+ // Requires path[2] === '/' to distinguish from Unix paths like /c or /cache
266
+ // Skipped on WSL where /c/ is a valid drvfs mount point, not a drive letter
267
+ if (!isWSL() && path.length >= 3 && path[2] === '/') {
268
+ const driveLetter = path[1];
269
+ if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
270
+ return true;
271
+ }
272
+ }
273
+ // Any other path starting with / is Unix absolute
274
+ return true;
275
+ }
276
+ // Windows native path: C:\ or C:/ (any letter A-Z)
277
+ if (path.length >= 2 && /[a-zA-Z]/.test(path[0]) && path[1] === ':') {
278
+ return true;
279
+ }
280
+ return false;
281
+ }
282
+ /**
283
+ * Normalize path separators to forward slashes.
284
+ * Converts Windows backslashes to forward slashes.
285
+ */
286
+ export function normalizePathSeparators(path) {
287
+ return path.replace(/\\/g, '/');
288
+ }
289
+ /**
290
+ * Detect if running inside WSL (Windows Subsystem for Linux).
291
+ * On WSL, paths like /c/work/... are valid drvfs mount points, not Git Bash paths.
292
+ */
293
+ function isWSL() {
294
+ return !!(process.env.WSL_DISTRO_NAME || process.env.WSL_INTEROP);
295
+ }
296
+ /**
297
+ * Get the relative path from a prefix.
298
+ * Returns null if path is not under prefix.
299
+ * Returns empty string if path equals prefix.
300
+ */
301
+ export function getRelativePathFromPrefix(path, prefix) {
302
+ // Empty prefix is invalid
303
+ if (!prefix) {
304
+ return null;
305
+ }
306
+ const normalizedPath = normalizePathSeparators(path);
307
+ const normalizedPrefix = normalizePathSeparators(prefix);
308
+ // Ensure prefix ends with / for proper matching
309
+ const prefixWithSlash = !normalizedPrefix.endsWith('/')
310
+ ? normalizedPrefix + '/'
311
+ : normalizedPrefix;
312
+ // Exact match
313
+ if (normalizedPath === normalizedPrefix) {
314
+ return '';
315
+ }
316
+ // Check if path starts with prefix
317
+ if (normalizedPath.startsWith(prefixWithSlash)) {
318
+ return normalizedPath.slice(prefixWithSlash.length);
319
+ }
320
+ return null;
321
+ }
322
+ export function resolve(...paths) {
323
+ if (paths.length === 0) {
324
+ throw new Error("resolve: at least one path segment is required");
325
+ }
326
+ // Normalize all paths to use forward slashes
327
+ const normalizedPaths = paths.map(normalizePathSeparators);
328
+ let result = '';
329
+ let windowsDrive = '';
330
+ // Check if first path is absolute
331
+ const firstPath = normalizedPaths[0];
332
+ if (isAbsolutePath(firstPath)) {
333
+ result = firstPath;
334
+ // Extract Windows drive letter if present
335
+ if (firstPath.length >= 2 && /[a-zA-Z]/.test(firstPath[0]) && firstPath[1] === ':') {
336
+ windowsDrive = firstPath.slice(0, 2);
337
+ result = firstPath.slice(2);
338
+ }
339
+ else if (!isWSL() && firstPath.startsWith('/') && firstPath.length >= 3 && firstPath[2] === '/') {
340
+ // Git Bash style: /c/ -> C: (C-Z drives only, not A or B)
341
+ // Skipped on WSL where /c/ is a valid drvfs mount point, not a drive letter
342
+ const driveLetter = firstPath[1];
343
+ if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
344
+ windowsDrive = driveLetter.toUpperCase() + ':';
345
+ result = firstPath.slice(2);
346
+ }
347
+ }
348
+ }
349
+ else {
350
+ // Start with PWD or cwd, then append the first relative path
351
+ const pwd = normalizePathSeparators(process.env.PWD || process.cwd());
352
+ // Extract Windows drive from PWD if present
353
+ if (pwd.length >= 2 && /[a-zA-Z]/.test(pwd[0]) && pwd[1] === ':') {
354
+ windowsDrive = pwd.slice(0, 2);
355
+ result = pwd.slice(2) + '/' + firstPath;
356
+ }
357
+ else {
358
+ result = pwd + '/' + firstPath;
359
+ }
360
+ }
361
+ // Process remaining paths
362
+ for (let i = 1; i < normalizedPaths.length; i++) {
363
+ const p = normalizedPaths[i];
364
+ if (isAbsolutePath(p)) {
365
+ // Absolute path replaces everything
366
+ result = p;
367
+ // Update Windows drive if present
368
+ if (p.length >= 2 && /[a-zA-Z]/.test(p[0]) && p[1] === ':') {
369
+ windowsDrive = p.slice(0, 2);
370
+ result = p.slice(2);
371
+ }
372
+ else if (!isWSL() && p.startsWith('/') && p.length >= 3 && p[2] === '/') {
373
+ // Git Bash style (C-Z drives only, not A or B)
374
+ // Skipped on WSL where /c/ is a valid drvfs mount point, not a drive letter
375
+ const driveLetter = p[1];
376
+ if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
377
+ windowsDrive = driveLetter.toUpperCase() + ':';
378
+ result = p.slice(2);
379
+ }
380
+ else {
381
+ windowsDrive = '';
382
+ }
383
+ }
384
+ else {
385
+ windowsDrive = '';
386
+ }
387
+ }
388
+ else {
389
+ // Relative path - append
390
+ result = result + '/' + p;
391
+ }
392
+ }
393
+ // Normalize . and .. components
394
+ const parts = result.split('/').filter(Boolean);
395
+ const normalized = [];
396
+ for (const part of parts) {
397
+ if (part === '..') {
398
+ normalized.pop();
399
+ }
400
+ else if (part !== '.') {
401
+ normalized.push(part);
402
+ }
403
+ }
404
+ // Build final path
405
+ const finalPath = '/' + normalized.join('/');
406
+ // Prepend Windows drive if present
407
+ if (windowsDrive) {
408
+ return windowsDrive + finalPath;
409
+ }
410
+ return finalPath;
411
+ }
412
+ // Flag to indicate production mode (set by qmd.ts at startup)
413
+ let _productionMode = false;
414
+ export function enableProductionMode() {
415
+ _productionMode = true;
416
+ }
417
+ /** Reset production mode flag — only for testing. */
418
+ export function _resetProductionModeForTesting() {
419
+ _productionMode = false;
420
+ }
421
+ export function getDefaultDbPath(indexName = "index") {
422
+ // Always allow override via INDEX_PATH (for testing)
423
+ if (process.env.INDEX_PATH) {
424
+ return process.env.INDEX_PATH;
425
+ }
426
+ // In non-production mode (tests), require explicit path
427
+ if (!_productionMode) {
428
+ throw new Error("Database path not set. Tests must set INDEX_PATH env var or use createStore() with explicit path. " +
429
+ "This prevents tests from accidentally writing to the global index.");
430
+ }
431
+ const cacheDir = process.env.XDG_CACHE_HOME || resolve(homedir(), ".cache");
432
+ const qmdCacheDir = resolve(cacheDir, "qmd");
433
+ try {
434
+ mkdirSync(qmdCacheDir, { recursive: true });
435
+ }
436
+ catch { }
437
+ return resolve(qmdCacheDir, `${indexName}.sqlite`);
438
+ }
439
+ export function getPwd() {
440
+ return process.env.PWD || process.cwd();
441
+ }
442
+ export function getRealPath(path) {
443
+ try {
444
+ return realpathSync(path);
445
+ }
446
+ catch {
447
+ return resolve(path);
448
+ }
449
+ }
450
+ /**
451
+ * Normalize explicit virtual path formats to standard qmd:// format.
452
+ * Only handles paths that are already explicitly virtual:
453
+ * - qmd://collection/path.md (already normalized)
454
+ * - qmd:////collection/path.md (extra slashes - normalize)
455
+ * - //collection/path.md (missing qmd: prefix - add it)
456
+ *
457
+ * Does NOT handle:
458
+ * - collection/path.md (bare paths - could be filesystem relative)
459
+ * - :linenum suffix (should be parsed separately before calling this)
460
+ */
461
+ export function normalizeVirtualPath(input) {
462
+ let path = input.trim();
463
+ // Handle qmd:// with extra slashes: qmd:////collection/path -> qmd://collection/path
464
+ if (path.startsWith('qmd:')) {
465
+ // Remove qmd: prefix and normalize slashes
466
+ path = path.slice(4);
467
+ // Remove leading slashes and re-add exactly two
468
+ path = path.replace(/^\/+/, '');
469
+ return `qmd://${path}`;
470
+ }
471
+ // Handle //collection/path (missing qmd: prefix)
472
+ if (path.startsWith('//')) {
473
+ path = path.replace(/^\/+/, '');
474
+ return `qmd://${path}`;
475
+ }
476
+ // Return as-is for other cases (filesystem paths, docids, bare collection/path, etc.)
477
+ return path;
478
+ }
479
+ /**
480
+ * Parse a virtual path like "qmd://collection-name/path/to/file.md"
481
+ * into its components.
482
+ * Also supports collection root: "qmd://collection-name/" or "qmd://collection-name"
483
+ */
484
+ export function parseVirtualPath(virtualPath) {
485
+ // Normalize the path first
486
+ const normalized = normalizeVirtualPath(virtualPath);
487
+ const [pathPart = normalized, queryString = ""] = normalized.split("?");
488
+ // Match: qmd://collection-name[/optional-path]
489
+ // Allows: qmd://name, qmd://name/, qmd://name/path
490
+ const match = pathPart.match(/^qmd:\/\/([^\/]+)\/?(.*)$/);
491
+ if (!match?.[1])
492
+ return null;
493
+ const indexName = new URLSearchParams(queryString).get("index")?.trim() || undefined;
494
+ return {
495
+ collectionName: match[1],
496
+ path: match[2] ?? '', // Empty string for collection root
497
+ ...(indexName ? { indexName } : {}),
498
+ };
499
+ }
500
+ /**
501
+ * Build a virtual path from collection name and relative path.
502
+ */
503
+ export function buildVirtualPath(collectionName, path, indexName) {
504
+ const base = `qmd://${collectionName}/${path}`;
505
+ return indexName ? `${base}?index=${encodeURIComponent(indexName)}` : base;
506
+ }
507
+ /**
508
+ * Check if a path is explicitly a virtual path.
509
+ * Only recognizes explicit virtual path formats:
510
+ * - qmd://collection/path.md
511
+ * - //collection/path.md
512
+ *
513
+ * Does NOT consider bare collection/path.md as virtual - that should be
514
+ * handled separately by checking if the first component is a collection name.
515
+ */
516
+ export function isVirtualPath(path) {
517
+ const trimmed = path.trim();
518
+ // Explicit qmd:// prefix (with any number of slashes)
519
+ if (trimmed.startsWith('qmd:'))
520
+ return true;
521
+ // //collection/path format (missing qmd: prefix)
522
+ if (trimmed.startsWith('//'))
523
+ return true;
524
+ return false;
525
+ }
526
+ /**
527
+ * Resolve a virtual path to absolute filesystem path.
528
+ */
529
+ export function resolveVirtualPath(db, virtualPath) {
530
+ const parsed = parseVirtualPath(virtualPath);
531
+ if (!parsed)
532
+ return null;
533
+ const coll = getCollectionByName(db, parsed.collectionName);
534
+ if (!coll)
535
+ return null;
536
+ return resolve(coll.pwd, parsed.path);
537
+ }
538
+ /**
539
+ * Convert an absolute filesystem path to a virtual path.
540
+ * Returns null if the file is not in any indexed collection.
541
+ */
542
+ export function toVirtualPath(db, absolutePath) {
543
+ // Get all collections from DB
544
+ const collections = getStoreCollections(db);
545
+ // Find which collection this absolute path belongs to
546
+ for (const coll of collections) {
547
+ if (absolutePath.startsWith(coll.path + '/') || absolutePath === coll.path) {
548
+ // Extract relative path
549
+ const relativePath = absolutePath.startsWith(coll.path + '/')
550
+ ? absolutePath.slice(coll.path.length + 1)
551
+ : '';
552
+ // Verify this document exists in the database
553
+ const doc = db.prepare(`
554
+ SELECT d.path
555
+ FROM documents d
556
+ WHERE d.collection = ? AND d.path = ? AND d.active = 1
557
+ LIMIT 1
558
+ `).get(coll.name, relativePath);
559
+ if (doc) {
560
+ return buildVirtualPath(coll.name, relativePath);
561
+ }
562
+ }
563
+ }
564
+ return null;
565
+ }
566
+ // =============================================================================
567
+ // Database initialization
568
+ // =============================================================================
569
+ function createSqliteVecUnavailableError(reason) {
570
+ return new Error("sqlite-vec extension is unavailable. " +
571
+ `${reason}. ` +
572
+ "Install Homebrew SQLite so the sqlite-vec extension can be loaded, " +
573
+ "and set BREW_PREFIX if Homebrew is installed in a non-standard location.");
574
+ }
575
+ let _sqliteVecUnavailableReason = null;
576
+ function getErrorMessage(err) {
577
+ return err instanceof Error ? err.message : String(err);
578
+ }
579
+ export function verifySqliteVecLoaded(db) {
580
+ try {
581
+ const row = db.prepare(`SELECT vec_version() AS version`).get();
582
+ if (!row?.version || typeof row.version !== "string") {
583
+ throw new Error("vec_version() returned no version");
584
+ }
585
+ }
586
+ catch (err) {
587
+ const message = getErrorMessage(err);
588
+ throw createSqliteVecUnavailableError(`sqlite-vec probe failed (${message})`);
589
+ }
590
+ }
591
+ let _sqliteVecAvailable = null;
592
+ const CJK_CHAR_PATTERN = /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Hangul}]/u;
593
+ // CJK_RUN_PATTERN also includes Katakana-extension characters (U+30FB middle dot,
594
+ // U+30FC prolonged sound mark "ー") which belong to Script=Common but are integral
595
+ // parts of katakana words (e.g. "ナレッジベース"). Without these, the regex splits
596
+ // on "ー" and vaporetto receives broken sub-strings like "ナレッジベ" and "ス".
597
+ const CJK_RUN_PATTERN = /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Hangul}・ー]+/gu;
598
+ export const FTS_CJK_NORMALIZED_VERSION = "3"; // bumped: vaporetto WASM morphological tokenization
599
+ // --- Vaporetto WASM Japanese morphological analyzer (lazy singleton) ---
600
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
601
+ let _vaporettoTokenizer = null;
602
+ let _vaporettoInitPromise = null;
603
+ /**
604
+ * Resolve the Vaporetto model file path relative to this module.
605
+ * The model is bundled under vendor/vaporetto-node-wasm/../models/ relative to the project root.
606
+ */
607
+ export function resolveVaporettoModelPath() {
608
+ // __dirname equivalent for ESM
609
+ const thisDir = dirname(fileURLToPath(import.meta.url));
610
+ // src/ → project root → models/
611
+ // Prefer the lightweight c0.003 model (fast init, good loanword accuracy for FTS)
612
+ // over the UniDic model (3s init cost is too high for CLI/MCP use).
613
+ const rawPath = pathJoin(thisDir, "..", "models", "vaporetto-bccwj.model");
614
+ const zstPath = pathJoin(thisDir, "..", "models", "vaporetto-bccwj.model.zst");
615
+ return existsSync(rawPath) ? rawPath : zstPath;
616
+ }
617
+ /**
618
+ * Pre-initialize the Vaporetto WASM Japanese morphological analyzer.
619
+ * Call this before indexing or search operations involving CJK text.
620
+ * normalizeCjkForFTS() falls back to unigram mode if not yet initialized.
621
+ */
622
+ export async function initializeVaporettoTokenizer() {
623
+ if (_vaporettoTokenizer)
624
+ return;
625
+ if (!_vaporettoInitPromise) {
626
+ _vaporettoInitPromise = (async () => {
627
+ const req = createRequire(import.meta.url);
628
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
629
+ const { VaporettoTokenizer } = req(pathJoin(dirname(fileURLToPath(import.meta.url)), "..", "vendor", "vaporetto-node-wasm", "vaporetto_node_wasm.js"));
630
+ const modelPath = resolveVaporettoModelPath();
631
+ const modelData = readFileSync(modelPath);
632
+ return new VaporettoTokenizer(modelData);
633
+ })();
634
+ }
635
+ _vaporettoTokenizer = await _vaporettoInitPromise;
636
+ }
637
+ /**
638
+ * Alias for backward compatibility: CLI and MCP code that calls
639
+ * initializeKuromojiTokenizer() will transparently use Vaporetto WASM instead.
640
+ */
641
+ export async function initializeKuromojiTokenizer() {
642
+ return initializeVaporettoTokenizer();
643
+ }
644
+ /**
645
+ * FTS5's unicode61 tokenizer does not segment CJK text into searchable words.
646
+ * When vaporetto is initialized (via initializeVaporettoTokenizer()), uses morphological
647
+ * analysis for accurate Japanese word-boundary segmentation.
648
+ * Falls back to character-level unigram spacing if not yet initialized.
649
+ */
650
+ export function normalizeCjkForFTS(text) {
651
+ if (_vaporettoTokenizer) {
652
+ return text.replace(CJK_RUN_PATTERN, run => {
653
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
654
+ const tokenized = _vaporettoTokenizer.tokenize(run);
655
+ return " " + tokenized + " ";
656
+ });
657
+ }
658
+ // Fallback: original character-level unigram
659
+ return text.replace(CJK_RUN_PATTERN, run => ` ${Array.from(run).join(" ")} `);
660
+ }
661
+ function containsCjk(text) {
662
+ return CJK_CHAR_PATTERN.test(text);
663
+ }
664
+ function sanitizeFTS5Phrase(phrase) {
665
+ return normalizeCjkForFTS(phrase)
666
+ .split(/\s+/)
667
+ .map(t => sanitizeFTS5Term(t))
668
+ .filter(t => t)
669
+ .join(' ');
670
+ }
671
+ function rebuildFTSForCjkNormalization(db) {
672
+ const version = db.prepare(`SELECT value FROM store_config WHERE key = 'fts_cjk_normalized_version'`).get();
673
+ if (version?.value === FTS_CJK_NORMALIZED_VERSION)
674
+ return;
675
+ try {
676
+ db.exec(`DELETE FROM documents_fts WHERE rowid >= 0`);
677
+ }
678
+ catch {
679
+ // Some older/corrupt FTS5 shadow-table states can reject bulk deletes even
680
+ // though reads still work. Recreate the virtual table; documents_fts is a
681
+ // derived index, so rebuilding it from documents/content is safe.
682
+ db.exec(`DROP TABLE IF EXISTS documents_fts`);
683
+ db.exec(`
684
+ CREATE VIRTUAL TABLE documents_fts USING fts5(
685
+ filepath, title, body,
686
+ tokenize='porter unicode61'
687
+ )
688
+ `);
689
+ }
690
+ const rows = db.prepare(`
691
+ SELECT d.id, d.collection, d.path, d.title, content.doc as body
692
+ FROM documents d
693
+ JOIN content ON content.hash = d.hash
694
+ WHERE d.active = 1
695
+ `).all();
696
+ const insert = db.prepare(`INSERT INTO documents_fts(rowid, filepath, title, body) VALUES (?, ?, ?, ?)`);
697
+ const rebuild = db.transaction(() => {
698
+ for (const row of rows) {
699
+ insert.run(row.id, normalizeCjkForFTS(`${row.collection}/${row.path}`), normalizeCjkForFTS(row.title), normalizeCjkForFTS(row.body));
700
+ }
701
+ });
702
+ rebuild();
703
+ db.prepare(`
704
+ INSERT OR REPLACE INTO store_config(key, value)
705
+ VALUES ('fts_cjk_normalized_version', ?)
706
+ `).run(FTS_CJK_NORMALIZED_VERSION);
707
+ }
708
+ function initializeDatabase(db) {
709
+ try {
710
+ loadSqliteVec(db);
711
+ verifySqliteVecLoaded(db);
712
+ _sqliteVecAvailable = true;
713
+ _sqliteVecUnavailableReason = null;
714
+ }
715
+ catch (err) {
716
+ // sqlite-vec is optional — vector search won't work but FTS is fine
717
+ _sqliteVecAvailable = false;
718
+ _sqliteVecUnavailableReason = getErrorMessage(err);
719
+ console.warn(_sqliteVecUnavailableReason);
720
+ }
721
+ db.exec("PRAGMA journal_mode = WAL");
722
+ db.exec("PRAGMA foreign_keys = ON");
723
+ // Drop legacy tables that are now managed in YAML
724
+ db.exec(`DROP TABLE IF EXISTS path_contexts`);
725
+ db.exec(`DROP TABLE IF EXISTS collections`);
726
+ // Content-addressable storage - the source of truth for document content
727
+ db.exec(`
728
+ CREATE TABLE IF NOT EXISTS content (
729
+ hash TEXT PRIMARY KEY,
730
+ doc TEXT NOT NULL,
731
+ created_at TEXT NOT NULL
732
+ )
733
+ `);
734
+ // Documents table - file system layer mapping virtual paths to content hashes
735
+ // Collections are now managed in ~/.config/qmd/index.yml
736
+ db.exec(`
737
+ CREATE TABLE IF NOT EXISTS documents (
738
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
739
+ collection TEXT NOT NULL,
740
+ path TEXT NOT NULL,
741
+ title TEXT NOT NULL,
742
+ hash TEXT NOT NULL,
743
+ created_at TEXT NOT NULL,
744
+ modified_at TEXT NOT NULL,
745
+ active INTEGER NOT NULL DEFAULT 1,
746
+ FOREIGN KEY (hash) REFERENCES content(hash) ON DELETE CASCADE,
747
+ UNIQUE(collection, path)
748
+ )
749
+ `);
750
+ db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_collection ON documents(collection, active)`);
751
+ db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_hash ON documents(hash)`);
752
+ db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_path ON documents(path, active)`);
753
+ // Cache table for LLM API calls
754
+ db.exec(`
755
+ CREATE TABLE IF NOT EXISTS llm_cache (
756
+ hash TEXT PRIMARY KEY,
757
+ result TEXT NOT NULL,
758
+ created_at TEXT NOT NULL
759
+ )
760
+ `);
761
+ // Content vectors. Avoid PRAGMA schema probes during startup; legacy vector
762
+ // columns are repaired lazily when a vector/embedding query first needs them.
763
+ db.exec(`
764
+ CREATE TABLE IF NOT EXISTS content_vectors (
765
+ hash TEXT NOT NULL,
766
+ seq INTEGER NOT NULL DEFAULT 0,
767
+ pos INTEGER NOT NULL DEFAULT 0,
768
+ model TEXT NOT NULL,
769
+ embed_fingerprint TEXT NOT NULL DEFAULT '',
770
+ total_chunks INTEGER NOT NULL DEFAULT 1,
771
+ embedded_at TEXT NOT NULL,
772
+ PRIMARY KEY (hash, seq)
773
+ )
774
+ `);
775
+ // Store collections — makes the DB self-contained (no external config needed)
776
+ db.exec(`
777
+ CREATE TABLE IF NOT EXISTS store_collections (
778
+ name TEXT PRIMARY KEY,
779
+ path TEXT NOT NULL,
780
+ pattern TEXT NOT NULL DEFAULT '**/*.md',
781
+ ignore_patterns TEXT,
782
+ include_by_default INTEGER DEFAULT 1,
783
+ update_command TEXT,
784
+ context TEXT
785
+ )
786
+ `);
787
+ // Store config — key-value metadata (e.g. config_hash for sync optimization)
788
+ db.exec(`
789
+ CREATE TABLE IF NOT EXISTS store_config (
790
+ key TEXT PRIMARY KEY,
791
+ value TEXT
792
+ )
793
+ `);
794
+ // FTS - index filepath (collection/path), title, and content
795
+ db.exec(`
796
+ CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
797
+ filepath, title, body,
798
+ tokenize='porter unicode61'
799
+ )
800
+ `);
801
+ // Triggers keep FTS in sync for callers that write directly to documents.
802
+ // Production indexing paths rebuild entries in TypeScript so CJK text can be
803
+ // normalized before it reaches the unicode61 tokenizer.
804
+ db.exec(`DROP TRIGGER IF EXISTS documents_ai`);
805
+ db.exec(`
806
+ CREATE TRIGGER documents_ai AFTER INSERT ON documents
807
+ WHEN new.active = 1
808
+ BEGIN
809
+ INSERT INTO documents_fts(rowid, filepath, title, body)
810
+ SELECT
811
+ new.id,
812
+ new.collection || '/' || new.path,
813
+ new.title,
814
+ (SELECT doc FROM content WHERE hash = new.hash)
815
+ WHERE new.active = 1;
816
+ END
817
+ `);
818
+ db.exec(`DROP TRIGGER IF EXISTS documents_ad`);
819
+ db.exec(`
820
+ CREATE TRIGGER documents_ad AFTER DELETE ON documents BEGIN
821
+ DELETE FROM documents_fts WHERE rowid = old.id;
822
+ END
823
+ `);
824
+ db.exec(`DROP TRIGGER IF EXISTS documents_au`);
825
+ db.exec(`
826
+ CREATE TRIGGER documents_au AFTER UPDATE ON documents
827
+ BEGIN
828
+ -- Delete from FTS if no longer active
829
+ DELETE FROM documents_fts WHERE rowid = old.id AND new.active = 0;
830
+
831
+ -- Update FTS if still/newly active
832
+ INSERT OR REPLACE INTO documents_fts(rowid, filepath, title, body)
833
+ SELECT
834
+ new.id,
835
+ new.collection || '/' || new.path,
836
+ new.title,
837
+ (SELECT doc FROM content WHERE hash = new.hash)
838
+ WHERE new.active = 1;
839
+ END
840
+ `);
841
+ rebuildFTSForCjkNormalization(db);
842
+ }
843
+ function rowToNamedCollection(row) {
844
+ return {
845
+ name: row.name,
846
+ path: row.path,
847
+ pattern: row.pattern,
848
+ ...(row.ignore_patterns ? { ignore: JSON.parse(row.ignore_patterns) } : {}),
849
+ ...(row.include_by_default === 0 ? { includeByDefault: false } : {}),
850
+ ...(row.update_command ? { update: row.update_command } : {}),
851
+ ...(row.context ? { context: JSON.parse(row.context) } : {}),
852
+ };
853
+ }
854
+ export function getStoreCollections(db) {
855
+ const rows = db.prepare(`SELECT * FROM store_collections`).all();
856
+ return rows.map(rowToNamedCollection);
857
+ }
858
+ export function getStoreCollection(db, name) {
859
+ const row = db.prepare(`SELECT * FROM store_collections WHERE name = ?`).get(name);
860
+ if (row == null)
861
+ return null;
862
+ return rowToNamedCollection(row);
863
+ }
864
+ export function getStoreGlobalContext(db) {
865
+ const row = db.prepare(`SELECT value FROM store_config WHERE key = 'global_context'`).get();
866
+ if (row == null)
867
+ return undefined;
868
+ return row.value || undefined;
869
+ }
870
+ export function getStoreContexts(db) {
871
+ const results = [];
872
+ // Global context
873
+ const globalCtx = getStoreGlobalContext(db);
874
+ if (globalCtx) {
875
+ results.push({ collection: "*", path: "/", context: globalCtx });
876
+ }
877
+ // Collection contexts
878
+ const rows = db.prepare(`SELECT name, context FROM store_collections WHERE context IS NOT NULL`).all();
879
+ for (const row of rows) {
880
+ const ctxMap = JSON.parse(row.context);
881
+ for (const [path, context] of Object.entries(ctxMap)) {
882
+ results.push({ collection: row.name, path, context });
883
+ }
884
+ }
885
+ return results;
886
+ }
887
+ export function upsertStoreCollection(db, name, collection) {
888
+ db.prepare(`
889
+ INSERT INTO store_collections (name, path, pattern, ignore_patterns, include_by_default, update_command, context)
890
+ VALUES (?, ?, ?, ?, ?, ?, ?)
891
+ ON CONFLICT(name) DO UPDATE SET
892
+ path = excluded.path,
893
+ pattern = excluded.pattern,
894
+ ignore_patterns = excluded.ignore_patterns,
895
+ include_by_default = excluded.include_by_default,
896
+ update_command = excluded.update_command,
897
+ context = excluded.context
898
+ `).run(name, collection.path, collection.pattern || '**/*.md', collection.ignore ? JSON.stringify(collection.ignore) : null, collection.includeByDefault === false ? 0 : 1, collection.update || null, collection.context ? JSON.stringify(collection.context) : null);
899
+ }
900
+ export function deleteStoreCollection(db, name) {
901
+ const result = db.prepare(`DELETE FROM store_collections WHERE name = ?`).run(name);
902
+ return result.changes > 0;
903
+ }
904
+ export function renameStoreCollection(db, oldName, newName) {
905
+ // Check target doesn't exist
906
+ const existing = db.prepare(`SELECT name FROM store_collections WHERE name = ?`).get(newName);
907
+ if (existing != null) {
908
+ throw new Error(`Collection '${newName}' already exists`);
909
+ }
910
+ const result = db.prepare(`UPDATE store_collections SET name = ? WHERE name = ?`).run(newName, oldName);
911
+ return result.changes > 0;
912
+ }
913
+ export function updateStoreContext(db, collectionName, path, text) {
914
+ const row = db.prepare(`SELECT context FROM store_collections WHERE name = ?`).get(collectionName);
915
+ if (row == null)
916
+ return false;
917
+ const ctxMap = row.context ? JSON.parse(row.context) : {};
918
+ ctxMap[path] = text;
919
+ db.prepare(`UPDATE store_collections SET context = ? WHERE name = ?`).run(JSON.stringify(ctxMap), collectionName);
920
+ return true;
921
+ }
922
+ export function removeStoreContext(db, collectionName, path) {
923
+ const row = db.prepare(`SELECT context FROM store_collections WHERE name = ?`).get(collectionName);
924
+ if (row == null)
925
+ return false;
926
+ if (!row.context)
927
+ return false;
928
+ const ctxMap = JSON.parse(row.context);
929
+ if (!(path in ctxMap))
930
+ return false;
931
+ delete ctxMap[path];
932
+ const newCtx = Object.keys(ctxMap).length > 0 ? JSON.stringify(ctxMap) : null;
933
+ db.prepare(`UPDATE store_collections SET context = ? WHERE name = ?`).run(newCtx, collectionName);
934
+ return true;
935
+ }
936
+ export function setStoreGlobalContext(db, value) {
937
+ if (value === undefined) {
938
+ db.prepare(`DELETE FROM store_config WHERE key = 'global_context'`).run();
939
+ }
940
+ else {
941
+ db.prepare(`INSERT INTO store_config (key, value) VALUES ('global_context', ?) ON CONFLICT(key) DO UPDATE SET value = excluded.value`).run(value);
942
+ }
943
+ }
944
+ /**
945
+ * Sync external config (YAML/inline) into SQLite store_collections.
946
+ * External config always wins. Skips sync if config hash hasn't changed.
947
+ */
948
+ export function syncConfigToDb(db, config) {
949
+ // Check config hash — skip sync if unchanged
950
+ const configJson = JSON.stringify(config);
951
+ const hash = createHash('sha256').update(configJson).digest('hex');
952
+ const existingHash = db.prepare(`SELECT value FROM store_config WHERE key = 'config_hash'`).get();
953
+ if (existingHash != null && existingHash.value === hash) {
954
+ return; // Config unchanged, skip sync
955
+ }
956
+ // Sync collections
957
+ const configNames = new Set(Object.keys(config.collections));
958
+ for (const [name, coll] of Object.entries(config.collections)) {
959
+ upsertStoreCollection(db, name, coll);
960
+ }
961
+ // Delete collections not in config
962
+ const dbCollections = db.prepare(`SELECT name FROM store_collections`).all();
963
+ for (const row of dbCollections) {
964
+ if (!configNames.has(row.name)) {
965
+ db.prepare(`DELETE FROM store_collections WHERE name = ?`).run(row.name);
966
+ }
967
+ }
968
+ // Sync global context
969
+ if (config.global_context !== undefined) {
970
+ setStoreGlobalContext(db, config.global_context);
971
+ }
972
+ else {
973
+ setStoreGlobalContext(db, undefined);
974
+ }
975
+ // Save config hash
976
+ db.prepare(`INSERT INTO store_config (key, value) VALUES ('config_hash', ?) ON CONFLICT(key) DO UPDATE SET value = excluded.value`).run(hash);
977
+ }
978
+ export function isSqliteVecAvailable() {
979
+ return _sqliteVecAvailable === true;
980
+ }
981
+ function ensureVecTableInternal(db, dimensions) {
982
+ if (!_sqliteVecAvailable) {
983
+ throw createSqliteVecUnavailableError(_sqliteVecUnavailableReason ?? "vector operations require a SQLite build with extension loading support");
984
+ }
985
+ const tableInfo = db.prepare(`SELECT sql FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
986
+ if (tableInfo) {
987
+ const match = tableInfo.sql.match(/float\[(\d+)\]/);
988
+ const hasHashSeq = tableInfo.sql.includes('hash_seq');
989
+ const hasCosine = tableInfo.sql.includes('distance_metric=cosine');
990
+ const existingDims = match?.[1] ? parseInt(match[1], 10) : null;
991
+ if (existingDims === dimensions && hasHashSeq && hasCosine)
992
+ return;
993
+ if (existingDims !== null && existingDims !== dimensions) {
994
+ throw new Error(`Embedding dimension mismatch: existing vectors are ${existingDims}d but the current model produces ${dimensions}d. ` +
995
+ `Run 'qmd embed -f' to re-embed with the new model.`);
996
+ }
997
+ db.exec("DROP TABLE IF EXISTS vectors_vec");
998
+ }
999
+ db.exec(`CREATE VIRTUAL TABLE vectors_vec USING vec0(hash_seq TEXT PRIMARY KEY, embedding float[${dimensions}] distance_metric=cosine)`);
1000
+ }
1001
+ /**
1002
+ * Re-index a single collection by scanning the filesystem and updating the database.
1003
+ * Pure function — no console output, no db lifecycle management.
1004
+ */
1005
+ export async function reindexCollection(store, collectionPath, globPattern, collectionName, options) {
1006
+ const db = store.db;
1007
+ const now = new Date().toISOString();
1008
+ const excludeDirs = ["node_modules", ".git", ".cache", "vendor", "dist", "build"];
1009
+ const allIgnore = [
1010
+ ...excludeDirs.map(d => `**/${d}/**`),
1011
+ ...(options?.ignorePatterns || []),
1012
+ ];
1013
+ const allFiles = await fastGlob(globPattern, {
1014
+ cwd: collectionPath,
1015
+ onlyFiles: true,
1016
+ followSymbolicLinks: false,
1017
+ dot: false,
1018
+ ignore: allIgnore,
1019
+ });
1020
+ // Filter hidden files/folders
1021
+ const files = allFiles.filter(file => {
1022
+ const parts = file.split("/");
1023
+ return !parts.some(part => part.startsWith("."));
1024
+ });
1025
+ const total = files.length;
1026
+ let indexed = 0, updated = 0, unchanged = 0, processed = 0;
1027
+ const seenPaths = new Set();
1028
+ for (const relativeFile of files) {
1029
+ const filepath = getRealPath(resolve(collectionPath, relativeFile));
1030
+ // Store the literal relative path so the filesystem path can always be
1031
+ // reconstructed as: resolve(collection.path, storedPath).
1032
+ // handelize() is NOT applied at index time — it is display-only.
1033
+ const path = normalizePathSeparators(relativeFile);
1034
+ seenPaths.add(path);
1035
+ let content;
1036
+ try {
1037
+ content = readFileSync(filepath, "utf-8");
1038
+ }
1039
+ catch {
1040
+ processed++;
1041
+ options?.onProgress?.({ file: relativeFile, current: processed, total });
1042
+ continue;
1043
+ }
1044
+ if (!content.trim()) {
1045
+ processed++;
1046
+ continue;
1047
+ }
1048
+ const hash = await hashContent(content);
1049
+ const title = extractTitle(content, relativeFile);
1050
+ const existing = findOrMigrateLegacyDocument(db, collectionName, path);
1051
+ if (existing) {
1052
+ if (existing.hash === hash) {
1053
+ if (existing.title !== title) {
1054
+ updateDocumentTitle(db, existing.id, title, now);
1055
+ updated++;
1056
+ }
1057
+ else {
1058
+ unchanged++;
1059
+ }
1060
+ }
1061
+ else {
1062
+ insertContent(db, hash, content, now);
1063
+ const stat = statSync(filepath);
1064
+ updateDocument(db, existing.id, title, hash, stat ? new Date(stat.mtime).toISOString() : now);
1065
+ updated++;
1066
+ }
1067
+ }
1068
+ else {
1069
+ indexed++;
1070
+ insertContent(db, hash, content, now);
1071
+ const stat = statSync(filepath);
1072
+ insertDocument(db, collectionName, path, title, hash, stat ? new Date(stat.birthtime).toISOString() : now, stat ? new Date(stat.mtime).toISOString() : now);
1073
+ }
1074
+ processed++;
1075
+ options?.onProgress?.({ file: relativeFile, current: processed, total });
1076
+ }
1077
+ // Deactivate documents that no longer exist
1078
+ const allActive = getActiveDocumentPaths(db, collectionName);
1079
+ let removed = 0;
1080
+ for (const path of allActive) {
1081
+ if (!seenPaths.has(path)) {
1082
+ deactivateDocument(db, collectionName, path);
1083
+ removed++;
1084
+ }
1085
+ }
1086
+ const orphanedCleaned = cleanupOrphanedContent(db);
1087
+ return { indexed, updated, unchanged, removed, orphanedCleaned };
1088
+ }
1089
+ function validatePositiveIntegerOption(name, value, fallback) {
1090
+ if (value === undefined)
1091
+ return fallback;
1092
+ if (!Number.isInteger(value) || value < 1) {
1093
+ throw new Error(`${name} must be a positive integer`);
1094
+ }
1095
+ return value;
1096
+ }
1097
+ function resolveEmbedOptions(options) {
1098
+ return {
1099
+ maxDocsPerBatch: validatePositiveIntegerOption("maxDocsPerBatch", options?.maxDocsPerBatch, DEFAULT_EMBED_MAX_DOCS_PER_BATCH),
1100
+ maxBatchBytes: validatePositiveIntegerOption("maxBatchBytes", options?.maxBatchBytes, DEFAULT_EMBED_MAX_BATCH_BYTES),
1101
+ };
1102
+ }
1103
+ const CONTENT_VECTOR_DESIRED_COLUMNS = [
1104
+ { name: "seq", definition: "INTEGER NOT NULL DEFAULT 0" },
1105
+ { name: "pos", definition: "INTEGER NOT NULL DEFAULT 0" },
1106
+ { name: "model", definition: "TEXT NOT NULL DEFAULT ''" },
1107
+ { name: "embed_fingerprint", definition: "TEXT NOT NULL DEFAULT ''" },
1108
+ { name: "total_chunks", definition: "INTEGER NOT NULL DEFAULT 1" },
1109
+ { name: "embedded_at", definition: "TEXT NOT NULL DEFAULT ''" },
1110
+ ];
1111
+ function isContentVectorColumnError(error) {
1112
+ const message = error instanceof Error ? error.message : String(error);
1113
+ if (!/(no such column|has no column named)/i.test(message)) {
1114
+ return false;
1115
+ }
1116
+ return CONTENT_VECTOR_DESIRED_COLUMNS.some(col => message.includes(col.name));
1117
+ }
1118
+ function runContentVectorColumnRepairs(db) {
1119
+ for (const column of CONTENT_VECTOR_DESIRED_COLUMNS) {
1120
+ try {
1121
+ db.exec(`ALTER TABLE content_vectors ADD COLUMN ${column.name} ${column.definition}`);
1122
+ }
1123
+ catch (error) {
1124
+ const message = error instanceof Error ? error.message : String(error);
1125
+ // The repair series is intentionally idempotent: most columns should
1126
+ // already exist, and another caller may have repaired a missing column
1127
+ // between the failed query and this ALTER series.
1128
+ if (!message.includes("duplicate column name")) {
1129
+ throw error;
1130
+ }
1131
+ }
1132
+ }
1133
+ }
1134
+ function withLazyContentVectorMigration(db, operation) {
1135
+ let repaired = false;
1136
+ while (true) {
1137
+ try {
1138
+ return operation();
1139
+ }
1140
+ catch (error) {
1141
+ if (repaired || !isContentVectorColumnError(error)) {
1142
+ throw error;
1143
+ }
1144
+ runContentVectorColumnRepairs(db);
1145
+ repaired = true;
1146
+ }
1147
+ }
1148
+ }
1149
+ function getPendingEmbeddingDocs(db, collection, model = DEFAULT_EMBED_MODEL) {
1150
+ const collectionFilter = collection ? `AND d.collection = ?` : ``;
1151
+ const fingerprint = getEmbeddingFingerprint(model);
1152
+ return withLazyContentVectorMigration(db, () => {
1153
+ const stmt = db.prepare(`
1154
+ SELECT d.hash, MIN(d.path) as path, length(CAST(c.doc AS BLOB)) as bytes
1155
+ FROM documents d
1156
+ JOIN content c ON d.hash = c.hash
1157
+ LEFT JOIN (
1158
+ SELECT hash, model, COUNT(*) AS chunk_count, MAX(total_chunks) AS expected_chunks
1159
+ FROM content_vectors
1160
+ WHERE model = ? AND embed_fingerprint = ?
1161
+ GROUP BY hash, model, embed_fingerprint
1162
+ ) v ON d.hash = v.hash
1163
+ WHERE d.active = 1
1164
+ AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
1165
+ ${collectionFilter}
1166
+ GROUP BY d.hash
1167
+ ORDER BY MIN(d.path)
1168
+ `);
1169
+ return (collection ? stmt.all(model, fingerprint, collection) : stmt.all(model, fingerprint));
1170
+ });
1171
+ }
1172
+ function buildEmbeddingBatches(docs, maxDocsPerBatch, maxBatchBytes) {
1173
+ const batches = [];
1174
+ let currentBatch = [];
1175
+ let currentBytes = 0;
1176
+ for (const doc of docs) {
1177
+ const docBytes = Math.max(0, doc.bytes);
1178
+ const wouldExceedDocs = currentBatch.length >= maxDocsPerBatch;
1179
+ const wouldExceedBytes = currentBatch.length > 0 && (currentBytes + docBytes) > maxBatchBytes;
1180
+ if (wouldExceedDocs || wouldExceedBytes) {
1181
+ batches.push(currentBatch);
1182
+ currentBatch = [];
1183
+ currentBytes = 0;
1184
+ }
1185
+ currentBatch.push(doc);
1186
+ currentBytes += docBytes;
1187
+ }
1188
+ if (currentBatch.length > 0) {
1189
+ batches.push(currentBatch);
1190
+ }
1191
+ return batches;
1192
+ }
1193
+ function getEmbeddingDocsForBatch(db, batch) {
1194
+ if (batch.length === 0)
1195
+ return [];
1196
+ const placeholders = batch.map(() => "?").join(",");
1197
+ const rows = db.prepare(`
1198
+ SELECT hash, doc as body
1199
+ FROM content
1200
+ WHERE hash IN (${placeholders})
1201
+ `).all(...batch.map(doc => doc.hash));
1202
+ const bodyByHash = new Map(rows.map(row => [row.hash, row.body]));
1203
+ return batch.map((doc) => ({
1204
+ ...doc,
1205
+ body: bodyByHash.get(doc.hash) ?? "",
1206
+ }));
1207
+ }
1208
+ /**
1209
+ * Generate vector embeddings for documents that need them.
1210
+ * Pure function — no console output, no db lifecycle management.
1211
+ * Uses the store's LlamaCpp instance if set, otherwise the global singleton.
1212
+ */
1213
+ export async function generateEmbeddings(store, options) {
1214
+ const db = store.db;
1215
+ const llm = getLlm(store);
1216
+ const model = options?.model ?? llm.embedModelName ?? DEFAULT_EMBED_MODEL;
1217
+ const fingerprint = getEmbeddingFingerprint(model);
1218
+ const now = new Date().toISOString();
1219
+ const { maxDocsPerBatch, maxBatchBytes } = resolveEmbedOptions(options);
1220
+ const encoder = new TextEncoder();
1221
+ if (options?.force) {
1222
+ clearAllEmbeddings(db, options?.collection);
1223
+ }
1224
+ const docsToEmbed = getPendingEmbeddingDocs(db, options?.collection, model);
1225
+ if (docsToEmbed.length === 0) {
1226
+ return { docsProcessed: 0, chunksEmbedded: 0, errors: 0, durationMs: 0 };
1227
+ }
1228
+ const totalBytes = docsToEmbed.reduce((sum, doc) => sum + Math.max(0, doc.bytes), 0);
1229
+ const totalDocs = docsToEmbed.length;
1230
+ const startTime = Date.now();
1231
+ // Use store's LlamaCpp or global singleton, wrapped in a session
1232
+ const embedModelUri = model;
1233
+ // Create a session manager for this llm instance
1234
+ const result = await withLLMSessionForLlm(llm, async (session) => {
1235
+ let chunksEmbedded = 0;
1236
+ let bytesProcessed = 0;
1237
+ let totalChunks = 0;
1238
+ let vectorTableInitialized = false;
1239
+ const BATCH_SIZE = 32;
1240
+ const RETRY_AFTER_SUCCESSFUL_CHUNKS = 64;
1241
+ const MAX_RETRY_ATTEMPTS = 3;
1242
+ const failures = new Map();
1243
+ const retryQueue = new Map();
1244
+ let successesSinceRetry = 0;
1245
+ const failureList = () => [...failures.values()];
1246
+ const activeErrorCount = () => failures.size;
1247
+ const chunkKey = (chunk) => `${chunk.hash}:${chunk.seq}`;
1248
+ const reasonFromError = (error) => {
1249
+ const raw = error instanceof Error ? error.message : String(error);
1250
+ return raw.length > 180 ? `${raw.slice(0, 177)}...` : raw;
1251
+ };
1252
+ const recordFailure = (chunk, reason) => {
1253
+ const key = chunkKey(chunk);
1254
+ const previous = failures.get(key);
1255
+ failures.set(key, {
1256
+ path: chunk.path,
1257
+ hash: chunk.hash,
1258
+ seq: chunk.seq,
1259
+ attempts: (previous?.attempts ?? 0) + 1,
1260
+ reason,
1261
+ });
1262
+ retryQueue.set(key, chunk);
1263
+ };
1264
+ const clearFailure = (chunk) => {
1265
+ const key = chunkKey(chunk);
1266
+ failures.delete(key);
1267
+ retryQueue.delete(key);
1268
+ };
1269
+ const tryEmbedChunk = async (chunk) => {
1270
+ try {
1271
+ const text = formatDocForEmbedding(chunk.text, chunk.title, embedModelUri);
1272
+ const result = await session.embed(text, { model });
1273
+ if (!result) {
1274
+ recordFailure(chunk, "embedding returned no vector");
1275
+ return false;
1276
+ }
1277
+ insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now, chunk.expectedTotalChunks, fingerprint);
1278
+ chunksEmbedded++;
1279
+ successesSinceRetry++;
1280
+ clearFailure(chunk);
1281
+ return true;
1282
+ }
1283
+ catch (error) {
1284
+ recordFailure(chunk, reasonFromError(error));
1285
+ return false;
1286
+ }
1287
+ };
1288
+ const retryFailedChunks = async (force = false) => {
1289
+ if (!session.isValid || retryQueue.size === 0)
1290
+ return;
1291
+ if (!force && successesSinceRetry < RETRY_AFTER_SUCCESSFUL_CHUNKS)
1292
+ return;
1293
+ successesSinceRetry = 0;
1294
+ // Normal mode: one retry pass after enough unrelated chunks succeeded.
1295
+ // Force mode: we have run out of other chunks for this batch, so keep
1296
+ // retrying outstanding failures until they recover or hit the cap. The
1297
+ // cap prevents endless loops on permanently bad chunks.
1298
+ do {
1299
+ let retried = 0;
1300
+ for (const [key, chunk] of [...retryQueue]) {
1301
+ const failure = failures.get(key);
1302
+ if (!failure || failure.attempts >= MAX_RETRY_ATTEMPTS)
1303
+ continue;
1304
+ retried++;
1305
+ await tryEmbedChunk(chunk);
1306
+ }
1307
+ if (!force || retried === 0)
1308
+ break;
1309
+ } while (session.isValid && [...retryQueue].some(([key]) => {
1310
+ const failure = failures.get(key);
1311
+ return !!failure && failure.attempts < MAX_RETRY_ATTEMPTS;
1312
+ }));
1313
+ };
1314
+ const batches = buildEmbeddingBatches(docsToEmbed, maxDocsPerBatch, maxBatchBytes);
1315
+ for (const batchMeta of batches) {
1316
+ // Abort early if session has been invalidated
1317
+ if (!session.isValid) {
1318
+ console.warn(`⚠ Session expired — skipping remaining document batches`);
1319
+ break;
1320
+ }
1321
+ const batchDocs = getEmbeddingDocsForBatch(db, batchMeta);
1322
+ const batchChunks = [];
1323
+ const expectedChunksByHash = new Map();
1324
+ const batchBytes = batchMeta.reduce((sum, doc) => sum + Math.max(0, doc.bytes), 0);
1325
+ for (const doc of batchDocs) {
1326
+ if (!doc.body.trim())
1327
+ continue;
1328
+ const title = extractTitle(doc.body, doc.path);
1329
+ const chunks = await chunkDocumentByTokens(doc.body, undefined, undefined, undefined, doc.path, options?.chunkStrategy, session.signal);
1330
+ for (let seq = 0; seq < chunks.length; seq++) {
1331
+ batchChunks.push({
1332
+ hash: doc.hash,
1333
+ path: doc.path,
1334
+ title,
1335
+ text: chunks[seq].text,
1336
+ seq,
1337
+ pos: chunks[seq].pos,
1338
+ tokens: chunks[seq].tokens,
1339
+ bytes: encoder.encode(chunks[seq].text).length,
1340
+ expectedTotalChunks: chunks.length,
1341
+ });
1342
+ }
1343
+ expectedChunksByHash.set(doc.hash, chunks.length);
1344
+ }
1345
+ totalChunks += batchChunks.length;
1346
+ if (batchChunks.length === 0) {
1347
+ bytesProcessed += batchBytes;
1348
+ options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors: activeErrorCount(), failures: failureList() });
1349
+ continue;
1350
+ }
1351
+ if (!vectorTableInitialized) {
1352
+ const firstChunk = batchChunks[0];
1353
+ const firstText = formatDocForEmbedding(firstChunk.text, firstChunk.title, embedModelUri);
1354
+ const firstResult = await session.embed(firstText, { model });
1355
+ if (!firstResult) {
1356
+ throw new Error("Failed to get embedding dimensions from first chunk");
1357
+ }
1358
+ store.ensureVecTable(firstResult.embedding.length);
1359
+ vectorTableInitialized = true;
1360
+ }
1361
+ const totalBatchChunkBytes = batchChunks.reduce((sum, chunk) => sum + chunk.bytes, 0);
1362
+ let batchChunkBytesProcessed = 0;
1363
+ for (let batchStart = 0; batchStart < batchChunks.length; batchStart += BATCH_SIZE) {
1364
+ // Abort early if session has been invalidated (e.g. max duration exceeded)
1365
+ if (!session.isValid) {
1366
+ const remainingChunks = batchChunks.slice(batchStart);
1367
+ for (const chunk of remainingChunks)
1368
+ recordFailure(chunk, "LLM session expired before embedding chunk");
1369
+ console.warn(`⚠ Session expired — skipping ${remainingChunks.length} remaining chunks`);
1370
+ break;
1371
+ }
1372
+ // Abort early if active error rate is too high (>80% of attempted chunks failed)
1373
+ const processed = chunksEmbedded + activeErrorCount();
1374
+ if (processed >= BATCH_SIZE && activeErrorCount() > processed * 0.8) {
1375
+ const remainingChunks = batchChunks.slice(batchStart);
1376
+ for (const chunk of remainingChunks)
1377
+ recordFailure(chunk, "embedding aborted because error rate was too high");
1378
+ console.warn(`⚠ Error rate too high (${activeErrorCount()}/${processed}) — aborting embedding`);
1379
+ break;
1380
+ }
1381
+ const batchEnd = Math.min(batchStart + BATCH_SIZE, batchChunks.length);
1382
+ const chunkBatch = batchChunks.slice(batchStart, batchEnd);
1383
+ const texts = chunkBatch.map(chunk => formatDocForEmbedding(chunk.text, chunk.title, embedModelUri));
1384
+ try {
1385
+ const embeddings = await session.embedBatch(texts, { model });
1386
+ for (let i = 0; i < chunkBatch.length; i++) {
1387
+ const chunk = chunkBatch[i];
1388
+ const embedding = embeddings[i];
1389
+ if (embedding) {
1390
+ insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now, chunk.expectedTotalChunks, fingerprint);
1391
+ chunksEmbedded++;
1392
+ successesSinceRetry++;
1393
+ clearFailure(chunk);
1394
+ }
1395
+ else {
1396
+ recordFailure(chunk, "batch embedding returned no vector");
1397
+ }
1398
+ batchChunkBytesProcessed += chunk.bytes;
1399
+ }
1400
+ await retryFailedChunks();
1401
+ }
1402
+ catch (error) {
1403
+ // Batch failed — try individual embeddings as fallback. If an
1404
+ // individual retry succeeds, any prior failure for that chunk is
1405
+ // cleared, so the visible error count reflects outstanding failures.
1406
+ const batchReason = reasonFromError(error);
1407
+ if (!session.isValid) {
1408
+ for (const chunk of chunkBatch)
1409
+ recordFailure(chunk, `batch failed and session expired: ${batchReason}`);
1410
+ batchChunkBytesProcessed += chunkBatch.reduce((sum, c) => sum + c.bytes, 0);
1411
+ }
1412
+ else {
1413
+ for (const chunk of chunkBatch) {
1414
+ await tryEmbedChunk(chunk);
1415
+ batchChunkBytesProcessed += chunk.bytes;
1416
+ await retryFailedChunks();
1417
+ }
1418
+ }
1419
+ }
1420
+ const proportionalBytes = totalBatchChunkBytes === 0
1421
+ ? batchBytes
1422
+ : Math.min(batchBytes, Math.round((batchChunkBytesProcessed / totalBatchChunkBytes) * batchBytes));
1423
+ options?.onProgress?.({
1424
+ chunksEmbedded,
1425
+ totalChunks,
1426
+ bytesProcessed: bytesProcessed + proportionalBytes,
1427
+ totalBytes,
1428
+ errors: activeErrorCount(),
1429
+ failures: failureList(),
1430
+ });
1431
+ }
1432
+ await retryFailedChunks(true);
1433
+ const removedPartialChunks = removeIncompleteEmbeddings(db, expectedChunksByHash, model);
1434
+ if (removedPartialChunks > 0) {
1435
+ chunksEmbedded = Math.max(0, chunksEmbedded - removedPartialChunks);
1436
+ }
1437
+ bytesProcessed += batchBytes;
1438
+ options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors: activeErrorCount(), failures: failureList() });
1439
+ }
1440
+ return { chunksEmbedded, errors: activeErrorCount(), failures: failureList() };
1441
+ }, { maxDuration: 30 * 60 * 1000, name: 'generateEmbeddings' });
1442
+ return {
1443
+ docsProcessed: totalDocs,
1444
+ chunksEmbedded: result.chunksEmbedded,
1445
+ errors: result.errors,
1446
+ failures: result.failures,
1447
+ durationMs: Date.now() - startTime,
1448
+ };
1449
+ }
1450
+ /**
1451
+ * Create a new store instance with the given database path.
1452
+ * If no path is provided, uses the default path (~/.cache/qmd/index.sqlite).
1453
+ *
1454
+ * @param dbPath - Path to the SQLite database file
1455
+ * @returns Store instance with all methods bound to the database
1456
+ */
1457
+ export function createStore(dbPath) {
1458
+ const resolvedPath = dbPath || getDefaultDbPath();
1459
+ const db = openDatabase(resolvedPath);
1460
+ initializeDatabase(db);
1461
+ const store = {
1462
+ db,
1463
+ dbPath: resolvedPath,
1464
+ close: () => db.close(),
1465
+ ensureVecTable: (dimensions) => ensureVecTableInternal(db, dimensions),
1466
+ // Index health
1467
+ getHashesNeedingEmbedding: (model) => getHashesNeedingEmbedding(db, undefined, model ?? store.llm?.embedModelName ?? DEFAULT_EMBED_MODEL),
1468
+ getIndexHealth: (model) => getIndexHealth(db, model ?? store.llm?.embedModelName ?? DEFAULT_EMBED_MODEL),
1469
+ getStatus: (model) => getStatus(db, model ?? store.llm?.embedModelName ?? DEFAULT_EMBED_MODEL),
1470
+ // Caching
1471
+ getCacheKey,
1472
+ getCachedResult: (cacheKey) => getCachedResult(db, cacheKey),
1473
+ setCachedResult: (cacheKey, result) => setCachedResult(db, cacheKey, result),
1474
+ clearCache: () => clearCache(db),
1475
+ // Cleanup and maintenance
1476
+ deleteLLMCache: () => deleteLLMCache(db),
1477
+ deleteInactiveDocuments: () => deleteInactiveDocuments(db),
1478
+ cleanupOrphanedContent: () => cleanupOrphanedContent(db),
1479
+ cleanupOrphanedVectors: () => cleanupOrphanedVectors(db),
1480
+ vacuumDatabase: () => vacuumDatabase(db),
1481
+ // Context
1482
+ getContextForFile: (filepath) => getContextForFile(db, filepath),
1483
+ getContextForPath: (collectionName, path) => getContextForPath(db, collectionName, path),
1484
+ getCollectionByName: (name) => getCollectionByName(db, name),
1485
+ getCollectionsWithoutContext: () => getCollectionsWithoutContext(db),
1486
+ getTopLevelPathsWithoutContext: (collectionName) => getTopLevelPathsWithoutContext(db, collectionName),
1487
+ // Virtual paths
1488
+ parseVirtualPath,
1489
+ buildVirtualPath,
1490
+ isVirtualPath,
1491
+ resolveVirtualPath: (virtualPath) => resolveVirtualPath(db, virtualPath),
1492
+ toVirtualPath: (absolutePath) => toVirtualPath(db, absolutePath),
1493
+ // Search
1494
+ searchFTS: (query, limit, collectionName) => searchFTS(db, query, limit, collectionName),
1495
+ searchVec: (query, model, limit, collectionName, session, precomputedEmbedding) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding),
1496
+ // Query expansion & reranking
1497
+ expandQuery: (query, model, intent) => expandQuery(query, model ?? store.llm?.generateModelName ?? DEFAULT_QUERY_MODEL, db, intent, store.llm),
1498
+ rerank: (query, documents, model, intent) => rerank(query, documents, model ?? store.llm?.rerankModelName ?? DEFAULT_RERANK_MODEL, db, intent, store.llm),
1499
+ // Document retrieval
1500
+ findDocument: (filename, options) => findDocument(db, filename, options),
1501
+ getDocumentBody: (doc, fromLine, maxLines) => getDocumentBody(db, doc, fromLine, maxLines),
1502
+ findDocuments: (pattern, options) => findDocuments(db, pattern, options),
1503
+ // Fuzzy matching and docid lookup
1504
+ findSimilarFiles: (query, maxDistance, limit) => findSimilarFiles(db, query, maxDistance, limit),
1505
+ matchFilesByGlob: (pattern) => matchFilesByGlob(db, pattern),
1506
+ findDocumentByDocid: (docid) => findDocumentByDocid(db, docid),
1507
+ // Document indexing operations
1508
+ insertContent: (hash, content, createdAt) => insertContent(db, hash, content, createdAt),
1509
+ insertDocument: (collectionName, path, title, hash, createdAt, modifiedAt) => insertDocument(db, collectionName, path, title, hash, createdAt, modifiedAt),
1510
+ findActiveDocument: (collectionName, path) => findActiveDocument(db, collectionName, path),
1511
+ findOrMigrateLegacyDocument: (collectionName, path) => findOrMigrateLegacyDocument(db, collectionName, path),
1512
+ updateDocumentTitle: (documentId, title, modifiedAt) => updateDocumentTitle(db, documentId, title, modifiedAt),
1513
+ updateDocument: (documentId, title, hash, modifiedAt) => updateDocument(db, documentId, title, hash, modifiedAt),
1514
+ deactivateDocument: (collectionName, path) => deactivateDocument(db, collectionName, path),
1515
+ getActiveDocumentPaths: (collectionName) => getActiveDocumentPaths(db, collectionName),
1516
+ // Vector/embedding operations
1517
+ getHashesForEmbedding: () => getHashesForEmbedding(db),
1518
+ clearAllEmbeddings: () => clearAllEmbeddings(db),
1519
+ insertEmbedding: (hash, seq, pos, embedding, model, embeddedAt, totalChunks, fingerprint) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt, totalChunks, fingerprint),
1520
+ };
1521
+ return store;
1522
+ }
1523
+ /**
1524
+ * Extract short docid from a full hash (first 6 characters).
1525
+ */
1526
+ export function getDocid(hash) {
1527
+ return hash.slice(0, 6);
1528
+ }
1529
+ /**
1530
+ * Handelize a filename to be more token-friendly.
1531
+ * - Convert triple underscore `___` to `/` (folder separator)
1532
+ * - Replace sequences of non-word chars (except /) with single dash
1533
+ * - Remove leading/trailing dashes from path segments
1534
+ * - Preserve folder structure (a/b/c/d.md stays structured)
1535
+ * - Preserve file extension
1536
+ * - Preserve original case (important for case-sensitive filesystems)
1537
+ */
1538
+ /** Replace emoji/symbol codepoints with their hex representation (e.g. 🐘 → 1f418) */
1539
+ function emojiToHex(str) {
1540
+ return str.replace(/(?:\p{So}\p{Mn}?|\p{Sk})+/gu, (run) => {
1541
+ // Split the run into individual emoji and convert each to hex, dash-separated
1542
+ return [...run].filter(c => /\p{So}|\p{Sk}/u.test(c))
1543
+ .map(c => c.codePointAt(0).toString(16)).join('-');
1544
+ });
1545
+ }
1546
+ export function handelize(path) {
1547
+ if (!path || path.trim() === '') {
1548
+ throw new Error('handelize: path cannot be empty');
1549
+ }
1550
+ // Allow route-style "$" filenames while still rejecting paths with no usable content.
1551
+ // Emoji (\p{So}) counts as valid content — they get converted to hex codepoints below.
1552
+ const segments = path.split('/').filter(Boolean);
1553
+ const lastSegment = segments[segments.length - 1] || '';
1554
+ const filenameWithoutExt = lastSegment.replace(/\.[^.]+$/, '');
1555
+ const hasValidContent = /[\p{L}\p{N}\p{So}\p{Sk}$]/u.test(filenameWithoutExt);
1556
+ if (!hasValidContent) {
1557
+ throw new Error(`handelize: path "${path}" has no valid filename content`);
1558
+ }
1559
+ const result = path
1560
+ .replace(/___/g, '/') // Triple underscore becomes folder separator
1561
+ .split('/')
1562
+ .map((segment, idx, arr) => {
1563
+ const isLastSegment = idx === arr.length - 1;
1564
+ // Convert emoji to hex codepoints before cleaning
1565
+ segment = emojiToHex(segment);
1566
+ if (isLastSegment) {
1567
+ // For the filename (last segment), preserve the extension
1568
+ const extMatch = segment.match(/(\.[a-z0-9]+)$/i);
1569
+ const ext = extMatch ? extMatch[1] : '';
1570
+ const nameWithoutExt = ext ? segment.slice(0, -ext.length) : segment;
1571
+ const cleanedName = nameWithoutExt
1572
+ .replace(/[^\p{L}\p{N}$]+/gu, '-') // Keep letters, numbers, "$"; dash-separate rest (including dots)
1573
+ .replace(/^-+|-+$/g, ''); // Remove leading/trailing dashes
1574
+ return cleanedName + ext;
1575
+ }
1576
+ else {
1577
+ // For directories, just clean normally
1578
+ return segment
1579
+ .replace(/[^\p{L}\p{N}$]+/gu, '-')
1580
+ .replace(/^-+|-+$/g, '');
1581
+ }
1582
+ })
1583
+ .filter(Boolean)
1584
+ .join('/');
1585
+ if (!result) {
1586
+ throw new Error(`handelize: path "${path}" resulted in empty string after processing`);
1587
+ }
1588
+ return result;
1589
+ }
1590
+ // =============================================================================
1591
+ // Index health
1592
+ // =============================================================================
1593
+ export function getHashesNeedingEmbedding(db, collection, model = DEFAULT_EMBED_MODEL) {
1594
+ const collectionFilter = collection ? `AND d.collection = ?` : ``;
1595
+ const fingerprint = getEmbeddingFingerprint(model);
1596
+ return withLazyContentVectorMigration(db, () => {
1597
+ const stmt = db.prepare(`
1598
+ SELECT COUNT(DISTINCT d.hash) as count
1599
+ FROM documents d
1600
+ LEFT JOIN (
1601
+ SELECT hash, model, COUNT(*) AS chunk_count, MAX(total_chunks) AS expected_chunks
1602
+ FROM content_vectors
1603
+ WHERE model = ? AND embed_fingerprint = ?
1604
+ GROUP BY hash, model, embed_fingerprint
1605
+ ) v ON d.hash = v.hash
1606
+ WHERE d.active = 1
1607
+ AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
1608
+ ${collectionFilter}
1609
+ `);
1610
+ const result = (collection ? stmt.get(model, fingerprint, collection) : stmt.get(model, fingerprint));
1611
+ return result.count;
1612
+ });
1613
+ }
1614
+ export async function maybeAdoptLegacyEmbeddingFingerprint(store, model = DEFAULT_EMBED_MODEL) {
1615
+ const db = store.db;
1616
+ const fingerprint = getEmbeddingFingerprint(model);
1617
+ const legacyCount = withLazyContentVectorMigration(db, () => {
1618
+ const row = db.prepare(`SELECT COUNT(DISTINCT hash) AS count FROM content_vectors WHERE model = ? AND embed_fingerprint = ''`).get(model);
1619
+ return row.count;
1620
+ });
1621
+ if (legacyCount === 0) {
1622
+ return { checked: false, adopted: 0, reason: "no legacy empty-fingerprint embeddings" };
1623
+ }
1624
+ const sample = withLazyContentVectorMigration(db, () => db.prepare(`
1625
+ SELECT cv.hash, cv.seq, cv.pos, cv.total_chunks, c.doc AS body, MIN(d.path) AS path
1626
+ FROM content_vectors cv
1627
+ JOIN documents d ON d.hash = cv.hash AND d.active = 1
1628
+ JOIN content c ON c.hash = cv.hash
1629
+ WHERE cv.model = ? AND cv.embed_fingerprint = ''
1630
+ GROUP BY cv.hash, cv.seq, cv.pos, cv.total_chunks, c.doc
1631
+ ORDER BY cv.hash, cv.seq
1632
+ LIMIT 1
1633
+ `).get(model));
1634
+ if (!sample) {
1635
+ return { checked: false, adopted: 0, reason: `${legacyCount} legacy docs have no active sample` };
1636
+ }
1637
+ const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
1638
+ if (!tableExists) {
1639
+ return { checked: false, adopted: 0, reason: "vectors_vec table is missing" };
1640
+ }
1641
+ const expectedHashSeq = `${sample.hash}_${sample.seq}`;
1642
+ const title = extractTitle(sample.body, sample.path);
1643
+ const llm = getLlm(store);
1644
+ return await withLLMSessionForLlm(llm, async (session) => {
1645
+ const chunks = await chunkDocumentByTokens(sample.body, undefined, undefined, undefined, sample.path, undefined, session.signal);
1646
+ const chunk = chunks[sample.seq];
1647
+ if (!chunk) {
1648
+ return { checked: true, adopted: 0, reason: `sample chunk ${expectedHashSeq} no longer exists` };
1649
+ }
1650
+ const result = await session.embed(formatDocForEmbedding(chunk.text, title, model), { model });
1651
+ if (!result) {
1652
+ return { checked: true, adopted: 0, reason: "failed to embed legacy sample" };
1653
+ }
1654
+ const nearest = db.prepare(`
1655
+ SELECT hash_seq, distance
1656
+ FROM vectors_vec
1657
+ WHERE embedding MATCH ? AND k = 1
1658
+ `).get(new Float32Array(result.embedding));
1659
+ if (!nearest) {
1660
+ return { checked: true, adopted: 0, reason: "legacy sample vector not found" };
1661
+ }
1662
+ const threshold = 0.0001;
1663
+ if (nearest.hash_seq !== expectedHashSeq || nearest.distance > threshold) {
1664
+ return { checked: true, adopted: 0, reason: `legacy sample differs from current fingerprint (nearest ${nearest.hash_seq}, distance ${nearest.distance.toFixed(6)})` };
1665
+ }
1666
+ const update = withLazyContentVectorMigration(db, () => db.prepare(`UPDATE content_vectors SET embed_fingerprint = ? WHERE model = ? AND embed_fingerprint = ''`).run(fingerprint, model));
1667
+ return { checked: true, adopted: update.changes, reason: `sample ${expectedHashSeq} matched current fingerprint at distance ${nearest.distance.toFixed(6)}` };
1668
+ });
1669
+ }
1670
+ export function getIndexHealth(db, model = DEFAULT_EMBED_MODEL) {
1671
+ const needsEmbedding = getHashesNeedingEmbedding(db, undefined, model);
1672
+ const totalDocs = db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get().count;
1673
+ const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get();
1674
+ let daysStale = null;
1675
+ if (mostRecent?.latest) {
1676
+ const lastUpdate = new Date(mostRecent.latest);
1677
+ daysStale = Math.floor((Date.now() - lastUpdate.getTime()) / (24 * 60 * 60 * 1000));
1678
+ }
1679
+ return { needsEmbedding, totalDocs, daysStale };
1680
+ }
1681
+ // =============================================================================
1682
+ // Caching
1683
+ // =============================================================================
1684
+ export function getCacheKey(url, body) {
1685
+ const hash = createHash("sha256");
1686
+ hash.update(url);
1687
+ hash.update(JSON.stringify(body));
1688
+ return hash.digest("hex");
1689
+ }
1690
+ export function getCachedResult(db, cacheKey) {
1691
+ const row = db.prepare(`SELECT result FROM llm_cache WHERE hash = ?`).get(cacheKey);
1692
+ return row?.result || null;
1693
+ }
1694
+ export function setCachedResult(db, cacheKey, result) {
1695
+ const now = new Date().toISOString();
1696
+ db.prepare(`INSERT OR REPLACE INTO llm_cache (hash, result, created_at) VALUES (?, ?, ?)`).run(cacheKey, result, now);
1697
+ if (Math.random() < 0.01) {
1698
+ db.exec(`DELETE FROM llm_cache WHERE hash NOT IN (SELECT hash FROM llm_cache ORDER BY created_at DESC LIMIT 1000)`);
1699
+ }
1700
+ }
1701
+ export function clearCache(db) {
1702
+ db.exec(`DELETE FROM llm_cache`);
1703
+ }
1704
+ // =============================================================================
1705
+ // Cleanup and maintenance operations
1706
+ // =============================================================================
1707
+ /**
1708
+ * Delete cached LLM API responses.
1709
+ * Returns the number of cached responses deleted.
1710
+ */
1711
+ export function deleteLLMCache(db) {
1712
+ const result = db.prepare(`DELETE FROM llm_cache`).run();
1713
+ return result.changes;
1714
+ }
1715
+ /**
1716
+ * Remove inactive document records (active = 0).
1717
+ * Returns the number of inactive documents deleted.
1718
+ */
1719
+ export function deleteInactiveDocuments(db) {
1720
+ const result = db.prepare(`DELETE FROM documents WHERE active = 0`).run();
1721
+ return result.changes;
1722
+ }
1723
+ /**
1724
+ * Remove orphaned content hashes that are not referenced by any document.
1725
+ * Inactive documents are soft-deleted tombstones, so their content rows must
1726
+ * remain referenced until deleteInactiveDocuments() hard-deletes them.
1727
+ * Returns the number of orphaned content hashes deleted.
1728
+ */
1729
+ export function cleanupOrphanedContent(db) {
1730
+ const result = db.prepare(`
1731
+ DELETE FROM content
1732
+ WHERE hash NOT IN (SELECT DISTINCT hash FROM documents)
1733
+ `).run();
1734
+ return result.changes;
1735
+ }
1736
+ /**
1737
+ * Remove orphaned vector embeddings that are not referenced by any active document.
1738
+ * Returns the number of orphaned embedding chunks deleted.
1739
+ */
1740
+ export function cleanupOrphanedVectors(db) {
1741
+ // sqlite-vec may not be loaded (e.g. Bun's bun:sqlite lacks loadExtension).
1742
+ // The vectors_vec virtual table can appear in sqlite_master from a prior
1743
+ // session, but querying it without the vec0 module loaded will crash (#380).
1744
+ if (!isSqliteVecAvailable()) {
1745
+ return 0;
1746
+ }
1747
+ // The schema entry can exist even when sqlite-vec itself is unavailable
1748
+ // (for example when reopening a DB without vec0 loaded). In that case,
1749
+ // touching the virtual table throws "no such module: vec0" and cleanup
1750
+ // should degrade gracefully like the rest of the vector features.
1751
+ try {
1752
+ db.prepare(`SELECT 1 FROM vectors_vec LIMIT 0`).get();
1753
+ }
1754
+ catch {
1755
+ return 0;
1756
+ }
1757
+ return withLazyContentVectorMigration(db, () => {
1758
+ // Count orphaned vectors first
1759
+ const countResult = db.prepare(`
1760
+ SELECT COUNT(*) as c FROM content_vectors cv
1761
+ WHERE NOT EXISTS (
1762
+ SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
1763
+ )
1764
+ `).get();
1765
+ if (countResult.c === 0) {
1766
+ return 0;
1767
+ }
1768
+ // Delete from vectors_vec first
1769
+ db.exec(`
1770
+ DELETE FROM vectors_vec WHERE hash_seq IN (
1771
+ SELECT cv.hash || '_' || cv.seq FROM content_vectors cv
1772
+ WHERE NOT EXISTS (
1773
+ SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
1774
+ )
1775
+ )
1776
+ `);
1777
+ // Delete from content_vectors
1778
+ db.exec(`
1779
+ DELETE FROM content_vectors WHERE hash NOT IN (
1780
+ SELECT hash FROM documents WHERE active = 1
1781
+ )
1782
+ `);
1783
+ return countResult.c;
1784
+ });
1785
+ }
1786
+ /**
1787
+ * Run VACUUM to reclaim unused space in the database.
1788
+ * This operation rebuilds the database file to eliminate fragmentation.
1789
+ */
1790
+ export function vacuumDatabase(db) {
1791
+ db.exec(`VACUUM`);
1792
+ }
1793
+ // =============================================================================
1794
+ // Document helpers
1795
+ // =============================================================================
1796
+ export async function hashContent(content) {
1797
+ const hash = createHash("sha256");
1798
+ hash.update(content);
1799
+ return hash.digest("hex");
1800
+ }
1801
+ const titleExtractors = {
1802
+ '.md': (content) => {
1803
+ const match = content.match(/^##?\s+(.+)$/m);
1804
+ if (match) {
1805
+ const title = (match[1] ?? "").trim();
1806
+ if (title === "📝 Notes" || title === "Notes") {
1807
+ const nextMatch = content.match(/^##\s+(.+)$/m);
1808
+ if (nextMatch?.[1])
1809
+ return nextMatch[1].trim();
1810
+ }
1811
+ return title;
1812
+ }
1813
+ return null;
1814
+ },
1815
+ '.org': (content) => {
1816
+ const titleProp = content.match(/^#\+TITLE:\s*(.+)$/im);
1817
+ if (titleProp?.[1])
1818
+ return titleProp[1].trim();
1819
+ const heading = content.match(/^\*+\s+(.+)$/m);
1820
+ if (heading?.[1])
1821
+ return heading[1].trim();
1822
+ return null;
1823
+ },
1824
+ };
1825
+ export function extractTitle(content, filename) {
1826
+ const ext = filename.slice(filename.lastIndexOf('.')).toLowerCase();
1827
+ const extractor = titleExtractors[ext];
1828
+ if (extractor) {
1829
+ const title = extractor(content);
1830
+ if (title)
1831
+ return title;
1832
+ }
1833
+ return filename.replace(/\.[^.]+$/, "").split("/").pop() || filename;
1834
+ }
1835
+ // =============================================================================
1836
+ // Document indexing operations
1837
+ // =============================================================================
1838
+ /**
1839
+ * Insert content into the content table (content-addressable storage).
1840
+ * Uses INSERT OR IGNORE so duplicate hashes are skipped.
1841
+ */
1842
+ export function insertContent(db, hash, content, createdAt) {
1843
+ db.prepare(`INSERT OR IGNORE INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
1844
+ .run(hash, content, createdAt);
1845
+ }
1846
+ function rebuildDocumentFTS(db, documentId) {
1847
+ const row = db.prepare(`
1848
+ SELECT d.id, d.collection, d.path, d.title, content.doc as body
1849
+ FROM documents d
1850
+ JOIN content ON content.hash = d.hash
1851
+ WHERE d.id = ? AND d.active = 1
1852
+ `).get(documentId);
1853
+ db.prepare(`DELETE FROM documents_fts WHERE rowid = ?`).run(documentId);
1854
+ if (!row)
1855
+ return;
1856
+ db.prepare(`
1857
+ INSERT INTO documents_fts(rowid, filepath, title, body)
1858
+ VALUES (?, ?, ?, ?)
1859
+ `).run(row.id, normalizeCjkForFTS(`${row.collection}/${row.path}`), normalizeCjkForFTS(row.title), normalizeCjkForFTS(row.body));
1860
+ }
1861
+ /**
1862
+ * Insert a new document into the documents table.
1863
+ */
1864
+ export function insertDocument(db, collectionName, path, title, hash, createdAt, modifiedAt) {
1865
+ db.prepare(`
1866
+ INSERT INTO documents (collection, path, title, hash, created_at, modified_at, active)
1867
+ VALUES (?, ?, ?, ?, ?, ?, 1)
1868
+ ON CONFLICT(collection, path) DO UPDATE SET
1869
+ title = excluded.title,
1870
+ hash = excluded.hash,
1871
+ modified_at = excluded.modified_at,
1872
+ active = 1
1873
+ `).run(collectionName, path, title, hash, createdAt, modifiedAt);
1874
+ const row = db.prepare(`SELECT id FROM documents WHERE collection = ? AND path = ?`).get(collectionName, path);
1875
+ if (row)
1876
+ rebuildDocumentFTS(db, row.id);
1877
+ }
1878
+ /**
1879
+ * Find an active document by collection name and path.
1880
+ */
1881
+ export function findActiveDocument(db, collectionName, path) {
1882
+ const row = db.prepare(`
1883
+ SELECT id, hash, title FROM documents
1884
+ WHERE collection = ? AND path = ? AND active = 1
1885
+ `).get(collectionName, path);
1886
+ return row ?? null;
1887
+ }
1888
+ /**
1889
+ * Find an active document, falling back to a case-insensitive path match.
1890
+ * If found under a different casing, renames it in-place and rebuilds the
1891
+ * FTS entry. Embeddings are keyed by content hash, so the rename is
1892
+ * safe — no re-embedding required.
1893
+ *
1894
+ * @internal Used by reindexCollection and indexFiles during qmd update.
1895
+ * Returns null if the document does not exist under either path.
1896
+ */
1897
+ export function findOrMigrateLegacyDocument(db, collectionName, path) {
1898
+ const existing = findActiveDocument(db, collectionName, path);
1899
+ if (existing)
1900
+ return existing;
1901
+ // Case-insensitive match (legacy normalization: e.g. "README.md" → "readme.md").
1902
+ const legacyCase = db.prepare(`
1903
+ SELECT id, hash, title FROM documents
1904
+ WHERE collection = ? AND path COLLATE NOCASE = ? AND active = 1
1905
+ ORDER BY id
1906
+ LIMIT 1
1907
+ `).get(collectionName, path);
1908
+ // Handalized-path match: existing DBs indexed with handelize() stored slugged paths
1909
+ // like "Budget-Revenue-Q4-2024.md" for a raw path like "Budget & Revenue (Q4) [2024].md".
1910
+ // Try matching the handalized form of the incoming raw path against the DB so that
1911
+ // qmd update on an old index can rename the row to the literal path.
1912
+ let legacyHandalized;
1913
+ try {
1914
+ const handleized = handelize(path);
1915
+ if (handleized !== path) {
1916
+ legacyHandalized = db.prepare(`
1917
+ SELECT id, hash, title FROM documents
1918
+ WHERE collection = ? AND path = ? AND active = 1
1919
+ ORDER BY id
1920
+ LIMIT 1
1921
+ `).get(collectionName, handleized);
1922
+ }
1923
+ }
1924
+ catch {
1925
+ // handelize throws on invalid paths; just skip
1926
+ }
1927
+ const legacy = legacyCase ?? legacyHandalized;
1928
+ if (!legacy)
1929
+ return null;
1930
+ // Wrap rename + FTS rebuild in a transaction for atomicity.
1931
+ const migrate = db.transaction(() => {
1932
+ // Use OR IGNORE so a UNIQUE conflict (e.g. both "readme.md" and
1933
+ // "README.md" already exist) is a no-op rather than crashing.
1934
+ const result = db.prepare(`UPDATE OR IGNORE documents SET path = ? WHERE id = ? AND active = 1`).run(path, legacy.id);
1935
+ if (result.changes === 0)
1936
+ return false;
1937
+ rebuildDocumentFTS(db, legacy.id);
1938
+ return true;
1939
+ });
1940
+ if (!migrate())
1941
+ return null;
1942
+ return findActiveDocument(db, collectionName, path);
1943
+ }
1944
+ /**
1945
+ * Update the title and modified_at timestamp for a document.
1946
+ */
1947
+ export function updateDocumentTitle(db, documentId, title, modifiedAt) {
1948
+ db.prepare(`UPDATE documents SET title = ?, modified_at = ? WHERE id = ?`)
1949
+ .run(title, modifiedAt, documentId);
1950
+ rebuildDocumentFTS(db, documentId);
1951
+ }
1952
+ /**
1953
+ * Update an existing document's hash, title, and modified_at timestamp.
1954
+ * Used when content changes but the file path stays the same.
1955
+ */
1956
+ export function updateDocument(db, documentId, title, hash, modifiedAt) {
1957
+ db.prepare(`UPDATE documents SET title = ?, hash = ?, modified_at = ? WHERE id = ?`)
1958
+ .run(title, hash, modifiedAt, documentId);
1959
+ rebuildDocumentFTS(db, documentId);
1960
+ }
1961
+ /**
1962
+ * Deactivate a document (mark as inactive but don't delete).
1963
+ */
1964
+ export function deactivateDocument(db, collectionName, path) {
1965
+ db.prepare(`UPDATE documents SET active = 0 WHERE collection = ? AND path = ? AND active = 1`)
1966
+ .run(collectionName, path);
1967
+ }
1968
+ /**
1969
+ * Get all active document paths for a collection.
1970
+ */
1971
+ export function getActiveDocumentPaths(db, collectionName) {
1972
+ const rows = db.prepare(`
1973
+ SELECT path FROM documents WHERE collection = ? AND active = 1
1974
+ `).all(collectionName);
1975
+ return rows.map(r => r.path);
1976
+ }
1977
+ export { formatQueryForEmbedding, formatDocForEmbedding };
1978
+ /**
1979
+ * Chunk a document using regex-only break point detection.
1980
+ * This is the sync, backward-compatible API used by tests and legacy callers.
1981
+ */
1982
+ export function chunkDocument(content, maxChars = CHUNK_SIZE_CHARS, overlapChars = CHUNK_OVERLAP_CHARS, windowChars = CHUNK_WINDOW_CHARS) {
1983
+ const breakPoints = scanBreakPoints(content);
1984
+ const codeFences = findCodeFences(content);
1985
+ return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
1986
+ }
1987
+ /**
1988
+ * Async AST-aware chunking. Detects language from filepath, computes AST
1989
+ * break points for supported code files, merges with regex break points,
1990
+ * and delegates to the shared chunk algorithm.
1991
+ *
1992
+ * Falls back to regex-only when strategy is "regex", filepath is absent,
1993
+ * or language is unsupported.
1994
+ */
1995
+ export async function chunkDocumentAsync(content, maxChars = CHUNK_SIZE_CHARS, overlapChars = CHUNK_OVERLAP_CHARS, windowChars = CHUNK_WINDOW_CHARS, filepath, chunkStrategy = "regex") {
1996
+ const regexPoints = scanBreakPoints(content);
1997
+ const codeFences = findCodeFences(content);
1998
+ let breakPoints = regexPoints;
1999
+ if (chunkStrategy === "auto" && filepath) {
2000
+ const { getASTBreakPoints } = await import("./ast.js");
2001
+ const astPoints = await getASTBreakPoints(content, filepath);
2002
+ if (astPoints.length > 0) {
2003
+ breakPoints = mergeBreakPoints(regexPoints, astPoints);
2004
+ }
2005
+ }
2006
+ return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
2007
+ }
2008
+ /**
2009
+ * Chunk a document by actual token count using the LLM tokenizer.
2010
+ * More accurate than character-based chunking but requires async.
2011
+ *
2012
+ * When filepath and chunkStrategy are provided, uses AST-aware break points
2013
+ * for supported code files.
2014
+ */
2015
+ export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKENS, overlapTokens = CHUNK_OVERLAP_TOKENS, windowTokens = CHUNK_WINDOW_TOKENS, filepath, chunkStrategy = "regex", signal) {
2016
+ const llm = getDefaultLlamaCpp();
2017
+ // Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3)
2018
+ // If chunks exceed limit, they'll be re-split with actual ratio
2019
+ const avgCharsPerToken = 3;
2020
+ const maxChars = maxTokens * avgCharsPerToken;
2021
+ const overlapChars = overlapTokens * avgCharsPerToken;
2022
+ const windowChars = windowTokens * avgCharsPerToken;
2023
+ // Chunk in character space with conservative estimate
2024
+ // Use AST-aware chunking for the first pass when filepath/strategy provided
2025
+ let charChunks = await chunkDocumentAsync(content, maxChars, overlapChars, windowChars, filepath, chunkStrategy);
2026
+ // Tokenize and split any chunks that still exceed limit
2027
+ const results = [];
2028
+ const clampOverlapChars = (value, maxChars) => {
2029
+ if (maxChars <= 1)
2030
+ return 0;
2031
+ return Math.max(0, Math.min(maxChars - 1, Math.floor(value)));
2032
+ };
2033
+ const pushChunkWithinTokenLimit = async (text, pos) => {
2034
+ if (signal?.aborted)
2035
+ return;
2036
+ const tokens = await llm.tokenize(text);
2037
+ if (tokens.length <= maxTokens || text.length <= 1) {
2038
+ results.push({ text, pos, tokens: tokens.length });
2039
+ return;
2040
+ }
2041
+ const actualCharsPerToken = text.length / tokens.length;
2042
+ let safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95);
2043
+ if (!Number.isFinite(safeMaxChars) || safeMaxChars < 1) {
2044
+ safeMaxChars = Math.floor(text.length / 2);
2045
+ }
2046
+ safeMaxChars = Math.max(1, Math.min(text.length - 1, safeMaxChars));
2047
+ let nextOverlapChars = clampOverlapChars(overlapChars * actualCharsPerToken / 2, safeMaxChars);
2048
+ let nextWindowChars = Math.max(0, Math.floor(windowChars * actualCharsPerToken / 2));
2049
+ let subChunks = chunkDocument(text, safeMaxChars, nextOverlapChars, nextWindowChars);
2050
+ // Pathological single-line blobs can produce no meaningful breakpoint progress.
2051
+ // Fall back to a simple half split so every recursion step strictly shrinks.
2052
+ if (subChunks.length <= 1
2053
+ || subChunks[0]?.text.length === text.length) {
2054
+ safeMaxChars = Math.max(1, Math.floor(text.length / 2));
2055
+ nextOverlapChars = 0;
2056
+ nextWindowChars = 0;
2057
+ subChunks = chunkDocument(text, safeMaxChars, nextOverlapChars, nextWindowChars);
2058
+ }
2059
+ if (subChunks.length <= 1
2060
+ || subChunks[0]?.text.length === text.length) {
2061
+ const fallbackTokens = tokens.slice(0, Math.max(1, maxTokens));
2062
+ const truncatedText = await llm.detokenize(fallbackTokens);
2063
+ results.push({
2064
+ text: truncatedText,
2065
+ pos,
2066
+ tokens: fallbackTokens.length,
2067
+ });
2068
+ return;
2069
+ }
2070
+ for (const subChunk of subChunks) {
2071
+ await pushChunkWithinTokenLimit(text.slice(subChunk.pos, subChunk.pos + subChunk.text.length), pos + subChunk.pos);
2072
+ }
2073
+ };
2074
+ for (const chunk of charChunks) {
2075
+ await pushChunkWithinTokenLimit(chunk.text, chunk.pos);
2076
+ }
2077
+ return results;
2078
+ }
2079
+ // =============================================================================
2080
+ // Fuzzy matching
2081
+ // =============================================================================
2082
+ function levenshtein(a, b) {
2083
+ const m = a.length, n = b.length;
2084
+ if (m === 0)
2085
+ return n;
2086
+ if (n === 0)
2087
+ return m;
2088
+ const dp = Array.from({ length: m + 1 }, () => Array(n + 1).fill(0));
2089
+ for (let i = 0; i <= m; i++)
2090
+ dp[i][0] = i;
2091
+ for (let j = 0; j <= n; j++)
2092
+ dp[0][j] = j;
2093
+ for (let i = 1; i <= m; i++) {
2094
+ for (let j = 1; j <= n; j++) {
2095
+ const cost = a[i - 1] === b[j - 1] ? 0 : 1;
2096
+ dp[i][j] = Math.min(dp[i - 1][j] + 1, dp[i][j - 1] + 1, dp[i - 1][j - 1] + cost);
2097
+ }
2098
+ }
2099
+ return dp[m][n];
2100
+ }
2101
+ /**
2102
+ * Normalize a docid input by stripping surrounding quotes and leading #.
2103
+ * Handles: "#abc123", 'abc123', "abc123", #abc123, abc123
2104
+ * Returns the bare hex string.
2105
+ */
2106
+ export function normalizeDocid(docid) {
2107
+ let normalized = docid.trim();
2108
+ // Strip surrounding quotes (single or double)
2109
+ if ((normalized.startsWith('"') && normalized.endsWith('"')) ||
2110
+ (normalized.startsWith("'") && normalized.endsWith("'"))) {
2111
+ normalized = normalized.slice(1, -1);
2112
+ }
2113
+ // Strip leading # if present
2114
+ if (normalized.startsWith('#')) {
2115
+ normalized = normalized.slice(1);
2116
+ }
2117
+ return normalized;
2118
+ }
2119
+ /**
2120
+ * Check if a string looks like a docid reference.
2121
+ * Accepts: #abc123, abc123, "#abc123", "abc123", '#abc123', 'abc123'
2122
+ * Returns true if the normalized form is a valid hex string of 6+ chars.
2123
+ */
2124
+ export function isDocid(input) {
2125
+ const normalized = normalizeDocid(input);
2126
+ // Must be at least 6 hex characters
2127
+ return normalized.length >= 6 && /^[a-f0-9]+$/i.test(normalized);
2128
+ }
2129
+ /**
2130
+ * Find a document by its short docid (first 6 characters of hash).
2131
+ * Returns the document's virtual path if found, null otherwise.
2132
+ * If multiple documents match the same short hash (collision), returns the first one.
2133
+ *
2134
+ * Accepts lenient input: #abc123, abc123, "#abc123", "abc123"
2135
+ */
2136
+ export function findDocumentByDocid(db, docid) {
2137
+ const shortHash = normalizeDocid(docid);
2138
+ if (shortHash.length < 1)
2139
+ return null;
2140
+ // Look up documents where hash starts with the short hash
2141
+ const doc = db.prepare(`
2142
+ SELECT 'qmd://' || d.collection || '/' || d.path as filepath, d.hash
2143
+ FROM documents d
2144
+ WHERE d.hash LIKE ? AND d.active = 1
2145
+ LIMIT 1
2146
+ `).get(`${shortHash}%`);
2147
+ return doc;
2148
+ }
2149
+ export function findSimilarFiles(db, query, maxDistance = 3, limit = 5) {
2150
+ const allFiles = db.prepare(`
2151
+ SELECT d.path
2152
+ FROM documents d
2153
+ WHERE d.active = 1
2154
+ `).all();
2155
+ const queryLower = query.toLowerCase();
2156
+ const scored = allFiles
2157
+ .map(f => ({ path: f.path, dist: levenshtein(f.path.toLowerCase(), queryLower) }))
2158
+ .filter(f => f.dist <= maxDistance)
2159
+ .sort((a, b) => a.dist - b.dist)
2160
+ .slice(0, limit);
2161
+ return scored.map(f => f.path);
2162
+ }
2163
+ export function matchFilesByGlob(db, pattern) {
2164
+ const allFiles = db.prepare(`
2165
+ SELECT
2166
+ 'qmd://' || d.collection || '/' || d.path as virtual_path,
2167
+ LENGTH(content.doc) as body_length,
2168
+ d.path,
2169
+ d.collection
2170
+ FROM documents d
2171
+ JOIN content ON content.hash = d.hash
2172
+ WHERE d.active = 1
2173
+ `).all();
2174
+ const isMatch = picomatch(pattern);
2175
+ return allFiles
2176
+ .filter(f => isMatch(f.virtual_path) || isMatch(f.path) || isMatch(f.collection + '/' + f.path))
2177
+ .map(f => ({
2178
+ filepath: f.virtual_path, // Virtual path for precise lookup
2179
+ displayPath: f.path, // Relative path for display
2180
+ bodyLength: f.body_length
2181
+ }));
2182
+ }
2183
+ // =============================================================================
2184
+ // Context
2185
+ // =============================================================================
2186
+ /**
2187
+ * Get context for a file path using hierarchical inheritance.
2188
+ * Contexts are collection-scoped and inherit from parent directories.
2189
+ * For example, context at "/talks" applies to "/talks/2024/keynote.md".
2190
+ *
2191
+ * @param db Database instance (unused - kept for compatibility)
2192
+ * @param collectionName Collection name
2193
+ * @param path Relative path within the collection
2194
+ * @returns Context string or null if no context is defined
2195
+ */
2196
+ export function getContextForPath(db, collectionName, path) {
2197
+ const coll = getStoreCollection(db, collectionName);
2198
+ if (!coll)
2199
+ return null;
2200
+ // Collect ALL matching contexts (global + all path prefixes)
2201
+ const contexts = [];
2202
+ // Add global context if present
2203
+ const globalCtx = getStoreGlobalContext(db);
2204
+ if (globalCtx) {
2205
+ contexts.push(globalCtx);
2206
+ }
2207
+ // Add all matching path contexts (from most general to most specific)
2208
+ if (coll.context) {
2209
+ const normalizedPath = path.startsWith("/") ? path : `/${path}`;
2210
+ // Collect all matching prefixes
2211
+ const matchingContexts = [];
2212
+ for (const [prefix, context] of Object.entries(coll.context)) {
2213
+ const normalizedPrefix = prefix.startsWith("/") ? prefix : `/${prefix}`;
2214
+ if (normalizedPath.startsWith(normalizedPrefix)) {
2215
+ matchingContexts.push({ prefix: normalizedPrefix, context });
2216
+ }
2217
+ }
2218
+ // Sort by prefix length (shortest/most general first)
2219
+ matchingContexts.sort((a, b) => a.prefix.length - b.prefix.length);
2220
+ // Add all matching contexts
2221
+ for (const match of matchingContexts) {
2222
+ contexts.push(match.context);
2223
+ }
2224
+ }
2225
+ // Join all contexts with double newline
2226
+ return contexts.length > 0 ? contexts.join('\n\n') : null;
2227
+ }
2228
+ /**
2229
+ * Get context for a file path (virtual or filesystem).
2230
+ * Resolves the collection and relative path from the DB store_collections table.
2231
+ */
2232
+ export function getContextForFile(db, filepath) {
2233
+ // Handle undefined or null filepath
2234
+ if (!filepath)
2235
+ return null;
2236
+ // Get all collections from DB
2237
+ const collections = getStoreCollections(db);
2238
+ // Parse virtual path format: qmd://collection/path
2239
+ let collectionName = null;
2240
+ let relativePath = null;
2241
+ const parsedVirtual = filepath.startsWith('qmd://') ? parseVirtualPath(filepath) : null;
2242
+ if (parsedVirtual) {
2243
+ collectionName = parsedVirtual.collectionName;
2244
+ relativePath = parsedVirtual.path;
2245
+ }
2246
+ else {
2247
+ // Filesystem path: find which collection this absolute path belongs to
2248
+ for (const coll of collections) {
2249
+ // Skip collections with missing paths
2250
+ if (!coll || !coll.path)
2251
+ continue;
2252
+ if (filepath.startsWith(coll.path + '/') || filepath === coll.path) {
2253
+ collectionName = coll.name;
2254
+ // Extract relative path
2255
+ relativePath = filepath.startsWith(coll.path + '/')
2256
+ ? filepath.slice(coll.path.length + 1)
2257
+ : '';
2258
+ break;
2259
+ }
2260
+ }
2261
+ if (!collectionName || relativePath === null)
2262
+ return null;
2263
+ }
2264
+ // Get the collection from DB
2265
+ const coll = getStoreCollection(db, collectionName);
2266
+ if (!coll)
2267
+ return null;
2268
+ // Verify this document exists in the database
2269
+ const doc = db.prepare(`
2270
+ SELECT d.path
2271
+ FROM documents d
2272
+ WHERE d.collection = ? AND d.path = ? AND d.active = 1
2273
+ LIMIT 1
2274
+ `).get(collectionName, relativePath);
2275
+ if (!doc)
2276
+ return null;
2277
+ // Collect ALL matching contexts (global + all path prefixes)
2278
+ const contexts = [];
2279
+ // Add global context if present
2280
+ const globalCtx = getStoreGlobalContext(db);
2281
+ if (globalCtx) {
2282
+ contexts.push(globalCtx);
2283
+ }
2284
+ // Add all matching path contexts (from most general to most specific)
2285
+ if (coll.context) {
2286
+ const normalizedPath = relativePath.startsWith("/") ? relativePath : `/${relativePath}`;
2287
+ // Collect all matching prefixes
2288
+ const matchingContexts = [];
2289
+ for (const [prefix, context] of Object.entries(coll.context)) {
2290
+ const normalizedPrefix = prefix.startsWith("/") ? prefix : `/${prefix}`;
2291
+ if (normalizedPath.startsWith(normalizedPrefix)) {
2292
+ matchingContexts.push({ prefix: normalizedPrefix, context });
2293
+ }
2294
+ }
2295
+ // Sort by prefix length (shortest/most general first)
2296
+ matchingContexts.sort((a, b) => a.prefix.length - b.prefix.length);
2297
+ // Add all matching contexts
2298
+ for (const match of matchingContexts) {
2299
+ contexts.push(match.context);
2300
+ }
2301
+ }
2302
+ // Join all contexts with double newline
2303
+ return contexts.length > 0 ? contexts.join('\n\n') : null;
2304
+ }
2305
+ /**
2306
+ * Get collection by name from DB store_collections table.
2307
+ */
2308
+ export function getCollectionByName(db, name) {
2309
+ const collection = getStoreCollection(db, name);
2310
+ if (!collection)
2311
+ return null;
2312
+ return {
2313
+ name: collection.name,
2314
+ pwd: collection.path,
2315
+ glob_pattern: collection.pattern,
2316
+ };
2317
+ }
2318
+ /**
2319
+ * List all collections with document counts from database.
2320
+ * Merges store_collections config with database statistics.
2321
+ */
2322
+ export function listCollections(db) {
2323
+ const collections = getStoreCollections(db);
2324
+ // Get document counts from database for each collection
2325
+ const result = collections.map(coll => {
2326
+ const stats = db.prepare(`
2327
+ SELECT
2328
+ COUNT(d.id) as doc_count,
2329
+ SUM(CASE WHEN d.active = 1 THEN 1 ELSE 0 END) as active_count,
2330
+ MAX(d.modified_at) as last_modified
2331
+ FROM documents d
2332
+ WHERE d.collection = ?
2333
+ `).get(coll.name);
2334
+ return {
2335
+ name: coll.name,
2336
+ pwd: coll.path,
2337
+ glob_pattern: coll.pattern,
2338
+ doc_count: stats?.doc_count || 0,
2339
+ active_count: stats?.active_count || 0,
2340
+ last_modified: stats?.last_modified || null,
2341
+ includeByDefault: coll.includeByDefault !== false,
2342
+ };
2343
+ });
2344
+ return result;
2345
+ }
2346
+ /**
2347
+ * Remove a collection and clean up its documents.
2348
+ * Uses collections.ts to remove from YAML config and cleans up database.
2349
+ */
2350
+ export function removeCollection(db, collectionName) {
2351
+ // Delete documents from database
2352
+ const docResult = db.prepare(`DELETE FROM documents WHERE collection = ?`).run(collectionName);
2353
+ // Clean up orphaned content hashes
2354
+ const cleanupResult = db.prepare(`
2355
+ DELETE FROM content
2356
+ WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
2357
+ `).run();
2358
+ // Remove from store_collections
2359
+ deleteStoreCollection(db, collectionName);
2360
+ return {
2361
+ deletedDocs: docResult.changes,
2362
+ cleanedHashes: cleanupResult.changes
2363
+ };
2364
+ }
2365
+ /**
2366
+ * Rename a collection.
2367
+ * Updates both YAML config and database documents table.
2368
+ */
2369
+ export function renameCollection(db, oldName, newName) {
2370
+ // Update all documents with the new collection name in database
2371
+ db.prepare(`UPDATE documents SET collection = ? WHERE collection = ?`)
2372
+ .run(newName, oldName);
2373
+ // Rename in store_collections
2374
+ renameStoreCollection(db, oldName, newName);
2375
+ }
2376
+ // =============================================================================
2377
+ // Context Management Operations
2378
+ // =============================================================================
2379
+ /**
2380
+ * Insert or update a context for a specific collection and path prefix.
2381
+ */
2382
+ export function insertContext(db, collectionId, pathPrefix, context) {
2383
+ // Get collection name from ID
2384
+ const coll = db.prepare(`SELECT name FROM collections WHERE id = ?`).get(collectionId);
2385
+ if (!coll) {
2386
+ throw new Error(`Collection with id ${collectionId} not found`);
2387
+ }
2388
+ // Add context to store_collections
2389
+ updateStoreContext(db, coll.name, pathPrefix, context);
2390
+ }
2391
+ /**
2392
+ * Delete a context for a specific collection and path prefix.
2393
+ * Returns the number of contexts deleted.
2394
+ */
2395
+ export function deleteContext(db, collectionName, pathPrefix) {
2396
+ // Remove context from store_collections
2397
+ const success = removeStoreContext(db, collectionName, pathPrefix);
2398
+ return success ? 1 : 0;
2399
+ }
2400
+ /**
2401
+ * Delete all global contexts (contexts with empty path_prefix).
2402
+ * Returns the number of contexts deleted.
2403
+ */
2404
+ export function deleteGlobalContexts(db) {
2405
+ let deletedCount = 0;
2406
+ // Remove global context
2407
+ setStoreGlobalContext(db, undefined);
2408
+ deletedCount++;
2409
+ // Remove root context (empty string) from all collections
2410
+ const collections = getStoreCollections(db);
2411
+ for (const coll of collections) {
2412
+ const success = removeStoreContext(db, coll.name, '');
2413
+ if (success) {
2414
+ deletedCount++;
2415
+ }
2416
+ }
2417
+ return deletedCount;
2418
+ }
2419
+ /**
2420
+ * List all contexts, grouped by collection.
2421
+ * Returns contexts ordered by collection name, then by path prefix length (longest first).
2422
+ */
2423
+ export function listPathContexts(db) {
2424
+ const allContexts = getStoreContexts(db);
2425
+ // Convert to expected format and sort
2426
+ return allContexts.map(ctx => ({
2427
+ collection_name: ctx.collection,
2428
+ path_prefix: ctx.path,
2429
+ context: ctx.context,
2430
+ })).sort((a, b) => {
2431
+ // Sort by collection name first
2432
+ if (a.collection_name !== b.collection_name) {
2433
+ return a.collection_name.localeCompare(b.collection_name);
2434
+ }
2435
+ // Then by path prefix length (longest first)
2436
+ if (a.path_prefix.length !== b.path_prefix.length) {
2437
+ return b.path_prefix.length - a.path_prefix.length;
2438
+ }
2439
+ // Then alphabetically
2440
+ return a.path_prefix.localeCompare(b.path_prefix);
2441
+ });
2442
+ }
2443
+ /**
2444
+ * Get all collections (name only - from YAML config).
2445
+ */
2446
+ export function getAllCollections(db) {
2447
+ const collections = getStoreCollections(db);
2448
+ return collections.map(c => ({ name: c.name }));
2449
+ }
2450
+ /**
2451
+ * Check which collections don't have any context defined.
2452
+ * Returns collections that have no context entries at all (not even root context).
2453
+ */
2454
+ export function getCollectionsWithoutContext(db) {
2455
+ // Get all collections from DB
2456
+ const allCollections = getStoreCollections(db);
2457
+ // Filter to those without context
2458
+ const collectionsWithoutContext = [];
2459
+ for (const coll of allCollections) {
2460
+ // Check if collection has any context
2461
+ if (!coll.context || Object.keys(coll.context).length === 0) {
2462
+ // Get doc count from database
2463
+ const stats = db.prepare(`
2464
+ SELECT COUNT(d.id) as doc_count
2465
+ FROM documents d
2466
+ WHERE d.collection = ? AND d.active = 1
2467
+ `).get(coll.name);
2468
+ collectionsWithoutContext.push({
2469
+ name: coll.name,
2470
+ pwd: coll.path,
2471
+ doc_count: stats?.doc_count || 0,
2472
+ });
2473
+ }
2474
+ }
2475
+ return collectionsWithoutContext.sort((a, b) => a.name.localeCompare(b.name));
2476
+ }
2477
+ /**
2478
+ * Get top-level directories in a collection that don't have context.
2479
+ * Useful for suggesting where context might be needed.
2480
+ */
2481
+ export function getTopLevelPathsWithoutContext(db, collectionName) {
2482
+ // Get all paths in the collection from database
2483
+ const paths = db.prepare(`
2484
+ SELECT DISTINCT path FROM documents
2485
+ WHERE collection = ? AND active = 1
2486
+ `).all(collectionName);
2487
+ // Get existing contexts for this collection from DB
2488
+ const dbColl = getStoreCollection(db, collectionName);
2489
+ if (!dbColl)
2490
+ return [];
2491
+ const contextPrefixes = new Set();
2492
+ if (dbColl.context) {
2493
+ for (const prefix of Object.keys(dbColl.context)) {
2494
+ contextPrefixes.add(prefix);
2495
+ }
2496
+ }
2497
+ // Extract top-level directories (first path component)
2498
+ const topLevelDirs = new Set();
2499
+ for (const { path } of paths) {
2500
+ const parts = path.split('/').filter(Boolean);
2501
+ if (parts.length > 1) {
2502
+ const dir = parts[0];
2503
+ if (dir)
2504
+ topLevelDirs.add(dir);
2505
+ }
2506
+ }
2507
+ // Filter out directories that already have context (exact or parent)
2508
+ const missing = [];
2509
+ for (const dir of topLevelDirs) {
2510
+ let hasContext = false;
2511
+ // Check if this dir or any parent has context
2512
+ for (const prefix of contextPrefixes) {
2513
+ if (prefix === '' || prefix === dir || dir.startsWith(prefix + '/')) {
2514
+ hasContext = true;
2515
+ break;
2516
+ }
2517
+ }
2518
+ if (!hasContext) {
2519
+ missing.push(dir);
2520
+ }
2521
+ }
2522
+ return missing.sort();
2523
+ }
2524
+ // =============================================================================
2525
+ // FTS Search
2526
+ // =============================================================================
2527
+ export function sanitizeFTS5Term(term) {
2528
+ return term.replace(/[^\p{L}\p{N}'_]/gu, '').toLowerCase();
2529
+ }
2530
+ /**
2531
+ * Check if a token is a hyphenated compound word (e.g., multi-agent, DEC-0054, gpt-4).
2532
+ * Returns true if the token contains internal hyphens between word/digit characters.
2533
+ */
2534
+ function isHyphenatedToken(token) {
2535
+ return /^[\p{L}\p{N}][\p{L}\p{N}'-]*-[\p{L}\p{N}][\p{L}\p{N}'-]*$/u.test(token);
2536
+ }
2537
+ /**
2538
+ * Sanitize a hyphenated term into an FTS5 phrase by splitting on hyphens
2539
+ * and sanitizing each part. Returns the parts joined by spaces for use
2540
+ * inside FTS5 quotes: "multi agent" matches "multi-agent" in porter tokenizer.
2541
+ */
2542
+ function sanitizeHyphenatedTerm(term) {
2543
+ return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
2544
+ }
2545
+ /**
2546
+ * Check if a token is a dotted version/version-like string (e.g., 2026.4.10, 3.14.0).
2547
+ * Returns true if splitting on dots yields at least 2 non-empty parts consisting of
2548
+ * word/digit characters only. This avoids incorrectly splitting tokens with leading/
2549
+ * trailing dots. Version strings like "2026.4.10" split into ["2026","4","10"] (3 parts).
2550
+ */
2551
+ function isDottedToken(token) {
2552
+ const parts = token.split('.');
2553
+ return parts.length >= 2 && parts.every(p => p.length > 0 && /^[\p{L}\p{N}_]+$/u.test(p));
2554
+ }
2555
+ /**
2556
+ * Sanitize a dotted term into individual FTS5 tokens joined with AND.
2557
+ * e.g. "2026.4.10" → '"2026"* AND "4"* AND "10"*'
2558
+ * The AND ensures all parts must appear, matching how the porter tokenizer
2559
+ * indexes dotted strings.
2560
+ */
2561
+ function sanitizeDottedTerm(term) {
2562
+ return term.split('.').map(t => sanitizeFTS5Term(t)).filter(t => t).map(t => `"${t}"*`).join(' AND ');
2563
+ }
2564
+ /**
2565
+ * Parse lex query syntax into FTS5 query.
2566
+ *
2567
+ * Supports:
2568
+ * - Quoted phrases: "exact phrase" → "exact phrase" (exact match)
2569
+ * - Negation: -term or -"phrase" → uses FTS5 NOT operator
2570
+ * - Hyphenated tokens: multi-agent, DEC-0054, gpt-4 → treated as phrases
2571
+ * - Plain terms: term → "term"* (prefix match)
2572
+ *
2573
+ * FTS5 NOT is a binary operator: `term1 NOT term2` means "match term1 but not term2".
2574
+ * So `-term` only works when there are also positive terms.
2575
+ *
2576
+ * Hyphen disambiguation: `-sports` at a word boundary is negation, but `multi-agent`
2577
+ * (where `-` is between word characters) is treated as a hyphenated phrase.
2578
+ * When a leading `-` is followed by what looks like a hyphenated compound word
2579
+ * (e.g., `-multi-agent`), the entire token is treated as a negated phrase.
2580
+ *
2581
+ * Examples:
2582
+ * performance -sports → "performance"* NOT "sports"*
2583
+ * "machine learning" → "machine learning"
2584
+ * multi-agent memory → "multi agent" AND "memory"*
2585
+ * DEC-0054 → "dec 0054"
2586
+ * -multi-agent → NOT "multi agent"
2587
+ */
2588
+ function buildFTS5Query(query) {
2589
+ const positive = [];
2590
+ const negative = [];
2591
+ let i = 0;
2592
+ const s = query.trim();
2593
+ while (i < s.length) {
2594
+ // Skip whitespace
2595
+ while (i < s.length && /\s/.test(s[i]))
2596
+ i++;
2597
+ if (i >= s.length)
2598
+ break;
2599
+ // Check for negation prefix
2600
+ const negated = s[i] === '-';
2601
+ if (negated)
2602
+ i++;
2603
+ // Check for quoted phrase
2604
+ if (s[i] === '"') {
2605
+ const start = i + 1;
2606
+ i++;
2607
+ while (i < s.length && s[i] !== '"')
2608
+ i++;
2609
+ const phrase = s.slice(start, i).trim();
2610
+ i++; // skip closing quote
2611
+ if (phrase.length > 0) {
2612
+ const sanitized = sanitizeFTS5Phrase(phrase);
2613
+ if (sanitized) {
2614
+ const ftsPhrase = `"${sanitized}"`; // Exact phrase, no prefix match
2615
+ if (negated) {
2616
+ negative.push(ftsPhrase);
2617
+ }
2618
+ else {
2619
+ positive.push(ftsPhrase);
2620
+ }
2621
+ }
2622
+ }
2623
+ }
2624
+ else {
2625
+ // Plain term (until whitespace or quote)
2626
+ const start = i;
2627
+ while (i < s.length && !/[\s"]/.test(s[i]))
2628
+ i++;
2629
+ const term = s.slice(start, i);
2630
+ // Handle hyphenated tokens: multi-agent, DEC-0054, gpt-4
2631
+ // These get split into phrase queries so FTS5 porter tokenizer matches them.
2632
+ if (isHyphenatedToken(term)) {
2633
+ const sanitized = sanitizeHyphenatedTerm(term);
2634
+ if (sanitized) {
2635
+ const ftsPhrase = `"${sanitized}"`; // Phrase match (no prefix)
2636
+ if (negated) {
2637
+ negative.push(ftsPhrase);
2638
+ }
2639
+ else {
2640
+ positive.push(ftsPhrase);
2641
+ }
2642
+ }
2643
+ }
2644
+ else if (isDottedToken(term)) {
2645
+ // Handle dotted version strings: 2026.4.10, 3.14.0, v1.2.3
2646
+ // The porter tokenizer splits on dots, so the index has individual tokens.
2647
+ // We AND all parts together so the query matches documents containing all parts.
2648
+ const sanitized = sanitizeDottedTerm(term);
2649
+ if (sanitized) {
2650
+ // sanitizeDottedTerm already wraps each part in quotes with prefix match
2651
+ if (negated) {
2652
+ // Wrap multi-token AND expression in parens for NOT negation
2653
+ negative.push(`(${sanitized})`);
2654
+ }
2655
+ else {
2656
+ // Flatten individual AND'd terms into the positive list so they combine
2657
+ // correctly with other terms (avoids double-wrapping in outer AND).
2658
+ for (const part of sanitized.split(' AND ')) {
2659
+ positive.push(part.trim());
2660
+ }
2661
+ }
2662
+ }
2663
+ }
2664
+ else if (containsCjk(term)) {
2665
+ const sanitized = sanitizeFTS5Phrase(term);
2666
+ if (sanitized) {
2667
+ const ftsPhrase = `"${sanitized}"`; // CJK phrase over character tokens
2668
+ if (negated) {
2669
+ negative.push(ftsPhrase);
2670
+ }
2671
+ else {
2672
+ positive.push(ftsPhrase);
2673
+ }
2674
+ }
2675
+ }
2676
+ else {
2677
+ const sanitized = sanitizeFTS5Term(term);
2678
+ if (sanitized) {
2679
+ const ftsTerm = `"${sanitized}"*`; // Prefix match
2680
+ if (negated) {
2681
+ negative.push(ftsTerm);
2682
+ }
2683
+ else {
2684
+ positive.push(ftsTerm);
2685
+ }
2686
+ }
2687
+ }
2688
+ }
2689
+ }
2690
+ if (positive.length === 0 && negative.length === 0)
2691
+ return null;
2692
+ // If only negative terms, we can't search (FTS5 NOT is binary)
2693
+ if (positive.length === 0)
2694
+ return null;
2695
+ // Join positive terms with AND
2696
+ let result = positive.join(' AND ');
2697
+ // Add NOT clause for negative terms
2698
+ for (const neg of negative) {
2699
+ result = `${result} NOT ${neg}`;
2700
+ }
2701
+ return result;
2702
+ }
2703
+ /**
2704
+ * Validate that a vec/hyde query doesn't use lex-only syntax.
2705
+ * Returns error message if invalid, null if valid.
2706
+ */
2707
+ export function validateSemanticQuery(query) {
2708
+ // Check for negation syntax — only at token boundaries (start of string or after whitespace).
2709
+ // Hyphenated words like "real-time" or "write-ahead" must not trigger this.
2710
+ if (/(^|\s)-[\w"]/.test(query)) {
2711
+ return 'Negation (-term) is not supported in vec/hyde queries. Use lex for exclusions.';
2712
+ }
2713
+ return null;
2714
+ }
2715
+ export function validateLexQuery(query) {
2716
+ if (/[\r\n]/.test(query)) {
2717
+ return 'Lex queries must be a single line. Remove newline characters or split into separate lex: lines.';
2718
+ }
2719
+ const quoteCount = (query.match(/"/g) ?? []).length;
2720
+ if (quoteCount % 2 === 1) {
2721
+ return 'Lex query has an unmatched double quote ("). Add the closing quote or remove it.';
2722
+ }
2723
+ return null;
2724
+ }
2725
+ export function searchFTS(db, query, limit = 20, collectionName) {
2726
+ const ftsQuery = buildFTS5Query(query);
2727
+ if (!ftsQuery)
2728
+ return [];
2729
+ // Use a CTE to force FTS5 to run first, then filter by collection.
2730
+ // Without the CTE, SQLite's query planner combines FTS5 MATCH with the
2731
+ // collection filter in a single WHERE clause, which can cause it to
2732
+ // abandon the FTS5 index and fall back to a full scan — turning an 8ms
2733
+ // query into a 17-second query on large collections.
2734
+ const params = [ftsQuery];
2735
+ // When filtering by collection, fetch extra candidates from the FTS index
2736
+ // since some will be filtered out. Without a collection filter we can
2737
+ // fetch exactly the requested limit.
2738
+ const ftsLimit = collectionName ? limit * 10 : limit;
2739
+ let sql = `
2740
+ WITH fts_matches AS (
2741
+ SELECT rowid, bm25(documents_fts, 1.5, 4.0, 1.0) as bm25_score
2742
+ FROM documents_fts
2743
+ WHERE documents_fts MATCH ?
2744
+ ORDER BY bm25_score ASC
2745
+ LIMIT ${ftsLimit}
2746
+ )
2747
+ SELECT
2748
+ 'qmd://' || d.collection || '/' || d.path as filepath,
2749
+ d.collection || '/' || d.path as display_path,
2750
+ d.title,
2751
+ content.doc as body,
2752
+ d.hash,
2753
+ fm.bm25_score
2754
+ FROM fts_matches fm
2755
+ JOIN documents d ON d.id = fm.rowid
2756
+ JOIN content ON content.hash = d.hash
2757
+ WHERE d.active = 1
2758
+ `;
2759
+ if (collectionName) {
2760
+ sql += ` AND d.collection = ?`;
2761
+ params.push(String(collectionName));
2762
+ }
2763
+ // bm25 lower is better; sort ascending.
2764
+ sql += ` ORDER BY fm.bm25_score ASC LIMIT ?`;
2765
+ params.push(limit);
2766
+ const rows = db.prepare(sql).all(...params);
2767
+ return rows.map(row => {
2768
+ const collectionName = row.filepath.split('//')[1]?.split('/')[0] || "";
2769
+ // Convert bm25 (negative, lower is better) into a stable [0..1) score where higher is better.
2770
+ // FTS5 BM25 scores are negative (e.g., -10 is strong, -2 is weak).
2771
+ // |x| / (1 + |x|) maps: strong(-10)→0.91, medium(-2)→0.67, weak(-0.5)→0.33, none(0)→0.
2772
+ // Monotonic and query-independent — no per-query normalization needed.
2773
+ const score = Math.abs(row.bm25_score) / (1 + Math.abs(row.bm25_score));
2774
+ return {
2775
+ filepath: row.filepath,
2776
+ displayPath: row.display_path,
2777
+ title: row.title,
2778
+ hash: row.hash,
2779
+ docid: getDocid(row.hash),
2780
+ collectionName,
2781
+ modifiedAt: "", // Not available in FTS query
2782
+ bodyLength: row.body.length,
2783
+ body: row.body,
2784
+ context: getContextForFile(db, row.filepath),
2785
+ score,
2786
+ source: "fts",
2787
+ };
2788
+ });
2789
+ }
2790
+ // =============================================================================
2791
+ // Vector Search
2792
+ // =============================================================================
2793
+ export async function searchVec(db, query, model, limit = 20, collectionName, session, precomputedEmbedding) {
2794
+ const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
2795
+ if (!tableExists)
2796
+ return [];
2797
+ const embedding = precomputedEmbedding ?? await getEmbedding(query, model, true, session);
2798
+ if (!embedding)
2799
+ return [];
2800
+ // IMPORTANT: We use a two-step query approach here because sqlite-vec virtual tables
2801
+ // hang indefinitely when combined with JOINs in the same query. Do NOT try to
2802
+ // "optimize" this by combining into a single query with JOINs - it will break.
2803
+ // See: https://github.com/tobi/qmd/pull/23
2804
+ // Step 1: Get vector matches from sqlite-vec (no JOINs allowed)
2805
+ const vecResults = db.prepare(`
2806
+ SELECT hash_seq, distance
2807
+ FROM vectors_vec
2808
+ WHERE embedding MATCH ? AND k = ?
2809
+ `).all(new Float32Array(embedding), limit * 3);
2810
+ if (vecResults.length === 0)
2811
+ return [];
2812
+ // Step 2: Get chunk info and document data
2813
+ const hashSeqs = vecResults.map(r => r.hash_seq);
2814
+ const distanceMap = new Map(vecResults.map(r => [r.hash_seq, r.distance]));
2815
+ // Build query for document lookup
2816
+ const placeholders = hashSeqs.map(() => '?').join(',');
2817
+ let docSql = `
2818
+ SELECT
2819
+ cv.hash || '_' || cv.seq as hash_seq,
2820
+ cv.hash,
2821
+ cv.pos,
2822
+ 'qmd://' || d.collection || '/' || d.path as filepath,
2823
+ d.collection || '/' || d.path as display_path,
2824
+ d.title,
2825
+ content.doc as body
2826
+ FROM content_vectors cv
2827
+ JOIN documents d ON d.hash = cv.hash AND d.active = 1
2828
+ JOIN content ON content.hash = d.hash
2829
+ WHERE cv.hash || '_' || cv.seq IN (${placeholders})
2830
+ `;
2831
+ const params = [...hashSeqs];
2832
+ if (collectionName) {
2833
+ docSql += ` AND d.collection = ?`;
2834
+ params.push(collectionName);
2835
+ }
2836
+ const docRows = withLazyContentVectorMigration(db, () => db.prepare(docSql).all(...params));
2837
+ // Combine with distances and dedupe by filepath
2838
+ const seen = new Map();
2839
+ for (const row of docRows) {
2840
+ const distance = distanceMap.get(row.hash_seq) ?? 1;
2841
+ const existing = seen.get(row.filepath);
2842
+ if (!existing || distance < existing.bestDist) {
2843
+ seen.set(row.filepath, { row, bestDist: distance });
2844
+ }
2845
+ }
2846
+ return Array.from(seen.values())
2847
+ .sort((a, b) => a.bestDist - b.bestDist)
2848
+ .slice(0, limit)
2849
+ .map(({ row, bestDist }) => {
2850
+ const collectionName = row.filepath.split('//')[1]?.split('/')[0] || "";
2851
+ return {
2852
+ filepath: row.filepath,
2853
+ displayPath: row.display_path,
2854
+ title: row.title,
2855
+ hash: row.hash,
2856
+ docid: getDocid(row.hash),
2857
+ collectionName,
2858
+ modifiedAt: "", // Not available in vec query
2859
+ bodyLength: row.body.length,
2860
+ body: row.body,
2861
+ context: getContextForFile(db, row.filepath),
2862
+ score: 1 - bestDist, // Cosine similarity = 1 - cosine distance
2863
+ source: "vec",
2864
+ chunkPos: row.pos,
2865
+ };
2866
+ });
2867
+ }
2868
+ // =============================================================================
2869
+ // Embeddings
2870
+ // =============================================================================
2871
+ async function getEmbedding(text, model, isQuery, session, llmOverride) {
2872
+ // Format text using the appropriate prompt template
2873
+ const formattedText = isQuery ? formatQueryForEmbedding(text, model) : formatDocForEmbedding(text, undefined, model);
2874
+ const result = session
2875
+ ? await session.embed(formattedText, { model, isQuery })
2876
+ : await (llmOverride ?? getDefaultLlamaCpp()).embed(formattedText, { model, isQuery });
2877
+ return result?.embedding || null;
2878
+ }
2879
+ /**
2880
+ * Get all unique content hashes that need embeddings (from active documents).
2881
+ * Returns hash, document body, and a sample path for display purposes.
2882
+ */
2883
+ export function getHashesForEmbedding(db, model = DEFAULT_EMBED_MODEL) {
2884
+ const fingerprint = getEmbeddingFingerprint(model);
2885
+ return withLazyContentVectorMigration(db, () => db.prepare(`
2886
+ SELECT d.hash, c.doc as body, MIN(d.path) as path
2887
+ FROM documents d
2888
+ JOIN content c ON d.hash = c.hash
2889
+ LEFT JOIN (
2890
+ SELECT hash, model, COUNT(*) AS chunk_count, MAX(total_chunks) AS expected_chunks
2891
+ FROM content_vectors
2892
+ WHERE model = ? AND embed_fingerprint = ?
2893
+ GROUP BY hash, model, embed_fingerprint
2894
+ ) v ON d.hash = v.hash
2895
+ WHERE d.active = 1
2896
+ AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
2897
+ GROUP BY d.hash
2898
+ `).all(model, fingerprint));
2899
+ }
2900
+ /**
2901
+ * Clear embeddings for the whole index, or just for one collection.
2902
+ *
2903
+ * When `collection` is omitted the entire content_vectors table is emptied and
2904
+ * the vectors_vec virtual table is dropped (it is recreated with the right
2905
+ * dimensions on the next embed run).
2906
+ *
2907
+ * When `collection` is provided, only vectors whose hash is referenced
2908
+ * exclusively by active documents in that collection are removed. Hashes
2909
+ * shared with active documents in other collections are left in place so
2910
+ * vector search keeps working there (content_vectors is keyed globally by
2911
+ * content hash; identical document bodies across collections share a row).
2912
+ * vectors_vec is preserved so other collections keep working unless the scoped
2913
+ * clear empties content_vectors entirely, in which case it is dropped so the
2914
+ * next embed can recreate the table with the current dimensions.
2915
+ */
2916
+ export function clearAllEmbeddings(db, collection) {
2917
+ if (!collection) {
2918
+ db.exec(`DELETE FROM content_vectors`);
2919
+ db.exec(`DROP TABLE IF EXISTS vectors_vec`);
2920
+ return;
2921
+ }
2922
+ const exclusiveHashesQuery = `
2923
+ SELECT DISTINCT d.hash
2924
+ FROM documents d
2925
+ WHERE d.collection = ? AND d.active = 1
2926
+ AND NOT EXISTS (
2927
+ SELECT 1 FROM documents d2
2928
+ WHERE d2.hash = d.hash
2929
+ AND d2.active = 1
2930
+ AND d2.collection != d.collection
2931
+ )
2932
+ `;
2933
+ const vecTableExists = db
2934
+ .prepare(`SELECT 1 FROM sqlite_master WHERE type='table' AND name='vectors_vec'`)
2935
+ .get();
2936
+ withLazyContentVectorMigration(db, () => {
2937
+ if (vecTableExists) {
2938
+ const hashSeqRows = db.prepare(`
2939
+ SELECT cv.hash, cv.seq
2940
+ FROM content_vectors cv
2941
+ WHERE cv.hash IN (${exclusiveHashesQuery})
2942
+ `).all(collection);
2943
+ const delVec = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
2944
+ for (const row of hashSeqRows) {
2945
+ delVec.run(`${row.hash}_${row.seq}`);
2946
+ }
2947
+ }
2948
+ db.prepare(`
2949
+ DELETE FROM content_vectors
2950
+ WHERE hash IN (${exclusiveHashesQuery})
2951
+ `).run(collection);
2952
+ const remaining = db
2953
+ .prepare(`SELECT COUNT(*) AS n FROM content_vectors`)
2954
+ .get();
2955
+ if (remaining.n === 0) {
2956
+ db.exec(`DROP TABLE IF EXISTS vectors_vec`);
2957
+ }
2958
+ });
2959
+ }
2960
+ /**
2961
+ * Insert a single embedding into both content_vectors and vectors_vec tables.
2962
+ * The hash_seq key is formatted as "hash_seq" for the vectors_vec table.
2963
+ *
2964
+ * content_vectors is inserted first so that getHashesForEmbedding (which checks
2965
+ * only content_vectors) won't re-select the hash on a crash between the two inserts.
2966
+ *
2967
+ * vectors_vec uses DELETE + INSERT instead of INSERT OR REPLACE because sqlite-vec's
2968
+ * vec0 virtual tables silently ignore the OR REPLACE conflict clause.
2969
+ */
2970
+ export function insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt, totalChunks = 1, fingerprint = getEmbeddingFingerprint(model)) {
2971
+ const hashSeq = `${hash}_${seq}`;
2972
+ withLazyContentVectorMigration(db, () => {
2973
+ // Insert content_vectors first — crash-safe ordering (see getHashesForEmbedding)
2974
+ const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embed_fingerprint, total_chunks, embedded_at) VALUES (?, ?, ?, ?, ?, ?, ?)`);
2975
+ insertContentVectorStmt.run(hash, seq, pos, model, fingerprint, totalChunks, embeddedAt);
2976
+ // vec0 virtual tables don't support OR REPLACE — use DELETE + INSERT
2977
+ const deleteVecStmt = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
2978
+ const insertVecStmt = db.prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
2979
+ deleteVecStmt.run(hashSeq);
2980
+ insertVecStmt.run(hashSeq, embedding);
2981
+ });
2982
+ }
2983
+ function removeIncompleteEmbeddings(db, expectedChunksByHash, model) {
2984
+ return withLazyContentVectorMigration(db, () => {
2985
+ let removed = 0;
2986
+ const rowsStmt = db.prepare(`SELECT seq FROM content_vectors WHERE hash = ? AND model = ?`);
2987
+ const deleteContentStmt = db.prepare(`DELETE FROM content_vectors WHERE hash = ? AND model = ?`);
2988
+ const deleteVecStmt = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
2989
+ for (const [hash, expectedChunks] of expectedChunksByHash) {
2990
+ const rows = rowsStmt.all(hash, model);
2991
+ if (rows.length === 0 || rows.length === expectedChunks)
2992
+ continue;
2993
+ for (const row of rows) {
2994
+ deleteVecStmt.run(`${hash}_${row.seq}`);
2995
+ }
2996
+ deleteContentStmt.run(hash, model);
2997
+ removed += rows.length;
2998
+ }
2999
+ return removed;
3000
+ });
3001
+ }
3002
+ // =============================================================================
3003
+ // Query expansion
3004
+ // =============================================================================
3005
+ export async function expandQuery(query, model = DEFAULT_QUERY_MODEL, db, intent, llmOverride) {
3006
+ // Check cache first — stored as JSON preserving types
3007
+ const cacheKey = getCacheKey("expandQuery", { query, model, ...(intent && { intent }) });
3008
+ const cached = getCachedResult(db, cacheKey);
3009
+ if (cached) {
3010
+ try {
3011
+ const parsed = JSON.parse(cached);
3012
+ if (!Array.isArray(parsed))
3013
+ return [];
3014
+ const rows = parsed;
3015
+ // Migrate old cache format: { type, text } → { type, query }
3016
+ if (rows.length > 0 && typeof rows[0]?.query === "string") {
3017
+ return rows.map((r) => ({ type: r.type, query: String(r.query) }));
3018
+ }
3019
+ else if (rows.length > 0 && typeof rows[0]?.text === "string") {
3020
+ return rows.map((r) => ({ type: r.type, query: String(r.text) }));
3021
+ }
3022
+ }
3023
+ catch {
3024
+ // Old cache format (pre-typed, newline-separated text) — re-expand
3025
+ }
3026
+ }
3027
+ const llm = llmOverride ?? getDefaultLlamaCpp();
3028
+ // Note: LlamaCpp uses hardcoded model, model parameter is ignored
3029
+ const results = await llm.expandQuery(query, { intent });
3030
+ // Map Queryable[] → ExpandedQuery[] (same shape, decoupled from llm.ts internals).
3031
+ // Filter out entries that duplicate the original query text.
3032
+ const expanded = results
3033
+ .filter(r => r.text !== query)
3034
+ .map(r => ({ type: r.type, query: r.text }));
3035
+ if (expanded.length > 0) {
3036
+ setCachedResult(db, cacheKey, JSON.stringify(expanded));
3037
+ }
3038
+ return expanded;
3039
+ }
3040
+ // =============================================================================
3041
+ // Reranking
3042
+ // =============================================================================
3043
+ export async function rerank(query, documents, model = DEFAULT_RERANK_MODEL, db, intent, llmOverride) {
3044
+ // Prepend intent to rerank query so the reranker scores with domain context
3045
+ const rerankQuery = intent ? `${intent}\n\n${query}` : query;
3046
+ const cachedResults = new Map();
3047
+ const uncachedDocsByChunk = new Map();
3048
+ // Check cache for each document
3049
+ // Cache key includes chunk text — different queries can select different chunks
3050
+ // from the same file, and the reranker score depends on which chunk was sent.
3051
+ // File path is excluded from the new cache key because the reranker score
3052
+ // depends on the chunk content, not where it came from.
3053
+ for (const doc of documents) {
3054
+ const cacheKey = getCacheKey("rerank", { query: rerankQuery, model, chunk: doc.text });
3055
+ const legacyCacheKey = getCacheKey("rerank", { query, file: doc.file, model, chunk: doc.text });
3056
+ const cached = getCachedResult(db, cacheKey) ?? getCachedResult(db, legacyCacheKey);
3057
+ if (cached !== null) {
3058
+ cachedResults.set(doc.text, parseFloat(cached));
3059
+ }
3060
+ else {
3061
+ uncachedDocsByChunk.set(doc.text, { file: doc.file, text: doc.text });
3062
+ }
3063
+ }
3064
+ // Rerank uncached documents using LlamaCpp
3065
+ if (uncachedDocsByChunk.size > 0) {
3066
+ const llm = llmOverride ?? getDefaultLlamaCpp();
3067
+ const uncachedDocs = [...uncachedDocsByChunk.values()];
3068
+ const rerankResult = await llm.rerank(rerankQuery, uncachedDocs, { model });
3069
+ // Cache results by chunk text so identical chunks across files are scored once.
3070
+ const textByFile = new Map(uncachedDocs.map(d => [d.file, d.text]));
3071
+ for (const result of rerankResult.results) {
3072
+ const chunk = textByFile.get(result.file) || "";
3073
+ const cacheKey = getCacheKey("rerank", { query: rerankQuery, model, chunk });
3074
+ setCachedResult(db, cacheKey, result.score.toString());
3075
+ cachedResults.set(chunk, result.score);
3076
+ }
3077
+ }
3078
+ // Return all results sorted by score
3079
+ return documents
3080
+ .map(doc => ({ file: doc.file, score: cachedResults.get(doc.text) || 0 }))
3081
+ .sort((a, b) => b.score - a.score);
3082
+ }
3083
+ // =============================================================================
3084
+ // Reciprocal Rank Fusion
3085
+ // =============================================================================
3086
+ export function reciprocalRankFusion(resultLists, weights = [], k = 60) {
3087
+ const scores = new Map();
3088
+ for (let listIdx = 0; listIdx < resultLists.length; listIdx++) {
3089
+ const list = resultLists[listIdx];
3090
+ if (!list)
3091
+ continue;
3092
+ const weight = weights[listIdx] ?? 1.0;
3093
+ for (let rank = 0; rank < list.length; rank++) {
3094
+ const result = list[rank];
3095
+ if (!result)
3096
+ continue;
3097
+ const rrfContribution = weight / (k + rank + 1);
3098
+ const existing = scores.get(result.file);
3099
+ if (existing) {
3100
+ existing.rrfScore += rrfContribution;
3101
+ existing.topRank = Math.min(existing.topRank, rank);
3102
+ }
3103
+ else {
3104
+ scores.set(result.file, {
3105
+ result,
3106
+ rrfScore: rrfContribution,
3107
+ topRank: rank,
3108
+ });
3109
+ }
3110
+ }
3111
+ }
3112
+ // Top-rank bonus
3113
+ for (const entry of scores.values()) {
3114
+ if (entry.topRank === 0) {
3115
+ entry.rrfScore += 0.05;
3116
+ }
3117
+ else if (entry.topRank <= 2) {
3118
+ entry.rrfScore += 0.02;
3119
+ }
3120
+ }
3121
+ return Array.from(scores.values())
3122
+ .sort((a, b) => b.rrfScore - a.rrfScore)
3123
+ .map(e => ({ ...e.result, score: e.rrfScore }));
3124
+ }
3125
+ /**
3126
+ * Build per-document RRF contribution traces for explain/debug output.
3127
+ */
3128
+ export function buildRrfTrace(resultLists, weights = [], listMeta = [], k = 60) {
3129
+ const traces = new Map();
3130
+ for (let listIdx = 0; listIdx < resultLists.length; listIdx++) {
3131
+ const list = resultLists[listIdx];
3132
+ if (!list)
3133
+ continue;
3134
+ const weight = weights[listIdx] ?? 1.0;
3135
+ const meta = listMeta[listIdx] ?? {
3136
+ source: "fts",
3137
+ queryType: "original",
3138
+ query: "",
3139
+ };
3140
+ for (let rank0 = 0; rank0 < list.length; rank0++) {
3141
+ const result = list[rank0];
3142
+ if (!result)
3143
+ continue;
3144
+ const rank = rank0 + 1; // 1-indexed rank for explain output
3145
+ const contribution = weight / (k + rank);
3146
+ const existing = traces.get(result.file);
3147
+ const detail = {
3148
+ listIndex: listIdx,
3149
+ source: meta.source,
3150
+ queryType: meta.queryType,
3151
+ query: meta.query,
3152
+ rank,
3153
+ weight,
3154
+ backendScore: result.score,
3155
+ rrfContribution: contribution,
3156
+ };
3157
+ if (existing) {
3158
+ existing.baseScore += contribution;
3159
+ existing.topRank = Math.min(existing.topRank, rank);
3160
+ existing.contributions.push(detail);
3161
+ }
3162
+ else {
3163
+ traces.set(result.file, {
3164
+ contributions: [detail],
3165
+ baseScore: contribution,
3166
+ topRank: rank,
3167
+ topRankBonus: 0,
3168
+ totalScore: 0,
3169
+ });
3170
+ }
3171
+ }
3172
+ }
3173
+ for (const trace of traces.values()) {
3174
+ let bonus = 0;
3175
+ if (trace.topRank === 1)
3176
+ bonus = 0.05;
3177
+ else if (trace.topRank <= 3)
3178
+ bonus = 0.02;
3179
+ trace.topRankBonus = bonus;
3180
+ trace.totalScore = trace.baseScore + bonus;
3181
+ }
3182
+ return traces;
3183
+ }
3184
+ /**
3185
+ * Find a document by filename/path, docid (#hash), or with fuzzy matching.
3186
+ * Returns document metadata without body by default.
3187
+ *
3188
+ * Supports:
3189
+ * - Virtual paths: qmd://collection/path/to/file.md
3190
+ * - Absolute paths: /path/to/file.md
3191
+ * - Relative paths: path/to/file.md
3192
+ * - Short docid: #abc123 (first 6 chars of hash)
3193
+ */
3194
+ export function findDocument(db, filename, options = {}) {
3195
+ let filepath = filename;
3196
+ const colonMatch = filepath.match(/:(\d+)$/);
3197
+ if (colonMatch) {
3198
+ filepath = filepath.slice(0, -colonMatch[0].length);
3199
+ }
3200
+ // Check if this is a docid lookup (#abc123, abc123, "#abc123", "abc123", etc.)
3201
+ if (isDocid(filepath)) {
3202
+ const docidMatch = findDocumentByDocid(db, filepath);
3203
+ if (docidMatch) {
3204
+ filepath = docidMatch.filepath;
3205
+ }
3206
+ else {
3207
+ return { error: "not_found", query: filename, similarFiles: [] };
3208
+ }
3209
+ }
3210
+ if (filepath.startsWith('~/')) {
3211
+ filepath = homedir() + filepath.slice(1);
3212
+ }
3213
+ const bodyCol = options.includeBody ? `, content.doc as body` : ``;
3214
+ // Build computed columns
3215
+ // Note: absoluteFilepath is computed from YAML collections after query
3216
+ const selectCols = `
3217
+ 'qmd://' || d.collection || '/' || d.path as virtual_path,
3218
+ d.collection || '/' || d.path as display_path,
3219
+ d.title,
3220
+ d.hash,
3221
+ d.collection,
3222
+ d.modified_at,
3223
+ LENGTH(content.doc) as body_length
3224
+ ${bodyCol}
3225
+ `;
3226
+ // Try to match by virtual path first
3227
+ let doc = db.prepare(`
3228
+ SELECT ${selectCols}
3229
+ FROM documents d
3230
+ JOIN content ON content.hash = d.hash
3231
+ WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
3232
+ `).get(filepath);
3233
+ // Try fuzzy match by virtual path
3234
+ if (!doc) {
3235
+ doc = db.prepare(`
3236
+ SELECT ${selectCols}
3237
+ FROM documents d
3238
+ JOIN content ON content.hash = d.hash
3239
+ WHERE 'qmd://' || d.collection || '/' || d.path LIKE ? AND d.active = 1
3240
+ LIMIT 1
3241
+ `).get(`%${filepath}`);
3242
+ }
3243
+ // Try to match by absolute path (requires looking up collection paths from DB)
3244
+ if (!doc && !filepath.startsWith('qmd://')) {
3245
+ const collections = getStoreCollections(db);
3246
+ for (const coll of collections) {
3247
+ let relativePath = null;
3248
+ // If filepath is absolute and starts with collection path, extract relative part
3249
+ if (filepath.startsWith(coll.path + '/')) {
3250
+ relativePath = filepath.slice(coll.path.length + 1);
3251
+ }
3252
+ // Otherwise treat filepath as relative to collection
3253
+ else if (!filepath.startsWith('/')) {
3254
+ relativePath = filepath;
3255
+ }
3256
+ if (relativePath) {
3257
+ doc = db.prepare(`
3258
+ SELECT ${selectCols}
3259
+ FROM documents d
3260
+ JOIN content ON content.hash = d.hash
3261
+ WHERE d.collection = ? AND d.path = ? AND d.active = 1
3262
+ `).get(coll.name, relativePath);
3263
+ if (doc)
3264
+ break;
3265
+ }
3266
+ }
3267
+ }
3268
+ if (!doc) {
3269
+ const similar = findSimilarFiles(db, filepath, 5, 5);
3270
+ return { error: "not_found", query: filename, similarFiles: similar };
3271
+ }
3272
+ // Get context using virtual path
3273
+ const virtualPath = doc.virtual_path || `qmd://${doc.collection}/${doc.display_path}`;
3274
+ const context = getContextForFile(db, virtualPath);
3275
+ return {
3276
+ filepath: virtualPath,
3277
+ displayPath: doc.display_path,
3278
+ title: doc.title,
3279
+ context,
3280
+ hash: doc.hash,
3281
+ docid: getDocid(doc.hash),
3282
+ collectionName: doc.collection,
3283
+ modifiedAt: doc.modified_at,
3284
+ bodyLength: doc.body_length,
3285
+ ...(options.includeBody && doc.body !== undefined && { body: doc.body }),
3286
+ };
3287
+ }
3288
+ /**
3289
+ * Get the body content for a document
3290
+ * Optionally slice by line range
3291
+ */
3292
+ export function getDocumentBody(db, doc, fromLine, maxLines) {
3293
+ const filepath = doc.filepath;
3294
+ // Try to resolve document by filepath (absolute or virtual)
3295
+ let row = null;
3296
+ // Try virtual path first
3297
+ if (filepath.startsWith('qmd://')) {
3298
+ row = db.prepare(`
3299
+ SELECT content.doc as body
3300
+ FROM documents d
3301
+ JOIN content ON content.hash = d.hash
3302
+ WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
3303
+ `).get(filepath);
3304
+ }
3305
+ // Try absolute path by looking up in DB store_collections
3306
+ if (!row) {
3307
+ const collections = getStoreCollections(db);
3308
+ for (const coll of collections) {
3309
+ if (filepath.startsWith(coll.path + '/')) {
3310
+ const relativePath = filepath.slice(coll.path.length + 1);
3311
+ row = db.prepare(`
3312
+ SELECT content.doc as body
3313
+ FROM documents d
3314
+ JOIN content ON content.hash = d.hash
3315
+ WHERE d.collection = ? AND d.path = ? AND d.active = 1
3316
+ `).get(coll.name, relativePath);
3317
+ if (row)
3318
+ break;
3319
+ }
3320
+ }
3321
+ }
3322
+ if (!row)
3323
+ return null;
3324
+ let body = row.body;
3325
+ if (fromLine !== undefined || maxLines !== undefined) {
3326
+ const lines = body.split('\n');
3327
+ const start = Math.max(0, (fromLine || 1) - 1);
3328
+ const end = maxLines !== undefined ? start + maxLines : lines.length;
3329
+ body = lines.slice(start, end).join('\n');
3330
+ }
3331
+ return body;
3332
+ }
3333
+ /**
3334
+ * Find multiple documents by glob pattern or comma-separated list
3335
+ * Returns documents without body by default (use getDocumentBody to load)
3336
+ */
3337
+ export function findDocuments(db, pattern, options = {}) {
3338
+ const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?') && !pattern.includes('{');
3339
+ const errors = [];
3340
+ const maxBytes = options.maxBytes ?? DEFAULT_MULTI_GET_MAX_BYTES;
3341
+ const bodyCol = options.includeBody ? `, content.doc as body` : ``;
3342
+ const selectCols = `
3343
+ 'qmd://' || d.collection || '/' || d.path as virtual_path,
3344
+ d.collection || '/' || d.path as display_path,
3345
+ d.title,
3346
+ d.hash,
3347
+ d.collection,
3348
+ d.modified_at,
3349
+ LENGTH(content.doc) as body_length
3350
+ ${bodyCol}
3351
+ `;
3352
+ let fileRows;
3353
+ if (isCommaSeparated) {
3354
+ const names = pattern.split(',').map(s => s.trim()).filter(Boolean);
3355
+ fileRows = [];
3356
+ for (const name of names) {
3357
+ let doc = db.prepare(`
3358
+ SELECT ${selectCols}
3359
+ FROM documents d
3360
+ JOIN content ON content.hash = d.hash
3361
+ WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
3362
+ `).get(name);
3363
+ if (!doc) {
3364
+ doc = db.prepare(`
3365
+ SELECT ${selectCols}
3366
+ FROM documents d
3367
+ JOIN content ON content.hash = d.hash
3368
+ WHERE 'qmd://' || d.collection || '/' || d.path LIKE ? AND d.active = 1
3369
+ LIMIT 1
3370
+ `).get(`%${name}`);
3371
+ }
3372
+ if (doc) {
3373
+ fileRows.push(doc);
3374
+ }
3375
+ else {
3376
+ const similar = findSimilarFiles(db, name, 5, 3);
3377
+ let msg = `File not found: ${name}`;
3378
+ if (similar.length > 0) {
3379
+ msg += ` (did you mean: ${similar.join(', ')}?)`;
3380
+ }
3381
+ errors.push(msg);
3382
+ }
3383
+ }
3384
+ }
3385
+ else {
3386
+ // Glob pattern match
3387
+ const matched = matchFilesByGlob(db, pattern);
3388
+ if (matched.length === 0) {
3389
+ errors.push(`No files matched pattern: ${pattern}`);
3390
+ return { docs: [], errors };
3391
+ }
3392
+ const virtualPaths = matched.map(m => m.filepath);
3393
+ const placeholders = virtualPaths.map(() => '?').join(',');
3394
+ fileRows = db.prepare(`
3395
+ SELECT ${selectCols}
3396
+ FROM documents d
3397
+ JOIN content ON content.hash = d.hash
3398
+ WHERE 'qmd://' || d.collection || '/' || d.path IN (${placeholders}) AND d.active = 1
3399
+ `).all(...virtualPaths);
3400
+ }
3401
+ const results = [];
3402
+ for (const row of fileRows) {
3403
+ // Get context using virtual path
3404
+ const virtualPath = row.virtual_path || `qmd://${row.collection}/${row.display_path}`;
3405
+ const context = getContextForFile(db, virtualPath);
3406
+ if (row.body_length > maxBytes) {
3407
+ results.push({
3408
+ doc: { filepath: virtualPath, displayPath: row.display_path },
3409
+ skipped: true,
3410
+ skipReason: `File too large (${Math.round(row.body_length / 1024)}KB > ${Math.round(maxBytes / 1024)}KB)`,
3411
+ });
3412
+ continue;
3413
+ }
3414
+ results.push({
3415
+ doc: {
3416
+ filepath: virtualPath,
3417
+ displayPath: row.display_path,
3418
+ title: row.title || row.display_path.split('/').pop() || row.display_path,
3419
+ context,
3420
+ hash: row.hash,
3421
+ docid: getDocid(row.hash),
3422
+ collectionName: row.collection,
3423
+ modifiedAt: row.modified_at,
3424
+ bodyLength: row.body_length,
3425
+ ...(options.includeBody && row.body !== undefined && { body: row.body }),
3426
+ },
3427
+ skipped: false,
3428
+ });
3429
+ }
3430
+ return { docs: results, errors };
3431
+ }
3432
+ // =============================================================================
3433
+ // Status
3434
+ // =============================================================================
3435
+ export function getStatus(db, model = DEFAULT_EMBED_MODEL) {
3436
+ // DB is source of truth for collections — config provides supplementary metadata
3437
+ const dbCollections = db.prepare(`
3438
+ SELECT
3439
+ collection as name,
3440
+ COUNT(*) as active_count,
3441
+ MAX(modified_at) as last_doc_update
3442
+ FROM documents
3443
+ WHERE active = 1
3444
+ GROUP BY collection
3445
+ `).all();
3446
+ // Build a lookup from store_collections for path/pattern metadata
3447
+ const storeCollections = getStoreCollections(db);
3448
+ const configLookup = new Map(storeCollections.map(c => [c.name, { path: c.path, pattern: c.pattern }]));
3449
+ const collections = dbCollections.map(row => {
3450
+ const config = configLookup.get(row.name);
3451
+ return {
3452
+ name: row.name,
3453
+ path: config?.path ?? null,
3454
+ pattern: config?.pattern ?? null,
3455
+ documents: row.active_count,
3456
+ lastUpdated: row.last_doc_update || new Date().toISOString(),
3457
+ };
3458
+ });
3459
+ // Sort by last update time (most recent first)
3460
+ collections.sort((a, b) => {
3461
+ if (!a.lastUpdated)
3462
+ return 1;
3463
+ if (!b.lastUpdated)
3464
+ return -1;
3465
+ return new Date(b.lastUpdated).getTime() - new Date(a.lastUpdated).getTime();
3466
+ });
3467
+ const totalDocs = db.prepare(`SELECT COUNT(*) as c FROM documents WHERE active = 1`).get().c;
3468
+ const needsEmbedding = getHashesNeedingEmbedding(db, undefined, model);
3469
+ const hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
3470
+ return {
3471
+ totalDocuments: totalDocs,
3472
+ needsEmbedding,
3473
+ hasVectorIndex: hasVectors,
3474
+ collections,
3475
+ };
3476
+ }
3477
+ /** Weight for intent terms relative to query terms (1.0) in snippet scoring */
3478
+ export const INTENT_WEIGHT_SNIPPET = 0.3;
3479
+ /** Weight for intent terms relative to query terms (1.0) in chunk selection */
3480
+ export const INTENT_WEIGHT_CHUNK = 0.5;
3481
+ // Common stop words filtered from intent strings before tokenization.
3482
+ // Seeded from finetune/reward.py KEY_TERM_STOPWORDS, extended with common
3483
+ // 2-3 char function words so the length threshold can drop to >1 and let
3484
+ // short domain terms (API, SQL, LLM, CPU, CDN, …) survive.
3485
+ const INTENT_STOP_WORDS = new Set([
3486
+ // 2-char function words
3487
+ "am", "an", "as", "at", "be", "by", "do", "he", "if",
3488
+ "in", "is", "it", "me", "my", "no", "of", "on", "or", "so",
3489
+ "to", "up", "us", "we",
3490
+ // 3-char function words
3491
+ "all", "and", "any", "are", "but", "can", "did", "for", "get",
3492
+ "has", "her", "him", "his", "how", "its", "let", "may", "not",
3493
+ "our", "out", "the", "too", "was", "who", "why", "you",
3494
+ // 4+ char common words
3495
+ "also", "does", "find", "from", "have", "into", "more", "need",
3496
+ "show", "some", "tell", "that", "them", "this", "want", "what",
3497
+ "when", "will", "with", "your",
3498
+ // Search-context noise
3499
+ "about", "looking", "notes", "search", "where", "which",
3500
+ ]);
3501
+ /**
3502
+ * Extract meaningful terms from an intent string, filtering stop words and punctuation.
3503
+ * Uses Unicode-aware punctuation stripping so domain terms like "API" survive.
3504
+ * Returns lowercase terms suitable for text matching.
3505
+ */
3506
+ export function extractIntentTerms(intent) {
3507
+ return intent.toLowerCase().split(/\s+/)
3508
+ .map(t => t.replace(/^[^\p{L}\p{N}]+|[^\p{L}\p{N}]+$/gu, ""))
3509
+ .filter(t => t.length > 1 && !INTENT_STOP_WORDS.has(t));
3510
+ }
3511
+ export function extractSnippet(body, query, maxLen = 500, chunkPos, chunkLen, intent) {
3512
+ const totalLines = body.split('\n').length;
3513
+ let searchBody = body;
3514
+ let lineOffset = 0;
3515
+ if (chunkPos !== undefined && chunkPos >= 0) {
3516
+ // Search within the chunk region, with some padding for context
3517
+ // Use provided chunkLen or fall back to max chunk size (covers variable-length chunks)
3518
+ const searchLen = chunkLen || CHUNK_SIZE_CHARS;
3519
+ const contextStart = Math.max(0, chunkPos - 100);
3520
+ const contextEnd = Math.min(body.length, chunkPos + searchLen + 100);
3521
+ searchBody = body.slice(contextStart, contextEnd);
3522
+ if (contextStart > 0) {
3523
+ lineOffset = body.slice(0, contextStart).split('\n').length - 1;
3524
+ }
3525
+ }
3526
+ const lines = searchBody.split('\n');
3527
+ const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 0);
3528
+ const intentTerms = intent ? extractIntentTerms(intent) : [];
3529
+ let bestLine = 0, bestScore = -1;
3530
+ for (let i = 0; i < lines.length; i++) {
3531
+ const lineLower = (lines[i] ?? "").toLowerCase();
3532
+ let score = 0;
3533
+ for (const term of queryTerms) {
3534
+ if (lineLower.includes(term))
3535
+ score += 1.0;
3536
+ }
3537
+ for (const term of intentTerms) {
3538
+ if (lineLower.includes(term))
3539
+ score += INTENT_WEIGHT_SNIPPET;
3540
+ }
3541
+ if (score > bestScore) {
3542
+ bestScore = score;
3543
+ bestLine = i;
3544
+ }
3545
+ }
3546
+ if (chunkPos !== undefined && chunkPos >= 0 && bestScore <= 0) {
3547
+ if (chunkPos === 0) {
3548
+ // chunkPos=0 may be the chunk selector's initialization default for queries
3549
+ // where lexical chunk scoring found no winner (e.g. tokens filtered to empty
3550
+ // by the length>2 guard). Retry with full body so the real match isn't missed.
3551
+ return extractSnippet(body, query, maxLen, undefined, undefined, intent);
3552
+ }
3553
+ // For chunkPos > 0 the reranker actively picked this chunk. Tokens failing to
3554
+ // match literally is most likely a tokenizer limitation (quoted phrases, FTS5
3555
+ // syntax, HYDE passages, semantic hits), so anchor on the chunk start rather
3556
+ // than disregarding the reranker's pick.
3557
+ const contextStart = Math.max(0, chunkPos - 100);
3558
+ bestLine = chunkPos > contextStart
3559
+ ? searchBody.slice(0, chunkPos - contextStart).split('\n').length - 1
3560
+ : 0;
3561
+ }
3562
+ const start = Math.max(0, bestLine - 1);
3563
+ const end = Math.min(lines.length, bestLine + 3);
3564
+ const snippetLines = lines.slice(start, end);
3565
+ let snippetText = snippetLines.join('\n');
3566
+ // If we focused on a chunk window and it produced an empty/whitespace-only snippet,
3567
+ // fall back to a full-document snippet so we always show something useful.
3568
+ if (chunkPos && chunkPos > 0 && snippetText.trim().length === 0) {
3569
+ return extractSnippet(body, query, maxLen, undefined, undefined, intent);
3570
+ }
3571
+ if (snippetText.length > maxLen)
3572
+ snippetText = snippetText.substring(0, maxLen - 3) + "...";
3573
+ const absoluteStart = lineOffset + start + 1; // 1-indexed
3574
+ const snippetLineCount = snippetLines.length;
3575
+ const linesBefore = absoluteStart - 1;
3576
+ const linesAfter = totalLines - (absoluteStart + snippetLineCount - 1);
3577
+ // Format with diff-style header: @@ -start,count @@ (linesBefore before, linesAfter after)
3578
+ const header = `@@ -${absoluteStart},${snippetLineCount} @@ (${linesBefore} before, ${linesAfter} after)`;
3579
+ const snippet = `${header}\n${snippetText}`;
3580
+ return {
3581
+ line: lineOffset + bestLine + 1,
3582
+ snippet,
3583
+ linesBefore,
3584
+ linesAfter,
3585
+ snippetLines: snippetLineCount,
3586
+ };
3587
+ }
3588
+ // =============================================================================
3589
+ // Shared helpers (used by both CLI and MCP)
3590
+ // =============================================================================
3591
+ /**
3592
+ * Add line numbers to text content.
3593
+ * Each line becomes: "{lineNum}: {content}"
3594
+ */
3595
+ export function addLineNumbers(text, startLine = 1) {
3596
+ const lines = text.split('\n');
3597
+ return lines.map((line, i) => `${startLine + i}: ${line}`).join('\n');
3598
+ }
3599
+ /**
3600
+ * RRF list weights for hybridQuery.
3601
+ *
3602
+ * Original-query retrieval paths are the primary evidence and get 2x weight:
3603
+ * - original FTS
3604
+ * - original vector search
3605
+ *
3606
+ * Expansion-derived lists (lex/vec/hyde) stay at 1x regardless of list order,
3607
+ * so a lex expansion inserted before original vector search cannot steal the
3608
+ * original vector boost.
3609
+ */
3610
+ export function getHybridRrfWeights(rankedListMeta) {
3611
+ return rankedListMeta.map(meta => meta.queryType === "original" ? 2.0 : 1.0);
3612
+ }
3613
+ /**
3614
+ * Hybrid search: BM25 + vector + query expansion + RRF + chunked reranking.
3615
+ *
3616
+ * Pipeline:
3617
+ * 1. BM25 probe → skip expansion if strong signal
3618
+ * 2. expandQuery() → typed query variants (lex/vec/hyde)
3619
+ * 3. Type-routed search: original→vector, lex→FTS, vec/hyde→vector
3620
+ * 4. RRF fusion → slice to candidateLimit
3621
+ * 5. chunkDocument() + keyword-best-chunk selection
3622
+ * 6. rerank on chunks (NOT full bodies — O(tokens) trap)
3623
+ * 7. Position-aware score blending (RRF rank × reranker score)
3624
+ * 8. Dedup by file, filter by minScore, slice to limit
3625
+ */
3626
+ export async function hybridQuery(store, query, options) {
3627
+ const limit = options?.limit ?? 10;
3628
+ const minScore = options?.minScore ?? 0;
3629
+ const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT;
3630
+ const collection = options?.collection;
3631
+ const explain = options?.explain ?? false;
3632
+ const intent = options?.intent;
3633
+ const skipRerank = options?.skipRerank ?? false;
3634
+ const hooks = options?.hooks;
3635
+ const rankedLists = [];
3636
+ const rankedListMeta = [];
3637
+ const docidMap = new Map(); // filepath -> docid
3638
+ const hasVectors = !!store.db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
3639
+ // Step 1: BM25 probe — strong signal skips expensive LLM expansion
3640
+ // When intent is provided, disable strong-signal bypass — the obvious BM25
3641
+ // match may not be what the caller wants (e.g. "performance" with intent
3642
+ // "web page load times" should NOT shortcut to a sports-performance doc).
3643
+ // Pass collection directly into FTS query (filter at SQL level, not post-hoc)
3644
+ const initialFts = store.searchFTS(query, 20, collection);
3645
+ const topScore = initialFts[0]?.score ?? 0;
3646
+ const secondScore = initialFts[1]?.score ?? 0;
3647
+ const hasStrongSignal = !intent && initialFts.length > 0
3648
+ && topScore >= STRONG_SIGNAL_MIN_SCORE
3649
+ && (topScore - secondScore) >= STRONG_SIGNAL_MIN_GAP;
3650
+ if (hasStrongSignal)
3651
+ hooks?.onStrongSignal?.(topScore);
3652
+ // Step 2: Expand query (or skip if strong signal)
3653
+ hooks?.onExpandStart?.();
3654
+ const expandStart = Date.now();
3655
+ const expanded = hasStrongSignal
3656
+ ? []
3657
+ : await store.expandQuery(query, undefined, intent);
3658
+ hooks?.onExpand?.(query, expanded, Date.now() - expandStart);
3659
+ // Seed with initial FTS results (avoid re-running original query FTS)
3660
+ if (initialFts.length > 0) {
3661
+ for (const r of initialFts)
3662
+ docidMap.set(r.filepath, r.docid);
3663
+ rankedLists.push(initialFts.map(r => ({
3664
+ file: r.filepath, displayPath: r.displayPath,
3665
+ title: r.title, body: r.body || "", score: r.score,
3666
+ })));
3667
+ rankedListMeta.push({ source: "fts", queryType: "original", query });
3668
+ }
3669
+ // Step 3: Route searches by query type
3670
+ //
3671
+ // Strategy: run all FTS queries immediately (they're sync/instant), then
3672
+ // batch-embed all vector queries in one embedBatch() call, then run
3673
+ // sqlite-vec lookups with pre-computed embeddings.
3674
+ // 3a: Run FTS for all lex expansions right away (no LLM needed)
3675
+ for (const q of expanded) {
3676
+ if (q.type === 'lex') {
3677
+ const ftsResults = store.searchFTS(q.query, 20, collection);
3678
+ if (ftsResults.length > 0) {
3679
+ for (const r of ftsResults)
3680
+ docidMap.set(r.filepath, r.docid);
3681
+ rankedLists.push(ftsResults.map(r => ({
3682
+ file: r.filepath, displayPath: r.displayPath,
3683
+ title: r.title, body: r.body || "", score: r.score,
3684
+ })));
3685
+ rankedListMeta.push({ source: "fts", queryType: "lex", query: q.query });
3686
+ }
3687
+ }
3688
+ }
3689
+ // 3b: Collect all texts that need vector search (original query + vec/hyde expansions)
3690
+ if (hasVectors) {
3691
+ const vecQueries = [
3692
+ { text: query, queryType: "original" },
3693
+ ];
3694
+ for (const q of expanded) {
3695
+ if (q.type === 'vec' || q.type === 'hyde') {
3696
+ vecQueries.push({ text: q.query, queryType: q.type });
3697
+ }
3698
+ }
3699
+ // Batch embed all vector queries in a single call
3700
+ const llm = getLlm(store);
3701
+ const embedModel = llm.embedModelName;
3702
+ const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text, embedModel));
3703
+ hooks?.onEmbedStart?.(textsToEmbed.length);
3704
+ const embedStart = Date.now();
3705
+ const embeddings = await llm.embedBatch(textsToEmbed);
3706
+ hooks?.onEmbedDone?.(Date.now() - embedStart);
3707
+ // Run sqlite-vec lookups with pre-computed embeddings
3708
+ for (let i = 0; i < vecQueries.length; i++) {
3709
+ const embedding = embeddings[i]?.embedding;
3710
+ if (!embedding)
3711
+ continue;
3712
+ const vecResults = await store.searchVec(vecQueries[i].text, embedModel, 20, collection, undefined, embedding);
3713
+ if (vecResults.length > 0) {
3714
+ for (const r of vecResults)
3715
+ docidMap.set(r.filepath, r.docid);
3716
+ rankedLists.push(vecResults.map(r => ({
3717
+ file: r.filepath, displayPath: r.displayPath,
3718
+ title: r.title, body: r.body || "", score: r.score,
3719
+ })));
3720
+ rankedListMeta.push({
3721
+ source: "vec",
3722
+ queryType: vecQueries[i].queryType,
3723
+ query: vecQueries[i].text,
3724
+ });
3725
+ }
3726
+ }
3727
+ }
3728
+ // Step 4: RRF fusion — original-query FTS and vector lists get 2x weight;
3729
+ // expansion-derived lists stay at 1x independent of insertion order.
3730
+ const weights = getHybridRrfWeights(rankedListMeta);
3731
+ const fused = reciprocalRankFusion(rankedLists, weights);
3732
+ const rrfTraceByFile = explain ? buildRrfTrace(rankedLists, weights, rankedListMeta) : null;
3733
+ const candidates = fused.slice(0, candidateLimit);
3734
+ if (candidates.length === 0)
3735
+ return [];
3736
+ // Step 5: Chunk documents, pick best chunk per doc for reranking.
3737
+ // Reranking full bodies is O(tokens) — the critical perf lesson that motivated this refactor.
3738
+ const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 2);
3739
+ const intentTerms = intent ? extractIntentTerms(intent) : [];
3740
+ const docChunkMap = new Map();
3741
+ const chunkStrategy = options?.chunkStrategy;
3742
+ for (const cand of candidates) {
3743
+ const chunks = await chunkDocumentAsync(cand.body, undefined, undefined, undefined, cand.file, chunkStrategy);
3744
+ if (chunks.length === 0)
3745
+ continue;
3746
+ // Pick chunk with most keyword overlap (fallback: first chunk)
3747
+ // Intent terms contribute at INTENT_WEIGHT_CHUNK (0.5) relative to query terms (1.0)
3748
+ let bestIdx = 0;
3749
+ let bestScore = -1;
3750
+ for (let i = 0; i < chunks.length; i++) {
3751
+ const chunkLower = chunks[i].text.toLowerCase();
3752
+ let score = queryTerms.reduce((acc, term) => acc + (chunkLower.includes(term) ? 1 : 0), 0);
3753
+ for (const term of intentTerms) {
3754
+ if (chunkLower.includes(term))
3755
+ score += INTENT_WEIGHT_CHUNK;
3756
+ }
3757
+ if (score > bestScore) {
3758
+ bestScore = score;
3759
+ bestIdx = i;
3760
+ }
3761
+ }
3762
+ docChunkMap.set(cand.file, { chunks, bestIdx });
3763
+ }
3764
+ if (skipRerank) {
3765
+ // Skip LLM reranking — return candidates scored by RRF only
3766
+ const seenFiles = new Set();
3767
+ return candidates
3768
+ .map((cand, i) => {
3769
+ const chunkInfo = docChunkMap.get(cand.file);
3770
+ const bestIdx = chunkInfo?.bestIdx ?? 0;
3771
+ const bestChunk = chunkInfo?.chunks[bestIdx]?.text || cand.body || "";
3772
+ const bestChunkPos = chunkInfo?.chunks[bestIdx]?.pos || 0;
3773
+ const rrfRank = i + 1;
3774
+ const rrfScore = 1 / rrfRank;
3775
+ const trace = rrfTraceByFile?.get(cand.file);
3776
+ const explainData = explain ? {
3777
+ ftsScores: trace?.contributions.filter(c => c.source === "fts").map(c => c.backendScore) ?? [],
3778
+ vectorScores: trace?.contributions.filter(c => c.source === "vec").map(c => c.backendScore) ?? [],
3779
+ rrf: {
3780
+ rank: rrfRank,
3781
+ positionScore: rrfScore,
3782
+ weight: 1.0,
3783
+ baseScore: trace?.baseScore ?? 0,
3784
+ topRankBonus: trace?.topRankBonus ?? 0,
3785
+ totalScore: trace?.totalScore ?? 0,
3786
+ contributions: trace?.contributions ?? [],
3787
+ },
3788
+ rerankScore: 0,
3789
+ blendedScore: rrfScore,
3790
+ } : undefined;
3791
+ return {
3792
+ file: cand.file,
3793
+ displayPath: cand.displayPath,
3794
+ title: cand.title,
3795
+ body: cand.body,
3796
+ bestChunk,
3797
+ bestChunkPos,
3798
+ score: rrfScore,
3799
+ context: store.getContextForFile(cand.file),
3800
+ docid: docidMap.get(cand.file) || "",
3801
+ ...(explainData ? { explain: explainData } : {}),
3802
+ };
3803
+ })
3804
+ .filter(r => {
3805
+ if (seenFiles.has(r.file))
3806
+ return false;
3807
+ seenFiles.add(r.file);
3808
+ return true;
3809
+ })
3810
+ .filter(r => r.score >= minScore)
3811
+ .slice(0, limit);
3812
+ }
3813
+ // Step 6: Rerank chunks (NOT full bodies)
3814
+ const chunksToRerank = [];
3815
+ for (const cand of candidates) {
3816
+ const chunkInfo = docChunkMap.get(cand.file);
3817
+ if (chunkInfo) {
3818
+ chunksToRerank.push({ file: cand.file, text: chunkInfo.chunks[chunkInfo.bestIdx].text });
3819
+ }
3820
+ }
3821
+ hooks?.onRerankStart?.(chunksToRerank.length);
3822
+ const rerankStart = Date.now();
3823
+ const reranked = await store.rerank(query, chunksToRerank, undefined, intent);
3824
+ hooks?.onRerankDone?.(Date.now() - rerankStart);
3825
+ // Step 7: Blend RRF position score with reranker score
3826
+ // Position-aware weights: top retrieval results get more protection from reranker disagreement
3827
+ const candidateMap = new Map(candidates.map(c => [c.file, {
3828
+ displayPath: c.displayPath, title: c.title, body: c.body,
3829
+ }]));
3830
+ const rrfRankMap = new Map(candidates.map((c, i) => [c.file, i + 1]));
3831
+ const blended = reranked.map(r => {
3832
+ const rrfRank = rrfRankMap.get(r.file) || candidateLimit;
3833
+ let rrfWeight;
3834
+ if (rrfRank <= 3)
3835
+ rrfWeight = 0.75;
3836
+ else if (rrfRank <= 10)
3837
+ rrfWeight = 0.60;
3838
+ else
3839
+ rrfWeight = 0.40;
3840
+ const rrfScore = 1 / rrfRank;
3841
+ const blendedScore = rrfWeight * rrfScore + (1 - rrfWeight) * r.score;
3842
+ const candidate = candidateMap.get(r.file);
3843
+ const chunkInfo = docChunkMap.get(r.file);
3844
+ const bestIdx = chunkInfo?.bestIdx ?? 0;
3845
+ const bestChunk = chunkInfo?.chunks[bestIdx]?.text || candidate?.body || "";
3846
+ const bestChunkPos = chunkInfo?.chunks[bestIdx]?.pos || 0;
3847
+ const trace = rrfTraceByFile?.get(r.file);
3848
+ const explainData = explain ? {
3849
+ ftsScores: trace?.contributions.filter(c => c.source === "fts").map(c => c.backendScore) ?? [],
3850
+ vectorScores: trace?.contributions.filter(c => c.source === "vec").map(c => c.backendScore) ?? [],
3851
+ rrf: {
3852
+ rank: rrfRank,
3853
+ positionScore: rrfScore,
3854
+ weight: rrfWeight,
3855
+ baseScore: trace?.baseScore ?? 0,
3856
+ topRankBonus: trace?.topRankBonus ?? 0,
3857
+ totalScore: trace?.totalScore ?? 0,
3858
+ contributions: trace?.contributions ?? [],
3859
+ },
3860
+ rerankScore: r.score,
3861
+ blendedScore,
3862
+ } : undefined;
3863
+ return {
3864
+ file: r.file,
3865
+ displayPath: candidate?.displayPath || "",
3866
+ title: candidate?.title || "",
3867
+ body: candidate?.body || "",
3868
+ bestChunk,
3869
+ bestChunkPos,
3870
+ score: blendedScore,
3871
+ context: store.getContextForFile(r.file),
3872
+ docid: docidMap.get(r.file) || "",
3873
+ ...(explainData ? { explain: explainData } : {}),
3874
+ };
3875
+ }).sort((a, b) => b.score - a.score);
3876
+ // Step 8: Dedup by file (safety net — prevents duplicate output)
3877
+ const seenFiles = new Set();
3878
+ return blended
3879
+ .filter(r => {
3880
+ if (seenFiles.has(r.file))
3881
+ return false;
3882
+ seenFiles.add(r.file);
3883
+ return true;
3884
+ })
3885
+ .filter(r => r.score >= minScore)
3886
+ .slice(0, limit);
3887
+ }
3888
+ /**
3889
+ * Vector-only semantic search with query expansion.
3890
+ *
3891
+ * Pipeline:
3892
+ * 1. expandQuery() → typed variants, filter to vec/hyde only (lex irrelevant here)
3893
+ * 2. searchVec() for original + vec/hyde variants (sequential — node-llama-cpp embed limitation)
3894
+ * 3. Dedup by filepath (keep max score)
3895
+ * 4. Sort by score descending, filter by minScore, slice to limit
3896
+ */
3897
+ export async function vectorSearchQuery(store, query, options) {
3898
+ const limit = options?.limit ?? 10;
3899
+ const minScore = options?.minScore ?? 0.3;
3900
+ const collection = options?.collection;
3901
+ const intent = options?.intent;
3902
+ const hasVectors = !!store.db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
3903
+ if (!hasVectors)
3904
+ return [];
3905
+ // Expand query — filter to vec/hyde only (lex queries target FTS, not vector)
3906
+ const expandStart = Date.now();
3907
+ const allExpanded = await store.expandQuery(query, undefined, intent);
3908
+ const vecExpanded = allExpanded.filter(q => q.type !== 'lex');
3909
+ options?.hooks?.onExpand?.(query, vecExpanded, Date.now() - expandStart);
3910
+ // Run original + vec/hyde expanded through vector, sequentially — concurrent embed() hangs
3911
+ const embedModel = getLlm(store).embedModelName;
3912
+ const queryTexts = [query, ...vecExpanded.map(q => q.query)];
3913
+ const allResults = new Map();
3914
+ for (const q of queryTexts) {
3915
+ const vecResults = await store.searchVec(q, embedModel, limit, collection);
3916
+ for (const r of vecResults) {
3917
+ const existing = allResults.get(r.filepath);
3918
+ if (!existing || r.score > existing.score) {
3919
+ allResults.set(r.filepath, {
3920
+ file: r.filepath,
3921
+ displayPath: r.displayPath,
3922
+ title: r.title,
3923
+ body: r.body || "",
3924
+ score: r.score,
3925
+ context: store.getContextForFile(r.filepath),
3926
+ docid: r.docid,
3927
+ });
3928
+ }
3929
+ }
3930
+ }
3931
+ return Array.from(allResults.values())
3932
+ .sort((a, b) => b.score - a.score)
3933
+ .filter(r => r.score >= minScore)
3934
+ .slice(0, limit);
3935
+ }
3936
+ /**
3937
+ * Structured search: execute pre-expanded queries without LLM query expansion.
3938
+ *
3939
+ * Designed for LLM callers (MCP/HTTP) that generate their own query expansions.
3940
+ * Skips the internal expandQuery() step — goes directly to:
3941
+ *
3942
+ * Pipeline:
3943
+ * 1. Route searches: lex→FTS, vec/hyde→vector (batch embed)
3944
+ * 2. RRF fusion across all result lists
3945
+ * 3. Chunk documents + keyword-best-chunk selection
3946
+ * 4. Rerank on chunks
3947
+ * 5. Position-aware score blending
3948
+ * 6. Dedup, filter, slice
3949
+ *
3950
+ * This is the recommended endpoint for capable LLMs — they can generate
3951
+ * better query variations than our small local model, especially for
3952
+ * domain-specific or nuanced queries.
3953
+ */
3954
+ export async function structuredSearch(store, searches, options) {
3955
+ const limit = options?.limit ?? 10;
3956
+ const minScore = options?.minScore ?? 0;
3957
+ const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT;
3958
+ const explain = options?.explain ?? false;
3959
+ const intent = options?.intent;
3960
+ const skipRerank = options?.skipRerank ?? false;
3961
+ const hooks = options?.hooks;
3962
+ const collections = options?.collections;
3963
+ if (searches.length === 0)
3964
+ return [];
3965
+ // Validate queries before executing
3966
+ for (const search of searches) {
3967
+ const location = search.line ? `Line ${search.line}` : 'Structured search';
3968
+ if (/[\r\n]/.test(search.query)) {
3969
+ throw new Error(`${location} (${search.type}): queries must be single-line. Remove newline characters.`);
3970
+ }
3971
+ if (search.type === 'lex') {
3972
+ const error = validateLexQuery(search.query);
3973
+ if (error) {
3974
+ throw new Error(`${location} (lex): ${error}`);
3975
+ }
3976
+ }
3977
+ else if (search.type === 'vec' || search.type === 'hyde') {
3978
+ const error = validateSemanticQuery(search.query);
3979
+ if (error) {
3980
+ throw new Error(`${location} (${search.type}): ${error}`);
3981
+ }
3982
+ }
3983
+ }
3984
+ const rankedLists = [];
3985
+ const rankedListMeta = [];
3986
+ const docidMap = new Map(); // filepath -> docid
3987
+ const hasVectors = !!store.db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
3988
+ // Helper to run search across collections (or all if undefined)
3989
+ const collectionList = collections ?? [undefined]; // undefined = all collections
3990
+ // Step 1: Run FTS for all lex searches (sync, instant)
3991
+ for (const search of searches) {
3992
+ if (search.type === 'lex') {
3993
+ for (const coll of collectionList) {
3994
+ const ftsResults = store.searchFTS(search.query, 20, coll);
3995
+ if (ftsResults.length > 0) {
3996
+ for (const r of ftsResults)
3997
+ docidMap.set(r.filepath, r.docid);
3998
+ rankedLists.push(ftsResults.map(r => ({
3999
+ file: r.filepath, displayPath: r.displayPath,
4000
+ title: r.title, body: r.body || "", score: r.score,
4001
+ })));
4002
+ rankedListMeta.push({
4003
+ source: "fts",
4004
+ queryType: "lex",
4005
+ query: search.query,
4006
+ });
4007
+ }
4008
+ }
4009
+ }
4010
+ }
4011
+ // Step 2: Batch embed and run vector searches for vec/hyde
4012
+ if (hasVectors) {
4013
+ const vecSearches = searches.filter((s) => s.type === 'vec' || s.type === 'hyde');
4014
+ if (vecSearches.length > 0) {
4015
+ const llm = getLlm(store);
4016
+ const embedModel = llm.embedModelName;
4017
+ const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query, embedModel));
4018
+ hooks?.onEmbedStart?.(textsToEmbed.length);
4019
+ const embedStart = Date.now();
4020
+ const embeddings = await llm.embedBatch(textsToEmbed);
4021
+ hooks?.onEmbedDone?.(Date.now() - embedStart);
4022
+ for (let i = 0; i < vecSearches.length; i++) {
4023
+ const embedding = embeddings[i]?.embedding;
4024
+ if (!embedding)
4025
+ continue;
4026
+ for (const coll of collectionList) {
4027
+ const vecResults = await store.searchVec(vecSearches[i].query, embedModel, 20, coll, undefined, embedding);
4028
+ if (vecResults.length > 0) {
4029
+ for (const r of vecResults)
4030
+ docidMap.set(r.filepath, r.docid);
4031
+ rankedLists.push(vecResults.map(r => ({
4032
+ file: r.filepath, displayPath: r.displayPath,
4033
+ title: r.title, body: r.body || "", score: r.score,
4034
+ })));
4035
+ rankedListMeta.push({
4036
+ source: "vec",
4037
+ queryType: vecSearches[i].type,
4038
+ query: vecSearches[i].query,
4039
+ });
4040
+ }
4041
+ }
4042
+ }
4043
+ }
4044
+ }
4045
+ if (rankedLists.length === 0)
4046
+ return [];
4047
+ // Step 3: RRF fusion — first list gets 2x weight (assume caller ordered by importance)
4048
+ const weights = rankedLists.map((_, i) => i === 0 ? 2.0 : 1.0);
4049
+ const fused = reciprocalRankFusion(rankedLists, weights);
4050
+ const rrfTraceByFile = explain ? buildRrfTrace(rankedLists, weights, rankedListMeta) : null;
4051
+ const candidates = fused.slice(0, candidateLimit);
4052
+ if (candidates.length === 0)
4053
+ return [];
4054
+ hooks?.onExpand?.("", [], 0); // Signal no expansion (pre-expanded)
4055
+ // Step 4: Chunk documents, pick best chunk per doc for reranking
4056
+ // Use first lex query as the "query" for keyword matching, or first vec if no lex
4057
+ const primaryQuery = searches.find(s => s.type === 'lex')?.query
4058
+ || searches.find(s => s.type === 'vec')?.query
4059
+ || searches[0]?.query || "";
4060
+ const queryTerms = primaryQuery.toLowerCase().split(/\s+/).filter(t => t.length > 2);
4061
+ const intentTerms = intent ? extractIntentTerms(intent) : [];
4062
+ const docChunkMap = new Map();
4063
+ const ssChunkStrategy = options?.chunkStrategy;
4064
+ for (const cand of candidates) {
4065
+ const chunks = await chunkDocumentAsync(cand.body, undefined, undefined, undefined, cand.file, ssChunkStrategy);
4066
+ if (chunks.length === 0)
4067
+ continue;
4068
+ // Pick chunk with most keyword overlap
4069
+ // Intent terms contribute at INTENT_WEIGHT_CHUNK (0.5) relative to query terms (1.0)
4070
+ let bestIdx = 0;
4071
+ let bestScore = -1;
4072
+ for (let i = 0; i < chunks.length; i++) {
4073
+ const chunkLower = chunks[i].text.toLowerCase();
4074
+ let score = queryTerms.reduce((acc, term) => acc + (chunkLower.includes(term) ? 1 : 0), 0);
4075
+ for (const term of intentTerms) {
4076
+ if (chunkLower.includes(term))
4077
+ score += INTENT_WEIGHT_CHUNK;
4078
+ }
4079
+ if (score > bestScore) {
4080
+ bestScore = score;
4081
+ bestIdx = i;
4082
+ }
4083
+ }
4084
+ docChunkMap.set(cand.file, { chunks, bestIdx });
4085
+ }
4086
+ if (skipRerank) {
4087
+ // Skip LLM reranking — return candidates scored by RRF only
4088
+ const seenFiles = new Set();
4089
+ return candidates
4090
+ .map((cand, i) => {
4091
+ const chunkInfo = docChunkMap.get(cand.file);
4092
+ const bestIdx = chunkInfo?.bestIdx ?? 0;
4093
+ const bestChunk = chunkInfo?.chunks[bestIdx]?.text || cand.body || "";
4094
+ const bestChunkPos = chunkInfo?.chunks[bestIdx]?.pos || 0;
4095
+ const rrfRank = i + 1;
4096
+ const rrfScore = 1 / rrfRank;
4097
+ const trace = rrfTraceByFile?.get(cand.file);
4098
+ const explainData = explain ? {
4099
+ ftsScores: trace?.contributions.filter(c => c.source === "fts").map(c => c.backendScore) ?? [],
4100
+ vectorScores: trace?.contributions.filter(c => c.source === "vec").map(c => c.backendScore) ?? [],
4101
+ rrf: {
4102
+ rank: rrfRank,
4103
+ positionScore: rrfScore,
4104
+ weight: 1.0,
4105
+ baseScore: trace?.baseScore ?? 0,
4106
+ topRankBonus: trace?.topRankBonus ?? 0,
4107
+ totalScore: trace?.totalScore ?? 0,
4108
+ contributions: trace?.contributions ?? [],
4109
+ },
4110
+ rerankScore: 0,
4111
+ blendedScore: rrfScore,
4112
+ } : undefined;
4113
+ return {
4114
+ file: cand.file,
4115
+ displayPath: cand.displayPath,
4116
+ title: cand.title,
4117
+ body: cand.body,
4118
+ bestChunk,
4119
+ bestChunkPos,
4120
+ score: rrfScore,
4121
+ context: store.getContextForFile(cand.file),
4122
+ docid: docidMap.get(cand.file) || "",
4123
+ ...(explainData ? { explain: explainData } : {}),
4124
+ };
4125
+ })
4126
+ .filter(r => {
4127
+ if (seenFiles.has(r.file))
4128
+ return false;
4129
+ seenFiles.add(r.file);
4130
+ return true;
4131
+ })
4132
+ .filter(r => r.score >= minScore)
4133
+ .slice(0, limit);
4134
+ }
4135
+ // Step 5: Rerank chunks
4136
+ const chunksToRerank = [];
4137
+ for (const cand of candidates) {
4138
+ const chunkInfo = docChunkMap.get(cand.file);
4139
+ if (chunkInfo) {
4140
+ chunksToRerank.push({ file: cand.file, text: chunkInfo.chunks[chunkInfo.bestIdx].text });
4141
+ }
4142
+ }
4143
+ hooks?.onRerankStart?.(chunksToRerank.length);
4144
+ const rerankStart2 = Date.now();
4145
+ const reranked = await store.rerank(primaryQuery, chunksToRerank, undefined, intent);
4146
+ hooks?.onRerankDone?.(Date.now() - rerankStart2);
4147
+ // Step 6: Blend RRF position score with reranker score
4148
+ const candidateMap = new Map(candidates.map(c => [c.file, {
4149
+ displayPath: c.displayPath, title: c.title, body: c.body,
4150
+ }]));
4151
+ const rrfRankMap = new Map(candidates.map((c, i) => [c.file, i + 1]));
4152
+ const blended = reranked.map(r => {
4153
+ const rrfRank = rrfRankMap.get(r.file) || candidateLimit;
4154
+ let rrfWeight;
4155
+ if (rrfRank <= 3)
4156
+ rrfWeight = 0.75;
4157
+ else if (rrfRank <= 10)
4158
+ rrfWeight = 0.60;
4159
+ else
4160
+ rrfWeight = 0.40;
4161
+ const rrfScore = 1 / rrfRank;
4162
+ const blendedScore = rrfWeight * rrfScore + (1 - rrfWeight) * r.score;
4163
+ const candidate = candidateMap.get(r.file);
4164
+ const chunkInfo = docChunkMap.get(r.file);
4165
+ const bestIdx = chunkInfo?.bestIdx ?? 0;
4166
+ const bestChunk = chunkInfo?.chunks[bestIdx]?.text || candidate?.body || "";
4167
+ const bestChunkPos = chunkInfo?.chunks[bestIdx]?.pos || 0;
4168
+ const trace = rrfTraceByFile?.get(r.file);
4169
+ const explainData = explain ? {
4170
+ ftsScores: trace?.contributions.filter(c => c.source === "fts").map(c => c.backendScore) ?? [],
4171
+ vectorScores: trace?.contributions.filter(c => c.source === "vec").map(c => c.backendScore) ?? [],
4172
+ rrf: {
4173
+ rank: rrfRank,
4174
+ positionScore: rrfScore,
4175
+ weight: rrfWeight,
4176
+ baseScore: trace?.baseScore ?? 0,
4177
+ topRankBonus: trace?.topRankBonus ?? 0,
4178
+ totalScore: trace?.totalScore ?? 0,
4179
+ contributions: trace?.contributions ?? [],
4180
+ },
4181
+ rerankScore: r.score,
4182
+ blendedScore,
4183
+ } : undefined;
4184
+ return {
4185
+ file: r.file,
4186
+ displayPath: candidate?.displayPath || "",
4187
+ title: candidate?.title || "",
4188
+ body: candidate?.body || "",
4189
+ bestChunk,
4190
+ bestChunkPos,
4191
+ score: blendedScore,
4192
+ context: store.getContextForFile(r.file),
4193
+ docid: docidMap.get(r.file) || "",
4194
+ ...(explainData ? { explain: explainData } : {}),
4195
+ };
4196
+ }).sort((a, b) => b.score - a.score);
4197
+ // Step 7: Dedup by file
4198
+ const seenFiles = new Set();
4199
+ return blended
4200
+ .filter(r => {
4201
+ if (seenFiles.has(r.file))
4202
+ return false;
4203
+ seenFiles.add(r.file);
4204
+ return true;
4205
+ })
4206
+ .filter(r => r.score >= minScore)
4207
+ .slice(0, limit);
4208
+ }