@tobilu/qmd 2.0.1 → 2.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/store.js CHANGED
@@ -16,16 +16,21 @@ import { createHash } from "crypto";
16
16
  import { readFileSync, realpathSync, statSync, mkdirSync } from "node:fs";
17
17
  // Note: node:path resolve is not imported — we export our own cross-platform resolve()
18
18
  import fastGlob from "fast-glob";
19
- import { LlamaCpp, getDefaultLlamaCpp, formatQueryForEmbedding, formatDocForEmbedding, withLLMSessionForLlm, } from "./llm.js";
19
+ import { qmdHomedir } from "./paths.js";
20
+ import { LlamaCpp, getDefaultLlamaCpp, formatQueryForEmbedding, formatDocForEmbedding, withLLMSessionForLlm, DEFAULT_EMBED_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, } from "./llm.js";
20
21
  // =============================================================================
21
22
  // Configuration
22
23
  // =============================================================================
23
- const HOME = process.env.HOME || "/tmp";
24
- export const DEFAULT_EMBED_MODEL = "embeddinggemma";
25
- export const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
26
- export const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B";
24
+ export const DEFAULT_EMBED_MODEL = DEFAULT_EMBED_MODEL_URI;
25
+ export const DEFAULT_RERANK_MODEL = DEFAULT_RERANK_MODEL_URI;
26
+ export const DEFAULT_QUERY_MODEL = DEFAULT_GENERATE_MODEL_URI;
27
27
  export const DEFAULT_GLOB = "**/*.md";
28
28
  export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB
29
+ export const DEFAULT_EMBED_MAX_DOCS_PER_BATCH = 64;
30
+ export const DEFAULT_EMBED_MAX_BATCH_BYTES = 64 * 1024 * 1024; // 64MB
31
+ const EMBED_FINGERPRINT_PROBE_QUERY = "__qmd_embedding_query_probe__";
32
+ const EMBED_FINGERPRINT_PROBE_TITLE = "__qmd_embedding_title_probe__";
33
+ const EMBED_FINGERPRINT_PROBE_DOC = "__qmd_embedding_document_probe__";
29
34
  // Chunking: 900 tokens per chunk with 15% overlap
30
35
  // Increased from 800 to accommodate smart chunking finding natural break points
31
36
  export const CHUNK_SIZE_TOKENS = 900;
@@ -36,6 +41,16 @@ export const CHUNK_OVERLAP_CHARS = CHUNK_OVERLAP_TOKENS * 4; // 540 chars
36
41
  // Search window for finding optimal break points (in tokens, ~200 tokens)
37
42
  export const CHUNK_WINDOW_TOKENS = 200;
38
43
  export const CHUNK_WINDOW_CHARS = CHUNK_WINDOW_TOKENS * 4; // 800 chars
44
+ export function getEmbeddingFingerprint(model = DEFAULT_EMBED_MODEL) {
45
+ const significant = [
46
+ `model:${model}`,
47
+ `query:${formatQueryForEmbedding(EMBED_FINGERPRINT_PROBE_QUERY, model)}`,
48
+ `doc:${formatDocForEmbedding(EMBED_FINGERPRINT_PROBE_DOC, EMBED_FINGERPRINT_PROBE_TITLE, model)}`,
49
+ `chunk_tokens:${CHUNK_SIZE_TOKENS}`,
50
+ `chunk_overlap_tokens:${CHUNK_OVERLAP_TOKENS}`,
51
+ ].join("\n");
52
+ return createHash("sha256").update(significant).digest("hex").slice(0, 6);
53
+ }
39
54
  /**
40
55
  * Get the LlamaCpp instance for a store — prefers the store's own instance,
41
56
  * falls back to the global singleton.
@@ -161,6 +176,60 @@ export function findBestCutoff(breakPoints, targetCharPos, windowChars = CHUNK_W
161
176
  }
162
177
  return bestPos;
163
178
  }
179
+ /**
180
+ * Merge two sets of break points (e.g. regex + AST), keeping the highest
181
+ * score at each position. Result is sorted by position.
182
+ */
183
+ export function mergeBreakPoints(a, b) {
184
+ const seen = new Map();
185
+ for (const bp of a) {
186
+ const existing = seen.get(bp.pos);
187
+ if (!existing || bp.score > existing.score) {
188
+ seen.set(bp.pos, bp);
189
+ }
190
+ }
191
+ for (const bp of b) {
192
+ const existing = seen.get(bp.pos);
193
+ if (!existing || bp.score > existing.score) {
194
+ seen.set(bp.pos, bp);
195
+ }
196
+ }
197
+ return Array.from(seen.values()).sort((a, b) => a.pos - b.pos);
198
+ }
199
+ /**
200
+ * Core chunk algorithm that operates on precomputed break points and code fences.
201
+ * This is the shared implementation used by both regex-only and AST-aware chunking.
202
+ */
203
+ export function chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars = CHUNK_SIZE_CHARS, overlapChars = CHUNK_OVERLAP_CHARS, windowChars = CHUNK_WINDOW_CHARS) {
204
+ if (content.length <= maxChars) {
205
+ return [{ text: content, pos: 0 }];
206
+ }
207
+ const chunks = [];
208
+ let charPos = 0;
209
+ while (charPos < content.length) {
210
+ const targetEndPos = Math.min(charPos + maxChars, content.length);
211
+ let endPos = targetEndPos;
212
+ if (endPos < content.length) {
213
+ const bestCutoff = findBestCutoff(breakPoints, targetEndPos, windowChars, 0.7, codeFences);
214
+ if (bestCutoff > charPos && bestCutoff <= targetEndPos) {
215
+ endPos = bestCutoff;
216
+ }
217
+ }
218
+ if (endPos <= charPos) {
219
+ endPos = Math.min(charPos + maxChars, content.length);
220
+ }
221
+ chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
222
+ if (endPos >= content.length) {
223
+ break;
224
+ }
225
+ charPos = endPos - overlapChars;
226
+ const lastChunkPos = chunks.at(-1).pos;
227
+ if (charPos <= lastChunkPos) {
228
+ charPos = endPos;
229
+ }
230
+ }
231
+ return chunks;
232
+ }
164
233
  // Hybrid query: strong BM25 signal detection thresholds
165
234
  // Skip expensive LLM expansion when top result is strong AND clearly separated from runner-up
166
235
  export const STRONG_SIGNAL_MIN_SCORE = 0.85;
@@ -172,7 +241,7 @@ export const RERANK_CANDIDATE_LIMIT = 40;
172
241
  // Path utilities
173
242
  // =============================================================================
174
243
  export function homedir() {
175
- return HOME;
244
+ return qmdHomedir();
176
245
  }
177
246
  /**
178
247
  * Check if a path is absolute.
@@ -191,7 +260,8 @@ export function isAbsolutePath(path) {
191
260
  if (path.startsWith('/')) {
192
261
  // Check if it's a Git Bash style path like /c/ or /c/Users (C-Z only, not A or B)
193
262
  // Requires path[2] === '/' to distinguish from Unix paths like /c or /cache
194
- if (path.length >= 3 && path[2] === '/') {
263
+ // Skipped on WSL where /c/ is a valid drvfs mount point, not a drive letter
264
+ if (!isWSL() && path.length >= 3 && path[2] === '/') {
195
265
  const driveLetter = path[1];
196
266
  if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
197
267
  return true;
@@ -213,6 +283,13 @@ export function isAbsolutePath(path) {
213
283
  export function normalizePathSeparators(path) {
214
284
  return path.replace(/\\/g, '/');
215
285
  }
286
+ /**
287
+ * Detect if running inside WSL (Windows Subsystem for Linux).
288
+ * On WSL, paths like /c/work/... are valid drvfs mount points, not Git Bash paths.
289
+ */
290
+ function isWSL() {
291
+ return !!(process.env.WSL_DISTRO_NAME || process.env.WSL_INTEROP);
292
+ }
216
293
  /**
217
294
  * Get the relative path from a prefix.
218
295
  * Returns null if path is not under prefix.
@@ -256,8 +333,9 @@ export function resolve(...paths) {
256
333
  windowsDrive = firstPath.slice(0, 2);
257
334
  result = firstPath.slice(2);
258
335
  }
259
- else if (firstPath.startsWith('/') && firstPath.length >= 3 && firstPath[2] === '/') {
336
+ else if (!isWSL() && firstPath.startsWith('/') && firstPath.length >= 3 && firstPath[2] === '/') {
260
337
  // Git Bash style: /c/ -> C: (C-Z drives only, not A or B)
338
+ // Skipped on WSL where /c/ is a valid drvfs mount point, not a drive letter
261
339
  const driveLetter = firstPath[1];
262
340
  if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
263
341
  windowsDrive = driveLetter.toUpperCase() + ':';
@@ -288,8 +366,9 @@ export function resolve(...paths) {
288
366
  windowsDrive = p.slice(0, 2);
289
367
  result = p.slice(2);
290
368
  }
291
- else if (p.startsWith('/') && p.length >= 3 && p[2] === '/') {
369
+ else if (!isWSL() && p.startsWith('/') && p.length >= 3 && p[2] === '/') {
292
370
  // Git Bash style (C-Z drives only, not A or B)
371
+ // Skipped on WSL where /c/ is a valid drvfs mount point, not a drive letter
293
372
  const driveLetter = p[1];
294
373
  if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
295
374
  windowsDrive = driveLetter.toUpperCase() + ':';
@@ -332,6 +411,10 @@ let _productionMode = false;
332
411
  export function enableProductionMode() {
333
412
  _productionMode = true;
334
413
  }
414
+ /** Reset production mode flag — only for testing. */
415
+ export function _resetProductionModeForTesting() {
416
+ _productionMode = false;
417
+ }
335
418
  export function getDefaultDbPath(indexName = "index") {
336
419
  // Always allow override via INDEX_PATH (for testing)
337
420
  if (process.env.INDEX_PATH) {
@@ -398,21 +481,25 @@ export function normalizeVirtualPath(input) {
398
481
  export function parseVirtualPath(virtualPath) {
399
482
  // Normalize the path first
400
483
  const normalized = normalizeVirtualPath(virtualPath);
484
+ const [pathPart = normalized, queryString = ""] = normalized.split("?");
401
485
  // Match: qmd://collection-name[/optional-path]
402
486
  // Allows: qmd://name, qmd://name/, qmd://name/path
403
- const match = normalized.match(/^qmd:\/\/([^\/]+)\/?(.*)$/);
487
+ const match = pathPart.match(/^qmd:\/\/([^\/]+)\/?(.*)$/);
404
488
  if (!match?.[1])
405
489
  return null;
490
+ const indexName = new URLSearchParams(queryString).get("index")?.trim() || undefined;
406
491
  return {
407
492
  collectionName: match[1],
408
493
  path: match[2] ?? '', // Empty string for collection root
494
+ ...(indexName ? { indexName } : {}),
409
495
  };
410
496
  }
411
497
  /**
412
498
  * Build a virtual path from collection name and relative path.
413
499
  */
414
- export function buildVirtualPath(collectionName, path) {
415
- return `qmd://${collectionName}/${path}`;
500
+ export function buildVirtualPath(collectionName, path, indexName) {
501
+ const base = `qmd://${collectionName}/${path}`;
502
+ return indexName ? `${base}?index=${encodeURIComponent(indexName)}` : base;
416
503
  }
417
504
  /**
418
505
  * Check if a path is explicitly a virtual path.
@@ -482,6 +569,7 @@ function createSqliteVecUnavailableError(reason) {
482
569
  "Install Homebrew SQLite so the sqlite-vec extension can be loaded, " +
483
570
  "and set BREW_PREFIX if Homebrew is installed in a non-standard location.");
484
571
  }
572
+ let _sqliteVecUnavailableReason = null;
485
573
  function getErrorMessage(err) {
486
574
  return err instanceof Error ? err.message : String(err);
487
575
  }
@@ -498,15 +586,76 @@ export function verifySqliteVecLoaded(db) {
498
586
  }
499
587
  }
500
588
  let _sqliteVecAvailable = null;
589
+ const CJK_CHAR_PATTERN = /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Hangul}]/u;
590
+ const CJK_RUN_PATTERN = /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Hangul}]+/gu;
591
+ const FTS_CJK_NORMALIZED_VERSION = "1";
592
+ /**
593
+ * FTS5's unicode61 tokenizer does not segment CJK text into searchable words.
594
+ * Normalize CJK runs by spacing every character so exact CJK queries can be
595
+ * translated into phrase queries while Latin text keeps the default tokenizer.
596
+ */
597
+ export function normalizeCjkForFTS(text) {
598
+ return text.replace(CJK_RUN_PATTERN, run => ` ${Array.from(run).join(' ')} `);
599
+ }
600
+ function containsCjk(text) {
601
+ return CJK_CHAR_PATTERN.test(text);
602
+ }
603
+ function sanitizeFTS5Phrase(phrase) {
604
+ return normalizeCjkForFTS(phrase)
605
+ .split(/\s+/)
606
+ .map(t => sanitizeFTS5Term(t))
607
+ .filter(t => t)
608
+ .join(' ');
609
+ }
610
+ function rebuildFTSForCjkNormalization(db) {
611
+ const version = db.prepare(`SELECT value FROM store_config WHERE key = 'fts_cjk_normalized_version'`).get();
612
+ if (version?.value === FTS_CJK_NORMALIZED_VERSION)
613
+ return;
614
+ try {
615
+ db.exec(`DELETE FROM documents_fts WHERE rowid >= 0`);
616
+ }
617
+ catch {
618
+ // Some older/corrupt FTS5 shadow-table states can reject bulk deletes even
619
+ // though reads still work. Recreate the virtual table; documents_fts is a
620
+ // derived index, so rebuilding it from documents/content is safe.
621
+ db.exec(`DROP TABLE IF EXISTS documents_fts`);
622
+ db.exec(`
623
+ CREATE VIRTUAL TABLE documents_fts USING fts5(
624
+ filepath, title, body,
625
+ tokenize='porter unicode61'
626
+ )
627
+ `);
628
+ }
629
+ const rows = db.prepare(`
630
+ SELECT d.id, d.collection, d.path, d.title, content.doc as body
631
+ FROM documents d
632
+ JOIN content ON content.hash = d.hash
633
+ WHERE d.active = 1
634
+ `).all();
635
+ const insert = db.prepare(`INSERT INTO documents_fts(rowid, filepath, title, body) VALUES (?, ?, ?, ?)`);
636
+ const rebuild = db.transaction(() => {
637
+ for (const row of rows) {
638
+ insert.run(row.id, normalizeCjkForFTS(`${row.collection}/${row.path}`), normalizeCjkForFTS(row.title), normalizeCjkForFTS(row.body));
639
+ }
640
+ });
641
+ rebuild();
642
+ db.prepare(`
643
+ INSERT OR REPLACE INTO store_config(key, value)
644
+ VALUES ('fts_cjk_normalized_version', ?)
645
+ `).run(FTS_CJK_NORMALIZED_VERSION);
646
+ }
501
647
  function initializeDatabase(db) {
502
648
  try {
503
649
  loadSqliteVec(db);
504
650
  verifySqliteVecLoaded(db);
505
651
  _sqliteVecAvailable = true;
652
+ _sqliteVecUnavailableReason = null;
506
653
  }
507
- catch {
654
+ catch (err) {
508
655
  // sqlite-vec is optional — vector search won't work but FTS is fine
509
656
  _sqliteVecAvailable = false;
657
+ _sqliteVecUnavailableReason = getErrorMessage(err);
658
+ console.warn(_sqliteVecUnavailableReason);
510
659
  }
511
660
  db.exec("PRAGMA journal_mode = WAL");
512
661
  db.exec("PRAGMA foreign_keys = ON");
@@ -548,19 +697,16 @@ function initializeDatabase(db) {
548
697
  created_at TEXT NOT NULL
549
698
  )
550
699
  `);
551
- // Content vectors
552
- const cvInfo = db.prepare(`PRAGMA table_info(content_vectors)`).all();
553
- const hasSeqColumn = cvInfo.some(col => col.name === 'seq');
554
- if (cvInfo.length > 0 && !hasSeqColumn) {
555
- db.exec(`DROP TABLE IF EXISTS content_vectors`);
556
- db.exec(`DROP TABLE IF EXISTS vectors_vec`);
557
- }
700
+ // Content vectors. Avoid PRAGMA schema probes during startup; legacy vector
701
+ // columns are repaired lazily when a vector/embedding query first needs them.
558
702
  db.exec(`
559
703
  CREATE TABLE IF NOT EXISTS content_vectors (
560
704
  hash TEXT NOT NULL,
561
705
  seq INTEGER NOT NULL DEFAULT 0,
562
706
  pos INTEGER NOT NULL DEFAULT 0,
563
707
  model TEXT NOT NULL,
708
+ embed_fingerprint TEXT NOT NULL DEFAULT '',
709
+ total_chunks INTEGER NOT NULL DEFAULT 1,
564
710
  embedded_at TEXT NOT NULL,
565
711
  PRIMARY KEY (hash, seq)
566
712
  )
@@ -591,9 +737,12 @@ function initializeDatabase(db) {
591
737
  tokenize='porter unicode61'
592
738
  )
593
739
  `);
594
- // Triggers to keep FTS in sync
740
+ // Triggers keep FTS in sync for callers that write directly to documents.
741
+ // Production indexing paths rebuild entries in TypeScript so CJK text can be
742
+ // normalized before it reaches the unicode61 tokenizer.
743
+ db.exec(`DROP TRIGGER IF EXISTS documents_ai`);
595
744
  db.exec(`
596
- CREATE TRIGGER IF NOT EXISTS documents_ai AFTER INSERT ON documents
745
+ CREATE TRIGGER documents_ai AFTER INSERT ON documents
597
746
  WHEN new.active = 1
598
747
  BEGIN
599
748
  INSERT INTO documents_fts(rowid, filepath, title, body)
@@ -605,13 +754,15 @@ function initializeDatabase(db) {
605
754
  WHERE new.active = 1;
606
755
  END
607
756
  `);
757
+ db.exec(`DROP TRIGGER IF EXISTS documents_ad`);
608
758
  db.exec(`
609
- CREATE TRIGGER IF NOT EXISTS documents_ad AFTER DELETE ON documents BEGIN
759
+ CREATE TRIGGER documents_ad AFTER DELETE ON documents BEGIN
610
760
  DELETE FROM documents_fts WHERE rowid = old.id;
611
761
  END
612
762
  `);
763
+ db.exec(`DROP TRIGGER IF EXISTS documents_au`);
613
764
  db.exec(`
614
- CREATE TRIGGER IF NOT EXISTS documents_au AFTER UPDATE ON documents
765
+ CREATE TRIGGER documents_au AFTER UPDATE ON documents
615
766
  BEGIN
616
767
  -- Delete from FTS if no longer active
617
768
  DELETE FROM documents_fts WHERE rowid = old.id AND new.active = 0;
@@ -626,6 +777,7 @@ function initializeDatabase(db) {
626
777
  WHERE new.active = 1;
627
778
  END
628
779
  `);
780
+ rebuildFTSForCjkNormalization(db);
629
781
  }
630
782
  function rowToNamedCollection(row) {
631
783
  return {
@@ -767,7 +919,7 @@ export function isSqliteVecAvailable() {
767
919
  }
768
920
  function ensureVecTableInternal(db, dimensions) {
769
921
  if (!_sqliteVecAvailable) {
770
- throw new Error("sqlite-vec is not available. Vector operations require a SQLite build with extension loading support.");
922
+ throw createSqliteVecUnavailableError(_sqliteVecUnavailableReason ?? "vector operations require a SQLite build with extension loading support");
771
923
  }
772
924
  const tableInfo = db.prepare(`SELECT sql FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
773
925
  if (tableInfo) {
@@ -777,7 +929,10 @@ function ensureVecTableInternal(db, dimensions) {
777
929
  const existingDims = match?.[1] ? parseInt(match[1], 10) : null;
778
930
  if (existingDims === dimensions && hasHashSeq && hasCosine)
779
931
  return;
780
- // Table exists but wrong schema - need to rebuild
932
+ if (existingDims !== null && existingDims !== dimensions) {
933
+ throw new Error(`Embedding dimension mismatch: existing vectors are ${existingDims}d but the current model produces ${dimensions}d. ` +
934
+ `Run 'qmd embed -f' to re-embed with the new model.`);
935
+ }
781
936
  db.exec("DROP TABLE IF EXISTS vectors_vec");
782
937
  }
783
938
  db.exec(`CREATE VIRTUAL TABLE vectors_vec USING vec0(hash_seq TEXT PRIMARY KEY, embedding float[${dimensions}] distance_metric=cosine)`);
@@ -828,7 +983,7 @@ export async function reindexCollection(store, collectionPath, globPattern, coll
828
983
  }
829
984
  const hash = await hashContent(content);
830
985
  const title = extractTitle(content, relativeFile);
831
- const existing = findActiveDocument(db, collectionName, path);
986
+ const existing = findOrMigrateLegacyDocument(db, collectionName, path);
832
987
  if (existing) {
833
988
  if (existing.hash === hash) {
834
989
  if (existing.title !== title) {
@@ -867,6 +1022,125 @@ export async function reindexCollection(store, collectionPath, globPattern, coll
867
1022
  const orphanedCleaned = cleanupOrphanedContent(db);
868
1023
  return { indexed, updated, unchanged, removed, orphanedCleaned };
869
1024
  }
1025
+ function validatePositiveIntegerOption(name, value, fallback) {
1026
+ if (value === undefined)
1027
+ return fallback;
1028
+ if (!Number.isInteger(value) || value < 1) {
1029
+ throw new Error(`${name} must be a positive integer`);
1030
+ }
1031
+ return value;
1032
+ }
1033
+ function resolveEmbedOptions(options) {
1034
+ return {
1035
+ maxDocsPerBatch: validatePositiveIntegerOption("maxDocsPerBatch", options?.maxDocsPerBatch, DEFAULT_EMBED_MAX_DOCS_PER_BATCH),
1036
+ maxBatchBytes: validatePositiveIntegerOption("maxBatchBytes", options?.maxBatchBytes, DEFAULT_EMBED_MAX_BATCH_BYTES),
1037
+ };
1038
+ }
1039
+ const CONTENT_VECTOR_DESIRED_COLUMNS = [
1040
+ { name: "seq", definition: "INTEGER NOT NULL DEFAULT 0" },
1041
+ { name: "pos", definition: "INTEGER NOT NULL DEFAULT 0" },
1042
+ { name: "model", definition: "TEXT NOT NULL DEFAULT ''" },
1043
+ { name: "embed_fingerprint", definition: "TEXT NOT NULL DEFAULT ''" },
1044
+ { name: "total_chunks", definition: "INTEGER NOT NULL DEFAULT 1" },
1045
+ { name: "embedded_at", definition: "TEXT NOT NULL DEFAULT ''" },
1046
+ ];
1047
+ function isContentVectorColumnError(error) {
1048
+ const message = error instanceof Error ? error.message : String(error);
1049
+ if (!/(no such column|has no column named)/i.test(message)) {
1050
+ return false;
1051
+ }
1052
+ return CONTENT_VECTOR_DESIRED_COLUMNS.some(col => message.includes(col.name));
1053
+ }
1054
+ function runContentVectorColumnRepairs(db) {
1055
+ for (const column of CONTENT_VECTOR_DESIRED_COLUMNS) {
1056
+ try {
1057
+ db.exec(`ALTER TABLE content_vectors ADD COLUMN ${column.name} ${column.definition}`);
1058
+ }
1059
+ catch (error) {
1060
+ const message = error instanceof Error ? error.message : String(error);
1061
+ // The repair series is intentionally idempotent: most columns should
1062
+ // already exist, and another caller may have repaired a missing column
1063
+ // between the failed query and this ALTER series.
1064
+ if (!message.includes("duplicate column name")) {
1065
+ throw error;
1066
+ }
1067
+ }
1068
+ }
1069
+ }
1070
+ function withLazyContentVectorMigration(db, operation) {
1071
+ let repaired = false;
1072
+ while (true) {
1073
+ try {
1074
+ return operation();
1075
+ }
1076
+ catch (error) {
1077
+ if (repaired || !isContentVectorColumnError(error)) {
1078
+ throw error;
1079
+ }
1080
+ runContentVectorColumnRepairs(db);
1081
+ repaired = true;
1082
+ }
1083
+ }
1084
+ }
1085
+ function getPendingEmbeddingDocs(db, collection, model = DEFAULT_EMBED_MODEL) {
1086
+ const collectionFilter = collection ? `AND d.collection = ?` : ``;
1087
+ const fingerprint = getEmbeddingFingerprint(model);
1088
+ return withLazyContentVectorMigration(db, () => {
1089
+ const stmt = db.prepare(`
1090
+ SELECT d.hash, MIN(d.path) as path, length(CAST(c.doc AS BLOB)) as bytes
1091
+ FROM documents d
1092
+ JOIN content c ON d.hash = c.hash
1093
+ LEFT JOIN (
1094
+ SELECT hash, model, COUNT(*) AS chunk_count, MAX(total_chunks) AS expected_chunks
1095
+ FROM content_vectors
1096
+ WHERE model = ? AND embed_fingerprint = ?
1097
+ GROUP BY hash, model, embed_fingerprint
1098
+ ) v ON d.hash = v.hash
1099
+ WHERE d.active = 1
1100
+ AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
1101
+ ${collectionFilter}
1102
+ GROUP BY d.hash
1103
+ ORDER BY MIN(d.path)
1104
+ `);
1105
+ return (collection ? stmt.all(model, fingerprint, collection) : stmt.all(model, fingerprint));
1106
+ });
1107
+ }
1108
+ function buildEmbeddingBatches(docs, maxDocsPerBatch, maxBatchBytes) {
1109
+ const batches = [];
1110
+ let currentBatch = [];
1111
+ let currentBytes = 0;
1112
+ for (const doc of docs) {
1113
+ const docBytes = Math.max(0, doc.bytes);
1114
+ const wouldExceedDocs = currentBatch.length >= maxDocsPerBatch;
1115
+ const wouldExceedBytes = currentBatch.length > 0 && (currentBytes + docBytes) > maxBatchBytes;
1116
+ if (wouldExceedDocs || wouldExceedBytes) {
1117
+ batches.push(currentBatch);
1118
+ currentBatch = [];
1119
+ currentBytes = 0;
1120
+ }
1121
+ currentBatch.push(doc);
1122
+ currentBytes += docBytes;
1123
+ }
1124
+ if (currentBatch.length > 0) {
1125
+ batches.push(currentBatch);
1126
+ }
1127
+ return batches;
1128
+ }
1129
+ function getEmbeddingDocsForBatch(db, batch) {
1130
+ if (batch.length === 0)
1131
+ return [];
1132
+ const placeholders = batch.map(() => "?").join(",");
1133
+ const rows = db.prepare(`
1134
+ SELECT hash, doc as body
1135
+ FROM content
1136
+ WHERE hash IN (${placeholders})
1137
+ `).all(...batch.map(doc => doc.hash));
1138
+ const bodyByHash = new Map(rows.map(row => [row.hash, row.body]));
1139
+ return batch.map((doc) => ({
1140
+ ...doc,
1141
+ body: bodyByHash.get(doc.hash) ?? "",
1142
+ }));
1143
+ }
870
1144
  /**
871
1145
  * Generate vector embeddings for documents that need them.
872
1146
  * Pure function — no console output, no db lifecycle management.
@@ -874,104 +1148,238 @@ export async function reindexCollection(store, collectionPath, globPattern, coll
874
1148
  */
875
1149
  export async function generateEmbeddings(store, options) {
876
1150
  const db = store.db;
877
- const model = options?.model ?? DEFAULT_EMBED_MODEL;
1151
+ const llm = getLlm(store);
1152
+ const model = options?.model ?? llm.embedModelName ?? DEFAULT_EMBED_MODEL;
1153
+ const fingerprint = getEmbeddingFingerprint(model);
878
1154
  const now = new Date().toISOString();
1155
+ const { maxDocsPerBatch, maxBatchBytes } = resolveEmbedOptions(options);
1156
+ const encoder = new TextEncoder();
879
1157
  if (options?.force) {
880
- clearAllEmbeddings(db);
1158
+ clearAllEmbeddings(db, options?.collection);
881
1159
  }
882
- const hashesToEmbed = getHashesForEmbedding(db);
883
- if (hashesToEmbed.length === 0) {
1160
+ const docsToEmbed = getPendingEmbeddingDocs(db, options?.collection, model);
1161
+ if (docsToEmbed.length === 0) {
884
1162
  return { docsProcessed: 0, chunksEmbedded: 0, errors: 0, durationMs: 0 };
885
1163
  }
886
- const allChunks = [];
887
- for (const item of hashesToEmbed) {
888
- const encoder = new TextEncoder();
889
- const bodyBytes = encoder.encode(item.body).length;
890
- if (bodyBytes === 0)
891
- continue;
892
- const title = extractTitle(item.body, item.path);
893
- const chunks = await chunkDocumentByTokens(item.body);
894
- for (let seq = 0; seq < chunks.length; seq++) {
895
- allChunks.push({
896
- hash: item.hash,
897
- title,
898
- text: chunks[seq].text,
899
- seq,
900
- pos: chunks[seq].pos,
901
- tokens: chunks[seq].tokens,
902
- bytes: encoder.encode(chunks[seq].text).length,
903
- });
904
- }
905
- }
906
- if (allChunks.length === 0) {
907
- return { docsProcessed: 0, chunksEmbedded: 0, errors: 0, durationMs: 0 };
908
- }
909
- const totalBytes = allChunks.reduce((sum, chk) => sum + chk.bytes, 0);
910
- const totalChunks = allChunks.length;
911
- const totalDocs = hashesToEmbed.length;
1164
+ const totalBytes = docsToEmbed.reduce((sum, doc) => sum + Math.max(0, doc.bytes), 0);
1165
+ const totalDocs = docsToEmbed.length;
912
1166
  const startTime = Date.now();
913
1167
  // Use store's LlamaCpp or global singleton, wrapped in a session
914
- const llm = getLlm(store);
915
- const sessionOptions = { maxDuration: 30 * 60 * 1000, name: 'generateEmbeddings' };
1168
+ const embedModelUri = model;
916
1169
  // Create a session manager for this llm instance
917
1170
  const result = await withLLMSessionForLlm(llm, async (session) => {
918
- // Get embedding dimensions from first chunk
919
- const firstChunk = allChunks[0];
920
- const firstText = formatDocForEmbedding(firstChunk.text, firstChunk.title);
921
- const firstResult = await session.embed(firstText);
922
- if (!firstResult) {
923
- throw new Error("Failed to get embedding dimensions from first chunk");
924
- }
925
- store.ensureVecTable(firstResult.embedding.length);
926
- let chunksEmbedded = 0, errors = 0, bytesProcessed = 0;
1171
+ let chunksEmbedded = 0;
1172
+ let bytesProcessed = 0;
1173
+ let totalChunks = 0;
1174
+ let vectorTableInitialized = false;
927
1175
  const BATCH_SIZE = 32;
928
- for (let batchStart = 0; batchStart < allChunks.length; batchStart += BATCH_SIZE) {
929
- const batchEnd = Math.min(batchStart + BATCH_SIZE, allChunks.length);
930
- const batch = allChunks.slice(batchStart, batchEnd);
931
- const texts = batch.map(chunk => formatDocForEmbedding(chunk.text, chunk.title));
1176
+ const RETRY_AFTER_SUCCESSFUL_CHUNKS = 64;
1177
+ const MAX_RETRY_ATTEMPTS = 3;
1178
+ const failures = new Map();
1179
+ const retryQueue = new Map();
1180
+ let successesSinceRetry = 0;
1181
+ const failureList = () => [...failures.values()];
1182
+ const activeErrorCount = () => failures.size;
1183
+ const chunkKey = (chunk) => `${chunk.hash}:${chunk.seq}`;
1184
+ const reasonFromError = (error) => {
1185
+ const raw = error instanceof Error ? error.message : String(error);
1186
+ return raw.length > 180 ? `${raw.slice(0, 177)}...` : raw;
1187
+ };
1188
+ const recordFailure = (chunk, reason) => {
1189
+ const key = chunkKey(chunk);
1190
+ const previous = failures.get(key);
1191
+ failures.set(key, {
1192
+ path: chunk.path,
1193
+ hash: chunk.hash,
1194
+ seq: chunk.seq,
1195
+ attempts: (previous?.attempts ?? 0) + 1,
1196
+ reason,
1197
+ });
1198
+ retryQueue.set(key, chunk);
1199
+ };
1200
+ const clearFailure = (chunk) => {
1201
+ const key = chunkKey(chunk);
1202
+ failures.delete(key);
1203
+ retryQueue.delete(key);
1204
+ };
1205
+ const tryEmbedChunk = async (chunk) => {
932
1206
  try {
933
- const embeddings = await session.embedBatch(texts);
934
- for (let i = 0; i < batch.length; i++) {
935
- const chunk = batch[i];
936
- const embedding = embeddings[i];
937
- if (embedding) {
938
- insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now);
939
- chunksEmbedded++;
940
- }
941
- else {
942
- errors++;
943
- }
944
- bytesProcessed += chunk.bytes;
1207
+ const text = formatDocForEmbedding(chunk.text, chunk.title, embedModelUri);
1208
+ const result = await session.embed(text, { model });
1209
+ if (!result) {
1210
+ recordFailure(chunk, "embedding returned no vector");
1211
+ return false;
945
1212
  }
1213
+ insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now, chunk.expectedTotalChunks, fingerprint);
1214
+ chunksEmbedded++;
1215
+ successesSinceRetry++;
1216
+ clearFailure(chunk);
1217
+ return true;
946
1218
  }
947
- catch {
948
- // Batch failed — try individual embeddings as fallback
949
- for (const chunk of batch) {
950
- try {
951
- const text = formatDocForEmbedding(chunk.text, chunk.title);
952
- const result = await session.embed(text);
953
- if (result) {
954
- insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now);
1219
+ catch (error) {
1220
+ recordFailure(chunk, reasonFromError(error));
1221
+ return false;
1222
+ }
1223
+ };
1224
+ const retryFailedChunks = async (force = false) => {
1225
+ if (!session.isValid || retryQueue.size === 0)
1226
+ return;
1227
+ if (!force && successesSinceRetry < RETRY_AFTER_SUCCESSFUL_CHUNKS)
1228
+ return;
1229
+ successesSinceRetry = 0;
1230
+ // Normal mode: one retry pass after enough unrelated chunks succeeded.
1231
+ // Force mode: we have run out of other chunks for this batch, so keep
1232
+ // retrying outstanding failures until they recover or hit the cap. The
1233
+ // cap prevents endless loops on permanently bad chunks.
1234
+ do {
1235
+ let retried = 0;
1236
+ for (const [key, chunk] of [...retryQueue]) {
1237
+ const failure = failures.get(key);
1238
+ if (!failure || failure.attempts >= MAX_RETRY_ATTEMPTS)
1239
+ continue;
1240
+ retried++;
1241
+ await tryEmbedChunk(chunk);
1242
+ }
1243
+ if (!force || retried === 0)
1244
+ break;
1245
+ } while (session.isValid && [...retryQueue].some(([key]) => {
1246
+ const failure = failures.get(key);
1247
+ return !!failure && failure.attempts < MAX_RETRY_ATTEMPTS;
1248
+ }));
1249
+ };
1250
+ const batches = buildEmbeddingBatches(docsToEmbed, maxDocsPerBatch, maxBatchBytes);
1251
+ for (const batchMeta of batches) {
1252
+ // Abort early if session has been invalidated
1253
+ if (!session.isValid) {
1254
+ console.warn(`⚠ Session expired — skipping remaining document batches`);
1255
+ break;
1256
+ }
1257
+ const batchDocs = getEmbeddingDocsForBatch(db, batchMeta);
1258
+ const batchChunks = [];
1259
+ const expectedChunksByHash = new Map();
1260
+ const batchBytes = batchMeta.reduce((sum, doc) => sum + Math.max(0, doc.bytes), 0);
1261
+ for (const doc of batchDocs) {
1262
+ if (!doc.body.trim())
1263
+ continue;
1264
+ const title = extractTitle(doc.body, doc.path);
1265
+ const chunks = await chunkDocumentByTokens(doc.body, undefined, undefined, undefined, doc.path, options?.chunkStrategy, session.signal);
1266
+ for (let seq = 0; seq < chunks.length; seq++) {
1267
+ batchChunks.push({
1268
+ hash: doc.hash,
1269
+ path: doc.path,
1270
+ title,
1271
+ text: chunks[seq].text,
1272
+ seq,
1273
+ pos: chunks[seq].pos,
1274
+ tokens: chunks[seq].tokens,
1275
+ bytes: encoder.encode(chunks[seq].text).length,
1276
+ expectedTotalChunks: chunks.length,
1277
+ });
1278
+ }
1279
+ expectedChunksByHash.set(doc.hash, chunks.length);
1280
+ }
1281
+ totalChunks += batchChunks.length;
1282
+ if (batchChunks.length === 0) {
1283
+ bytesProcessed += batchBytes;
1284
+ options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors: activeErrorCount(), failures: failureList() });
1285
+ continue;
1286
+ }
1287
+ if (!vectorTableInitialized) {
1288
+ const firstChunk = batchChunks[0];
1289
+ const firstText = formatDocForEmbedding(firstChunk.text, firstChunk.title, embedModelUri);
1290
+ const firstResult = await session.embed(firstText, { model });
1291
+ if (!firstResult) {
1292
+ throw new Error("Failed to get embedding dimensions from first chunk");
1293
+ }
1294
+ store.ensureVecTable(firstResult.embedding.length);
1295
+ vectorTableInitialized = true;
1296
+ }
1297
+ const totalBatchChunkBytes = batchChunks.reduce((sum, chunk) => sum + chunk.bytes, 0);
1298
+ let batchChunkBytesProcessed = 0;
1299
+ for (let batchStart = 0; batchStart < batchChunks.length; batchStart += BATCH_SIZE) {
1300
+ // Abort early if session has been invalidated (e.g. max duration exceeded)
1301
+ if (!session.isValid) {
1302
+ const remainingChunks = batchChunks.slice(batchStart);
1303
+ for (const chunk of remainingChunks)
1304
+ recordFailure(chunk, "LLM session expired before embedding chunk");
1305
+ console.warn(`⚠ Session expired — skipping ${remainingChunks.length} remaining chunks`);
1306
+ break;
1307
+ }
1308
+ // Abort early if active error rate is too high (>80% of attempted chunks failed)
1309
+ const processed = chunksEmbedded + activeErrorCount();
1310
+ if (processed >= BATCH_SIZE && activeErrorCount() > processed * 0.8) {
1311
+ const remainingChunks = batchChunks.slice(batchStart);
1312
+ for (const chunk of remainingChunks)
1313
+ recordFailure(chunk, "embedding aborted because error rate was too high");
1314
+ console.warn(`⚠ Error rate too high (${activeErrorCount()}/${processed}) — aborting embedding`);
1315
+ break;
1316
+ }
1317
+ const batchEnd = Math.min(batchStart + BATCH_SIZE, batchChunks.length);
1318
+ const chunkBatch = batchChunks.slice(batchStart, batchEnd);
1319
+ const texts = chunkBatch.map(chunk => formatDocForEmbedding(chunk.text, chunk.title, embedModelUri));
1320
+ try {
1321
+ const embeddings = await session.embedBatch(texts, { model });
1322
+ for (let i = 0; i < chunkBatch.length; i++) {
1323
+ const chunk = chunkBatch[i];
1324
+ const embedding = embeddings[i];
1325
+ if (embedding) {
1326
+ insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now, chunk.expectedTotalChunks, fingerprint);
955
1327
  chunksEmbedded++;
1328
+ successesSinceRetry++;
1329
+ clearFailure(chunk);
956
1330
  }
957
1331
  else {
958
- errors++;
1332
+ recordFailure(chunk, "batch embedding returned no vector");
959
1333
  }
1334
+ batchChunkBytesProcessed += chunk.bytes;
1335
+ }
1336
+ await retryFailedChunks();
1337
+ }
1338
+ catch (error) {
1339
+ // Batch failed — try individual embeddings as fallback. If an
1340
+ // individual retry succeeds, any prior failure for that chunk is
1341
+ // cleared, so the visible error count reflects outstanding failures.
1342
+ const batchReason = reasonFromError(error);
1343
+ if (!session.isValid) {
1344
+ for (const chunk of chunkBatch)
1345
+ recordFailure(chunk, `batch failed and session expired: ${batchReason}`);
1346
+ batchChunkBytesProcessed += chunkBatch.reduce((sum, c) => sum + c.bytes, 0);
960
1347
  }
961
- catch {
962
- errors++;
1348
+ else {
1349
+ for (const chunk of chunkBatch) {
1350
+ await tryEmbedChunk(chunk);
1351
+ batchChunkBytesProcessed += chunk.bytes;
1352
+ await retryFailedChunks();
1353
+ }
963
1354
  }
964
- bytesProcessed += chunk.bytes;
965
1355
  }
1356
+ const proportionalBytes = totalBatchChunkBytes === 0
1357
+ ? batchBytes
1358
+ : Math.min(batchBytes, Math.round((batchChunkBytesProcessed / totalBatchChunkBytes) * batchBytes));
1359
+ options?.onProgress?.({
1360
+ chunksEmbedded,
1361
+ totalChunks,
1362
+ bytesProcessed: bytesProcessed + proportionalBytes,
1363
+ totalBytes,
1364
+ errors: activeErrorCount(),
1365
+ failures: failureList(),
1366
+ });
1367
+ }
1368
+ await retryFailedChunks(true);
1369
+ const removedPartialChunks = removeIncompleteEmbeddings(db, expectedChunksByHash, model);
1370
+ if (removedPartialChunks > 0) {
1371
+ chunksEmbedded = Math.max(0, chunksEmbedded - removedPartialChunks);
966
1372
  }
967
- options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors });
1373
+ bytesProcessed += batchBytes;
1374
+ options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors: activeErrorCount(), failures: failureList() });
968
1375
  }
969
- return { chunksEmbedded, errors };
970
- }, sessionOptions);
1376
+ return { chunksEmbedded, errors: activeErrorCount(), failures: failureList() };
1377
+ }, { maxDuration: 30 * 60 * 1000, name: 'generateEmbeddings' });
971
1378
  return {
972
1379
  docsProcessed: totalDocs,
973
1380
  chunksEmbedded: result.chunksEmbedded,
974
1381
  errors: result.errors,
1382
+ failures: result.failures,
975
1383
  durationMs: Date.now() - startTime,
976
1384
  };
977
1385
  }
@@ -992,9 +1400,9 @@ export function createStore(dbPath) {
992
1400
  close: () => db.close(),
993
1401
  ensureVecTable: (dimensions) => ensureVecTableInternal(db, dimensions),
994
1402
  // Index health
995
- getHashesNeedingEmbedding: () => getHashesNeedingEmbedding(db),
996
- getIndexHealth: () => getIndexHealth(db),
997
- getStatus: () => getStatus(db),
1403
+ getHashesNeedingEmbedding: (model) => getHashesNeedingEmbedding(db, undefined, model ?? store.llm?.embedModelName ?? DEFAULT_EMBED_MODEL),
1404
+ getIndexHealth: (model) => getIndexHealth(db, model ?? store.llm?.embedModelName ?? DEFAULT_EMBED_MODEL),
1405
+ getStatus: (model) => getStatus(db, model ?? store.llm?.embedModelName ?? DEFAULT_EMBED_MODEL),
998
1406
  // Caching
999
1407
  getCacheKey,
1000
1408
  getCachedResult: (cacheKey) => getCachedResult(db, cacheKey),
@@ -1022,8 +1430,8 @@ export function createStore(dbPath) {
1022
1430
  searchFTS: (query, limit, collectionName) => searchFTS(db, query, limit, collectionName),
1023
1431
  searchVec: (query, model, limit, collectionName, session, precomputedEmbedding) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding),
1024
1432
  // Query expansion & reranking
1025
- expandQuery: (query, model, intent) => expandQuery(query, model, db, intent, store.llm),
1026
- rerank: (query, documents, model, intent) => rerank(query, documents, model, db, intent, store.llm),
1433
+ expandQuery: (query, model, intent) => expandQuery(query, model ?? store.llm?.generateModelName ?? DEFAULT_QUERY_MODEL, db, intent, store.llm),
1434
+ rerank: (query, documents, model, intent) => rerank(query, documents, model ?? store.llm?.rerankModelName ?? DEFAULT_RERANK_MODEL, db, intent, store.llm),
1027
1435
  // Document retrieval
1028
1436
  findDocument: (filename, options) => findDocument(db, filename, options),
1029
1437
  getDocumentBody: (doc, fromLine, maxLines) => getDocumentBody(db, doc, fromLine, maxLines),
@@ -1036,6 +1444,7 @@ export function createStore(dbPath) {
1036
1444
  insertContent: (hash, content, createdAt) => insertContent(db, hash, content, createdAt),
1037
1445
  insertDocument: (collectionName, path, title, hash, createdAt, modifiedAt) => insertDocument(db, collectionName, path, title, hash, createdAt, modifiedAt),
1038
1446
  findActiveDocument: (collectionName, path) => findActiveDocument(db, collectionName, path),
1447
+ findOrMigrateLegacyDocument: (collectionName, path) => findOrMigrateLegacyDocument(db, collectionName, path),
1039
1448
  updateDocumentTitle: (documentId, title, modifiedAt) => updateDocumentTitle(db, documentId, title, modifiedAt),
1040
1449
  updateDocument: (documentId, title, hash, modifiedAt) => updateDocument(db, documentId, title, hash, modifiedAt),
1041
1450
  deactivateDocument: (collectionName, path) => deactivateDocument(db, collectionName, path),
@@ -1043,7 +1452,7 @@ export function createStore(dbPath) {
1043
1452
  // Vector/embedding operations
1044
1453
  getHashesForEmbedding: () => getHashesForEmbedding(db),
1045
1454
  clearAllEmbeddings: () => clearAllEmbeddings(db),
1046
- insertEmbedding: (hash, seq, pos, embedding, model, embeddedAt) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt),
1455
+ insertEmbedding: (hash, seq, pos, embedding, model, embeddedAt, totalChunks, fingerprint) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt, totalChunks, fingerprint),
1047
1456
  };
1048
1457
  return store;
1049
1458
  }
@@ -1056,11 +1465,11 @@ export function getDocid(hash) {
1056
1465
  /**
1057
1466
  * Handelize a filename to be more token-friendly.
1058
1467
  * - Convert triple underscore `___` to `/` (folder separator)
1059
- * - Convert to lowercase
1060
1468
  * - Replace sequences of non-word chars (except /) with single dash
1061
1469
  * - Remove leading/trailing dashes from path segments
1062
1470
  * - Preserve folder structure (a/b/c/d.md stays structured)
1063
1471
  * - Preserve file extension
1472
+ * - Preserve original case (important for case-sensitive filesystems)
1064
1473
  */
1065
1474
  /** Replace emoji/symbol codepoints with their hex representation (e.g. 🐘 → 1f418) */
1066
1475
  function emojiToHex(str) {
@@ -1085,7 +1494,6 @@ export function handelize(path) {
1085
1494
  }
1086
1495
  const result = path
1087
1496
  .replace(/___/g, '/') // Triple underscore becomes folder separator
1088
- .toLowerCase()
1089
1497
  .split('/')
1090
1498
  .map((segment, idx, arr) => {
1091
1499
  const isLastSegment = idx === arr.length - 1;
@@ -1097,7 +1505,7 @@ export function handelize(path) {
1097
1505
  const ext = extMatch ? extMatch[1] : '';
1098
1506
  const nameWithoutExt = ext ? segment.slice(0, -ext.length) : segment;
1099
1507
  const cleanedName = nameWithoutExt
1100
- .replace(/[^\p{L}\p{N}$]+/gu, '-') // Keep route marker "$", dash-separate other chars
1508
+ .replace(/[^\p{L}\p{N}$]+/gu, '-') // Keep letters, numbers, "$"; dash-separate rest (including dots)
1101
1509
  .replace(/^-+|-+$/g, ''); // Remove leading/trailing dashes
1102
1510
  return cleanedName + ext;
1103
1511
  }
@@ -1118,17 +1526,85 @@ export function handelize(path) {
1118
1526
  // =============================================================================
1119
1527
  // Index health
1120
1528
  // =============================================================================
1121
- export function getHashesNeedingEmbedding(db) {
1122
- const result = db.prepare(`
1123
- SELECT COUNT(DISTINCT d.hash) as count
1124
- FROM documents d
1125
- LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
1126
- WHERE d.active = 1 AND v.hash IS NULL
1127
- `).get();
1128
- return result.count;
1529
+ export function getHashesNeedingEmbedding(db, collection, model = DEFAULT_EMBED_MODEL) {
1530
+ const collectionFilter = collection ? `AND d.collection = ?` : ``;
1531
+ const fingerprint = getEmbeddingFingerprint(model);
1532
+ return withLazyContentVectorMigration(db, () => {
1533
+ const stmt = db.prepare(`
1534
+ SELECT COUNT(DISTINCT d.hash) as count
1535
+ FROM documents d
1536
+ LEFT JOIN (
1537
+ SELECT hash, model, COUNT(*) AS chunk_count, MAX(total_chunks) AS expected_chunks
1538
+ FROM content_vectors
1539
+ WHERE model = ? AND embed_fingerprint = ?
1540
+ GROUP BY hash, model, embed_fingerprint
1541
+ ) v ON d.hash = v.hash
1542
+ WHERE d.active = 1
1543
+ AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
1544
+ ${collectionFilter}
1545
+ `);
1546
+ const result = (collection ? stmt.get(model, fingerprint, collection) : stmt.get(model, fingerprint));
1547
+ return result.count;
1548
+ });
1129
1549
  }
1130
- export function getIndexHealth(db) {
1131
- const needsEmbedding = getHashesNeedingEmbedding(db);
1550
+ export async function maybeAdoptLegacyEmbeddingFingerprint(store, model = DEFAULT_EMBED_MODEL) {
1551
+ const db = store.db;
1552
+ const fingerprint = getEmbeddingFingerprint(model);
1553
+ const legacyCount = withLazyContentVectorMigration(db, () => {
1554
+ const row = db.prepare(`SELECT COUNT(DISTINCT hash) AS count FROM content_vectors WHERE model = ? AND embed_fingerprint = ''`).get(model);
1555
+ return row.count;
1556
+ });
1557
+ if (legacyCount === 0) {
1558
+ return { checked: false, adopted: 0, reason: "no legacy empty-fingerprint embeddings" };
1559
+ }
1560
+ const sample = withLazyContentVectorMigration(db, () => db.prepare(`
1561
+ SELECT cv.hash, cv.seq, cv.pos, cv.total_chunks, c.doc AS body, MIN(d.path) AS path
1562
+ FROM content_vectors cv
1563
+ JOIN documents d ON d.hash = cv.hash AND d.active = 1
1564
+ JOIN content c ON c.hash = cv.hash
1565
+ WHERE cv.model = ? AND cv.embed_fingerprint = ''
1566
+ GROUP BY cv.hash, cv.seq, cv.pos, cv.total_chunks, c.doc
1567
+ ORDER BY cv.hash, cv.seq
1568
+ LIMIT 1
1569
+ `).get(model));
1570
+ if (!sample) {
1571
+ return { checked: false, adopted: 0, reason: `${legacyCount} legacy docs have no active sample` };
1572
+ }
1573
+ const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
1574
+ if (!tableExists) {
1575
+ return { checked: false, adopted: 0, reason: "vectors_vec table is missing" };
1576
+ }
1577
+ const expectedHashSeq = `${sample.hash}_${sample.seq}`;
1578
+ const title = extractTitle(sample.body, sample.path);
1579
+ const llm = getLlm(store);
1580
+ return await withLLMSessionForLlm(llm, async (session) => {
1581
+ const chunks = await chunkDocumentByTokens(sample.body, undefined, undefined, undefined, sample.path, undefined, session.signal);
1582
+ const chunk = chunks[sample.seq];
1583
+ if (!chunk) {
1584
+ return { checked: true, adopted: 0, reason: `sample chunk ${expectedHashSeq} no longer exists` };
1585
+ }
1586
+ const result = await session.embed(formatDocForEmbedding(chunk.text, title, model), { model });
1587
+ if (!result) {
1588
+ return { checked: true, adopted: 0, reason: "failed to embed legacy sample" };
1589
+ }
1590
+ const nearest = db.prepare(`
1591
+ SELECT hash_seq, distance
1592
+ FROM vectors_vec
1593
+ WHERE embedding MATCH ? AND k = 1
1594
+ `).get(new Float32Array(result.embedding));
1595
+ if (!nearest) {
1596
+ return { checked: true, adopted: 0, reason: "legacy sample vector not found" };
1597
+ }
1598
+ const threshold = 0.0001;
1599
+ if (nearest.hash_seq !== expectedHashSeq || nearest.distance > threshold) {
1600
+ return { checked: true, adopted: 0, reason: `legacy sample differs from current fingerprint (nearest ${nearest.hash_seq}, distance ${nearest.distance.toFixed(6)})` };
1601
+ }
1602
+ const update = withLazyContentVectorMigration(db, () => db.prepare(`UPDATE content_vectors SET embed_fingerprint = ? WHERE model = ? AND embed_fingerprint = ''`).run(fingerprint, model));
1603
+ return { checked: true, adopted: update.changes, reason: `sample ${expectedHashSeq} matched current fingerprint at distance ${nearest.distance.toFixed(6)}` };
1604
+ });
1605
+ }
1606
+ export function getIndexHealth(db, model = DEFAULT_EMBED_MODEL) {
1607
+ const needsEmbedding = getHashesNeedingEmbedding(db, undefined, model);
1132
1608
  const totalDocs = db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get().count;
1133
1609
  const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get();
1134
1610
  let daysStale = null;
@@ -1181,13 +1657,15 @@ export function deleteInactiveDocuments(db) {
1181
1657
  return result.changes;
1182
1658
  }
1183
1659
  /**
1184
- * Remove orphaned content hashes that are not referenced by any active document.
1660
+ * Remove orphaned content hashes that are not referenced by any document.
1661
+ * Inactive documents are soft-deleted tombstones, so their content rows must
1662
+ * remain referenced until deleteInactiveDocuments() hard-deletes them.
1185
1663
  * Returns the number of orphaned content hashes deleted.
1186
1664
  */
1187
1665
  export function cleanupOrphanedContent(db) {
1188
1666
  const result = db.prepare(`
1189
1667
  DELETE FROM content
1190
- WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
1668
+ WHERE hash NOT IN (SELECT DISTINCT hash FROM documents)
1191
1669
  `).run();
1192
1670
  return result.changes;
1193
1671
  }
@@ -1196,39 +1674,50 @@ export function cleanupOrphanedContent(db) {
1196
1674
  * Returns the number of orphaned embedding chunks deleted.
1197
1675
  */
1198
1676
  export function cleanupOrphanedVectors(db) {
1199
- // Check if vectors_vec table exists
1200
- const tableExists = db.prepare(`
1201
- SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'
1202
- `).get();
1203
- if (!tableExists) {
1677
+ // sqlite-vec may not be loaded (e.g. Bun's bun:sqlite lacks loadExtension).
1678
+ // The vectors_vec virtual table can appear in sqlite_master from a prior
1679
+ // session, but querying it without the vec0 module loaded will crash (#380).
1680
+ if (!isSqliteVecAvailable()) {
1204
1681
  return 0;
1205
1682
  }
1206
- // Count orphaned vectors first
1207
- const countResult = db.prepare(`
1208
- SELECT COUNT(*) as c FROM content_vectors cv
1209
- WHERE NOT EXISTS (
1210
- SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
1211
- )
1212
- `).get();
1213
- if (countResult.c === 0) {
1683
+ // The schema entry can exist even when sqlite-vec itself is unavailable
1684
+ // (for example when reopening a DB without vec0 loaded). In that case,
1685
+ // touching the virtual table throws "no such module: vec0" and cleanup
1686
+ // should degrade gracefully like the rest of the vector features.
1687
+ try {
1688
+ db.prepare(`SELECT 1 FROM vectors_vec LIMIT 0`).get();
1689
+ }
1690
+ catch {
1214
1691
  return 0;
1215
1692
  }
1216
- // Delete from vectors_vec first
1217
- db.exec(`
1218
- DELETE FROM vectors_vec WHERE hash_seq IN (
1219
- SELECT cv.hash || '_' || cv.seq FROM content_vectors cv
1693
+ return withLazyContentVectorMigration(db, () => {
1694
+ // Count orphaned vectors first
1695
+ const countResult = db.prepare(`
1696
+ SELECT COUNT(*) as c FROM content_vectors cv
1220
1697
  WHERE NOT EXISTS (
1221
1698
  SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
1222
1699
  )
1223
- )
1224
- `);
1225
- // Delete from content_vectors
1226
- db.exec(`
1227
- DELETE FROM content_vectors WHERE hash NOT IN (
1228
- SELECT hash FROM documents WHERE active = 1
1229
- )
1230
- `);
1231
- return countResult.c;
1700
+ `).get();
1701
+ if (countResult.c === 0) {
1702
+ return 0;
1703
+ }
1704
+ // Delete from vectors_vec first
1705
+ db.exec(`
1706
+ DELETE FROM vectors_vec WHERE hash_seq IN (
1707
+ SELECT cv.hash || '_' || cv.seq FROM content_vectors cv
1708
+ WHERE NOT EXISTS (
1709
+ SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
1710
+ )
1711
+ )
1712
+ `);
1713
+ // Delete from content_vectors
1714
+ db.exec(`
1715
+ DELETE FROM content_vectors WHERE hash NOT IN (
1716
+ SELECT hash FROM documents WHERE active = 1
1717
+ )
1718
+ `);
1719
+ return countResult.c;
1720
+ });
1232
1721
  }
1233
1722
  /**
1234
1723
  * Run VACUUM to reclaim unused space in the database.
@@ -1290,6 +1779,21 @@ export function insertContent(db, hash, content, createdAt) {
1290
1779
  db.prepare(`INSERT OR IGNORE INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
1291
1780
  .run(hash, content, createdAt);
1292
1781
  }
1782
+ function rebuildDocumentFTS(db, documentId) {
1783
+ const row = db.prepare(`
1784
+ SELECT d.id, d.collection, d.path, d.title, content.doc as body
1785
+ FROM documents d
1786
+ JOIN content ON content.hash = d.hash
1787
+ WHERE d.id = ? AND d.active = 1
1788
+ `).get(documentId);
1789
+ db.prepare(`DELETE FROM documents_fts WHERE rowid = ?`).run(documentId);
1790
+ if (!row)
1791
+ return;
1792
+ db.prepare(`
1793
+ INSERT INTO documents_fts(rowid, filepath, title, body)
1794
+ VALUES (?, ?, ?, ?)
1795
+ `).run(row.id, normalizeCjkForFTS(`${row.collection}/${row.path}`), normalizeCjkForFTS(row.title), normalizeCjkForFTS(row.body));
1796
+ }
1293
1797
  /**
1294
1798
  * Insert a new document into the documents table.
1295
1799
  */
@@ -1303,6 +1807,9 @@ export function insertDocument(db, collectionName, path, title, hash, createdAt,
1303
1807
  modified_at = excluded.modified_at,
1304
1808
  active = 1
1305
1809
  `).run(collectionName, path, title, hash, createdAt, modifiedAt);
1810
+ const row = db.prepare(`SELECT id FROM documents WHERE collection = ? AND path = ?`).get(collectionName, path);
1811
+ if (row)
1812
+ rebuildDocumentFTS(db, row.id);
1306
1813
  }
1307
1814
  /**
1308
1815
  * Find an active document by collection name and path.
@@ -1314,12 +1821,48 @@ export function findActiveDocument(db, collectionName, path) {
1314
1821
  `).get(collectionName, path);
1315
1822
  return row ?? null;
1316
1823
  }
1824
+ /**
1825
+ * Find an active document, falling back to a case-insensitive path match.
1826
+ * If found under a different casing, renames it in-place and rebuilds the
1827
+ * FTS entry. Embeddings are keyed by content hash, so the rename is
1828
+ * safe — no re-embedding required.
1829
+ *
1830
+ * @internal Used by reindexCollection and indexFiles during qmd update.
1831
+ * Returns null if the document does not exist under either path.
1832
+ */
1833
+ export function findOrMigrateLegacyDocument(db, collectionName, path) {
1834
+ const existing = findActiveDocument(db, collectionName, path);
1835
+ if (existing)
1836
+ return existing;
1837
+ const legacy = db.prepare(`
1838
+ SELECT id, hash, title FROM documents
1839
+ WHERE collection = ? AND path COLLATE NOCASE = ? AND active = 1
1840
+ ORDER BY id
1841
+ LIMIT 1
1842
+ `).get(collectionName, path);
1843
+ if (!legacy)
1844
+ return null;
1845
+ // Wrap rename + FTS rebuild in a transaction for atomicity.
1846
+ const migrate = db.transaction(() => {
1847
+ // Use OR IGNORE so a UNIQUE conflict (e.g. both "readme.md" and
1848
+ // "README.md" already exist) is a no-op rather than crashing.
1849
+ const result = db.prepare(`UPDATE OR IGNORE documents SET path = ? WHERE id = ? AND active = 1`).run(path, legacy.id);
1850
+ if (result.changes === 0)
1851
+ return false;
1852
+ rebuildDocumentFTS(db, legacy.id);
1853
+ return true;
1854
+ });
1855
+ if (!migrate())
1856
+ return null;
1857
+ return findActiveDocument(db, collectionName, path);
1858
+ }
1317
1859
  /**
1318
1860
  * Update the title and modified_at timestamp for a document.
1319
1861
  */
1320
1862
  export function updateDocumentTitle(db, documentId, title, modifiedAt) {
1321
1863
  db.prepare(`UPDATE documents SET title = ?, modified_at = ? WHERE id = ?`)
1322
1864
  .run(title, modifiedAt, documentId);
1865
+ rebuildDocumentFTS(db, documentId);
1323
1866
  }
1324
1867
  /**
1325
1868
  * Update an existing document's hash, title, and modified_at timestamp.
@@ -1328,6 +1871,7 @@ export function updateDocumentTitle(db, documentId, title, modifiedAt) {
1328
1871
  export function updateDocument(db, documentId, title, hash, modifiedAt) {
1329
1872
  db.prepare(`UPDATE documents SET title = ?, hash = ?, modified_at = ? WHERE id = ?`)
1330
1873
  .run(title, hash, modifiedAt, documentId);
1874
+ rebuildDocumentFTS(db, documentId);
1331
1875
  }
1332
1876
  /**
1333
1877
  * Deactivate a document (mark as inactive but don't delete).
@@ -1346,52 +1890,44 @@ export function getActiveDocumentPaths(db, collectionName) {
1346
1890
  return rows.map(r => r.path);
1347
1891
  }
1348
1892
  export { formatQueryForEmbedding, formatDocForEmbedding };
1893
+ /**
1894
+ * Chunk a document using regex-only break point detection.
1895
+ * This is the sync, backward-compatible API used by tests and legacy callers.
1896
+ */
1349
1897
  export function chunkDocument(content, maxChars = CHUNK_SIZE_CHARS, overlapChars = CHUNK_OVERLAP_CHARS, windowChars = CHUNK_WINDOW_CHARS) {
1350
- if (content.length <= maxChars) {
1351
- return [{ text: content, pos: 0 }];
1352
- }
1353
- // Pre-scan all break points and code fences once
1354
1898
  const breakPoints = scanBreakPoints(content);
1355
1899
  const codeFences = findCodeFences(content);
1356
- const chunks = [];
1357
- let charPos = 0;
1358
- while (charPos < content.length) {
1359
- // Calculate target end position for this chunk
1360
- const targetEndPos = Math.min(charPos + maxChars, content.length);
1361
- let endPos = targetEndPos;
1362
- // If not at the end, find the best break point
1363
- if (endPos < content.length) {
1364
- // Find best cutoff using scored algorithm
1365
- const bestCutoff = findBestCutoff(breakPoints, targetEndPos, windowChars, 0.7, codeFences);
1366
- // Only use the cutoff if it's within our current chunk
1367
- if (bestCutoff > charPos && bestCutoff <= targetEndPos) {
1368
- endPos = bestCutoff;
1369
- }
1370
- }
1371
- // Ensure we make progress
1372
- if (endPos <= charPos) {
1373
- endPos = Math.min(charPos + maxChars, content.length);
1374
- }
1375
- chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
1376
- // Move forward, but overlap with previous chunk
1377
- // For last chunk, don't overlap (just go to the end)
1378
- if (endPos >= content.length) {
1379
- break;
1380
- }
1381
- charPos = endPos - overlapChars;
1382
- const lastChunkPos = chunks.at(-1).pos;
1383
- if (charPos <= lastChunkPos) {
1384
- // Prevent infinite loop - move forward at least a bit
1385
- charPos = endPos;
1900
+ return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
1901
+ }
1902
+ /**
1903
+ * Async AST-aware chunking. Detects language from filepath, computes AST
1904
+ * break points for supported code files, merges with regex break points,
1905
+ * and delegates to the shared chunk algorithm.
1906
+ *
1907
+ * Falls back to regex-only when strategy is "regex", filepath is absent,
1908
+ * or language is unsupported.
1909
+ */
1910
+ export async function chunkDocumentAsync(content, maxChars = CHUNK_SIZE_CHARS, overlapChars = CHUNK_OVERLAP_CHARS, windowChars = CHUNK_WINDOW_CHARS, filepath, chunkStrategy = "regex") {
1911
+ const regexPoints = scanBreakPoints(content);
1912
+ const codeFences = findCodeFences(content);
1913
+ let breakPoints = regexPoints;
1914
+ if (chunkStrategy === "auto" && filepath) {
1915
+ const { getASTBreakPoints } = await import("./ast.js");
1916
+ const astPoints = await getASTBreakPoints(content, filepath);
1917
+ if (astPoints.length > 0) {
1918
+ breakPoints = mergeBreakPoints(regexPoints, astPoints);
1386
1919
  }
1387
1920
  }
1388
- return chunks;
1921
+ return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
1389
1922
  }
1390
1923
  /**
1391
1924
  * Chunk a document by actual token count using the LLM tokenizer.
1392
1925
  * More accurate than character-based chunking but requires async.
1926
+ *
1927
+ * When filepath and chunkStrategy are provided, uses AST-aware break points
1928
+ * for supported code files.
1393
1929
  */
1394
- export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKENS, overlapTokens = CHUNK_OVERLAP_TOKENS, windowTokens = CHUNK_WINDOW_TOKENS) {
1930
+ export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKENS, overlapTokens = CHUNK_OVERLAP_TOKENS, windowTokens = CHUNK_WINDOW_TOKENS, filepath, chunkStrategy = "regex", signal) {
1395
1931
  const llm = getDefaultLlamaCpp();
1396
1932
  // Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3)
1397
1933
  // If chunks exceed limit, they'll be re-split with actual ratio
@@ -1400,29 +1936,58 @@ export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKE
1400
1936
  const overlapChars = overlapTokens * avgCharsPerToken;
1401
1937
  const windowChars = windowTokens * avgCharsPerToken;
1402
1938
  // Chunk in character space with conservative estimate
1403
- let charChunks = chunkDocument(content, maxChars, overlapChars, windowChars);
1939
+ // Use AST-aware chunking for the first pass when filepath/strategy provided
1940
+ let charChunks = await chunkDocumentAsync(content, maxChars, overlapChars, windowChars, filepath, chunkStrategy);
1404
1941
  // Tokenize and split any chunks that still exceed limit
1405
1942
  const results = [];
1406
- for (const chunk of charChunks) {
1407
- const tokens = await llm.tokenize(chunk.text);
1408
- if (tokens.length <= maxTokens) {
1409
- results.push({ text: chunk.text, pos: chunk.pos, tokens: tokens.length });
1943
+ const clampOverlapChars = (value, maxChars) => {
1944
+ if (maxChars <= 1)
1945
+ return 0;
1946
+ return Math.max(0, Math.min(maxChars - 1, Math.floor(value)));
1947
+ };
1948
+ const pushChunkWithinTokenLimit = async (text, pos) => {
1949
+ if (signal?.aborted)
1950
+ return;
1951
+ const tokens = await llm.tokenize(text);
1952
+ if (tokens.length <= maxTokens || text.length <= 1) {
1953
+ results.push({ text, pos, tokens: tokens.length });
1954
+ return;
1410
1955
  }
1411
- else {
1412
- // Chunk is still too large - split it further
1413
- // Use actual token count to estimate better char limit
1414
- const actualCharsPerToken = chunk.text.length / tokens.length;
1415
- const safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95); // 5% safety margin
1416
- const subChunks = chunkDocument(chunk.text, safeMaxChars, Math.floor(overlapChars * actualCharsPerToken / 2), Math.floor(windowChars * actualCharsPerToken / 2));
1417
- for (const subChunk of subChunks) {
1418
- const subTokens = await llm.tokenize(subChunk.text);
1419
- results.push({
1420
- text: subChunk.text,
1421
- pos: chunk.pos + subChunk.pos,
1422
- tokens: subTokens.length,
1423
- });
1424
- }
1956
+ const actualCharsPerToken = text.length / tokens.length;
1957
+ let safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95);
1958
+ if (!Number.isFinite(safeMaxChars) || safeMaxChars < 1) {
1959
+ safeMaxChars = Math.floor(text.length / 2);
1960
+ }
1961
+ safeMaxChars = Math.max(1, Math.min(text.length - 1, safeMaxChars));
1962
+ let nextOverlapChars = clampOverlapChars(overlapChars * actualCharsPerToken / 2, safeMaxChars);
1963
+ let nextWindowChars = Math.max(0, Math.floor(windowChars * actualCharsPerToken / 2));
1964
+ let subChunks = chunkDocument(text, safeMaxChars, nextOverlapChars, nextWindowChars);
1965
+ // Pathological single-line blobs can produce no meaningful breakpoint progress.
1966
+ // Fall back to a simple half split so every recursion step strictly shrinks.
1967
+ if (subChunks.length <= 1
1968
+ || subChunks[0]?.text.length === text.length) {
1969
+ safeMaxChars = Math.max(1, Math.floor(text.length / 2));
1970
+ nextOverlapChars = 0;
1971
+ nextWindowChars = 0;
1972
+ subChunks = chunkDocument(text, safeMaxChars, nextOverlapChars, nextWindowChars);
1973
+ }
1974
+ if (subChunks.length <= 1
1975
+ || subChunks[0]?.text.length === text.length) {
1976
+ const fallbackTokens = tokens.slice(0, Math.max(1, maxTokens));
1977
+ const truncatedText = await llm.detokenize(fallbackTokens);
1978
+ results.push({
1979
+ text: truncatedText,
1980
+ pos,
1981
+ tokens: fallbackTokens.length,
1982
+ });
1983
+ return;
1425
1984
  }
1985
+ for (const subChunk of subChunks) {
1986
+ await pushChunkWithinTokenLimit(text.slice(subChunk.pos, subChunk.pos + subChunk.text.length), pos + subChunk.pos);
1987
+ }
1988
+ };
1989
+ for (const chunk of charChunks) {
1990
+ await pushChunkWithinTokenLimit(chunk.text, chunk.pos);
1426
1991
  }
1427
1992
  return results;
1428
1993
  }
@@ -1523,7 +2088,7 @@ export function matchFilesByGlob(db, pattern) {
1523
2088
  `).all();
1524
2089
  const isMatch = picomatch(pattern);
1525
2090
  return allFiles
1526
- .filter(f => isMatch(f.virtual_path) || isMatch(f.path))
2091
+ .filter(f => isMatch(f.virtual_path) || isMatch(f.path) || isMatch(f.collection + '/' + f.path))
1527
2092
  .map(f => ({
1528
2093
  filepath: f.virtual_path, // Virtual path for precise lookup
1529
2094
  displayPath: f.path, // Relative path for display
@@ -1874,8 +2439,23 @@ export function getTopLevelPathsWithoutContext(db, collectionName) {
1874
2439
  // =============================================================================
1875
2440
  // FTS Search
1876
2441
  // =============================================================================
1877
- function sanitizeFTS5Term(term) {
1878
- return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
2442
+ export function sanitizeFTS5Term(term) {
2443
+ return term.replace(/[^\p{L}\p{N}'_]/gu, '').toLowerCase();
2444
+ }
2445
+ /**
2446
+ * Check if a token is a hyphenated compound word (e.g., multi-agent, DEC-0054, gpt-4).
2447
+ * Returns true if the token contains internal hyphens between word/digit characters.
2448
+ */
2449
+ function isHyphenatedToken(token) {
2450
+ return /^[\p{L}\p{N}][\p{L}\p{N}'-]*-[\p{L}\p{N}][\p{L}\p{N}'-]*$/u.test(token);
2451
+ }
2452
+ /**
2453
+ * Sanitize a hyphenated term into an FTS5 phrase by splitting on hyphens
2454
+ * and sanitizing each part. Returns the parts joined by spaces for use
2455
+ * inside FTS5 quotes: "multi agent" matches "multi-agent" in porter tokenizer.
2456
+ */
2457
+ function sanitizeHyphenatedTerm(term) {
2458
+ return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
1879
2459
  }
1880
2460
  /**
1881
2461
  * Parse lex query syntax into FTS5 query.
@@ -1883,14 +2463,23 @@ function sanitizeFTS5Term(term) {
1883
2463
  * Supports:
1884
2464
  * - Quoted phrases: "exact phrase" → "exact phrase" (exact match)
1885
2465
  * - Negation: -term or -"phrase" → uses FTS5 NOT operator
2466
+ * - Hyphenated tokens: multi-agent, DEC-0054, gpt-4 → treated as phrases
1886
2467
  * - Plain terms: term → "term"* (prefix match)
1887
2468
  *
1888
2469
  * FTS5 NOT is a binary operator: `term1 NOT term2` means "match term1 but not term2".
1889
2470
  * So `-term` only works when there are also positive terms.
1890
2471
  *
2472
+ * Hyphen disambiguation: `-sports` at a word boundary is negation, but `multi-agent`
2473
+ * (where `-` is between word characters) is treated as a hyphenated phrase.
2474
+ * When a leading `-` is followed by what looks like a hyphenated compound word
2475
+ * (e.g., `-multi-agent`), the entire token is treated as a negated phrase.
2476
+ *
1891
2477
  * Examples:
1892
2478
  * performance -sports → "performance"* NOT "sports"*
1893
2479
  * "machine learning" → "machine learning"
2480
+ * multi-agent memory → "multi agent" AND "memory"*
2481
+ * DEC-0054 → "dec 0054"
2482
+ * -multi-agent → NOT "multi agent"
1894
2483
  */
1895
2484
  function buildFTS5Query(query) {
1896
2485
  const positive = [];
@@ -1916,7 +2505,7 @@ function buildFTS5Query(query) {
1916
2505
  const phrase = s.slice(start, i).trim();
1917
2506
  i++; // skip closing quote
1918
2507
  if (phrase.length > 0) {
1919
- const sanitized = phrase.split(/\s+/).map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
2508
+ const sanitized = sanitizeFTS5Phrase(phrase);
1920
2509
  if (sanitized) {
1921
2510
  const ftsPhrase = `"${sanitized}"`; // Exact phrase, no prefix match
1922
2511
  if (negated) {
@@ -1934,14 +2523,42 @@ function buildFTS5Query(query) {
1934
2523
  while (i < s.length && !/[\s"]/.test(s[i]))
1935
2524
  i++;
1936
2525
  const term = s.slice(start, i);
1937
- const sanitized = sanitizeFTS5Term(term);
1938
- if (sanitized) {
1939
- const ftsTerm = `"${sanitized}"*`; // Prefix match
1940
- if (negated) {
1941
- negative.push(ftsTerm);
2526
+ // Handle hyphenated tokens: multi-agent, DEC-0054, gpt-4
2527
+ // These get split into phrase queries so FTS5 porter tokenizer matches them.
2528
+ if (isHyphenatedToken(term)) {
2529
+ const sanitized = sanitizeHyphenatedTerm(term);
2530
+ if (sanitized) {
2531
+ const ftsPhrase = `"${sanitized}"`; // Phrase match (no prefix)
2532
+ if (negated) {
2533
+ negative.push(ftsPhrase);
2534
+ }
2535
+ else {
2536
+ positive.push(ftsPhrase);
2537
+ }
1942
2538
  }
1943
- else {
1944
- positive.push(ftsTerm);
2539
+ }
2540
+ else if (containsCjk(term)) {
2541
+ const sanitized = sanitizeFTS5Phrase(term);
2542
+ if (sanitized) {
2543
+ const ftsPhrase = `"${sanitized}"`; // CJK phrase over character tokens
2544
+ if (negated) {
2545
+ negative.push(ftsPhrase);
2546
+ }
2547
+ else {
2548
+ positive.push(ftsPhrase);
2549
+ }
2550
+ }
2551
+ }
2552
+ else {
2553
+ const sanitized = sanitizeFTS5Term(term);
2554
+ if (sanitized) {
2555
+ const ftsTerm = `"${sanitized}"*`; // Prefix match
2556
+ if (negated) {
2557
+ negative.push(ftsTerm);
2558
+ }
2559
+ else {
2560
+ positive.push(ftsTerm);
2561
+ }
1945
2562
  }
1946
2563
  }
1947
2564
  }
@@ -1964,8 +2581,9 @@ function buildFTS5Query(query) {
1964
2581
  * Returns error message if invalid, null if valid.
1965
2582
  */
1966
2583
  export function validateSemanticQuery(query) {
1967
- // Check for negation syntax
1968
- if (/-\w/.test(query) || /-"/.test(query)) {
2584
+ // Check for negation syntax — only at token boundaries (start of string or after whitespace).
2585
+ // Hyphenated words like "real-time" or "write-ahead" must not trigger this.
2586
+ if (/(^|\s)-[\w"]/.test(query)) {
1969
2587
  return 'Negation (-term) is not supported in vec/hyde queries. Use lex for exclusions.';
1970
2588
  }
1971
2589
  return null;
@@ -1984,26 +2602,42 @@ export function searchFTS(db, query, limit = 20, collectionName) {
1984
2602
  const ftsQuery = buildFTS5Query(query);
1985
2603
  if (!ftsQuery)
1986
2604
  return [];
2605
+ // Use a CTE to force FTS5 to run first, then filter by collection.
2606
+ // Without the CTE, SQLite's query planner combines FTS5 MATCH with the
2607
+ // collection filter in a single WHERE clause, which can cause it to
2608
+ // abandon the FTS5 index and fall back to a full scan — turning an 8ms
2609
+ // query into a 17-second query on large collections.
2610
+ const params = [ftsQuery];
2611
+ // When filtering by collection, fetch extra candidates from the FTS index
2612
+ // since some will be filtered out. Without a collection filter we can
2613
+ // fetch exactly the requested limit.
2614
+ const ftsLimit = collectionName ? limit * 10 : limit;
1987
2615
  let sql = `
2616
+ WITH fts_matches AS (
2617
+ SELECT rowid, bm25(documents_fts, 1.5, 4.0, 1.0) as bm25_score
2618
+ FROM documents_fts
2619
+ WHERE documents_fts MATCH ?
2620
+ ORDER BY bm25_score ASC
2621
+ LIMIT ${ftsLimit}
2622
+ )
1988
2623
  SELECT
1989
2624
  'qmd://' || d.collection || '/' || d.path as filepath,
1990
2625
  d.collection || '/' || d.path as display_path,
1991
2626
  d.title,
1992
2627
  content.doc as body,
1993
2628
  d.hash,
1994
- bm25(documents_fts, 10.0, 1.0) as bm25_score
1995
- FROM documents_fts f
1996
- JOIN documents d ON d.id = f.rowid
2629
+ fm.bm25_score
2630
+ FROM fts_matches fm
2631
+ JOIN documents d ON d.id = fm.rowid
1997
2632
  JOIN content ON content.hash = d.hash
1998
- WHERE documents_fts MATCH ? AND d.active = 1
2633
+ WHERE d.active = 1
1999
2634
  `;
2000
- const params = [ftsQuery];
2001
2635
  if (collectionName) {
2002
2636
  sql += ` AND d.collection = ?`;
2003
2637
  params.push(String(collectionName));
2004
2638
  }
2005
2639
  // bm25 lower is better; sort ascending.
2006
- sql += ` ORDER BY bm25_score ASC LIMIT ?`;
2640
+ sql += ` ORDER BY fm.bm25_score ASC LIMIT ?`;
2007
2641
  params.push(limit);
2008
2642
  const rows = db.prepare(sql).all(...params);
2009
2643
  return rows.map(row => {
@@ -2075,7 +2709,7 @@ export async function searchVec(db, query, model, limit = 20, collectionName, se
2075
2709
  docSql += ` AND d.collection = ?`;
2076
2710
  params.push(collectionName);
2077
2711
  }
2078
- const docRows = db.prepare(docSql).all(...params);
2712
+ const docRows = withLazyContentVectorMigration(db, () => db.prepare(docSql).all(...params));
2079
2713
  // Combine with distances and dedupe by filepath
2080
2714
  const seen = new Map();
2081
2715
  for (const row of docRows) {
@@ -2122,34 +2756,124 @@ async function getEmbedding(text, model, isQuery, session, llmOverride) {
2122
2756
  * Get all unique content hashes that need embeddings (from active documents).
2123
2757
  * Returns hash, document body, and a sample path for display purposes.
2124
2758
  */
2125
- export function getHashesForEmbedding(db) {
2126
- return db.prepare(`
2759
+ export function getHashesForEmbedding(db, model = DEFAULT_EMBED_MODEL) {
2760
+ const fingerprint = getEmbeddingFingerprint(model);
2761
+ return withLazyContentVectorMigration(db, () => db.prepare(`
2127
2762
  SELECT d.hash, c.doc as body, MIN(d.path) as path
2128
2763
  FROM documents d
2129
2764
  JOIN content c ON d.hash = c.hash
2130
- LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
2131
- WHERE d.active = 1 AND v.hash IS NULL
2765
+ LEFT JOIN (
2766
+ SELECT hash, model, COUNT(*) AS chunk_count, MAX(total_chunks) AS expected_chunks
2767
+ FROM content_vectors
2768
+ WHERE model = ? AND embed_fingerprint = ?
2769
+ GROUP BY hash, model, embed_fingerprint
2770
+ ) v ON d.hash = v.hash
2771
+ WHERE d.active = 1
2772
+ AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
2132
2773
  GROUP BY d.hash
2133
- `).all();
2774
+ `).all(model, fingerprint));
2134
2775
  }
2135
2776
  /**
2136
- * Clear all embeddings from the database (force re-index).
2137
- * Deletes all rows from content_vectors and drops the vectors_vec table.
2138
- */
2139
- export function clearAllEmbeddings(db) {
2140
- db.exec(`DELETE FROM content_vectors`);
2141
- db.exec(`DROP TABLE IF EXISTS vectors_vec`);
2777
+ * Clear embeddings for the whole index, or just for one collection.
2778
+ *
2779
+ * When `collection` is omitted the entire content_vectors table is emptied and
2780
+ * the vectors_vec virtual table is dropped (it is recreated with the right
2781
+ * dimensions on the next embed run).
2782
+ *
2783
+ * When `collection` is provided, only vectors whose hash is referenced
2784
+ * exclusively by active documents in that collection are removed. Hashes
2785
+ * shared with active documents in other collections are left in place so
2786
+ * vector search keeps working there (content_vectors is keyed globally by
2787
+ * content hash; identical document bodies across collections share a row).
2788
+ * vectors_vec is preserved so other collections keep working unless the scoped
2789
+ * clear empties content_vectors entirely, in which case it is dropped so the
2790
+ * next embed can recreate the table with the current dimensions.
2791
+ */
2792
+ export function clearAllEmbeddings(db, collection) {
2793
+ if (!collection) {
2794
+ db.exec(`DELETE FROM content_vectors`);
2795
+ db.exec(`DROP TABLE IF EXISTS vectors_vec`);
2796
+ return;
2797
+ }
2798
+ const exclusiveHashesQuery = `
2799
+ SELECT DISTINCT d.hash
2800
+ FROM documents d
2801
+ WHERE d.collection = ? AND d.active = 1
2802
+ AND NOT EXISTS (
2803
+ SELECT 1 FROM documents d2
2804
+ WHERE d2.hash = d.hash
2805
+ AND d2.active = 1
2806
+ AND d2.collection != d.collection
2807
+ )
2808
+ `;
2809
+ const vecTableExists = db
2810
+ .prepare(`SELECT 1 FROM sqlite_master WHERE type='table' AND name='vectors_vec'`)
2811
+ .get();
2812
+ withLazyContentVectorMigration(db, () => {
2813
+ if (vecTableExists) {
2814
+ const hashSeqRows = db.prepare(`
2815
+ SELECT cv.hash, cv.seq
2816
+ FROM content_vectors cv
2817
+ WHERE cv.hash IN (${exclusiveHashesQuery})
2818
+ `).all(collection);
2819
+ const delVec = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
2820
+ for (const row of hashSeqRows) {
2821
+ delVec.run(`${row.hash}_${row.seq}`);
2822
+ }
2823
+ }
2824
+ db.prepare(`
2825
+ DELETE FROM content_vectors
2826
+ WHERE hash IN (${exclusiveHashesQuery})
2827
+ `).run(collection);
2828
+ const remaining = db
2829
+ .prepare(`SELECT COUNT(*) AS n FROM content_vectors`)
2830
+ .get();
2831
+ if (remaining.n === 0) {
2832
+ db.exec(`DROP TABLE IF EXISTS vectors_vec`);
2833
+ }
2834
+ });
2142
2835
  }
2143
2836
  /**
2144
2837
  * Insert a single embedding into both content_vectors and vectors_vec tables.
2145
2838
  * The hash_seq key is formatted as "hash_seq" for the vectors_vec table.
2839
+ *
2840
+ * content_vectors is inserted first so that getHashesForEmbedding (which checks
2841
+ * only content_vectors) won't re-select the hash on a crash between the two inserts.
2842
+ *
2843
+ * vectors_vec uses DELETE + INSERT instead of INSERT OR REPLACE because sqlite-vec's
2844
+ * vec0 virtual tables silently ignore the OR REPLACE conflict clause.
2146
2845
  */
2147
- export function insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt) {
2846
+ export function insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt, totalChunks = 1, fingerprint = getEmbeddingFingerprint(model)) {
2148
2847
  const hashSeq = `${hash}_${seq}`;
2149
- const insertVecStmt = db.prepare(`INSERT OR REPLACE INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
2150
- const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, ?, ?, ?, ?)`);
2151
- insertVecStmt.run(hashSeq, embedding);
2152
- insertContentVectorStmt.run(hash, seq, pos, model, embeddedAt);
2848
+ withLazyContentVectorMigration(db, () => {
2849
+ // Insert content_vectors first crash-safe ordering (see getHashesForEmbedding)
2850
+ const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embed_fingerprint, total_chunks, embedded_at) VALUES (?, ?, ?, ?, ?, ?, ?)`);
2851
+ insertContentVectorStmt.run(hash, seq, pos, model, fingerprint, totalChunks, embeddedAt);
2852
+ // vec0 virtual tables don't support OR REPLACE — use DELETE + INSERT
2853
+ const deleteVecStmt = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
2854
+ const insertVecStmt = db.prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
2855
+ deleteVecStmt.run(hashSeq);
2856
+ insertVecStmt.run(hashSeq, embedding);
2857
+ });
2858
+ }
2859
+ function removeIncompleteEmbeddings(db, expectedChunksByHash, model) {
2860
+ return withLazyContentVectorMigration(db, () => {
2861
+ let removed = 0;
2862
+ const rowsStmt = db.prepare(`SELECT seq FROM content_vectors WHERE hash = ? AND model = ?`);
2863
+ const deleteContentStmt = db.prepare(`DELETE FROM content_vectors WHERE hash = ? AND model = ?`);
2864
+ const deleteVecStmt = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
2865
+ for (const [hash, expectedChunks] of expectedChunksByHash) {
2866
+ const rows = rowsStmt.all(hash, model);
2867
+ if (rows.length === 0 || rows.length === expectedChunks)
2868
+ continue;
2869
+ for (const row of rows) {
2870
+ deleteVecStmt.run(`${hash}_${row.seq}`);
2871
+ }
2872
+ deleteContentStmt.run(hash, model);
2873
+ removed += rows.length;
2874
+ }
2875
+ return removed;
2876
+ });
2153
2877
  }
2154
2878
  // =============================================================================
2155
2879
  // Query expansion
@@ -2161,12 +2885,15 @@ export async function expandQuery(query, model = DEFAULT_QUERY_MODEL, db, intent
2161
2885
  if (cached) {
2162
2886
  try {
2163
2887
  const parsed = JSON.parse(cached);
2888
+ if (!Array.isArray(parsed))
2889
+ return [];
2890
+ const rows = parsed;
2164
2891
  // Migrate old cache format: { type, text } → { type, query }
2165
- if (parsed.length > 0 && parsed[0].query) {
2166
- return parsed;
2892
+ if (rows.length > 0 && typeof rows[0]?.query === "string") {
2893
+ return rows.map((r) => ({ type: r.type, query: String(r.query) }));
2167
2894
  }
2168
- else if (parsed.length > 0 && parsed[0].text) {
2169
- return parsed.map((r) => ({ type: r.type, query: r.text }));
2895
+ else if (rows.length > 0 && typeof rows[0]?.text === "string") {
2896
+ return rows.map((r) => ({ type: r.type, query: String(r.text) }));
2170
2897
  }
2171
2898
  }
2172
2899
  catch {
@@ -2473,7 +3200,7 @@ export function getDocumentBody(db, doc, fromLine, maxLines) {
2473
3200
  let body = row.body;
2474
3201
  if (fromLine !== undefined || maxLines !== undefined) {
2475
3202
  const lines = body.split('\n');
2476
- const start = (fromLine || 1) - 1;
3203
+ const start = Math.max(0, (fromLine || 1) - 1);
2477
3204
  const end = maxLines !== undefined ? start + maxLines : lines.length;
2478
3205
  body = lines.slice(start, end).join('\n');
2479
3206
  }
@@ -2484,7 +3211,7 @@ export function getDocumentBody(db, doc, fromLine, maxLines) {
2484
3211
  * Returns documents without body by default (use getDocumentBody to load)
2485
3212
  */
2486
3213
  export function findDocuments(db, pattern, options = {}) {
2487
- const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?');
3214
+ const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?') && !pattern.includes('{');
2488
3215
  const errors = [];
2489
3216
  const maxBytes = options.maxBytes ?? DEFAULT_MULTI_GET_MAX_BYTES;
2490
3217
  const bodyCol = options.includeBody ? `, content.doc as body` : ``;
@@ -2581,7 +3308,7 @@ export function findDocuments(db, pattern, options = {}) {
2581
3308
  // =============================================================================
2582
3309
  // Status
2583
3310
  // =============================================================================
2584
- export function getStatus(db) {
3311
+ export function getStatus(db, model = DEFAULT_EMBED_MODEL) {
2585
3312
  // DB is source of truth for collections — config provides supplementary metadata
2586
3313
  const dbCollections = db.prepare(`
2587
3314
  SELECT
@@ -2614,7 +3341,7 @@ export function getStatus(db) {
2614
3341
  return new Date(b.lastUpdated).getTime() - new Date(a.lastUpdated).getTime();
2615
3342
  });
2616
3343
  const totalDocs = db.prepare(`SELECT COUNT(*) as c FROM documents WHERE active = 1`).get().c;
2617
- const needsEmbedding = getHashesNeedingEmbedding(db);
3344
+ const needsEmbedding = getHashesNeedingEmbedding(db, undefined, model);
2618
3345
  const hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
2619
3346
  return {
2620
3347
  totalDocuments: totalDocs,
@@ -2661,7 +3388,7 @@ export function extractSnippet(body, query, maxLen = 500, chunkPos, chunkLen, in
2661
3388
  const totalLines = body.split('\n').length;
2662
3389
  let searchBody = body;
2663
3390
  let lineOffset = 0;
2664
- if (chunkPos && chunkPos > 0) {
3391
+ if (chunkPos !== undefined && chunkPos >= 0) {
2665
3392
  // Search within the chunk region, with some padding for context
2666
3393
  // Use provided chunkLen or fall back to max chunk size (covers variable-length chunks)
2667
3394
  const searchLen = chunkLen || CHUNK_SIZE_CHARS;
@@ -2692,6 +3419,22 @@ export function extractSnippet(body, query, maxLen = 500, chunkPos, chunkLen, in
2692
3419
  bestLine = i;
2693
3420
  }
2694
3421
  }
3422
+ if (chunkPos !== undefined && chunkPos >= 0 && bestScore <= 0) {
3423
+ if (chunkPos === 0) {
3424
+ // chunkPos=0 may be the chunk selector's initialization default for queries
3425
+ // where lexical chunk scoring found no winner (e.g. tokens filtered to empty
3426
+ // by the length>2 guard). Retry with full body so the real match isn't missed.
3427
+ return extractSnippet(body, query, maxLen, undefined, undefined, intent);
3428
+ }
3429
+ // For chunkPos > 0 the reranker actively picked this chunk. Tokens failing to
3430
+ // match literally is most likely a tokenizer limitation (quoted phrases, FTS5
3431
+ // syntax, HYDE passages, semantic hits), so anchor on the chunk start rather
3432
+ // than disregarding the reranker's pick.
3433
+ const contextStart = Math.max(0, chunkPos - 100);
3434
+ bestLine = chunkPos > contextStart
3435
+ ? searchBody.slice(0, chunkPos - contextStart).split('\n').length - 1
3436
+ : 0;
3437
+ }
2695
3438
  const start = Math.max(0, bestLine - 1);
2696
3439
  const end = Math.min(lines.length, bestLine + 3);
2697
3440
  const snippetLines = lines.slice(start, end);
@@ -2729,6 +3472,20 @@ export function addLineNumbers(text, startLine = 1) {
2729
3472
  const lines = text.split('\n');
2730
3473
  return lines.map((line, i) => `${startLine + i}: ${line}`).join('\n');
2731
3474
  }
3475
+ /**
3476
+ * RRF list weights for hybridQuery.
3477
+ *
3478
+ * Original-query retrieval paths are the primary evidence and get 2x weight:
3479
+ * - original FTS
3480
+ * - original vector search
3481
+ *
3482
+ * Expansion-derived lists (lex/vec/hyde) stay at 1x regardless of list order,
3483
+ * so a lex expansion inserted before original vector search cannot steal the
3484
+ * original vector boost.
3485
+ */
3486
+ export function getHybridRrfWeights(rankedListMeta) {
3487
+ return rankedListMeta.map(meta => meta.queryType === "original" ? 2.0 : 1.0);
3488
+ }
2732
3489
  /**
2733
3490
  * Hybrid search: BM25 + vector + query expansion + RRF + chunked reranking.
2734
3491
  *
@@ -2817,7 +3574,8 @@ export async function hybridQuery(store, query, options) {
2817
3574
  }
2818
3575
  // Batch embed all vector queries in a single call
2819
3576
  const llm = getLlm(store);
2820
- const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text));
3577
+ const embedModel = llm.embedModelName;
3578
+ const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text, embedModel));
2821
3579
  hooks?.onEmbedStart?.(textsToEmbed.length);
2822
3580
  const embedStart = Date.now();
2823
3581
  const embeddings = await llm.embedBatch(textsToEmbed);
@@ -2827,7 +3585,7 @@ export async function hybridQuery(store, query, options) {
2827
3585
  const embedding = embeddings[i]?.embedding;
2828
3586
  if (!embedding)
2829
3587
  continue;
2830
- const vecResults = await store.searchVec(vecQueries[i].text, DEFAULT_EMBED_MODEL, 20, collection, undefined, embedding);
3588
+ const vecResults = await store.searchVec(vecQueries[i].text, embedModel, 20, collection, undefined, embedding);
2831
3589
  if (vecResults.length > 0) {
2832
3590
  for (const r of vecResults)
2833
3591
  docidMap.set(r.filepath, r.docid);
@@ -2843,8 +3601,9 @@ export async function hybridQuery(store, query, options) {
2843
3601
  }
2844
3602
  }
2845
3603
  }
2846
- // Step 4: RRF fusion — first 2 lists (original FTS + first vec) get 2x weight
2847
- const weights = rankedLists.map((_, i) => i < 2 ? 2.0 : 1.0);
3604
+ // Step 4: RRF fusion — original-query FTS and vector lists get 2x weight;
3605
+ // expansion-derived lists stay at 1x independent of insertion order.
3606
+ const weights = getHybridRrfWeights(rankedListMeta);
2848
3607
  const fused = reciprocalRankFusion(rankedLists, weights);
2849
3608
  const rrfTraceByFile = explain ? buildRrfTrace(rankedLists, weights, rankedListMeta) : null;
2850
3609
  const candidates = fused.slice(0, candidateLimit);
@@ -2855,8 +3614,9 @@ export async function hybridQuery(store, query, options) {
2855
3614
  const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 2);
2856
3615
  const intentTerms = intent ? extractIntentTerms(intent) : [];
2857
3616
  const docChunkMap = new Map();
3617
+ const chunkStrategy = options?.chunkStrategy;
2858
3618
  for (const cand of candidates) {
2859
- const chunks = chunkDocument(cand.body);
3619
+ const chunks = await chunkDocumentAsync(cand.body, undefined, undefined, undefined, cand.file, chunkStrategy);
2860
3620
  if (chunks.length === 0)
2861
3621
  continue;
2862
3622
  // Pick chunk with most keyword overlap (fallback: first chunk)
@@ -3024,10 +3784,11 @@ export async function vectorSearchQuery(store, query, options) {
3024
3784
  const vecExpanded = allExpanded.filter(q => q.type !== 'lex');
3025
3785
  options?.hooks?.onExpand?.(query, vecExpanded, Date.now() - expandStart);
3026
3786
  // Run original + vec/hyde expanded through vector, sequentially — concurrent embed() hangs
3787
+ const embedModel = getLlm(store).embedModelName;
3027
3788
  const queryTexts = [query, ...vecExpanded.map(q => q.query)];
3028
3789
  const allResults = new Map();
3029
3790
  for (const q of queryTexts) {
3030
- const vecResults = await store.searchVec(q, DEFAULT_EMBED_MODEL, limit, collection);
3791
+ const vecResults = await store.searchVec(q, embedModel, limit, collection);
3031
3792
  for (const r of vecResults) {
3032
3793
  const existing = allResults.get(r.filepath);
3033
3794
  if (!existing || r.score > existing.score) {
@@ -3128,7 +3889,8 @@ export async function structuredSearch(store, searches, options) {
3128
3889
  const vecSearches = searches.filter((s) => s.type === 'vec' || s.type === 'hyde');
3129
3890
  if (vecSearches.length > 0) {
3130
3891
  const llm = getLlm(store);
3131
- const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query));
3892
+ const embedModel = llm.embedModelName;
3893
+ const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query, embedModel));
3132
3894
  hooks?.onEmbedStart?.(textsToEmbed.length);
3133
3895
  const embedStart = Date.now();
3134
3896
  const embeddings = await llm.embedBatch(textsToEmbed);
@@ -3138,7 +3900,7 @@ export async function structuredSearch(store, searches, options) {
3138
3900
  if (!embedding)
3139
3901
  continue;
3140
3902
  for (const coll of collectionList) {
3141
- const vecResults = await store.searchVec(vecSearches[i].query, DEFAULT_EMBED_MODEL, 20, coll, undefined, embedding);
3903
+ const vecResults = await store.searchVec(vecSearches[i].query, embedModel, 20, coll, undefined, embedding);
3142
3904
  if (vecResults.length > 0) {
3143
3905
  for (const r of vecResults)
3144
3906
  docidMap.set(r.filepath, r.docid);
@@ -3174,8 +3936,9 @@ export async function structuredSearch(store, searches, options) {
3174
3936
  const queryTerms = primaryQuery.toLowerCase().split(/\s+/).filter(t => t.length > 2);
3175
3937
  const intentTerms = intent ? extractIntentTerms(intent) : [];
3176
3938
  const docChunkMap = new Map();
3939
+ const ssChunkStrategy = options?.chunkStrategy;
3177
3940
  for (const cand of candidates) {
3178
- const chunks = chunkDocument(cand.body);
3941
+ const chunks = await chunkDocumentAsync(cand.body, undefined, undefined, undefined, cand.file, ssChunkStrategy);
3179
3942
  if (chunks.length === 0)
3180
3943
  continue;
3181
3944
  // Pick chunk with most keyword overlap