@tobilu/qmd 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/store.js CHANGED
@@ -26,6 +26,8 @@ export const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
26
26
  export const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B";
27
27
  export const DEFAULT_GLOB = "**/*.md";
28
28
  export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB
29
+ export const DEFAULT_EMBED_MAX_DOCS_PER_BATCH = 64;
30
+ export const DEFAULT_EMBED_MAX_BATCH_BYTES = 64 * 1024 * 1024; // 64MB
29
31
  // Chunking: 900 tokens per chunk with 15% overlap
30
32
  // Increased from 800 to accommodate smart chunking finding natural break points
31
33
  export const CHUNK_SIZE_TOKENS = 900;
@@ -161,6 +163,60 @@ export function findBestCutoff(breakPoints, targetCharPos, windowChars = CHUNK_W
161
163
  }
162
164
  return bestPos;
163
165
  }
166
+ /**
167
+ * Merge two sets of break points (e.g. regex + AST), keeping the highest
168
+ * score at each position. Result is sorted by position.
169
+ */
170
+ export function mergeBreakPoints(a, b) {
171
+ const seen = new Map();
172
+ for (const bp of a) {
173
+ const existing = seen.get(bp.pos);
174
+ if (!existing || bp.score > existing.score) {
175
+ seen.set(bp.pos, bp);
176
+ }
177
+ }
178
+ for (const bp of b) {
179
+ const existing = seen.get(bp.pos);
180
+ if (!existing || bp.score > existing.score) {
181
+ seen.set(bp.pos, bp);
182
+ }
183
+ }
184
+ return Array.from(seen.values()).sort((a, b) => a.pos - b.pos);
185
+ }
186
+ /**
187
+ * Core chunk algorithm that operates on precomputed break points and code fences.
188
+ * This is the shared implementation used by both regex-only and AST-aware chunking.
189
+ */
190
+ export function chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars = CHUNK_SIZE_CHARS, overlapChars = CHUNK_OVERLAP_CHARS, windowChars = CHUNK_WINDOW_CHARS) {
191
+ if (content.length <= maxChars) {
192
+ return [{ text: content, pos: 0 }];
193
+ }
194
+ const chunks = [];
195
+ let charPos = 0;
196
+ while (charPos < content.length) {
197
+ const targetEndPos = Math.min(charPos + maxChars, content.length);
198
+ let endPos = targetEndPos;
199
+ if (endPos < content.length) {
200
+ const bestCutoff = findBestCutoff(breakPoints, targetEndPos, windowChars, 0.7, codeFences);
201
+ if (bestCutoff > charPos && bestCutoff <= targetEndPos) {
202
+ endPos = bestCutoff;
203
+ }
204
+ }
205
+ if (endPos <= charPos) {
206
+ endPos = Math.min(charPos + maxChars, content.length);
207
+ }
208
+ chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
209
+ if (endPos >= content.length) {
210
+ break;
211
+ }
212
+ charPos = endPos - overlapChars;
213
+ const lastChunkPos = chunks.at(-1).pos;
214
+ if (charPos <= lastChunkPos) {
215
+ charPos = endPos;
216
+ }
217
+ }
218
+ return chunks;
219
+ }
164
220
  // Hybrid query: strong BM25 signal detection thresholds
165
221
  // Skip expensive LLM expansion when top result is strong AND clearly separated from runner-up
166
222
  export const STRONG_SIGNAL_MIN_SCORE = 0.85;
@@ -191,7 +247,8 @@ export function isAbsolutePath(path) {
191
247
  if (path.startsWith('/')) {
192
248
  // Check if it's a Git Bash style path like /c/ or /c/Users (C-Z only, not A or B)
193
249
  // Requires path[2] === '/' to distinguish from Unix paths like /c or /cache
194
- if (path.length >= 3 && path[2] === '/') {
250
+ // Skipped on WSL where /c/ is a valid drvfs mount point, not a drive letter
251
+ if (!isWSL() && path.length >= 3 && path[2] === '/') {
195
252
  const driveLetter = path[1];
196
253
  if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
197
254
  return true;
@@ -213,6 +270,13 @@ export function isAbsolutePath(path) {
213
270
  export function normalizePathSeparators(path) {
214
271
  return path.replace(/\\/g, '/');
215
272
  }
273
+ /**
274
+ * Detect if running inside WSL (Windows Subsystem for Linux).
275
+ * On WSL, paths like /c/work/... are valid drvfs mount points, not Git Bash paths.
276
+ */
277
+ function isWSL() {
278
+ return !!(process.env.WSL_DISTRO_NAME || process.env.WSL_INTEROP);
279
+ }
216
280
  /**
217
281
  * Get the relative path from a prefix.
218
282
  * Returns null if path is not under prefix.
@@ -256,8 +320,9 @@ export function resolve(...paths) {
256
320
  windowsDrive = firstPath.slice(0, 2);
257
321
  result = firstPath.slice(2);
258
322
  }
259
- else if (firstPath.startsWith('/') && firstPath.length >= 3 && firstPath[2] === '/') {
323
+ else if (!isWSL() && firstPath.startsWith('/') && firstPath.length >= 3 && firstPath[2] === '/') {
260
324
  // Git Bash style: /c/ -> C: (C-Z drives only, not A or B)
325
+ // Skipped on WSL where /c/ is a valid drvfs mount point, not a drive letter
261
326
  const driveLetter = firstPath[1];
262
327
  if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
263
328
  windowsDrive = driveLetter.toUpperCase() + ':';
@@ -288,8 +353,9 @@ export function resolve(...paths) {
288
353
  windowsDrive = p.slice(0, 2);
289
354
  result = p.slice(2);
290
355
  }
291
- else if (p.startsWith('/') && p.length >= 3 && p[2] === '/') {
356
+ else if (!isWSL() && p.startsWith('/') && p.length >= 3 && p[2] === '/') {
292
357
  // Git Bash style (C-Z drives only, not A or B)
358
+ // Skipped on WSL where /c/ is a valid drvfs mount point, not a drive letter
293
359
  const driveLetter = p[1];
294
360
  if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
295
361
  windowsDrive = driveLetter.toUpperCase() + ':';
@@ -332,6 +398,10 @@ let _productionMode = false;
332
398
  export function enableProductionMode() {
333
399
  _productionMode = true;
334
400
  }
401
+ /** Reset production mode flag — only for testing. */
402
+ export function _resetProductionModeForTesting() {
403
+ _productionMode = false;
404
+ }
335
405
  export function getDefaultDbPath(indexName = "index") {
336
406
  // Always allow override via INDEX_PATH (for testing)
337
407
  if (process.env.INDEX_PATH) {
@@ -504,9 +574,10 @@ function initializeDatabase(db) {
504
574
  verifySqliteVecLoaded(db);
505
575
  _sqliteVecAvailable = true;
506
576
  }
507
- catch {
577
+ catch (err) {
508
578
  // sqlite-vec is optional — vector search won't work but FTS is fine
509
579
  _sqliteVecAvailable = false;
580
+ console.warn(getErrorMessage(err));
510
581
  }
511
582
  db.exec("PRAGMA journal_mode = WAL");
512
583
  db.exec("PRAGMA foreign_keys = ON");
@@ -777,7 +848,10 @@ function ensureVecTableInternal(db, dimensions) {
777
848
  const existingDims = match?.[1] ? parseInt(match[1], 10) : null;
778
849
  if (existingDims === dimensions && hasHashSeq && hasCosine)
779
850
  return;
780
- // Table exists but wrong schema - need to rebuild
851
+ if (existingDims !== null && existingDims !== dimensions) {
852
+ throw new Error(`Embedding dimension mismatch: existing vectors are ${existingDims}d but the current model produces ${dimensions}d. ` +
853
+ `Run 'qmd embed -f' to re-embed with the new model.`);
854
+ }
781
855
  db.exec("DROP TABLE IF EXISTS vectors_vec");
782
856
  }
783
857
  db.exec(`CREATE VIRTUAL TABLE vectors_vec USING vec0(hash_seq TEXT PRIMARY KEY, embedding float[${dimensions}] distance_metric=cosine)`);
@@ -867,6 +941,67 @@ export async function reindexCollection(store, collectionPath, globPattern, coll
867
941
  const orphanedCleaned = cleanupOrphanedContent(db);
868
942
  return { indexed, updated, unchanged, removed, orphanedCleaned };
869
943
  }
944
+ function validatePositiveIntegerOption(name, value, fallback) {
945
+ if (value === undefined)
946
+ return fallback;
947
+ if (!Number.isInteger(value) || value < 1) {
948
+ throw new Error(`${name} must be a positive integer`);
949
+ }
950
+ return value;
951
+ }
952
+ function resolveEmbedOptions(options) {
953
+ return {
954
+ maxDocsPerBatch: validatePositiveIntegerOption("maxDocsPerBatch", options?.maxDocsPerBatch, DEFAULT_EMBED_MAX_DOCS_PER_BATCH),
955
+ maxBatchBytes: validatePositiveIntegerOption("maxBatchBytes", options?.maxBatchBytes, DEFAULT_EMBED_MAX_BATCH_BYTES),
956
+ };
957
+ }
958
+ function getPendingEmbeddingDocs(db) {
959
+ return db.prepare(`
960
+ SELECT d.hash, MIN(d.path) as path, length(CAST(c.doc AS BLOB)) as bytes
961
+ FROM documents d
962
+ JOIN content c ON d.hash = c.hash
963
+ LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
964
+ WHERE d.active = 1 AND v.hash IS NULL
965
+ GROUP BY d.hash
966
+ ORDER BY MIN(d.path)
967
+ `).all();
968
+ }
969
+ function buildEmbeddingBatches(docs, maxDocsPerBatch, maxBatchBytes) {
970
+ const batches = [];
971
+ let currentBatch = [];
972
+ let currentBytes = 0;
973
+ for (const doc of docs) {
974
+ const docBytes = Math.max(0, doc.bytes);
975
+ const wouldExceedDocs = currentBatch.length >= maxDocsPerBatch;
976
+ const wouldExceedBytes = currentBatch.length > 0 && (currentBytes + docBytes) > maxBatchBytes;
977
+ if (wouldExceedDocs || wouldExceedBytes) {
978
+ batches.push(currentBatch);
979
+ currentBatch = [];
980
+ currentBytes = 0;
981
+ }
982
+ currentBatch.push(doc);
983
+ currentBytes += docBytes;
984
+ }
985
+ if (currentBatch.length > 0) {
986
+ batches.push(currentBatch);
987
+ }
988
+ return batches;
989
+ }
990
+ function getEmbeddingDocsForBatch(db, batch) {
991
+ if (batch.length === 0)
992
+ return [];
993
+ const placeholders = batch.map(() => "?").join(",");
994
+ const rows = db.prepare(`
995
+ SELECT hash, doc as body
996
+ FROM content
997
+ WHERE hash IN (${placeholders})
998
+ `).all(...batch.map(doc => doc.hash));
999
+ const bodyByHash = new Map(rows.map(row => [row.hash, row.body]));
1000
+ return batch.map((doc) => ({
1001
+ ...doc,
1002
+ body: bodyByHash.get(doc.hash) ?? "",
1003
+ }));
1004
+ }
870
1005
  /**
871
1006
  * Generate vector embeddings for documents that need them.
872
1007
  * Pure function — no console output, no db lifecycle management.
@@ -876,98 +1011,151 @@ export async function generateEmbeddings(store, options) {
876
1011
  const db = store.db;
877
1012
  const model = options?.model ?? DEFAULT_EMBED_MODEL;
878
1013
  const now = new Date().toISOString();
1014
+ const { maxDocsPerBatch, maxBatchBytes } = resolveEmbedOptions(options);
1015
+ const encoder = new TextEncoder();
879
1016
  if (options?.force) {
880
1017
  clearAllEmbeddings(db);
881
1018
  }
882
- const hashesToEmbed = getHashesForEmbedding(db);
883
- if (hashesToEmbed.length === 0) {
884
- return { docsProcessed: 0, chunksEmbedded: 0, errors: 0, durationMs: 0 };
885
- }
886
- const allChunks = [];
887
- for (const item of hashesToEmbed) {
888
- const encoder = new TextEncoder();
889
- const bodyBytes = encoder.encode(item.body).length;
890
- if (bodyBytes === 0)
891
- continue;
892
- const title = extractTitle(item.body, item.path);
893
- const chunks = await chunkDocumentByTokens(item.body);
894
- for (let seq = 0; seq < chunks.length; seq++) {
895
- allChunks.push({
896
- hash: item.hash,
897
- title,
898
- text: chunks[seq].text,
899
- seq,
900
- pos: chunks[seq].pos,
901
- tokens: chunks[seq].tokens,
902
- bytes: encoder.encode(chunks[seq].text).length,
903
- });
904
- }
905
- }
906
- if (allChunks.length === 0) {
1019
+ const docsToEmbed = getPendingEmbeddingDocs(db);
1020
+ if (docsToEmbed.length === 0) {
907
1021
  return { docsProcessed: 0, chunksEmbedded: 0, errors: 0, durationMs: 0 };
908
1022
  }
909
- const totalBytes = allChunks.reduce((sum, chk) => sum + chk.bytes, 0);
910
- const totalChunks = allChunks.length;
911
- const totalDocs = hashesToEmbed.length;
1023
+ const totalBytes = docsToEmbed.reduce((sum, doc) => sum + Math.max(0, doc.bytes), 0);
1024
+ const totalDocs = docsToEmbed.length;
912
1025
  const startTime = Date.now();
913
1026
  // Use store's LlamaCpp or global singleton, wrapped in a session
914
1027
  const llm = getLlm(store);
915
- const sessionOptions = { maxDuration: 30 * 60 * 1000, name: 'generateEmbeddings' };
1028
+ const embedModelUri = llm.embedModelName;
916
1029
  // Create a session manager for this llm instance
917
1030
  const result = await withLLMSessionForLlm(llm, async (session) => {
918
- // Get embedding dimensions from first chunk
919
- const firstChunk = allChunks[0];
920
- const firstText = formatDocForEmbedding(firstChunk.text, firstChunk.title);
921
- const firstResult = await session.embed(firstText);
922
- if (!firstResult) {
923
- throw new Error("Failed to get embedding dimensions from first chunk");
924
- }
925
- store.ensureVecTable(firstResult.embedding.length);
926
- let chunksEmbedded = 0, errors = 0, bytesProcessed = 0;
1031
+ let chunksEmbedded = 0;
1032
+ let errors = 0;
1033
+ let bytesProcessed = 0;
1034
+ let totalChunks = 0;
1035
+ let vectorTableInitialized = false;
927
1036
  const BATCH_SIZE = 32;
928
- for (let batchStart = 0; batchStart < allChunks.length; batchStart += BATCH_SIZE) {
929
- const batchEnd = Math.min(batchStart + BATCH_SIZE, allChunks.length);
930
- const batch = allChunks.slice(batchStart, batchEnd);
931
- const texts = batch.map(chunk => formatDocForEmbedding(chunk.text, chunk.title));
932
- try {
933
- const embeddings = await session.embedBatch(texts);
934
- for (let i = 0; i < batch.length; i++) {
935
- const chunk = batch[i];
936
- const embedding = embeddings[i];
937
- if (embedding) {
938
- insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now);
939
- chunksEmbedded++;
940
- }
941
- else {
942
- errors++;
943
- }
944
- bytesProcessed += chunk.bytes;
1037
+ const batches = buildEmbeddingBatches(docsToEmbed, maxDocsPerBatch, maxBatchBytes);
1038
+ for (const batchMeta of batches) {
1039
+ // Abort early if session has been invalidated
1040
+ if (!session.isValid) {
1041
+ console.warn(`⚠ Session expired — skipping remaining document batches`);
1042
+ break;
1043
+ }
1044
+ const batchDocs = getEmbeddingDocsForBatch(db, batchMeta);
1045
+ const batchChunks = [];
1046
+ const batchBytes = batchMeta.reduce((sum, doc) => sum + Math.max(0, doc.bytes), 0);
1047
+ for (const doc of batchDocs) {
1048
+ if (!doc.body.trim())
1049
+ continue;
1050
+ const title = extractTitle(doc.body, doc.path);
1051
+ const chunks = await chunkDocumentByTokens(doc.body, undefined, undefined, undefined, doc.path, options?.chunkStrategy, session.signal);
1052
+ for (let seq = 0; seq < chunks.length; seq++) {
1053
+ batchChunks.push({
1054
+ hash: doc.hash,
1055
+ title,
1056
+ text: chunks[seq].text,
1057
+ seq,
1058
+ pos: chunks[seq].pos,
1059
+ tokens: chunks[seq].tokens,
1060
+ bytes: encoder.encode(chunks[seq].text).length,
1061
+ });
945
1062
  }
946
1063
  }
947
- catch {
948
- // Batch failed try individual embeddings as fallback
949
- for (const chunk of batch) {
950
- try {
951
- const text = formatDocForEmbedding(chunk.text, chunk.title);
952
- const result = await session.embed(text);
953
- if (result) {
954
- insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now);
1064
+ totalChunks += batchChunks.length;
1065
+ if (batchChunks.length === 0) {
1066
+ bytesProcessed += batchBytes;
1067
+ options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors });
1068
+ continue;
1069
+ }
1070
+ if (!vectorTableInitialized) {
1071
+ const firstChunk = batchChunks[0];
1072
+ const firstText = formatDocForEmbedding(firstChunk.text, firstChunk.title, embedModelUri);
1073
+ const firstResult = await session.embed(firstText, { model });
1074
+ if (!firstResult) {
1075
+ throw new Error("Failed to get embedding dimensions from first chunk");
1076
+ }
1077
+ store.ensureVecTable(firstResult.embedding.length);
1078
+ vectorTableInitialized = true;
1079
+ }
1080
+ const totalBatchChunkBytes = batchChunks.reduce((sum, chunk) => sum + chunk.bytes, 0);
1081
+ let batchChunkBytesProcessed = 0;
1082
+ for (let batchStart = 0; batchStart < batchChunks.length; batchStart += BATCH_SIZE) {
1083
+ // Abort early if session has been invalidated (e.g. max duration exceeded)
1084
+ if (!session.isValid) {
1085
+ const remaining = batchChunks.length - batchStart;
1086
+ errors += remaining;
1087
+ console.warn(`⚠ Session expired — skipping ${remaining} remaining chunks`);
1088
+ break;
1089
+ }
1090
+ // Abort early if error rate is too high (>80% of processed chunks failed)
1091
+ const processed = chunksEmbedded + errors;
1092
+ if (processed >= BATCH_SIZE && errors > processed * 0.8) {
1093
+ const remaining = batchChunks.length - batchStart;
1094
+ errors += remaining;
1095
+ console.warn(`⚠ Error rate too high (${errors}/${processed}) — aborting embedding`);
1096
+ break;
1097
+ }
1098
+ const batchEnd = Math.min(batchStart + BATCH_SIZE, batchChunks.length);
1099
+ const chunkBatch = batchChunks.slice(batchStart, batchEnd);
1100
+ const texts = chunkBatch.map(chunk => formatDocForEmbedding(chunk.text, chunk.title, embedModelUri));
1101
+ try {
1102
+ const embeddings = await session.embedBatch(texts, { model });
1103
+ for (let i = 0; i < chunkBatch.length; i++) {
1104
+ const chunk = chunkBatch[i];
1105
+ const embedding = embeddings[i];
1106
+ if (embedding) {
1107
+ insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now);
955
1108
  chunksEmbedded++;
956
1109
  }
957
1110
  else {
958
1111
  errors++;
959
1112
  }
1113
+ batchChunkBytesProcessed += chunk.bytes;
1114
+ }
1115
+ }
1116
+ catch {
1117
+ // Batch failed — try individual embeddings as fallback
1118
+ // But skip if session is already invalid (avoids N doomed retries)
1119
+ if (!session.isValid) {
1120
+ errors += chunkBatch.length;
1121
+ batchChunkBytesProcessed += chunkBatch.reduce((sum, c) => sum + c.bytes, 0);
960
1122
  }
961
- catch {
962
- errors++;
1123
+ else {
1124
+ for (const chunk of chunkBatch) {
1125
+ try {
1126
+ const text = formatDocForEmbedding(chunk.text, chunk.title, embedModelUri);
1127
+ const result = await session.embed(text, { model });
1128
+ if (result) {
1129
+ insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now);
1130
+ chunksEmbedded++;
1131
+ }
1132
+ else {
1133
+ errors++;
1134
+ }
1135
+ }
1136
+ catch {
1137
+ errors++;
1138
+ }
1139
+ batchChunkBytesProcessed += chunk.bytes;
1140
+ }
963
1141
  }
964
- bytesProcessed += chunk.bytes;
965
1142
  }
1143
+ const proportionalBytes = totalBatchChunkBytes === 0
1144
+ ? batchBytes
1145
+ : Math.min(batchBytes, Math.round((batchChunkBytesProcessed / totalBatchChunkBytes) * batchBytes));
1146
+ options?.onProgress?.({
1147
+ chunksEmbedded,
1148
+ totalChunks,
1149
+ bytesProcessed: bytesProcessed + proportionalBytes,
1150
+ totalBytes,
1151
+ errors,
1152
+ });
966
1153
  }
1154
+ bytesProcessed += batchBytes;
967
1155
  options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors });
968
1156
  }
969
1157
  return { chunksEmbedded, errors };
970
- }, sessionOptions);
1158
+ }, { maxDuration: 30 * 60 * 1000, name: 'generateEmbeddings' });
971
1159
  return {
972
1160
  docsProcessed: totalDocs,
973
1161
  chunksEmbedded: result.chunksEmbedded,
@@ -1097,7 +1285,7 @@ export function handelize(path) {
1097
1285
  const ext = extMatch ? extMatch[1] : '';
1098
1286
  const nameWithoutExt = ext ? segment.slice(0, -ext.length) : segment;
1099
1287
  const cleanedName = nameWithoutExt
1100
- .replace(/[^\p{L}\p{N}$]+/gu, '-') // Keep route marker "$", dash-separate other chars
1288
+ .replace(/[^\p{L}\p{N}$]+/gu, '-') // Keep letters, numbers, "$"; dash-separate rest (including dots)
1101
1289
  .replace(/^-+|-+$/g, ''); // Remove leading/trailing dashes
1102
1290
  return cleanedName + ext;
1103
1291
  }
@@ -1196,11 +1384,20 @@ export function cleanupOrphanedContent(db) {
1196
1384
  * Returns the number of orphaned embedding chunks deleted.
1197
1385
  */
1198
1386
  export function cleanupOrphanedVectors(db) {
1199
- // Check if vectors_vec table exists
1200
- const tableExists = db.prepare(`
1201
- SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'
1202
- `).get();
1203
- if (!tableExists) {
1387
+ // sqlite-vec may not be loaded (e.g. Bun's bun:sqlite lacks loadExtension).
1388
+ // The vectors_vec virtual table can appear in sqlite_master from a prior
1389
+ // session, but querying it without the vec0 module loaded will crash (#380).
1390
+ if (!isSqliteVecAvailable()) {
1391
+ return 0;
1392
+ }
1393
+ // The schema entry can exist even when sqlite-vec itself is unavailable
1394
+ // (for example when reopening a DB without vec0 loaded). In that case,
1395
+ // touching the virtual table throws "no such module: vec0" and cleanup
1396
+ // should degrade gracefully like the rest of the vector features.
1397
+ try {
1398
+ db.prepare(`SELECT 1 FROM vectors_vec LIMIT 0`).get();
1399
+ }
1400
+ catch {
1204
1401
  return 0;
1205
1402
  }
1206
1403
  // Count orphaned vectors first
@@ -1346,52 +1543,44 @@ export function getActiveDocumentPaths(db, collectionName) {
1346
1543
  return rows.map(r => r.path);
1347
1544
  }
1348
1545
  export { formatQueryForEmbedding, formatDocForEmbedding };
1546
+ /**
1547
+ * Chunk a document using regex-only break point detection.
1548
+ * This is the sync, backward-compatible API used by tests and legacy callers.
1549
+ */
1349
1550
  export function chunkDocument(content, maxChars = CHUNK_SIZE_CHARS, overlapChars = CHUNK_OVERLAP_CHARS, windowChars = CHUNK_WINDOW_CHARS) {
1350
- if (content.length <= maxChars) {
1351
- return [{ text: content, pos: 0 }];
1352
- }
1353
- // Pre-scan all break points and code fences once
1354
1551
  const breakPoints = scanBreakPoints(content);
1355
1552
  const codeFences = findCodeFences(content);
1356
- const chunks = [];
1357
- let charPos = 0;
1358
- while (charPos < content.length) {
1359
- // Calculate target end position for this chunk
1360
- const targetEndPos = Math.min(charPos + maxChars, content.length);
1361
- let endPos = targetEndPos;
1362
- // If not at the end, find the best break point
1363
- if (endPos < content.length) {
1364
- // Find best cutoff using scored algorithm
1365
- const bestCutoff = findBestCutoff(breakPoints, targetEndPos, windowChars, 0.7, codeFences);
1366
- // Only use the cutoff if it's within our current chunk
1367
- if (bestCutoff > charPos && bestCutoff <= targetEndPos) {
1368
- endPos = bestCutoff;
1369
- }
1370
- }
1371
- // Ensure we make progress
1372
- if (endPos <= charPos) {
1373
- endPos = Math.min(charPos + maxChars, content.length);
1374
- }
1375
- chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
1376
- // Move forward, but overlap with previous chunk
1377
- // For last chunk, don't overlap (just go to the end)
1378
- if (endPos >= content.length) {
1379
- break;
1380
- }
1381
- charPos = endPos - overlapChars;
1382
- const lastChunkPos = chunks.at(-1).pos;
1383
- if (charPos <= lastChunkPos) {
1384
- // Prevent infinite loop - move forward at least a bit
1385
- charPos = endPos;
1553
+ return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
1554
+ }
1555
+ /**
1556
+ * Async AST-aware chunking. Detects language from filepath, computes AST
1557
+ * break points for supported code files, merges with regex break points,
1558
+ * and delegates to the shared chunk algorithm.
1559
+ *
1560
+ * Falls back to regex-only when strategy is "regex", filepath is absent,
1561
+ * or language is unsupported.
1562
+ */
1563
+ export async function chunkDocumentAsync(content, maxChars = CHUNK_SIZE_CHARS, overlapChars = CHUNK_OVERLAP_CHARS, windowChars = CHUNK_WINDOW_CHARS, filepath, chunkStrategy = "regex") {
1564
+ const regexPoints = scanBreakPoints(content);
1565
+ const codeFences = findCodeFences(content);
1566
+ let breakPoints = regexPoints;
1567
+ if (chunkStrategy === "auto" && filepath) {
1568
+ const { getASTBreakPoints } = await import("./ast.js");
1569
+ const astPoints = await getASTBreakPoints(content, filepath);
1570
+ if (astPoints.length > 0) {
1571
+ breakPoints = mergeBreakPoints(regexPoints, astPoints);
1386
1572
  }
1387
1573
  }
1388
- return chunks;
1574
+ return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
1389
1575
  }
1390
1576
  /**
1391
1577
  * Chunk a document by actual token count using the LLM tokenizer.
1392
1578
  * More accurate than character-based chunking but requires async.
1579
+ *
1580
+ * When filepath and chunkStrategy are provided, uses AST-aware break points
1581
+ * for supported code files.
1393
1582
  */
1394
- export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKENS, overlapTokens = CHUNK_OVERLAP_TOKENS, windowTokens = CHUNK_WINDOW_TOKENS) {
1583
+ export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKENS, overlapTokens = CHUNK_OVERLAP_TOKENS, windowTokens = CHUNK_WINDOW_TOKENS, filepath, chunkStrategy = "regex", signal) {
1395
1584
  const llm = getDefaultLlamaCpp();
1396
1585
  // Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3)
1397
1586
  // If chunks exceed limit, they'll be re-split with actual ratio
@@ -1400,10 +1589,14 @@ export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKE
1400
1589
  const overlapChars = overlapTokens * avgCharsPerToken;
1401
1590
  const windowChars = windowTokens * avgCharsPerToken;
1402
1591
  // Chunk in character space with conservative estimate
1403
- let charChunks = chunkDocument(content, maxChars, overlapChars, windowChars);
1592
+ // Use AST-aware chunking for the first pass when filepath/strategy provided
1593
+ let charChunks = await chunkDocumentAsync(content, maxChars, overlapChars, windowChars, filepath, chunkStrategy);
1404
1594
  // Tokenize and split any chunks that still exceed limit
1405
1595
  const results = [];
1406
1596
  for (const chunk of charChunks) {
1597
+ // Respect abort signal to avoid runaway tokenization
1598
+ if (signal?.aborted)
1599
+ break;
1407
1600
  const tokens = await llm.tokenize(chunk.text);
1408
1601
  if (tokens.length <= maxTokens) {
1409
1602
  results.push({ text: chunk.text, pos: chunk.pos, tokens: tokens.length });
@@ -1415,6 +1608,8 @@ export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKE
1415
1608
  const safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95); // 5% safety margin
1416
1609
  const subChunks = chunkDocument(chunk.text, safeMaxChars, Math.floor(overlapChars * actualCharsPerToken / 2), Math.floor(windowChars * actualCharsPerToken / 2));
1417
1610
  for (const subChunk of subChunks) {
1611
+ if (signal?.aborted)
1612
+ break;
1418
1613
  const subTokens = await llm.tokenize(subChunk.text);
1419
1614
  results.push({
1420
1615
  text: subChunk.text,
@@ -1523,7 +1718,7 @@ export function matchFilesByGlob(db, pattern) {
1523
1718
  `).all();
1524
1719
  const isMatch = picomatch(pattern);
1525
1720
  return allFiles
1526
- .filter(f => isMatch(f.virtual_path) || isMatch(f.path))
1721
+ .filter(f => isMatch(f.virtual_path) || isMatch(f.path) || isMatch(f.collection + '/' + f.path))
1527
1722
  .map(f => ({
1528
1723
  filepath: f.virtual_path, // Virtual path for precise lookup
1529
1724
  displayPath: f.path, // Relative path for display
@@ -1874,8 +2069,23 @@ export function getTopLevelPathsWithoutContext(db, collectionName) {
1874
2069
  // =============================================================================
1875
2070
  // FTS Search
1876
2071
  // =============================================================================
1877
- function sanitizeFTS5Term(term) {
1878
- return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
2072
+ export function sanitizeFTS5Term(term) {
2073
+ return term.replace(/[^\p{L}\p{N}'_]/gu, '').toLowerCase();
2074
+ }
2075
+ /**
2076
+ * Check if a token is a hyphenated compound word (e.g., multi-agent, DEC-0054, gpt-4).
2077
+ * Returns true if the token contains internal hyphens between word/digit characters.
2078
+ */
2079
+ function isHyphenatedToken(token) {
2080
+ return /^[\p{L}\p{N}][\p{L}\p{N}'-]*-[\p{L}\p{N}][\p{L}\p{N}'-]*$/u.test(token);
2081
+ }
2082
+ /**
2083
+ * Sanitize a hyphenated term into an FTS5 phrase by splitting on hyphens
2084
+ * and sanitizing each part. Returns the parts joined by spaces for use
2085
+ * inside FTS5 quotes: "multi agent" matches "multi-agent" in porter tokenizer.
2086
+ */
2087
+ function sanitizeHyphenatedTerm(term) {
2088
+ return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
1879
2089
  }
1880
2090
  /**
1881
2091
  * Parse lex query syntax into FTS5 query.
@@ -1883,14 +2093,23 @@ function sanitizeFTS5Term(term) {
1883
2093
  * Supports:
1884
2094
  * - Quoted phrases: "exact phrase" → "exact phrase" (exact match)
1885
2095
  * - Negation: -term or -"phrase" → uses FTS5 NOT operator
2096
+ * - Hyphenated tokens: multi-agent, DEC-0054, gpt-4 → treated as phrases
1886
2097
  * - Plain terms: term → "term"* (prefix match)
1887
2098
  *
1888
2099
  * FTS5 NOT is a binary operator: `term1 NOT term2` means "match term1 but not term2".
1889
2100
  * So `-term` only works when there are also positive terms.
1890
2101
  *
2102
+ * Hyphen disambiguation: `-sports` at a word boundary is negation, but `multi-agent`
2103
+ * (where `-` is between word characters) is treated as a hyphenated phrase.
2104
+ * When a leading `-` is followed by what looks like a hyphenated compound word
2105
+ * (e.g., `-multi-agent`), the entire token is treated as a negated phrase.
2106
+ *
1891
2107
  * Examples:
1892
2108
  * performance -sports → "performance"* NOT "sports"*
1893
2109
  * "machine learning" → "machine learning"
2110
+ * multi-agent memory → "multi agent" AND "memory"*
2111
+ * DEC-0054 → "dec 0054"
2112
+ * -multi-agent → NOT "multi agent"
1894
2113
  */
1895
2114
  function buildFTS5Query(query) {
1896
2115
  const positive = [];
@@ -1934,14 +2153,30 @@ function buildFTS5Query(query) {
1934
2153
  while (i < s.length && !/[\s"]/.test(s[i]))
1935
2154
  i++;
1936
2155
  const term = s.slice(start, i);
1937
- const sanitized = sanitizeFTS5Term(term);
1938
- if (sanitized) {
1939
- const ftsTerm = `"${sanitized}"*`; // Prefix match
1940
- if (negated) {
1941
- negative.push(ftsTerm);
2156
+ // Handle hyphenated tokens: multi-agent, DEC-0054, gpt-4
2157
+ // These get split into phrase queries so FTS5 porter tokenizer matches them.
2158
+ if (isHyphenatedToken(term)) {
2159
+ const sanitized = sanitizeHyphenatedTerm(term);
2160
+ if (sanitized) {
2161
+ const ftsPhrase = `"${sanitized}"`; // Phrase match (no prefix)
2162
+ if (negated) {
2163
+ negative.push(ftsPhrase);
2164
+ }
2165
+ else {
2166
+ positive.push(ftsPhrase);
2167
+ }
1942
2168
  }
1943
- else {
1944
- positive.push(ftsTerm);
2169
+ }
2170
+ else {
2171
+ const sanitized = sanitizeFTS5Term(term);
2172
+ if (sanitized) {
2173
+ const ftsTerm = `"${sanitized}"*`; // Prefix match
2174
+ if (negated) {
2175
+ negative.push(ftsTerm);
2176
+ }
2177
+ else {
2178
+ positive.push(ftsTerm);
2179
+ }
1945
2180
  }
1946
2181
  }
1947
2182
  }
@@ -1984,26 +2219,42 @@ export function searchFTS(db, query, limit = 20, collectionName) {
1984
2219
  const ftsQuery = buildFTS5Query(query);
1985
2220
  if (!ftsQuery)
1986
2221
  return [];
2222
+ // Use a CTE to force FTS5 to run first, then filter by collection.
2223
+ // Without the CTE, SQLite's query planner combines FTS5 MATCH with the
2224
+ // collection filter in a single WHERE clause, which can cause it to
2225
+ // abandon the FTS5 index and fall back to a full scan — turning an 8ms
2226
+ // query into a 17-second query on large collections.
2227
+ const params = [ftsQuery];
2228
+ // When filtering by collection, fetch extra candidates from the FTS index
2229
+ // since some will be filtered out. Without a collection filter we can
2230
+ // fetch exactly the requested limit.
2231
+ const ftsLimit = collectionName ? limit * 10 : limit;
1987
2232
  let sql = `
2233
+ WITH fts_matches AS (
2234
+ SELECT rowid, bm25(documents_fts, 1.5, 4.0, 1.0) as bm25_score
2235
+ FROM documents_fts
2236
+ WHERE documents_fts MATCH ?
2237
+ ORDER BY bm25_score ASC
2238
+ LIMIT ${ftsLimit}
2239
+ )
1988
2240
  SELECT
1989
2241
  'qmd://' || d.collection || '/' || d.path as filepath,
1990
2242
  d.collection || '/' || d.path as display_path,
1991
2243
  d.title,
1992
2244
  content.doc as body,
1993
2245
  d.hash,
1994
- bm25(documents_fts, 10.0, 1.0) as bm25_score
1995
- FROM documents_fts f
1996
- JOIN documents d ON d.id = f.rowid
2246
+ fm.bm25_score
2247
+ FROM fts_matches fm
2248
+ JOIN documents d ON d.id = fm.rowid
1997
2249
  JOIN content ON content.hash = d.hash
1998
- WHERE documents_fts MATCH ? AND d.active = 1
2250
+ WHERE d.active = 1
1999
2251
  `;
2000
- const params = [ftsQuery];
2001
2252
  if (collectionName) {
2002
2253
  sql += ` AND d.collection = ?`;
2003
2254
  params.push(String(collectionName));
2004
2255
  }
2005
2256
  // bm25 lower is better; sort ascending.
2006
- sql += ` ORDER BY bm25_score ASC LIMIT ?`;
2257
+ sql += ` ORDER BY fm.bm25_score ASC LIMIT ?`;
2007
2258
  params.push(limit);
2008
2259
  const rows = db.prepare(sql).all(...params);
2009
2260
  return rows.map(row => {
@@ -2143,13 +2394,23 @@ export function clearAllEmbeddings(db) {
2143
2394
  /**
2144
2395
  * Insert a single embedding into both content_vectors and vectors_vec tables.
2145
2396
  * The hash_seq key is formatted as "hash_seq" for the vectors_vec table.
2397
+ *
2398
+ * content_vectors is inserted first so that getHashesForEmbedding (which checks
2399
+ * only content_vectors) won't re-select the hash on a crash between the two inserts.
2400
+ *
2401
+ * vectors_vec uses DELETE + INSERT instead of INSERT OR REPLACE because sqlite-vec's
2402
+ * vec0 virtual tables silently ignore the OR REPLACE conflict clause.
2146
2403
  */
2147
2404
  export function insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt) {
2148
2405
  const hashSeq = `${hash}_${seq}`;
2149
- const insertVecStmt = db.prepare(`INSERT OR REPLACE INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
2406
+ // Insert content_vectors first crash-safe ordering (see getHashesForEmbedding)
2150
2407
  const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, ?, ?, ?, ?)`);
2151
- insertVecStmt.run(hashSeq, embedding);
2152
2408
  insertContentVectorStmt.run(hash, seq, pos, model, embeddedAt);
2409
+ // vec0 virtual tables don't support OR REPLACE — use DELETE + INSERT
2410
+ const deleteVecStmt = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
2411
+ const insertVecStmt = db.prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
2412
+ deleteVecStmt.run(hashSeq);
2413
+ insertVecStmt.run(hashSeq, embedding);
2153
2414
  }
2154
2415
  // =============================================================================
2155
2416
  // Query expansion
@@ -2484,7 +2745,7 @@ export function getDocumentBody(db, doc, fromLine, maxLines) {
2484
2745
  * Returns documents without body by default (use getDocumentBody to load)
2485
2746
  */
2486
2747
  export function findDocuments(db, pattern, options = {}) {
2487
- const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?');
2748
+ const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?') && !pattern.includes('{');
2488
2749
  const errors = [];
2489
2750
  const maxBytes = options.maxBytes ?? DEFAULT_MULTI_GET_MAX_BYTES;
2490
2751
  const bodyCol = options.includeBody ? `, content.doc as body` : ``;
@@ -2817,7 +3078,7 @@ export async function hybridQuery(store, query, options) {
2817
3078
  }
2818
3079
  // Batch embed all vector queries in a single call
2819
3080
  const llm = getLlm(store);
2820
- const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text));
3081
+ const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text, llm.embedModelName));
2821
3082
  hooks?.onEmbedStart?.(textsToEmbed.length);
2822
3083
  const embedStart = Date.now();
2823
3084
  const embeddings = await llm.embedBatch(textsToEmbed);
@@ -2855,8 +3116,9 @@ export async function hybridQuery(store, query, options) {
2855
3116
  const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 2);
2856
3117
  const intentTerms = intent ? extractIntentTerms(intent) : [];
2857
3118
  const docChunkMap = new Map();
3119
+ const chunkStrategy = options?.chunkStrategy;
2858
3120
  for (const cand of candidates) {
2859
- const chunks = chunkDocument(cand.body);
3121
+ const chunks = await chunkDocumentAsync(cand.body, undefined, undefined, undefined, cand.file, chunkStrategy);
2860
3122
  if (chunks.length === 0)
2861
3123
  continue;
2862
3124
  // Pick chunk with most keyword overlap (fallback: first chunk)
@@ -3128,7 +3390,7 @@ export async function structuredSearch(store, searches, options) {
3128
3390
  const vecSearches = searches.filter((s) => s.type === 'vec' || s.type === 'hyde');
3129
3391
  if (vecSearches.length > 0) {
3130
3392
  const llm = getLlm(store);
3131
- const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query));
3393
+ const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query, llm.embedModelName));
3132
3394
  hooks?.onEmbedStart?.(textsToEmbed.length);
3133
3395
  const embedStart = Date.now();
3134
3396
  const embeddings = await llm.embedBatch(textsToEmbed);
@@ -3174,8 +3436,9 @@ export async function structuredSearch(store, searches, options) {
3174
3436
  const queryTerms = primaryQuery.toLowerCase().split(/\s+/).filter(t => t.length > 2);
3175
3437
  const intentTerms = intent ? extractIntentTerms(intent) : [];
3176
3438
  const docChunkMap = new Map();
3439
+ const ssChunkStrategy = options?.chunkStrategy;
3177
3440
  for (const cand of candidates) {
3178
- const chunks = chunkDocument(cand.body);
3441
+ const chunks = await chunkDocumentAsync(cand.body, undefined, undefined, undefined, cand.file, ssChunkStrategy);
3179
3442
  if (chunks.length === 0)
3180
3443
  continue;
3181
3444
  // Pick chunk with most keyword overlap