@tobilu/qmd 2.0.1 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +96 -0
- package/README.md +61 -1
- package/bin/qmd +11 -2
- package/dist/ast.d.ts +64 -0
- package/dist/ast.js +324 -0
- package/dist/bench/bench.d.ts +21 -0
- package/dist/bench/bench.js +185 -0
- package/dist/bench/score.d.ts +26 -0
- package/dist/bench/score.js +67 -0
- package/dist/bench/types.d.ts +67 -0
- package/dist/bench/types.js +8 -0
- package/dist/cli/formatter.js +5 -1
- package/dist/cli/qmd.d.ts +2 -1
- package/dist/cli/qmd.js +171 -9
- package/dist/collections.d.ts +11 -0
- package/dist/db.d.ts +8 -0
- package/dist/db.js +44 -3
- package/dist/index.d.ts +7 -1
- package/dist/index.js +13 -3
- package/dist/llm.d.ts +12 -3
- package/dist/llm.js +94 -24
- package/dist/mcp/server.js +29 -5
- package/dist/store.d.ts +56 -6
- package/dist/store.js +401 -138
- package/package.json +34 -17
package/dist/store.js
CHANGED
|
@@ -26,6 +26,8 @@ export const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
|
|
|
26
26
|
export const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B";
|
|
27
27
|
export const DEFAULT_GLOB = "**/*.md";
|
|
28
28
|
export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB
|
|
29
|
+
export const DEFAULT_EMBED_MAX_DOCS_PER_BATCH = 64;
|
|
30
|
+
export const DEFAULT_EMBED_MAX_BATCH_BYTES = 64 * 1024 * 1024; // 64MB
|
|
29
31
|
// Chunking: 900 tokens per chunk with 15% overlap
|
|
30
32
|
// Increased from 800 to accommodate smart chunking finding natural break points
|
|
31
33
|
export const CHUNK_SIZE_TOKENS = 900;
|
|
@@ -161,6 +163,60 @@ export function findBestCutoff(breakPoints, targetCharPos, windowChars = CHUNK_W
|
|
|
161
163
|
}
|
|
162
164
|
return bestPos;
|
|
163
165
|
}
|
|
166
|
+
/**
|
|
167
|
+
* Merge two sets of break points (e.g. regex + AST), keeping the highest
|
|
168
|
+
* score at each position. Result is sorted by position.
|
|
169
|
+
*/
|
|
170
|
+
export function mergeBreakPoints(a, b) {
|
|
171
|
+
const seen = new Map();
|
|
172
|
+
for (const bp of a) {
|
|
173
|
+
const existing = seen.get(bp.pos);
|
|
174
|
+
if (!existing || bp.score > existing.score) {
|
|
175
|
+
seen.set(bp.pos, bp);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
for (const bp of b) {
|
|
179
|
+
const existing = seen.get(bp.pos);
|
|
180
|
+
if (!existing || bp.score > existing.score) {
|
|
181
|
+
seen.set(bp.pos, bp);
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
return Array.from(seen.values()).sort((a, b) => a.pos - b.pos);
|
|
185
|
+
}
|
|
186
|
+
/**
|
|
187
|
+
* Core chunk algorithm that operates on precomputed break points and code fences.
|
|
188
|
+
* This is the shared implementation used by both regex-only and AST-aware chunking.
|
|
189
|
+
*/
|
|
190
|
+
export function chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars = CHUNK_SIZE_CHARS, overlapChars = CHUNK_OVERLAP_CHARS, windowChars = CHUNK_WINDOW_CHARS) {
|
|
191
|
+
if (content.length <= maxChars) {
|
|
192
|
+
return [{ text: content, pos: 0 }];
|
|
193
|
+
}
|
|
194
|
+
const chunks = [];
|
|
195
|
+
let charPos = 0;
|
|
196
|
+
while (charPos < content.length) {
|
|
197
|
+
const targetEndPos = Math.min(charPos + maxChars, content.length);
|
|
198
|
+
let endPos = targetEndPos;
|
|
199
|
+
if (endPos < content.length) {
|
|
200
|
+
const bestCutoff = findBestCutoff(breakPoints, targetEndPos, windowChars, 0.7, codeFences);
|
|
201
|
+
if (bestCutoff > charPos && bestCutoff <= targetEndPos) {
|
|
202
|
+
endPos = bestCutoff;
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
if (endPos <= charPos) {
|
|
206
|
+
endPos = Math.min(charPos + maxChars, content.length);
|
|
207
|
+
}
|
|
208
|
+
chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
|
|
209
|
+
if (endPos >= content.length) {
|
|
210
|
+
break;
|
|
211
|
+
}
|
|
212
|
+
charPos = endPos - overlapChars;
|
|
213
|
+
const lastChunkPos = chunks.at(-1).pos;
|
|
214
|
+
if (charPos <= lastChunkPos) {
|
|
215
|
+
charPos = endPos;
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
return chunks;
|
|
219
|
+
}
|
|
164
220
|
// Hybrid query: strong BM25 signal detection thresholds
|
|
165
221
|
// Skip expensive LLM expansion when top result is strong AND clearly separated from runner-up
|
|
166
222
|
export const STRONG_SIGNAL_MIN_SCORE = 0.85;
|
|
@@ -191,7 +247,8 @@ export function isAbsolutePath(path) {
|
|
|
191
247
|
if (path.startsWith('/')) {
|
|
192
248
|
// Check if it's a Git Bash style path like /c/ or /c/Users (C-Z only, not A or B)
|
|
193
249
|
// Requires path[2] === '/' to distinguish from Unix paths like /c or /cache
|
|
194
|
-
|
|
250
|
+
// Skipped on WSL where /c/ is a valid drvfs mount point, not a drive letter
|
|
251
|
+
if (!isWSL() && path.length >= 3 && path[2] === '/') {
|
|
195
252
|
const driveLetter = path[1];
|
|
196
253
|
if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
|
|
197
254
|
return true;
|
|
@@ -213,6 +270,13 @@ export function isAbsolutePath(path) {
|
|
|
213
270
|
export function normalizePathSeparators(path) {
|
|
214
271
|
return path.replace(/\\/g, '/');
|
|
215
272
|
}
|
|
273
|
+
/**
|
|
274
|
+
* Detect if running inside WSL (Windows Subsystem for Linux).
|
|
275
|
+
* On WSL, paths like /c/work/... are valid drvfs mount points, not Git Bash paths.
|
|
276
|
+
*/
|
|
277
|
+
function isWSL() {
|
|
278
|
+
return !!(process.env.WSL_DISTRO_NAME || process.env.WSL_INTEROP);
|
|
279
|
+
}
|
|
216
280
|
/**
|
|
217
281
|
* Get the relative path from a prefix.
|
|
218
282
|
* Returns null if path is not under prefix.
|
|
@@ -256,8 +320,9 @@ export function resolve(...paths) {
|
|
|
256
320
|
windowsDrive = firstPath.slice(0, 2);
|
|
257
321
|
result = firstPath.slice(2);
|
|
258
322
|
}
|
|
259
|
-
else if (firstPath.startsWith('/') && firstPath.length >= 3 && firstPath[2] === '/') {
|
|
323
|
+
else if (!isWSL() && firstPath.startsWith('/') && firstPath.length >= 3 && firstPath[2] === '/') {
|
|
260
324
|
// Git Bash style: /c/ -> C: (C-Z drives only, not A or B)
|
|
325
|
+
// Skipped on WSL where /c/ is a valid drvfs mount point, not a drive letter
|
|
261
326
|
const driveLetter = firstPath[1];
|
|
262
327
|
if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
|
|
263
328
|
windowsDrive = driveLetter.toUpperCase() + ':';
|
|
@@ -288,8 +353,9 @@ export function resolve(...paths) {
|
|
|
288
353
|
windowsDrive = p.slice(0, 2);
|
|
289
354
|
result = p.slice(2);
|
|
290
355
|
}
|
|
291
|
-
else if (p.startsWith('/') && p.length >= 3 && p[2] === '/') {
|
|
356
|
+
else if (!isWSL() && p.startsWith('/') && p.length >= 3 && p[2] === '/') {
|
|
292
357
|
// Git Bash style (C-Z drives only, not A or B)
|
|
358
|
+
// Skipped on WSL where /c/ is a valid drvfs mount point, not a drive letter
|
|
293
359
|
const driveLetter = p[1];
|
|
294
360
|
if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
|
|
295
361
|
windowsDrive = driveLetter.toUpperCase() + ':';
|
|
@@ -332,6 +398,10 @@ let _productionMode = false;
|
|
|
332
398
|
export function enableProductionMode() {
|
|
333
399
|
_productionMode = true;
|
|
334
400
|
}
|
|
401
|
+
/** Reset production mode flag — only for testing. */
|
|
402
|
+
export function _resetProductionModeForTesting() {
|
|
403
|
+
_productionMode = false;
|
|
404
|
+
}
|
|
335
405
|
export function getDefaultDbPath(indexName = "index") {
|
|
336
406
|
// Always allow override via INDEX_PATH (for testing)
|
|
337
407
|
if (process.env.INDEX_PATH) {
|
|
@@ -504,9 +574,10 @@ function initializeDatabase(db) {
|
|
|
504
574
|
verifySqliteVecLoaded(db);
|
|
505
575
|
_sqliteVecAvailable = true;
|
|
506
576
|
}
|
|
507
|
-
catch {
|
|
577
|
+
catch (err) {
|
|
508
578
|
// sqlite-vec is optional — vector search won't work but FTS is fine
|
|
509
579
|
_sqliteVecAvailable = false;
|
|
580
|
+
console.warn(getErrorMessage(err));
|
|
510
581
|
}
|
|
511
582
|
db.exec("PRAGMA journal_mode = WAL");
|
|
512
583
|
db.exec("PRAGMA foreign_keys = ON");
|
|
@@ -777,7 +848,10 @@ function ensureVecTableInternal(db, dimensions) {
|
|
|
777
848
|
const existingDims = match?.[1] ? parseInt(match[1], 10) : null;
|
|
778
849
|
if (existingDims === dimensions && hasHashSeq && hasCosine)
|
|
779
850
|
return;
|
|
780
|
-
|
|
851
|
+
if (existingDims !== null && existingDims !== dimensions) {
|
|
852
|
+
throw new Error(`Embedding dimension mismatch: existing vectors are ${existingDims}d but the current model produces ${dimensions}d. ` +
|
|
853
|
+
`Run 'qmd embed -f' to re-embed with the new model.`);
|
|
854
|
+
}
|
|
781
855
|
db.exec("DROP TABLE IF EXISTS vectors_vec");
|
|
782
856
|
}
|
|
783
857
|
db.exec(`CREATE VIRTUAL TABLE vectors_vec USING vec0(hash_seq TEXT PRIMARY KEY, embedding float[${dimensions}] distance_metric=cosine)`);
|
|
@@ -867,6 +941,67 @@ export async function reindexCollection(store, collectionPath, globPattern, coll
|
|
|
867
941
|
const orphanedCleaned = cleanupOrphanedContent(db);
|
|
868
942
|
return { indexed, updated, unchanged, removed, orphanedCleaned };
|
|
869
943
|
}
|
|
944
|
+
function validatePositiveIntegerOption(name, value, fallback) {
|
|
945
|
+
if (value === undefined)
|
|
946
|
+
return fallback;
|
|
947
|
+
if (!Number.isInteger(value) || value < 1) {
|
|
948
|
+
throw new Error(`${name} must be a positive integer`);
|
|
949
|
+
}
|
|
950
|
+
return value;
|
|
951
|
+
}
|
|
952
|
+
function resolveEmbedOptions(options) {
|
|
953
|
+
return {
|
|
954
|
+
maxDocsPerBatch: validatePositiveIntegerOption("maxDocsPerBatch", options?.maxDocsPerBatch, DEFAULT_EMBED_MAX_DOCS_PER_BATCH),
|
|
955
|
+
maxBatchBytes: validatePositiveIntegerOption("maxBatchBytes", options?.maxBatchBytes, DEFAULT_EMBED_MAX_BATCH_BYTES),
|
|
956
|
+
};
|
|
957
|
+
}
|
|
958
|
+
function getPendingEmbeddingDocs(db) {
|
|
959
|
+
return db.prepare(`
|
|
960
|
+
SELECT d.hash, MIN(d.path) as path, length(CAST(c.doc AS BLOB)) as bytes
|
|
961
|
+
FROM documents d
|
|
962
|
+
JOIN content c ON d.hash = c.hash
|
|
963
|
+
LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
|
|
964
|
+
WHERE d.active = 1 AND v.hash IS NULL
|
|
965
|
+
GROUP BY d.hash
|
|
966
|
+
ORDER BY MIN(d.path)
|
|
967
|
+
`).all();
|
|
968
|
+
}
|
|
969
|
+
function buildEmbeddingBatches(docs, maxDocsPerBatch, maxBatchBytes) {
|
|
970
|
+
const batches = [];
|
|
971
|
+
let currentBatch = [];
|
|
972
|
+
let currentBytes = 0;
|
|
973
|
+
for (const doc of docs) {
|
|
974
|
+
const docBytes = Math.max(0, doc.bytes);
|
|
975
|
+
const wouldExceedDocs = currentBatch.length >= maxDocsPerBatch;
|
|
976
|
+
const wouldExceedBytes = currentBatch.length > 0 && (currentBytes + docBytes) > maxBatchBytes;
|
|
977
|
+
if (wouldExceedDocs || wouldExceedBytes) {
|
|
978
|
+
batches.push(currentBatch);
|
|
979
|
+
currentBatch = [];
|
|
980
|
+
currentBytes = 0;
|
|
981
|
+
}
|
|
982
|
+
currentBatch.push(doc);
|
|
983
|
+
currentBytes += docBytes;
|
|
984
|
+
}
|
|
985
|
+
if (currentBatch.length > 0) {
|
|
986
|
+
batches.push(currentBatch);
|
|
987
|
+
}
|
|
988
|
+
return batches;
|
|
989
|
+
}
|
|
990
|
+
function getEmbeddingDocsForBatch(db, batch) {
|
|
991
|
+
if (batch.length === 0)
|
|
992
|
+
return [];
|
|
993
|
+
const placeholders = batch.map(() => "?").join(",");
|
|
994
|
+
const rows = db.prepare(`
|
|
995
|
+
SELECT hash, doc as body
|
|
996
|
+
FROM content
|
|
997
|
+
WHERE hash IN (${placeholders})
|
|
998
|
+
`).all(...batch.map(doc => doc.hash));
|
|
999
|
+
const bodyByHash = new Map(rows.map(row => [row.hash, row.body]));
|
|
1000
|
+
return batch.map((doc) => ({
|
|
1001
|
+
...doc,
|
|
1002
|
+
body: bodyByHash.get(doc.hash) ?? "",
|
|
1003
|
+
}));
|
|
1004
|
+
}
|
|
870
1005
|
/**
|
|
871
1006
|
* Generate vector embeddings for documents that need them.
|
|
872
1007
|
* Pure function — no console output, no db lifecycle management.
|
|
@@ -876,98 +1011,151 @@ export async function generateEmbeddings(store, options) {
|
|
|
876
1011
|
const db = store.db;
|
|
877
1012
|
const model = options?.model ?? DEFAULT_EMBED_MODEL;
|
|
878
1013
|
const now = new Date().toISOString();
|
|
1014
|
+
const { maxDocsPerBatch, maxBatchBytes } = resolveEmbedOptions(options);
|
|
1015
|
+
const encoder = new TextEncoder();
|
|
879
1016
|
if (options?.force) {
|
|
880
1017
|
clearAllEmbeddings(db);
|
|
881
1018
|
}
|
|
882
|
-
const
|
|
883
|
-
if (
|
|
884
|
-
return { docsProcessed: 0, chunksEmbedded: 0, errors: 0, durationMs: 0 };
|
|
885
|
-
}
|
|
886
|
-
const allChunks = [];
|
|
887
|
-
for (const item of hashesToEmbed) {
|
|
888
|
-
const encoder = new TextEncoder();
|
|
889
|
-
const bodyBytes = encoder.encode(item.body).length;
|
|
890
|
-
if (bodyBytes === 0)
|
|
891
|
-
continue;
|
|
892
|
-
const title = extractTitle(item.body, item.path);
|
|
893
|
-
const chunks = await chunkDocumentByTokens(item.body);
|
|
894
|
-
for (let seq = 0; seq < chunks.length; seq++) {
|
|
895
|
-
allChunks.push({
|
|
896
|
-
hash: item.hash,
|
|
897
|
-
title,
|
|
898
|
-
text: chunks[seq].text,
|
|
899
|
-
seq,
|
|
900
|
-
pos: chunks[seq].pos,
|
|
901
|
-
tokens: chunks[seq].tokens,
|
|
902
|
-
bytes: encoder.encode(chunks[seq].text).length,
|
|
903
|
-
});
|
|
904
|
-
}
|
|
905
|
-
}
|
|
906
|
-
if (allChunks.length === 0) {
|
|
1019
|
+
const docsToEmbed = getPendingEmbeddingDocs(db);
|
|
1020
|
+
if (docsToEmbed.length === 0) {
|
|
907
1021
|
return { docsProcessed: 0, chunksEmbedded: 0, errors: 0, durationMs: 0 };
|
|
908
1022
|
}
|
|
909
|
-
const totalBytes =
|
|
910
|
-
const
|
|
911
|
-
const totalDocs = hashesToEmbed.length;
|
|
1023
|
+
const totalBytes = docsToEmbed.reduce((sum, doc) => sum + Math.max(0, doc.bytes), 0);
|
|
1024
|
+
const totalDocs = docsToEmbed.length;
|
|
912
1025
|
const startTime = Date.now();
|
|
913
1026
|
// Use store's LlamaCpp or global singleton, wrapped in a session
|
|
914
1027
|
const llm = getLlm(store);
|
|
915
|
-
const
|
|
1028
|
+
const embedModelUri = llm.embedModelName;
|
|
916
1029
|
// Create a session manager for this llm instance
|
|
917
1030
|
const result = await withLLMSessionForLlm(llm, async (session) => {
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
throw new Error("Failed to get embedding dimensions from first chunk");
|
|
924
|
-
}
|
|
925
|
-
store.ensureVecTable(firstResult.embedding.length);
|
|
926
|
-
let chunksEmbedded = 0, errors = 0, bytesProcessed = 0;
|
|
1031
|
+
let chunksEmbedded = 0;
|
|
1032
|
+
let errors = 0;
|
|
1033
|
+
let bytesProcessed = 0;
|
|
1034
|
+
let totalChunks = 0;
|
|
1035
|
+
let vectorTableInitialized = false;
|
|
927
1036
|
const BATCH_SIZE = 32;
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
1037
|
+
const batches = buildEmbeddingBatches(docsToEmbed, maxDocsPerBatch, maxBatchBytes);
|
|
1038
|
+
for (const batchMeta of batches) {
|
|
1039
|
+
// Abort early if session has been invalidated
|
|
1040
|
+
if (!session.isValid) {
|
|
1041
|
+
console.warn(`⚠ Session expired — skipping remaining document batches`);
|
|
1042
|
+
break;
|
|
1043
|
+
}
|
|
1044
|
+
const batchDocs = getEmbeddingDocsForBatch(db, batchMeta);
|
|
1045
|
+
const batchChunks = [];
|
|
1046
|
+
const batchBytes = batchMeta.reduce((sum, doc) => sum + Math.max(0, doc.bytes), 0);
|
|
1047
|
+
for (const doc of batchDocs) {
|
|
1048
|
+
if (!doc.body.trim())
|
|
1049
|
+
continue;
|
|
1050
|
+
const title = extractTitle(doc.body, doc.path);
|
|
1051
|
+
const chunks = await chunkDocumentByTokens(doc.body, undefined, undefined, undefined, doc.path, options?.chunkStrategy, session.signal);
|
|
1052
|
+
for (let seq = 0; seq < chunks.length; seq++) {
|
|
1053
|
+
batchChunks.push({
|
|
1054
|
+
hash: doc.hash,
|
|
1055
|
+
title,
|
|
1056
|
+
text: chunks[seq].text,
|
|
1057
|
+
seq,
|
|
1058
|
+
pos: chunks[seq].pos,
|
|
1059
|
+
tokens: chunks[seq].tokens,
|
|
1060
|
+
bytes: encoder.encode(chunks[seq].text).length,
|
|
1061
|
+
});
|
|
945
1062
|
}
|
|
946
1063
|
}
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
1064
|
+
totalChunks += batchChunks.length;
|
|
1065
|
+
if (batchChunks.length === 0) {
|
|
1066
|
+
bytesProcessed += batchBytes;
|
|
1067
|
+
options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors });
|
|
1068
|
+
continue;
|
|
1069
|
+
}
|
|
1070
|
+
if (!vectorTableInitialized) {
|
|
1071
|
+
const firstChunk = batchChunks[0];
|
|
1072
|
+
const firstText = formatDocForEmbedding(firstChunk.text, firstChunk.title, embedModelUri);
|
|
1073
|
+
const firstResult = await session.embed(firstText, { model });
|
|
1074
|
+
if (!firstResult) {
|
|
1075
|
+
throw new Error("Failed to get embedding dimensions from first chunk");
|
|
1076
|
+
}
|
|
1077
|
+
store.ensureVecTable(firstResult.embedding.length);
|
|
1078
|
+
vectorTableInitialized = true;
|
|
1079
|
+
}
|
|
1080
|
+
const totalBatchChunkBytes = batchChunks.reduce((sum, chunk) => sum + chunk.bytes, 0);
|
|
1081
|
+
let batchChunkBytesProcessed = 0;
|
|
1082
|
+
for (let batchStart = 0; batchStart < batchChunks.length; batchStart += BATCH_SIZE) {
|
|
1083
|
+
// Abort early if session has been invalidated (e.g. max duration exceeded)
|
|
1084
|
+
if (!session.isValid) {
|
|
1085
|
+
const remaining = batchChunks.length - batchStart;
|
|
1086
|
+
errors += remaining;
|
|
1087
|
+
console.warn(`⚠ Session expired — skipping ${remaining} remaining chunks`);
|
|
1088
|
+
break;
|
|
1089
|
+
}
|
|
1090
|
+
// Abort early if error rate is too high (>80% of processed chunks failed)
|
|
1091
|
+
const processed = chunksEmbedded + errors;
|
|
1092
|
+
if (processed >= BATCH_SIZE && errors > processed * 0.8) {
|
|
1093
|
+
const remaining = batchChunks.length - batchStart;
|
|
1094
|
+
errors += remaining;
|
|
1095
|
+
console.warn(`⚠ Error rate too high (${errors}/${processed}) — aborting embedding`);
|
|
1096
|
+
break;
|
|
1097
|
+
}
|
|
1098
|
+
const batchEnd = Math.min(batchStart + BATCH_SIZE, batchChunks.length);
|
|
1099
|
+
const chunkBatch = batchChunks.slice(batchStart, batchEnd);
|
|
1100
|
+
const texts = chunkBatch.map(chunk => formatDocForEmbedding(chunk.text, chunk.title, embedModelUri));
|
|
1101
|
+
try {
|
|
1102
|
+
const embeddings = await session.embedBatch(texts, { model });
|
|
1103
|
+
for (let i = 0; i < chunkBatch.length; i++) {
|
|
1104
|
+
const chunk = chunkBatch[i];
|
|
1105
|
+
const embedding = embeddings[i];
|
|
1106
|
+
if (embedding) {
|
|
1107
|
+
insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now);
|
|
955
1108
|
chunksEmbedded++;
|
|
956
1109
|
}
|
|
957
1110
|
else {
|
|
958
1111
|
errors++;
|
|
959
1112
|
}
|
|
1113
|
+
batchChunkBytesProcessed += chunk.bytes;
|
|
1114
|
+
}
|
|
1115
|
+
}
|
|
1116
|
+
catch {
|
|
1117
|
+
// Batch failed — try individual embeddings as fallback
|
|
1118
|
+
// But skip if session is already invalid (avoids N doomed retries)
|
|
1119
|
+
if (!session.isValid) {
|
|
1120
|
+
errors += chunkBatch.length;
|
|
1121
|
+
batchChunkBytesProcessed += chunkBatch.reduce((sum, c) => sum + c.bytes, 0);
|
|
960
1122
|
}
|
|
961
|
-
|
|
962
|
-
|
|
1123
|
+
else {
|
|
1124
|
+
for (const chunk of chunkBatch) {
|
|
1125
|
+
try {
|
|
1126
|
+
const text = formatDocForEmbedding(chunk.text, chunk.title, embedModelUri);
|
|
1127
|
+
const result = await session.embed(text, { model });
|
|
1128
|
+
if (result) {
|
|
1129
|
+
insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now);
|
|
1130
|
+
chunksEmbedded++;
|
|
1131
|
+
}
|
|
1132
|
+
else {
|
|
1133
|
+
errors++;
|
|
1134
|
+
}
|
|
1135
|
+
}
|
|
1136
|
+
catch {
|
|
1137
|
+
errors++;
|
|
1138
|
+
}
|
|
1139
|
+
batchChunkBytesProcessed += chunk.bytes;
|
|
1140
|
+
}
|
|
963
1141
|
}
|
|
964
|
-
bytesProcessed += chunk.bytes;
|
|
965
1142
|
}
|
|
1143
|
+
const proportionalBytes = totalBatchChunkBytes === 0
|
|
1144
|
+
? batchBytes
|
|
1145
|
+
: Math.min(batchBytes, Math.round((batchChunkBytesProcessed / totalBatchChunkBytes) * batchBytes));
|
|
1146
|
+
options?.onProgress?.({
|
|
1147
|
+
chunksEmbedded,
|
|
1148
|
+
totalChunks,
|
|
1149
|
+
bytesProcessed: bytesProcessed + proportionalBytes,
|
|
1150
|
+
totalBytes,
|
|
1151
|
+
errors,
|
|
1152
|
+
});
|
|
966
1153
|
}
|
|
1154
|
+
bytesProcessed += batchBytes;
|
|
967
1155
|
options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors });
|
|
968
1156
|
}
|
|
969
1157
|
return { chunksEmbedded, errors };
|
|
970
|
-
},
|
|
1158
|
+
}, { maxDuration: 30 * 60 * 1000, name: 'generateEmbeddings' });
|
|
971
1159
|
return {
|
|
972
1160
|
docsProcessed: totalDocs,
|
|
973
1161
|
chunksEmbedded: result.chunksEmbedded,
|
|
@@ -1097,7 +1285,7 @@ export function handelize(path) {
|
|
|
1097
1285
|
const ext = extMatch ? extMatch[1] : '';
|
|
1098
1286
|
const nameWithoutExt = ext ? segment.slice(0, -ext.length) : segment;
|
|
1099
1287
|
const cleanedName = nameWithoutExt
|
|
1100
|
-
.replace(/[^\p{L}\p{N}$]+/gu, '-') // Keep
|
|
1288
|
+
.replace(/[^\p{L}\p{N}$]+/gu, '-') // Keep letters, numbers, "$"; dash-separate rest (including dots)
|
|
1101
1289
|
.replace(/^-+|-+$/g, ''); // Remove leading/trailing dashes
|
|
1102
1290
|
return cleanedName + ext;
|
|
1103
1291
|
}
|
|
@@ -1196,11 +1384,20 @@ export function cleanupOrphanedContent(db) {
|
|
|
1196
1384
|
* Returns the number of orphaned embedding chunks deleted.
|
|
1197
1385
|
*/
|
|
1198
1386
|
export function cleanupOrphanedVectors(db) {
|
|
1199
|
-
//
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1387
|
+
// sqlite-vec may not be loaded (e.g. Bun's bun:sqlite lacks loadExtension).
|
|
1388
|
+
// The vectors_vec virtual table can appear in sqlite_master from a prior
|
|
1389
|
+
// session, but querying it without the vec0 module loaded will crash (#380).
|
|
1390
|
+
if (!isSqliteVecAvailable()) {
|
|
1391
|
+
return 0;
|
|
1392
|
+
}
|
|
1393
|
+
// The schema entry can exist even when sqlite-vec itself is unavailable
|
|
1394
|
+
// (for example when reopening a DB without vec0 loaded). In that case,
|
|
1395
|
+
// touching the virtual table throws "no such module: vec0" and cleanup
|
|
1396
|
+
// should degrade gracefully like the rest of the vector features.
|
|
1397
|
+
try {
|
|
1398
|
+
db.prepare(`SELECT 1 FROM vectors_vec LIMIT 0`).get();
|
|
1399
|
+
}
|
|
1400
|
+
catch {
|
|
1204
1401
|
return 0;
|
|
1205
1402
|
}
|
|
1206
1403
|
// Count orphaned vectors first
|
|
@@ -1346,52 +1543,44 @@ export function getActiveDocumentPaths(db, collectionName) {
|
|
|
1346
1543
|
return rows.map(r => r.path);
|
|
1347
1544
|
}
|
|
1348
1545
|
export { formatQueryForEmbedding, formatDocForEmbedding };
|
|
1546
|
+
/**
|
|
1547
|
+
* Chunk a document using regex-only break point detection.
|
|
1548
|
+
* This is the sync, backward-compatible API used by tests and legacy callers.
|
|
1549
|
+
*/
|
|
1349
1550
|
export function chunkDocument(content, maxChars = CHUNK_SIZE_CHARS, overlapChars = CHUNK_OVERLAP_CHARS, windowChars = CHUNK_WINDOW_CHARS) {
|
|
1350
|
-
if (content.length <= maxChars) {
|
|
1351
|
-
return [{ text: content, pos: 0 }];
|
|
1352
|
-
}
|
|
1353
|
-
// Pre-scan all break points and code fences once
|
|
1354
1551
|
const breakPoints = scanBreakPoints(content);
|
|
1355
1552
|
const codeFences = findCodeFences(content);
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
|
|
1376
|
-
// Move forward, but overlap with previous chunk
|
|
1377
|
-
// For last chunk, don't overlap (just go to the end)
|
|
1378
|
-
if (endPos >= content.length) {
|
|
1379
|
-
break;
|
|
1380
|
-
}
|
|
1381
|
-
charPos = endPos - overlapChars;
|
|
1382
|
-
const lastChunkPos = chunks.at(-1).pos;
|
|
1383
|
-
if (charPos <= lastChunkPos) {
|
|
1384
|
-
// Prevent infinite loop - move forward at least a bit
|
|
1385
|
-
charPos = endPos;
|
|
1553
|
+
return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
|
|
1554
|
+
}
|
|
1555
|
+
/**
|
|
1556
|
+
* Async AST-aware chunking. Detects language from filepath, computes AST
|
|
1557
|
+
* break points for supported code files, merges with regex break points,
|
|
1558
|
+
* and delegates to the shared chunk algorithm.
|
|
1559
|
+
*
|
|
1560
|
+
* Falls back to regex-only when strategy is "regex", filepath is absent,
|
|
1561
|
+
* or language is unsupported.
|
|
1562
|
+
*/
|
|
1563
|
+
export async function chunkDocumentAsync(content, maxChars = CHUNK_SIZE_CHARS, overlapChars = CHUNK_OVERLAP_CHARS, windowChars = CHUNK_WINDOW_CHARS, filepath, chunkStrategy = "regex") {
|
|
1564
|
+
const regexPoints = scanBreakPoints(content);
|
|
1565
|
+
const codeFences = findCodeFences(content);
|
|
1566
|
+
let breakPoints = regexPoints;
|
|
1567
|
+
if (chunkStrategy === "auto" && filepath) {
|
|
1568
|
+
const { getASTBreakPoints } = await import("./ast.js");
|
|
1569
|
+
const astPoints = await getASTBreakPoints(content, filepath);
|
|
1570
|
+
if (astPoints.length > 0) {
|
|
1571
|
+
breakPoints = mergeBreakPoints(regexPoints, astPoints);
|
|
1386
1572
|
}
|
|
1387
1573
|
}
|
|
1388
|
-
return
|
|
1574
|
+
return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
|
|
1389
1575
|
}
|
|
1390
1576
|
/**
|
|
1391
1577
|
* Chunk a document by actual token count using the LLM tokenizer.
|
|
1392
1578
|
* More accurate than character-based chunking but requires async.
|
|
1579
|
+
*
|
|
1580
|
+
* When filepath and chunkStrategy are provided, uses AST-aware break points
|
|
1581
|
+
* for supported code files.
|
|
1393
1582
|
*/
|
|
1394
|
-
export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKENS, overlapTokens = CHUNK_OVERLAP_TOKENS, windowTokens = CHUNK_WINDOW_TOKENS) {
|
|
1583
|
+
export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKENS, overlapTokens = CHUNK_OVERLAP_TOKENS, windowTokens = CHUNK_WINDOW_TOKENS, filepath, chunkStrategy = "regex", signal) {
|
|
1395
1584
|
const llm = getDefaultLlamaCpp();
|
|
1396
1585
|
// Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3)
|
|
1397
1586
|
// If chunks exceed limit, they'll be re-split with actual ratio
|
|
@@ -1400,10 +1589,14 @@ export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKE
|
|
|
1400
1589
|
const overlapChars = overlapTokens * avgCharsPerToken;
|
|
1401
1590
|
const windowChars = windowTokens * avgCharsPerToken;
|
|
1402
1591
|
// Chunk in character space with conservative estimate
|
|
1403
|
-
|
|
1592
|
+
// Use AST-aware chunking for the first pass when filepath/strategy provided
|
|
1593
|
+
let charChunks = await chunkDocumentAsync(content, maxChars, overlapChars, windowChars, filepath, chunkStrategy);
|
|
1404
1594
|
// Tokenize and split any chunks that still exceed limit
|
|
1405
1595
|
const results = [];
|
|
1406
1596
|
for (const chunk of charChunks) {
|
|
1597
|
+
// Respect abort signal to avoid runaway tokenization
|
|
1598
|
+
if (signal?.aborted)
|
|
1599
|
+
break;
|
|
1407
1600
|
const tokens = await llm.tokenize(chunk.text);
|
|
1408
1601
|
if (tokens.length <= maxTokens) {
|
|
1409
1602
|
results.push({ text: chunk.text, pos: chunk.pos, tokens: tokens.length });
|
|
@@ -1415,6 +1608,8 @@ export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKE
|
|
|
1415
1608
|
const safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95); // 5% safety margin
|
|
1416
1609
|
const subChunks = chunkDocument(chunk.text, safeMaxChars, Math.floor(overlapChars * actualCharsPerToken / 2), Math.floor(windowChars * actualCharsPerToken / 2));
|
|
1417
1610
|
for (const subChunk of subChunks) {
|
|
1611
|
+
if (signal?.aborted)
|
|
1612
|
+
break;
|
|
1418
1613
|
const subTokens = await llm.tokenize(subChunk.text);
|
|
1419
1614
|
results.push({
|
|
1420
1615
|
text: subChunk.text,
|
|
@@ -1523,7 +1718,7 @@ export function matchFilesByGlob(db, pattern) {
|
|
|
1523
1718
|
`).all();
|
|
1524
1719
|
const isMatch = picomatch(pattern);
|
|
1525
1720
|
return allFiles
|
|
1526
|
-
.filter(f => isMatch(f.virtual_path) || isMatch(f.path))
|
|
1721
|
+
.filter(f => isMatch(f.virtual_path) || isMatch(f.path) || isMatch(f.collection + '/' + f.path))
|
|
1527
1722
|
.map(f => ({
|
|
1528
1723
|
filepath: f.virtual_path, // Virtual path for precise lookup
|
|
1529
1724
|
displayPath: f.path, // Relative path for display
|
|
@@ -1874,8 +2069,23 @@ export function getTopLevelPathsWithoutContext(db, collectionName) {
|
|
|
1874
2069
|
// =============================================================================
|
|
1875
2070
|
// FTS Search
|
|
1876
2071
|
// =============================================================================
|
|
1877
|
-
function sanitizeFTS5Term(term) {
|
|
1878
|
-
return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
|
|
2072
|
+
export function sanitizeFTS5Term(term) {
|
|
2073
|
+
return term.replace(/[^\p{L}\p{N}'_]/gu, '').toLowerCase();
|
|
2074
|
+
}
|
|
2075
|
+
/**
|
|
2076
|
+
* Check if a token is a hyphenated compound word (e.g., multi-agent, DEC-0054, gpt-4).
|
|
2077
|
+
* Returns true if the token contains internal hyphens between word/digit characters.
|
|
2078
|
+
*/
|
|
2079
|
+
function isHyphenatedToken(token) {
|
|
2080
|
+
return /^[\p{L}\p{N}][\p{L}\p{N}'-]*-[\p{L}\p{N}][\p{L}\p{N}'-]*$/u.test(token);
|
|
2081
|
+
}
|
|
2082
|
+
/**
|
|
2083
|
+
* Sanitize a hyphenated term into an FTS5 phrase by splitting on hyphens
|
|
2084
|
+
* and sanitizing each part. Returns the parts joined by spaces for use
|
|
2085
|
+
* inside FTS5 quotes: "multi agent" matches "multi-agent" in porter tokenizer.
|
|
2086
|
+
*/
|
|
2087
|
+
function sanitizeHyphenatedTerm(term) {
|
|
2088
|
+
return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
|
|
1879
2089
|
}
|
|
1880
2090
|
/**
|
|
1881
2091
|
* Parse lex query syntax into FTS5 query.
|
|
@@ -1883,14 +2093,23 @@ function sanitizeFTS5Term(term) {
|
|
|
1883
2093
|
* Supports:
|
|
1884
2094
|
* - Quoted phrases: "exact phrase" → "exact phrase" (exact match)
|
|
1885
2095
|
* - Negation: -term or -"phrase" → uses FTS5 NOT operator
|
|
2096
|
+
* - Hyphenated tokens: multi-agent, DEC-0054, gpt-4 → treated as phrases
|
|
1886
2097
|
* - Plain terms: term → "term"* (prefix match)
|
|
1887
2098
|
*
|
|
1888
2099
|
* FTS5 NOT is a binary operator: `term1 NOT term2` means "match term1 but not term2".
|
|
1889
2100
|
* So `-term` only works when there are also positive terms.
|
|
1890
2101
|
*
|
|
2102
|
+
* Hyphen disambiguation: `-sports` at a word boundary is negation, but `multi-agent`
|
|
2103
|
+
* (where `-` is between word characters) is treated as a hyphenated phrase.
|
|
2104
|
+
* When a leading `-` is followed by what looks like a hyphenated compound word
|
|
2105
|
+
* (e.g., `-multi-agent`), the entire token is treated as a negated phrase.
|
|
2106
|
+
*
|
|
1891
2107
|
* Examples:
|
|
1892
2108
|
* performance -sports → "performance"* NOT "sports"*
|
|
1893
2109
|
* "machine learning" → "machine learning"
|
|
2110
|
+
* multi-agent memory → "multi agent" AND "memory"*
|
|
2111
|
+
* DEC-0054 → "dec 0054"
|
|
2112
|
+
* -multi-agent → NOT "multi agent"
|
|
1894
2113
|
*/
|
|
1895
2114
|
function buildFTS5Query(query) {
|
|
1896
2115
|
const positive = [];
|
|
@@ -1934,14 +2153,30 @@ function buildFTS5Query(query) {
|
|
|
1934
2153
|
while (i < s.length && !/[\s"]/.test(s[i]))
|
|
1935
2154
|
i++;
|
|
1936
2155
|
const term = s.slice(start, i);
|
|
1937
|
-
|
|
1938
|
-
|
|
1939
|
-
|
|
1940
|
-
|
|
1941
|
-
|
|
2156
|
+
// Handle hyphenated tokens: multi-agent, DEC-0054, gpt-4
|
|
2157
|
+
// These get split into phrase queries so FTS5 porter tokenizer matches them.
|
|
2158
|
+
if (isHyphenatedToken(term)) {
|
|
2159
|
+
const sanitized = sanitizeHyphenatedTerm(term);
|
|
2160
|
+
if (sanitized) {
|
|
2161
|
+
const ftsPhrase = `"${sanitized}"`; // Phrase match (no prefix)
|
|
2162
|
+
if (negated) {
|
|
2163
|
+
negative.push(ftsPhrase);
|
|
2164
|
+
}
|
|
2165
|
+
else {
|
|
2166
|
+
positive.push(ftsPhrase);
|
|
2167
|
+
}
|
|
1942
2168
|
}
|
|
1943
|
-
|
|
1944
|
-
|
|
2169
|
+
}
|
|
2170
|
+
else {
|
|
2171
|
+
const sanitized = sanitizeFTS5Term(term);
|
|
2172
|
+
if (sanitized) {
|
|
2173
|
+
const ftsTerm = `"${sanitized}"*`; // Prefix match
|
|
2174
|
+
if (negated) {
|
|
2175
|
+
negative.push(ftsTerm);
|
|
2176
|
+
}
|
|
2177
|
+
else {
|
|
2178
|
+
positive.push(ftsTerm);
|
|
2179
|
+
}
|
|
1945
2180
|
}
|
|
1946
2181
|
}
|
|
1947
2182
|
}
|
|
@@ -1984,26 +2219,42 @@ export function searchFTS(db, query, limit = 20, collectionName) {
|
|
|
1984
2219
|
const ftsQuery = buildFTS5Query(query);
|
|
1985
2220
|
if (!ftsQuery)
|
|
1986
2221
|
return [];
|
|
2222
|
+
// Use a CTE to force FTS5 to run first, then filter by collection.
|
|
2223
|
+
// Without the CTE, SQLite's query planner combines FTS5 MATCH with the
|
|
2224
|
+
// collection filter in a single WHERE clause, which can cause it to
|
|
2225
|
+
// abandon the FTS5 index and fall back to a full scan — turning an 8ms
|
|
2226
|
+
// query into a 17-second query on large collections.
|
|
2227
|
+
const params = [ftsQuery];
|
|
2228
|
+
// When filtering by collection, fetch extra candidates from the FTS index
|
|
2229
|
+
// since some will be filtered out. Without a collection filter we can
|
|
2230
|
+
// fetch exactly the requested limit.
|
|
2231
|
+
const ftsLimit = collectionName ? limit * 10 : limit;
|
|
1987
2232
|
let sql = `
|
|
2233
|
+
WITH fts_matches AS (
|
|
2234
|
+
SELECT rowid, bm25(documents_fts, 1.5, 4.0, 1.0) as bm25_score
|
|
2235
|
+
FROM documents_fts
|
|
2236
|
+
WHERE documents_fts MATCH ?
|
|
2237
|
+
ORDER BY bm25_score ASC
|
|
2238
|
+
LIMIT ${ftsLimit}
|
|
2239
|
+
)
|
|
1988
2240
|
SELECT
|
|
1989
2241
|
'qmd://' || d.collection || '/' || d.path as filepath,
|
|
1990
2242
|
d.collection || '/' || d.path as display_path,
|
|
1991
2243
|
d.title,
|
|
1992
2244
|
content.doc as body,
|
|
1993
2245
|
d.hash,
|
|
1994
|
-
|
|
1995
|
-
FROM
|
|
1996
|
-
JOIN documents d ON d.id =
|
|
2246
|
+
fm.bm25_score
|
|
2247
|
+
FROM fts_matches fm
|
|
2248
|
+
JOIN documents d ON d.id = fm.rowid
|
|
1997
2249
|
JOIN content ON content.hash = d.hash
|
|
1998
|
-
WHERE
|
|
2250
|
+
WHERE d.active = 1
|
|
1999
2251
|
`;
|
|
2000
|
-
const params = [ftsQuery];
|
|
2001
2252
|
if (collectionName) {
|
|
2002
2253
|
sql += ` AND d.collection = ?`;
|
|
2003
2254
|
params.push(String(collectionName));
|
|
2004
2255
|
}
|
|
2005
2256
|
// bm25 lower is better; sort ascending.
|
|
2006
|
-
sql += ` ORDER BY bm25_score ASC LIMIT ?`;
|
|
2257
|
+
sql += ` ORDER BY fm.bm25_score ASC LIMIT ?`;
|
|
2007
2258
|
params.push(limit);
|
|
2008
2259
|
const rows = db.prepare(sql).all(...params);
|
|
2009
2260
|
return rows.map(row => {
|
|
@@ -2143,13 +2394,23 @@ export function clearAllEmbeddings(db) {
|
|
|
2143
2394
|
/**
|
|
2144
2395
|
* Insert a single embedding into both content_vectors and vectors_vec tables.
|
|
2145
2396
|
* The hash_seq key is formatted as "hash_seq" for the vectors_vec table.
|
|
2397
|
+
*
|
|
2398
|
+
* content_vectors is inserted first so that getHashesForEmbedding (which checks
|
|
2399
|
+
* only content_vectors) won't re-select the hash on a crash between the two inserts.
|
|
2400
|
+
*
|
|
2401
|
+
* vectors_vec uses DELETE + INSERT instead of INSERT OR REPLACE because sqlite-vec's
|
|
2402
|
+
* vec0 virtual tables silently ignore the OR REPLACE conflict clause.
|
|
2146
2403
|
*/
|
|
2147
2404
|
export function insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt) {
|
|
2148
2405
|
const hashSeq = `${hash}_${seq}`;
|
|
2149
|
-
|
|
2406
|
+
// Insert content_vectors first — crash-safe ordering (see getHashesForEmbedding)
|
|
2150
2407
|
const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, ?, ?, ?, ?)`);
|
|
2151
|
-
insertVecStmt.run(hashSeq, embedding);
|
|
2152
2408
|
insertContentVectorStmt.run(hash, seq, pos, model, embeddedAt);
|
|
2409
|
+
// vec0 virtual tables don't support OR REPLACE — use DELETE + INSERT
|
|
2410
|
+
const deleteVecStmt = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
|
|
2411
|
+
const insertVecStmt = db.prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
|
|
2412
|
+
deleteVecStmt.run(hashSeq);
|
|
2413
|
+
insertVecStmt.run(hashSeq, embedding);
|
|
2153
2414
|
}
|
|
2154
2415
|
// =============================================================================
|
|
2155
2416
|
// Query expansion
|
|
@@ -2484,7 +2745,7 @@ export function getDocumentBody(db, doc, fromLine, maxLines) {
|
|
|
2484
2745
|
* Returns documents without body by default (use getDocumentBody to load)
|
|
2485
2746
|
*/
|
|
2486
2747
|
export function findDocuments(db, pattern, options = {}) {
|
|
2487
|
-
const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?');
|
|
2748
|
+
const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?') && !pattern.includes('{');
|
|
2488
2749
|
const errors = [];
|
|
2489
2750
|
const maxBytes = options.maxBytes ?? DEFAULT_MULTI_GET_MAX_BYTES;
|
|
2490
2751
|
const bodyCol = options.includeBody ? `, content.doc as body` : ``;
|
|
@@ -2817,7 +3078,7 @@ export async function hybridQuery(store, query, options) {
|
|
|
2817
3078
|
}
|
|
2818
3079
|
// Batch embed all vector queries in a single call
|
|
2819
3080
|
const llm = getLlm(store);
|
|
2820
|
-
const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text));
|
|
3081
|
+
const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text, llm.embedModelName));
|
|
2821
3082
|
hooks?.onEmbedStart?.(textsToEmbed.length);
|
|
2822
3083
|
const embedStart = Date.now();
|
|
2823
3084
|
const embeddings = await llm.embedBatch(textsToEmbed);
|
|
@@ -2855,8 +3116,9 @@ export async function hybridQuery(store, query, options) {
|
|
|
2855
3116
|
const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 2);
|
|
2856
3117
|
const intentTerms = intent ? extractIntentTerms(intent) : [];
|
|
2857
3118
|
const docChunkMap = new Map();
|
|
3119
|
+
const chunkStrategy = options?.chunkStrategy;
|
|
2858
3120
|
for (const cand of candidates) {
|
|
2859
|
-
const chunks =
|
|
3121
|
+
const chunks = await chunkDocumentAsync(cand.body, undefined, undefined, undefined, cand.file, chunkStrategy);
|
|
2860
3122
|
if (chunks.length === 0)
|
|
2861
3123
|
continue;
|
|
2862
3124
|
// Pick chunk with most keyword overlap (fallback: first chunk)
|
|
@@ -3128,7 +3390,7 @@ export async function structuredSearch(store, searches, options) {
|
|
|
3128
3390
|
const vecSearches = searches.filter((s) => s.type === 'vec' || s.type === 'hyde');
|
|
3129
3391
|
if (vecSearches.length > 0) {
|
|
3130
3392
|
const llm = getLlm(store);
|
|
3131
|
-
const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query));
|
|
3393
|
+
const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query, llm.embedModelName));
|
|
3132
3394
|
hooks?.onEmbedStart?.(textsToEmbed.length);
|
|
3133
3395
|
const embedStart = Date.now();
|
|
3134
3396
|
const embeddings = await llm.embedBatch(textsToEmbed);
|
|
@@ -3174,8 +3436,9 @@ export async function structuredSearch(store, searches, options) {
|
|
|
3174
3436
|
const queryTerms = primaryQuery.toLowerCase().split(/\s+/).filter(t => t.length > 2);
|
|
3175
3437
|
const intentTerms = intent ? extractIntentTerms(intent) : [];
|
|
3176
3438
|
const docChunkMap = new Map();
|
|
3439
|
+
const ssChunkStrategy = options?.chunkStrategy;
|
|
3177
3440
|
for (const cand of candidates) {
|
|
3178
|
-
const chunks =
|
|
3441
|
+
const chunks = await chunkDocumentAsync(cand.body, undefined, undefined, undefined, cand.file, ssChunkStrategy);
|
|
3179
3442
|
if (chunks.length === 0)
|
|
3180
3443
|
continue;
|
|
3181
3444
|
// Pick chunk with most keyword overlap
|