@tobilu/qmd 2.0.1 → 2.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +177 -0
- package/README.md +64 -1
- package/bin/qmd +49 -4
- package/dist/ast.d.ts +65 -0
- package/dist/ast.js +334 -0
- package/dist/bench/bench.d.ts +23 -0
- package/dist/bench/bench.js +280 -0
- package/dist/bench/score.d.ts +33 -0
- package/dist/bench/score.js +88 -0
- package/dist/bench/types.d.ts +80 -0
- package/dist/bench/types.js +8 -0
- package/dist/cli/formatter.js +5 -1
- package/dist/cli/qmd.d.ts +27 -0
- package/dist/cli/qmd.js +1328 -115
- package/dist/collections.d.ts +20 -0
- package/dist/collections.js +32 -7
- package/dist/db.d.ts +14 -3
- package/dist/db.js +45 -4
- package/dist/index.d.ts +11 -1
- package/dist/index.js +18 -5
- package/dist/llm.d.ts +77 -6
- package/dist/llm.js +445 -62
- package/dist/mcp/server.d.ts +6 -3
- package/dist/mcp/server.js +68 -29
- package/dist/paths.d.ts +1 -0
- package/dist/paths.js +4 -0
- package/dist/store.d.ts +148 -23
- package/dist/store.js +1018 -255
- package/package.json +48 -20
- package/scripts/build.mjs +29 -0
- package/scripts/check-package-grammars.mjs +29 -0
- package/scripts/package-smoke.mjs +65 -0
- package/scripts/test-all.mjs +27 -0
- package/skills/qmd/SKILL.md +203 -0
- package/skills/qmd/references/mcp-setup.md +102 -0
- package/skills/release/SKILL.md +139 -0
- package/skills/release/scripts/install-hooks.sh +38 -0
- package/dist/embedded-skills.d.ts +0 -6
- package/dist/embedded-skills.js +0 -14
package/dist/store.js
CHANGED
|
@@ -16,16 +16,21 @@ import { createHash } from "crypto";
|
|
|
16
16
|
import { readFileSync, realpathSync, statSync, mkdirSync } from "node:fs";
|
|
17
17
|
// Note: node:path resolve is not imported — we export our own cross-platform resolve()
|
|
18
18
|
import fastGlob from "fast-glob";
|
|
19
|
-
import {
|
|
19
|
+
import { qmdHomedir } from "./paths.js";
|
|
20
|
+
import { LlamaCpp, getDefaultLlamaCpp, formatQueryForEmbedding, formatDocForEmbedding, withLLMSessionForLlm, DEFAULT_EMBED_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, } from "./llm.js";
|
|
20
21
|
// =============================================================================
|
|
21
22
|
// Configuration
|
|
22
23
|
// =============================================================================
|
|
23
|
-
const
|
|
24
|
-
export const
|
|
25
|
-
export const
|
|
26
|
-
export const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B";
|
|
24
|
+
export const DEFAULT_EMBED_MODEL = DEFAULT_EMBED_MODEL_URI;
|
|
25
|
+
export const DEFAULT_RERANK_MODEL = DEFAULT_RERANK_MODEL_URI;
|
|
26
|
+
export const DEFAULT_QUERY_MODEL = DEFAULT_GENERATE_MODEL_URI;
|
|
27
27
|
export const DEFAULT_GLOB = "**/*.md";
|
|
28
28
|
export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB
|
|
29
|
+
export const DEFAULT_EMBED_MAX_DOCS_PER_BATCH = 64;
|
|
30
|
+
export const DEFAULT_EMBED_MAX_BATCH_BYTES = 64 * 1024 * 1024; // 64MB
|
|
31
|
+
const EMBED_FINGERPRINT_PROBE_QUERY = "__qmd_embedding_query_probe__";
|
|
32
|
+
const EMBED_FINGERPRINT_PROBE_TITLE = "__qmd_embedding_title_probe__";
|
|
33
|
+
const EMBED_FINGERPRINT_PROBE_DOC = "__qmd_embedding_document_probe__";
|
|
29
34
|
// Chunking: 900 tokens per chunk with 15% overlap
|
|
30
35
|
// Increased from 800 to accommodate smart chunking finding natural break points
|
|
31
36
|
export const CHUNK_SIZE_TOKENS = 900;
|
|
@@ -36,6 +41,16 @@ export const CHUNK_OVERLAP_CHARS = CHUNK_OVERLAP_TOKENS * 4; // 540 chars
|
|
|
36
41
|
// Search window for finding optimal break points (in tokens, ~200 tokens)
|
|
37
42
|
export const CHUNK_WINDOW_TOKENS = 200;
|
|
38
43
|
export const CHUNK_WINDOW_CHARS = CHUNK_WINDOW_TOKENS * 4; // 800 chars
|
|
44
|
+
export function getEmbeddingFingerprint(model = DEFAULT_EMBED_MODEL) {
|
|
45
|
+
const significant = [
|
|
46
|
+
`model:${model}`,
|
|
47
|
+
`query:${formatQueryForEmbedding(EMBED_FINGERPRINT_PROBE_QUERY, model)}`,
|
|
48
|
+
`doc:${formatDocForEmbedding(EMBED_FINGERPRINT_PROBE_DOC, EMBED_FINGERPRINT_PROBE_TITLE, model)}`,
|
|
49
|
+
`chunk_tokens:${CHUNK_SIZE_TOKENS}`,
|
|
50
|
+
`chunk_overlap_tokens:${CHUNK_OVERLAP_TOKENS}`,
|
|
51
|
+
].join("\n");
|
|
52
|
+
return createHash("sha256").update(significant).digest("hex").slice(0, 6);
|
|
53
|
+
}
|
|
39
54
|
/**
|
|
40
55
|
* Get the LlamaCpp instance for a store — prefers the store's own instance,
|
|
41
56
|
* falls back to the global singleton.
|
|
@@ -161,6 +176,60 @@ export function findBestCutoff(breakPoints, targetCharPos, windowChars = CHUNK_W
|
|
|
161
176
|
}
|
|
162
177
|
return bestPos;
|
|
163
178
|
}
|
|
179
|
+
/**
|
|
180
|
+
* Merge two sets of break points (e.g. regex + AST), keeping the highest
|
|
181
|
+
* score at each position. Result is sorted by position.
|
|
182
|
+
*/
|
|
183
|
+
export function mergeBreakPoints(a, b) {
|
|
184
|
+
const seen = new Map();
|
|
185
|
+
for (const bp of a) {
|
|
186
|
+
const existing = seen.get(bp.pos);
|
|
187
|
+
if (!existing || bp.score > existing.score) {
|
|
188
|
+
seen.set(bp.pos, bp);
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
for (const bp of b) {
|
|
192
|
+
const existing = seen.get(bp.pos);
|
|
193
|
+
if (!existing || bp.score > existing.score) {
|
|
194
|
+
seen.set(bp.pos, bp);
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
return Array.from(seen.values()).sort((a, b) => a.pos - b.pos);
|
|
198
|
+
}
|
|
199
|
+
/**
|
|
200
|
+
* Core chunk algorithm that operates on precomputed break points and code fences.
|
|
201
|
+
* This is the shared implementation used by both regex-only and AST-aware chunking.
|
|
202
|
+
*/
|
|
203
|
+
export function chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars = CHUNK_SIZE_CHARS, overlapChars = CHUNK_OVERLAP_CHARS, windowChars = CHUNK_WINDOW_CHARS) {
|
|
204
|
+
if (content.length <= maxChars) {
|
|
205
|
+
return [{ text: content, pos: 0 }];
|
|
206
|
+
}
|
|
207
|
+
const chunks = [];
|
|
208
|
+
let charPos = 0;
|
|
209
|
+
while (charPos < content.length) {
|
|
210
|
+
const targetEndPos = Math.min(charPos + maxChars, content.length);
|
|
211
|
+
let endPos = targetEndPos;
|
|
212
|
+
if (endPos < content.length) {
|
|
213
|
+
const bestCutoff = findBestCutoff(breakPoints, targetEndPos, windowChars, 0.7, codeFences);
|
|
214
|
+
if (bestCutoff > charPos && bestCutoff <= targetEndPos) {
|
|
215
|
+
endPos = bestCutoff;
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
if (endPos <= charPos) {
|
|
219
|
+
endPos = Math.min(charPos + maxChars, content.length);
|
|
220
|
+
}
|
|
221
|
+
chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
|
|
222
|
+
if (endPos >= content.length) {
|
|
223
|
+
break;
|
|
224
|
+
}
|
|
225
|
+
charPos = endPos - overlapChars;
|
|
226
|
+
const lastChunkPos = chunks.at(-1).pos;
|
|
227
|
+
if (charPos <= lastChunkPos) {
|
|
228
|
+
charPos = endPos;
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
return chunks;
|
|
232
|
+
}
|
|
164
233
|
// Hybrid query: strong BM25 signal detection thresholds
|
|
165
234
|
// Skip expensive LLM expansion when top result is strong AND clearly separated from runner-up
|
|
166
235
|
export const STRONG_SIGNAL_MIN_SCORE = 0.85;
|
|
@@ -172,7 +241,7 @@ export const RERANK_CANDIDATE_LIMIT = 40;
|
|
|
172
241
|
// Path utilities
|
|
173
242
|
// =============================================================================
|
|
174
243
|
export function homedir() {
|
|
175
|
-
return
|
|
244
|
+
return qmdHomedir();
|
|
176
245
|
}
|
|
177
246
|
/**
|
|
178
247
|
* Check if a path is absolute.
|
|
@@ -191,7 +260,8 @@ export function isAbsolutePath(path) {
|
|
|
191
260
|
if (path.startsWith('/')) {
|
|
192
261
|
// Check if it's a Git Bash style path like /c/ or /c/Users (C-Z only, not A or B)
|
|
193
262
|
// Requires path[2] === '/' to distinguish from Unix paths like /c or /cache
|
|
194
|
-
|
|
263
|
+
// Skipped on WSL where /c/ is a valid drvfs mount point, not a drive letter
|
|
264
|
+
if (!isWSL() && path.length >= 3 && path[2] === '/') {
|
|
195
265
|
const driveLetter = path[1];
|
|
196
266
|
if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
|
|
197
267
|
return true;
|
|
@@ -213,6 +283,13 @@ export function isAbsolutePath(path) {
|
|
|
213
283
|
export function normalizePathSeparators(path) {
|
|
214
284
|
return path.replace(/\\/g, '/');
|
|
215
285
|
}
|
|
286
|
+
/**
|
|
287
|
+
* Detect if running inside WSL (Windows Subsystem for Linux).
|
|
288
|
+
* On WSL, paths like /c/work/... are valid drvfs mount points, not Git Bash paths.
|
|
289
|
+
*/
|
|
290
|
+
function isWSL() {
|
|
291
|
+
return !!(process.env.WSL_DISTRO_NAME || process.env.WSL_INTEROP);
|
|
292
|
+
}
|
|
216
293
|
/**
|
|
217
294
|
* Get the relative path from a prefix.
|
|
218
295
|
* Returns null if path is not under prefix.
|
|
@@ -256,8 +333,9 @@ export function resolve(...paths) {
|
|
|
256
333
|
windowsDrive = firstPath.slice(0, 2);
|
|
257
334
|
result = firstPath.slice(2);
|
|
258
335
|
}
|
|
259
|
-
else if (firstPath.startsWith('/') && firstPath.length >= 3 && firstPath[2] === '/') {
|
|
336
|
+
else if (!isWSL() && firstPath.startsWith('/') && firstPath.length >= 3 && firstPath[2] === '/') {
|
|
260
337
|
// Git Bash style: /c/ -> C: (C-Z drives only, not A or B)
|
|
338
|
+
// Skipped on WSL where /c/ is a valid drvfs mount point, not a drive letter
|
|
261
339
|
const driveLetter = firstPath[1];
|
|
262
340
|
if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
|
|
263
341
|
windowsDrive = driveLetter.toUpperCase() + ':';
|
|
@@ -288,8 +366,9 @@ export function resolve(...paths) {
|
|
|
288
366
|
windowsDrive = p.slice(0, 2);
|
|
289
367
|
result = p.slice(2);
|
|
290
368
|
}
|
|
291
|
-
else if (p.startsWith('/') && p.length >= 3 && p[2] === '/') {
|
|
369
|
+
else if (!isWSL() && p.startsWith('/') && p.length >= 3 && p[2] === '/') {
|
|
292
370
|
// Git Bash style (C-Z drives only, not A or B)
|
|
371
|
+
// Skipped on WSL where /c/ is a valid drvfs mount point, not a drive letter
|
|
293
372
|
const driveLetter = p[1];
|
|
294
373
|
if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
|
|
295
374
|
windowsDrive = driveLetter.toUpperCase() + ':';
|
|
@@ -332,6 +411,10 @@ let _productionMode = false;
|
|
|
332
411
|
export function enableProductionMode() {
|
|
333
412
|
_productionMode = true;
|
|
334
413
|
}
|
|
414
|
+
/** Reset production mode flag — only for testing. */
|
|
415
|
+
export function _resetProductionModeForTesting() {
|
|
416
|
+
_productionMode = false;
|
|
417
|
+
}
|
|
335
418
|
export function getDefaultDbPath(indexName = "index") {
|
|
336
419
|
// Always allow override via INDEX_PATH (for testing)
|
|
337
420
|
if (process.env.INDEX_PATH) {
|
|
@@ -398,21 +481,25 @@ export function normalizeVirtualPath(input) {
|
|
|
398
481
|
export function parseVirtualPath(virtualPath) {
|
|
399
482
|
// Normalize the path first
|
|
400
483
|
const normalized = normalizeVirtualPath(virtualPath);
|
|
484
|
+
const [pathPart = normalized, queryString = ""] = normalized.split("?");
|
|
401
485
|
// Match: qmd://collection-name[/optional-path]
|
|
402
486
|
// Allows: qmd://name, qmd://name/, qmd://name/path
|
|
403
|
-
const match =
|
|
487
|
+
const match = pathPart.match(/^qmd:\/\/([^\/]+)\/?(.*)$/);
|
|
404
488
|
if (!match?.[1])
|
|
405
489
|
return null;
|
|
490
|
+
const indexName = new URLSearchParams(queryString).get("index")?.trim() || undefined;
|
|
406
491
|
return {
|
|
407
492
|
collectionName: match[1],
|
|
408
493
|
path: match[2] ?? '', // Empty string for collection root
|
|
494
|
+
...(indexName ? { indexName } : {}),
|
|
409
495
|
};
|
|
410
496
|
}
|
|
411
497
|
/**
|
|
412
498
|
* Build a virtual path from collection name and relative path.
|
|
413
499
|
*/
|
|
414
|
-
export function buildVirtualPath(collectionName, path) {
|
|
415
|
-
|
|
500
|
+
export function buildVirtualPath(collectionName, path, indexName) {
|
|
501
|
+
const base = `qmd://${collectionName}/${path}`;
|
|
502
|
+
return indexName ? `${base}?index=${encodeURIComponent(indexName)}` : base;
|
|
416
503
|
}
|
|
417
504
|
/**
|
|
418
505
|
* Check if a path is explicitly a virtual path.
|
|
@@ -482,6 +569,7 @@ function createSqliteVecUnavailableError(reason) {
|
|
|
482
569
|
"Install Homebrew SQLite so the sqlite-vec extension can be loaded, " +
|
|
483
570
|
"and set BREW_PREFIX if Homebrew is installed in a non-standard location.");
|
|
484
571
|
}
|
|
572
|
+
let _sqliteVecUnavailableReason = null;
|
|
485
573
|
function getErrorMessage(err) {
|
|
486
574
|
return err instanceof Error ? err.message : String(err);
|
|
487
575
|
}
|
|
@@ -498,15 +586,76 @@ export function verifySqliteVecLoaded(db) {
|
|
|
498
586
|
}
|
|
499
587
|
}
|
|
500
588
|
let _sqliteVecAvailable = null;
|
|
589
|
+
const CJK_CHAR_PATTERN = /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Hangul}]/u;
|
|
590
|
+
const CJK_RUN_PATTERN = /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Hangul}]+/gu;
|
|
591
|
+
const FTS_CJK_NORMALIZED_VERSION = "1";
|
|
592
|
+
/**
|
|
593
|
+
* FTS5's unicode61 tokenizer does not segment CJK text into searchable words.
|
|
594
|
+
* Normalize CJK runs by spacing every character so exact CJK queries can be
|
|
595
|
+
* translated into phrase queries while Latin text keeps the default tokenizer.
|
|
596
|
+
*/
|
|
597
|
+
export function normalizeCjkForFTS(text) {
|
|
598
|
+
return text.replace(CJK_RUN_PATTERN, run => ` ${Array.from(run).join(' ')} `);
|
|
599
|
+
}
|
|
600
|
+
function containsCjk(text) {
|
|
601
|
+
return CJK_CHAR_PATTERN.test(text);
|
|
602
|
+
}
|
|
603
|
+
function sanitizeFTS5Phrase(phrase) {
|
|
604
|
+
return normalizeCjkForFTS(phrase)
|
|
605
|
+
.split(/\s+/)
|
|
606
|
+
.map(t => sanitizeFTS5Term(t))
|
|
607
|
+
.filter(t => t)
|
|
608
|
+
.join(' ');
|
|
609
|
+
}
|
|
610
|
+
function rebuildFTSForCjkNormalization(db) {
|
|
611
|
+
const version = db.prepare(`SELECT value FROM store_config WHERE key = 'fts_cjk_normalized_version'`).get();
|
|
612
|
+
if (version?.value === FTS_CJK_NORMALIZED_VERSION)
|
|
613
|
+
return;
|
|
614
|
+
try {
|
|
615
|
+
db.exec(`DELETE FROM documents_fts WHERE rowid >= 0`);
|
|
616
|
+
}
|
|
617
|
+
catch {
|
|
618
|
+
// Some older/corrupt FTS5 shadow-table states can reject bulk deletes even
|
|
619
|
+
// though reads still work. Recreate the virtual table; documents_fts is a
|
|
620
|
+
// derived index, so rebuilding it from documents/content is safe.
|
|
621
|
+
db.exec(`DROP TABLE IF EXISTS documents_fts`);
|
|
622
|
+
db.exec(`
|
|
623
|
+
CREATE VIRTUAL TABLE documents_fts USING fts5(
|
|
624
|
+
filepath, title, body,
|
|
625
|
+
tokenize='porter unicode61'
|
|
626
|
+
)
|
|
627
|
+
`);
|
|
628
|
+
}
|
|
629
|
+
const rows = db.prepare(`
|
|
630
|
+
SELECT d.id, d.collection, d.path, d.title, content.doc as body
|
|
631
|
+
FROM documents d
|
|
632
|
+
JOIN content ON content.hash = d.hash
|
|
633
|
+
WHERE d.active = 1
|
|
634
|
+
`).all();
|
|
635
|
+
const insert = db.prepare(`INSERT INTO documents_fts(rowid, filepath, title, body) VALUES (?, ?, ?, ?)`);
|
|
636
|
+
const rebuild = db.transaction(() => {
|
|
637
|
+
for (const row of rows) {
|
|
638
|
+
insert.run(row.id, normalizeCjkForFTS(`${row.collection}/${row.path}`), normalizeCjkForFTS(row.title), normalizeCjkForFTS(row.body));
|
|
639
|
+
}
|
|
640
|
+
});
|
|
641
|
+
rebuild();
|
|
642
|
+
db.prepare(`
|
|
643
|
+
INSERT OR REPLACE INTO store_config(key, value)
|
|
644
|
+
VALUES ('fts_cjk_normalized_version', ?)
|
|
645
|
+
`).run(FTS_CJK_NORMALIZED_VERSION);
|
|
646
|
+
}
|
|
501
647
|
function initializeDatabase(db) {
|
|
502
648
|
try {
|
|
503
649
|
loadSqliteVec(db);
|
|
504
650
|
verifySqliteVecLoaded(db);
|
|
505
651
|
_sqliteVecAvailable = true;
|
|
652
|
+
_sqliteVecUnavailableReason = null;
|
|
506
653
|
}
|
|
507
|
-
catch {
|
|
654
|
+
catch (err) {
|
|
508
655
|
// sqlite-vec is optional — vector search won't work but FTS is fine
|
|
509
656
|
_sqliteVecAvailable = false;
|
|
657
|
+
_sqliteVecUnavailableReason = getErrorMessage(err);
|
|
658
|
+
console.warn(_sqliteVecUnavailableReason);
|
|
510
659
|
}
|
|
511
660
|
db.exec("PRAGMA journal_mode = WAL");
|
|
512
661
|
db.exec("PRAGMA foreign_keys = ON");
|
|
@@ -548,19 +697,16 @@ function initializeDatabase(db) {
|
|
|
548
697
|
created_at TEXT NOT NULL
|
|
549
698
|
)
|
|
550
699
|
`);
|
|
551
|
-
// Content vectors
|
|
552
|
-
|
|
553
|
-
const hasSeqColumn = cvInfo.some(col => col.name === 'seq');
|
|
554
|
-
if (cvInfo.length > 0 && !hasSeqColumn) {
|
|
555
|
-
db.exec(`DROP TABLE IF EXISTS content_vectors`);
|
|
556
|
-
db.exec(`DROP TABLE IF EXISTS vectors_vec`);
|
|
557
|
-
}
|
|
700
|
+
// Content vectors. Avoid PRAGMA schema probes during startup; legacy vector
|
|
701
|
+
// columns are repaired lazily when a vector/embedding query first needs them.
|
|
558
702
|
db.exec(`
|
|
559
703
|
CREATE TABLE IF NOT EXISTS content_vectors (
|
|
560
704
|
hash TEXT NOT NULL,
|
|
561
705
|
seq INTEGER NOT NULL DEFAULT 0,
|
|
562
706
|
pos INTEGER NOT NULL DEFAULT 0,
|
|
563
707
|
model TEXT NOT NULL,
|
|
708
|
+
embed_fingerprint TEXT NOT NULL DEFAULT '',
|
|
709
|
+
total_chunks INTEGER NOT NULL DEFAULT 1,
|
|
564
710
|
embedded_at TEXT NOT NULL,
|
|
565
711
|
PRIMARY KEY (hash, seq)
|
|
566
712
|
)
|
|
@@ -591,9 +737,12 @@ function initializeDatabase(db) {
|
|
|
591
737
|
tokenize='porter unicode61'
|
|
592
738
|
)
|
|
593
739
|
`);
|
|
594
|
-
// Triggers
|
|
740
|
+
// Triggers keep FTS in sync for callers that write directly to documents.
|
|
741
|
+
// Production indexing paths rebuild entries in TypeScript so CJK text can be
|
|
742
|
+
// normalized before it reaches the unicode61 tokenizer.
|
|
743
|
+
db.exec(`DROP TRIGGER IF EXISTS documents_ai`);
|
|
595
744
|
db.exec(`
|
|
596
|
-
CREATE TRIGGER
|
|
745
|
+
CREATE TRIGGER documents_ai AFTER INSERT ON documents
|
|
597
746
|
WHEN new.active = 1
|
|
598
747
|
BEGIN
|
|
599
748
|
INSERT INTO documents_fts(rowid, filepath, title, body)
|
|
@@ -605,13 +754,15 @@ function initializeDatabase(db) {
|
|
|
605
754
|
WHERE new.active = 1;
|
|
606
755
|
END
|
|
607
756
|
`);
|
|
757
|
+
db.exec(`DROP TRIGGER IF EXISTS documents_ad`);
|
|
608
758
|
db.exec(`
|
|
609
|
-
CREATE TRIGGER
|
|
759
|
+
CREATE TRIGGER documents_ad AFTER DELETE ON documents BEGIN
|
|
610
760
|
DELETE FROM documents_fts WHERE rowid = old.id;
|
|
611
761
|
END
|
|
612
762
|
`);
|
|
763
|
+
db.exec(`DROP TRIGGER IF EXISTS documents_au`);
|
|
613
764
|
db.exec(`
|
|
614
|
-
CREATE TRIGGER
|
|
765
|
+
CREATE TRIGGER documents_au AFTER UPDATE ON documents
|
|
615
766
|
BEGIN
|
|
616
767
|
-- Delete from FTS if no longer active
|
|
617
768
|
DELETE FROM documents_fts WHERE rowid = old.id AND new.active = 0;
|
|
@@ -626,6 +777,7 @@ function initializeDatabase(db) {
|
|
|
626
777
|
WHERE new.active = 1;
|
|
627
778
|
END
|
|
628
779
|
`);
|
|
780
|
+
rebuildFTSForCjkNormalization(db);
|
|
629
781
|
}
|
|
630
782
|
function rowToNamedCollection(row) {
|
|
631
783
|
return {
|
|
@@ -767,7 +919,7 @@ export function isSqliteVecAvailable() {
|
|
|
767
919
|
}
|
|
768
920
|
function ensureVecTableInternal(db, dimensions) {
|
|
769
921
|
if (!_sqliteVecAvailable) {
|
|
770
|
-
throw
|
|
922
|
+
throw createSqliteVecUnavailableError(_sqliteVecUnavailableReason ?? "vector operations require a SQLite build with extension loading support");
|
|
771
923
|
}
|
|
772
924
|
const tableInfo = db.prepare(`SELECT sql FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
|
|
773
925
|
if (tableInfo) {
|
|
@@ -777,7 +929,10 @@ function ensureVecTableInternal(db, dimensions) {
|
|
|
777
929
|
const existingDims = match?.[1] ? parseInt(match[1], 10) : null;
|
|
778
930
|
if (existingDims === dimensions && hasHashSeq && hasCosine)
|
|
779
931
|
return;
|
|
780
|
-
|
|
932
|
+
if (existingDims !== null && existingDims !== dimensions) {
|
|
933
|
+
throw new Error(`Embedding dimension mismatch: existing vectors are ${existingDims}d but the current model produces ${dimensions}d. ` +
|
|
934
|
+
`Run 'qmd embed -f' to re-embed with the new model.`);
|
|
935
|
+
}
|
|
781
936
|
db.exec("DROP TABLE IF EXISTS vectors_vec");
|
|
782
937
|
}
|
|
783
938
|
db.exec(`CREATE VIRTUAL TABLE vectors_vec USING vec0(hash_seq TEXT PRIMARY KEY, embedding float[${dimensions}] distance_metric=cosine)`);
|
|
@@ -828,7 +983,7 @@ export async function reindexCollection(store, collectionPath, globPattern, coll
|
|
|
828
983
|
}
|
|
829
984
|
const hash = await hashContent(content);
|
|
830
985
|
const title = extractTitle(content, relativeFile);
|
|
831
|
-
const existing =
|
|
986
|
+
const existing = findOrMigrateLegacyDocument(db, collectionName, path);
|
|
832
987
|
if (existing) {
|
|
833
988
|
if (existing.hash === hash) {
|
|
834
989
|
if (existing.title !== title) {
|
|
@@ -867,6 +1022,125 @@ export async function reindexCollection(store, collectionPath, globPattern, coll
|
|
|
867
1022
|
const orphanedCleaned = cleanupOrphanedContent(db);
|
|
868
1023
|
return { indexed, updated, unchanged, removed, orphanedCleaned };
|
|
869
1024
|
}
|
|
1025
|
+
function validatePositiveIntegerOption(name, value, fallback) {
|
|
1026
|
+
if (value === undefined)
|
|
1027
|
+
return fallback;
|
|
1028
|
+
if (!Number.isInteger(value) || value < 1) {
|
|
1029
|
+
throw new Error(`${name} must be a positive integer`);
|
|
1030
|
+
}
|
|
1031
|
+
return value;
|
|
1032
|
+
}
|
|
1033
|
+
function resolveEmbedOptions(options) {
|
|
1034
|
+
return {
|
|
1035
|
+
maxDocsPerBatch: validatePositiveIntegerOption("maxDocsPerBatch", options?.maxDocsPerBatch, DEFAULT_EMBED_MAX_DOCS_PER_BATCH),
|
|
1036
|
+
maxBatchBytes: validatePositiveIntegerOption("maxBatchBytes", options?.maxBatchBytes, DEFAULT_EMBED_MAX_BATCH_BYTES),
|
|
1037
|
+
};
|
|
1038
|
+
}
|
|
1039
|
+
const CONTENT_VECTOR_DESIRED_COLUMNS = [
|
|
1040
|
+
{ name: "seq", definition: "INTEGER NOT NULL DEFAULT 0" },
|
|
1041
|
+
{ name: "pos", definition: "INTEGER NOT NULL DEFAULT 0" },
|
|
1042
|
+
{ name: "model", definition: "TEXT NOT NULL DEFAULT ''" },
|
|
1043
|
+
{ name: "embed_fingerprint", definition: "TEXT NOT NULL DEFAULT ''" },
|
|
1044
|
+
{ name: "total_chunks", definition: "INTEGER NOT NULL DEFAULT 1" },
|
|
1045
|
+
{ name: "embedded_at", definition: "TEXT NOT NULL DEFAULT ''" },
|
|
1046
|
+
];
|
|
1047
|
+
function isContentVectorColumnError(error) {
|
|
1048
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1049
|
+
if (!/(no such column|has no column named)/i.test(message)) {
|
|
1050
|
+
return false;
|
|
1051
|
+
}
|
|
1052
|
+
return CONTENT_VECTOR_DESIRED_COLUMNS.some(col => message.includes(col.name));
|
|
1053
|
+
}
|
|
1054
|
+
function runContentVectorColumnRepairs(db) {
|
|
1055
|
+
for (const column of CONTENT_VECTOR_DESIRED_COLUMNS) {
|
|
1056
|
+
try {
|
|
1057
|
+
db.exec(`ALTER TABLE content_vectors ADD COLUMN ${column.name} ${column.definition}`);
|
|
1058
|
+
}
|
|
1059
|
+
catch (error) {
|
|
1060
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1061
|
+
// The repair series is intentionally idempotent: most columns should
|
|
1062
|
+
// already exist, and another caller may have repaired a missing column
|
|
1063
|
+
// between the failed query and this ALTER series.
|
|
1064
|
+
if (!message.includes("duplicate column name")) {
|
|
1065
|
+
throw error;
|
|
1066
|
+
}
|
|
1067
|
+
}
|
|
1068
|
+
}
|
|
1069
|
+
}
|
|
1070
|
+
function withLazyContentVectorMigration(db, operation) {
|
|
1071
|
+
let repaired = false;
|
|
1072
|
+
while (true) {
|
|
1073
|
+
try {
|
|
1074
|
+
return operation();
|
|
1075
|
+
}
|
|
1076
|
+
catch (error) {
|
|
1077
|
+
if (repaired || !isContentVectorColumnError(error)) {
|
|
1078
|
+
throw error;
|
|
1079
|
+
}
|
|
1080
|
+
runContentVectorColumnRepairs(db);
|
|
1081
|
+
repaired = true;
|
|
1082
|
+
}
|
|
1083
|
+
}
|
|
1084
|
+
}
|
|
1085
|
+
function getPendingEmbeddingDocs(db, collection, model = DEFAULT_EMBED_MODEL) {
|
|
1086
|
+
const collectionFilter = collection ? `AND d.collection = ?` : ``;
|
|
1087
|
+
const fingerprint = getEmbeddingFingerprint(model);
|
|
1088
|
+
return withLazyContentVectorMigration(db, () => {
|
|
1089
|
+
const stmt = db.prepare(`
|
|
1090
|
+
SELECT d.hash, MIN(d.path) as path, length(CAST(c.doc AS BLOB)) as bytes
|
|
1091
|
+
FROM documents d
|
|
1092
|
+
JOIN content c ON d.hash = c.hash
|
|
1093
|
+
LEFT JOIN (
|
|
1094
|
+
SELECT hash, model, COUNT(*) AS chunk_count, MAX(total_chunks) AS expected_chunks
|
|
1095
|
+
FROM content_vectors
|
|
1096
|
+
WHERE model = ? AND embed_fingerprint = ?
|
|
1097
|
+
GROUP BY hash, model, embed_fingerprint
|
|
1098
|
+
) v ON d.hash = v.hash
|
|
1099
|
+
WHERE d.active = 1
|
|
1100
|
+
AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
|
|
1101
|
+
${collectionFilter}
|
|
1102
|
+
GROUP BY d.hash
|
|
1103
|
+
ORDER BY MIN(d.path)
|
|
1104
|
+
`);
|
|
1105
|
+
return (collection ? stmt.all(model, fingerprint, collection) : stmt.all(model, fingerprint));
|
|
1106
|
+
});
|
|
1107
|
+
}
|
|
1108
|
+
function buildEmbeddingBatches(docs, maxDocsPerBatch, maxBatchBytes) {
|
|
1109
|
+
const batches = [];
|
|
1110
|
+
let currentBatch = [];
|
|
1111
|
+
let currentBytes = 0;
|
|
1112
|
+
for (const doc of docs) {
|
|
1113
|
+
const docBytes = Math.max(0, doc.bytes);
|
|
1114
|
+
const wouldExceedDocs = currentBatch.length >= maxDocsPerBatch;
|
|
1115
|
+
const wouldExceedBytes = currentBatch.length > 0 && (currentBytes + docBytes) > maxBatchBytes;
|
|
1116
|
+
if (wouldExceedDocs || wouldExceedBytes) {
|
|
1117
|
+
batches.push(currentBatch);
|
|
1118
|
+
currentBatch = [];
|
|
1119
|
+
currentBytes = 0;
|
|
1120
|
+
}
|
|
1121
|
+
currentBatch.push(doc);
|
|
1122
|
+
currentBytes += docBytes;
|
|
1123
|
+
}
|
|
1124
|
+
if (currentBatch.length > 0) {
|
|
1125
|
+
batches.push(currentBatch);
|
|
1126
|
+
}
|
|
1127
|
+
return batches;
|
|
1128
|
+
}
|
|
1129
|
+
function getEmbeddingDocsForBatch(db, batch) {
|
|
1130
|
+
if (batch.length === 0)
|
|
1131
|
+
return [];
|
|
1132
|
+
const placeholders = batch.map(() => "?").join(",");
|
|
1133
|
+
const rows = db.prepare(`
|
|
1134
|
+
SELECT hash, doc as body
|
|
1135
|
+
FROM content
|
|
1136
|
+
WHERE hash IN (${placeholders})
|
|
1137
|
+
`).all(...batch.map(doc => doc.hash));
|
|
1138
|
+
const bodyByHash = new Map(rows.map(row => [row.hash, row.body]));
|
|
1139
|
+
return batch.map((doc) => ({
|
|
1140
|
+
...doc,
|
|
1141
|
+
body: bodyByHash.get(doc.hash) ?? "",
|
|
1142
|
+
}));
|
|
1143
|
+
}
|
|
870
1144
|
/**
|
|
871
1145
|
* Generate vector embeddings for documents that need them.
|
|
872
1146
|
* Pure function — no console output, no db lifecycle management.
|
|
@@ -874,104 +1148,238 @@ export async function reindexCollection(store, collectionPath, globPattern, coll
|
|
|
874
1148
|
*/
|
|
875
1149
|
export async function generateEmbeddings(store, options) {
|
|
876
1150
|
const db = store.db;
|
|
877
|
-
const
|
|
1151
|
+
const llm = getLlm(store);
|
|
1152
|
+
const model = options?.model ?? llm.embedModelName ?? DEFAULT_EMBED_MODEL;
|
|
1153
|
+
const fingerprint = getEmbeddingFingerprint(model);
|
|
878
1154
|
const now = new Date().toISOString();
|
|
1155
|
+
const { maxDocsPerBatch, maxBatchBytes } = resolveEmbedOptions(options);
|
|
1156
|
+
const encoder = new TextEncoder();
|
|
879
1157
|
if (options?.force) {
|
|
880
|
-
clearAllEmbeddings(db);
|
|
1158
|
+
clearAllEmbeddings(db, options?.collection);
|
|
881
1159
|
}
|
|
882
|
-
const
|
|
883
|
-
if (
|
|
1160
|
+
const docsToEmbed = getPendingEmbeddingDocs(db, options?.collection, model);
|
|
1161
|
+
if (docsToEmbed.length === 0) {
|
|
884
1162
|
return { docsProcessed: 0, chunksEmbedded: 0, errors: 0, durationMs: 0 };
|
|
885
1163
|
}
|
|
886
|
-
const
|
|
887
|
-
|
|
888
|
-
const encoder = new TextEncoder();
|
|
889
|
-
const bodyBytes = encoder.encode(item.body).length;
|
|
890
|
-
if (bodyBytes === 0)
|
|
891
|
-
continue;
|
|
892
|
-
const title = extractTitle(item.body, item.path);
|
|
893
|
-
const chunks = await chunkDocumentByTokens(item.body);
|
|
894
|
-
for (let seq = 0; seq < chunks.length; seq++) {
|
|
895
|
-
allChunks.push({
|
|
896
|
-
hash: item.hash,
|
|
897
|
-
title,
|
|
898
|
-
text: chunks[seq].text,
|
|
899
|
-
seq,
|
|
900
|
-
pos: chunks[seq].pos,
|
|
901
|
-
tokens: chunks[seq].tokens,
|
|
902
|
-
bytes: encoder.encode(chunks[seq].text).length,
|
|
903
|
-
});
|
|
904
|
-
}
|
|
905
|
-
}
|
|
906
|
-
if (allChunks.length === 0) {
|
|
907
|
-
return { docsProcessed: 0, chunksEmbedded: 0, errors: 0, durationMs: 0 };
|
|
908
|
-
}
|
|
909
|
-
const totalBytes = allChunks.reduce((sum, chk) => sum + chk.bytes, 0);
|
|
910
|
-
const totalChunks = allChunks.length;
|
|
911
|
-
const totalDocs = hashesToEmbed.length;
|
|
1164
|
+
const totalBytes = docsToEmbed.reduce((sum, doc) => sum + Math.max(0, doc.bytes), 0);
|
|
1165
|
+
const totalDocs = docsToEmbed.length;
|
|
912
1166
|
const startTime = Date.now();
|
|
913
1167
|
// Use store's LlamaCpp or global singleton, wrapped in a session
|
|
914
|
-
const
|
|
915
|
-
const sessionOptions = { maxDuration: 30 * 60 * 1000, name: 'generateEmbeddings' };
|
|
1168
|
+
const embedModelUri = model;
|
|
916
1169
|
// Create a session manager for this llm instance
|
|
917
1170
|
const result = await withLLMSessionForLlm(llm, async (session) => {
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
if (!firstResult) {
|
|
923
|
-
throw new Error("Failed to get embedding dimensions from first chunk");
|
|
924
|
-
}
|
|
925
|
-
store.ensureVecTable(firstResult.embedding.length);
|
|
926
|
-
let chunksEmbedded = 0, errors = 0, bytesProcessed = 0;
|
|
1171
|
+
let chunksEmbedded = 0;
|
|
1172
|
+
let bytesProcessed = 0;
|
|
1173
|
+
let totalChunks = 0;
|
|
1174
|
+
let vectorTableInitialized = false;
|
|
927
1175
|
const BATCH_SIZE = 32;
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
1176
|
+
const RETRY_AFTER_SUCCESSFUL_CHUNKS = 64;
|
|
1177
|
+
const MAX_RETRY_ATTEMPTS = 3;
|
|
1178
|
+
const failures = new Map();
|
|
1179
|
+
const retryQueue = new Map();
|
|
1180
|
+
let successesSinceRetry = 0;
|
|
1181
|
+
const failureList = () => [...failures.values()];
|
|
1182
|
+
const activeErrorCount = () => failures.size;
|
|
1183
|
+
const chunkKey = (chunk) => `${chunk.hash}:${chunk.seq}`;
|
|
1184
|
+
const reasonFromError = (error) => {
|
|
1185
|
+
const raw = error instanceof Error ? error.message : String(error);
|
|
1186
|
+
return raw.length > 180 ? `${raw.slice(0, 177)}...` : raw;
|
|
1187
|
+
};
|
|
1188
|
+
const recordFailure = (chunk, reason) => {
|
|
1189
|
+
const key = chunkKey(chunk);
|
|
1190
|
+
const previous = failures.get(key);
|
|
1191
|
+
failures.set(key, {
|
|
1192
|
+
path: chunk.path,
|
|
1193
|
+
hash: chunk.hash,
|
|
1194
|
+
seq: chunk.seq,
|
|
1195
|
+
attempts: (previous?.attempts ?? 0) + 1,
|
|
1196
|
+
reason,
|
|
1197
|
+
});
|
|
1198
|
+
retryQueue.set(key, chunk);
|
|
1199
|
+
};
|
|
1200
|
+
const clearFailure = (chunk) => {
|
|
1201
|
+
const key = chunkKey(chunk);
|
|
1202
|
+
failures.delete(key);
|
|
1203
|
+
retryQueue.delete(key);
|
|
1204
|
+
};
|
|
1205
|
+
const tryEmbedChunk = async (chunk) => {
|
|
932
1206
|
try {
|
|
933
|
-
const
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now);
|
|
939
|
-
chunksEmbedded++;
|
|
940
|
-
}
|
|
941
|
-
else {
|
|
942
|
-
errors++;
|
|
943
|
-
}
|
|
944
|
-
bytesProcessed += chunk.bytes;
|
|
1207
|
+
const text = formatDocForEmbedding(chunk.text, chunk.title, embedModelUri);
|
|
1208
|
+
const result = await session.embed(text, { model });
|
|
1209
|
+
if (!result) {
|
|
1210
|
+
recordFailure(chunk, "embedding returned no vector");
|
|
1211
|
+
return false;
|
|
945
1212
|
}
|
|
1213
|
+
insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now, chunk.expectedTotalChunks, fingerprint);
|
|
1214
|
+
chunksEmbedded++;
|
|
1215
|
+
successesSinceRetry++;
|
|
1216
|
+
clearFailure(chunk);
|
|
1217
|
+
return true;
|
|
946
1218
|
}
|
|
947
|
-
catch {
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
1219
|
+
catch (error) {
|
|
1220
|
+
recordFailure(chunk, reasonFromError(error));
|
|
1221
|
+
return false;
|
|
1222
|
+
}
|
|
1223
|
+
};
|
|
1224
|
+
const retryFailedChunks = async (force = false) => {
|
|
1225
|
+
if (!session.isValid || retryQueue.size === 0)
|
|
1226
|
+
return;
|
|
1227
|
+
if (!force && successesSinceRetry < RETRY_AFTER_SUCCESSFUL_CHUNKS)
|
|
1228
|
+
return;
|
|
1229
|
+
successesSinceRetry = 0;
|
|
1230
|
+
// Normal mode: one retry pass after enough unrelated chunks succeeded.
|
|
1231
|
+
// Force mode: we have run out of other chunks for this batch, so keep
|
|
1232
|
+
// retrying outstanding failures until they recover or hit the cap. The
|
|
1233
|
+
// cap prevents endless loops on permanently bad chunks.
|
|
1234
|
+
do {
|
|
1235
|
+
let retried = 0;
|
|
1236
|
+
for (const [key, chunk] of [...retryQueue]) {
|
|
1237
|
+
const failure = failures.get(key);
|
|
1238
|
+
if (!failure || failure.attempts >= MAX_RETRY_ATTEMPTS)
|
|
1239
|
+
continue;
|
|
1240
|
+
retried++;
|
|
1241
|
+
await tryEmbedChunk(chunk);
|
|
1242
|
+
}
|
|
1243
|
+
if (!force || retried === 0)
|
|
1244
|
+
break;
|
|
1245
|
+
} while (session.isValid && [...retryQueue].some(([key]) => {
|
|
1246
|
+
const failure = failures.get(key);
|
|
1247
|
+
return !!failure && failure.attempts < MAX_RETRY_ATTEMPTS;
|
|
1248
|
+
}));
|
|
1249
|
+
};
|
|
1250
|
+
const batches = buildEmbeddingBatches(docsToEmbed, maxDocsPerBatch, maxBatchBytes);
|
|
1251
|
+
for (const batchMeta of batches) {
|
|
1252
|
+
// Abort early if session has been invalidated
|
|
1253
|
+
if (!session.isValid) {
|
|
1254
|
+
console.warn(`⚠ Session expired — skipping remaining document batches`);
|
|
1255
|
+
break;
|
|
1256
|
+
}
|
|
1257
|
+
const batchDocs = getEmbeddingDocsForBatch(db, batchMeta);
|
|
1258
|
+
const batchChunks = [];
|
|
1259
|
+
const expectedChunksByHash = new Map();
|
|
1260
|
+
const batchBytes = batchMeta.reduce((sum, doc) => sum + Math.max(0, doc.bytes), 0);
|
|
1261
|
+
for (const doc of batchDocs) {
|
|
1262
|
+
if (!doc.body.trim())
|
|
1263
|
+
continue;
|
|
1264
|
+
const title = extractTitle(doc.body, doc.path);
|
|
1265
|
+
const chunks = await chunkDocumentByTokens(doc.body, undefined, undefined, undefined, doc.path, options?.chunkStrategy, session.signal);
|
|
1266
|
+
for (let seq = 0; seq < chunks.length; seq++) {
|
|
1267
|
+
batchChunks.push({
|
|
1268
|
+
hash: doc.hash,
|
|
1269
|
+
path: doc.path,
|
|
1270
|
+
title,
|
|
1271
|
+
text: chunks[seq].text,
|
|
1272
|
+
seq,
|
|
1273
|
+
pos: chunks[seq].pos,
|
|
1274
|
+
tokens: chunks[seq].tokens,
|
|
1275
|
+
bytes: encoder.encode(chunks[seq].text).length,
|
|
1276
|
+
expectedTotalChunks: chunks.length,
|
|
1277
|
+
});
|
|
1278
|
+
}
|
|
1279
|
+
expectedChunksByHash.set(doc.hash, chunks.length);
|
|
1280
|
+
}
|
|
1281
|
+
totalChunks += batchChunks.length;
|
|
1282
|
+
if (batchChunks.length === 0) {
|
|
1283
|
+
bytesProcessed += batchBytes;
|
|
1284
|
+
options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors: activeErrorCount(), failures: failureList() });
|
|
1285
|
+
continue;
|
|
1286
|
+
}
|
|
1287
|
+
if (!vectorTableInitialized) {
|
|
1288
|
+
const firstChunk = batchChunks[0];
|
|
1289
|
+
const firstText = formatDocForEmbedding(firstChunk.text, firstChunk.title, embedModelUri);
|
|
1290
|
+
const firstResult = await session.embed(firstText, { model });
|
|
1291
|
+
if (!firstResult) {
|
|
1292
|
+
throw new Error("Failed to get embedding dimensions from first chunk");
|
|
1293
|
+
}
|
|
1294
|
+
store.ensureVecTable(firstResult.embedding.length);
|
|
1295
|
+
vectorTableInitialized = true;
|
|
1296
|
+
}
|
|
1297
|
+
const totalBatchChunkBytes = batchChunks.reduce((sum, chunk) => sum + chunk.bytes, 0);
|
|
1298
|
+
let batchChunkBytesProcessed = 0;
|
|
1299
|
+
for (let batchStart = 0; batchStart < batchChunks.length; batchStart += BATCH_SIZE) {
|
|
1300
|
+
// Abort early if session has been invalidated (e.g. max duration exceeded)
|
|
1301
|
+
if (!session.isValid) {
|
|
1302
|
+
const remainingChunks = batchChunks.slice(batchStart);
|
|
1303
|
+
for (const chunk of remainingChunks)
|
|
1304
|
+
recordFailure(chunk, "LLM session expired before embedding chunk");
|
|
1305
|
+
console.warn(`⚠ Session expired — skipping ${remainingChunks.length} remaining chunks`);
|
|
1306
|
+
break;
|
|
1307
|
+
}
|
|
1308
|
+
// Abort early if active error rate is too high (>80% of attempted chunks failed)
|
|
1309
|
+
const processed = chunksEmbedded + activeErrorCount();
|
|
1310
|
+
if (processed >= BATCH_SIZE && activeErrorCount() > processed * 0.8) {
|
|
1311
|
+
const remainingChunks = batchChunks.slice(batchStart);
|
|
1312
|
+
for (const chunk of remainingChunks)
|
|
1313
|
+
recordFailure(chunk, "embedding aborted because error rate was too high");
|
|
1314
|
+
console.warn(`⚠ Error rate too high (${activeErrorCount()}/${processed}) — aborting embedding`);
|
|
1315
|
+
break;
|
|
1316
|
+
}
|
|
1317
|
+
const batchEnd = Math.min(batchStart + BATCH_SIZE, batchChunks.length);
|
|
1318
|
+
const chunkBatch = batchChunks.slice(batchStart, batchEnd);
|
|
1319
|
+
const texts = chunkBatch.map(chunk => formatDocForEmbedding(chunk.text, chunk.title, embedModelUri));
|
|
1320
|
+
try {
|
|
1321
|
+
const embeddings = await session.embedBatch(texts, { model });
|
|
1322
|
+
for (let i = 0; i < chunkBatch.length; i++) {
|
|
1323
|
+
const chunk = chunkBatch[i];
|
|
1324
|
+
const embedding = embeddings[i];
|
|
1325
|
+
if (embedding) {
|
|
1326
|
+
insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now, chunk.expectedTotalChunks, fingerprint);
|
|
955
1327
|
chunksEmbedded++;
|
|
1328
|
+
successesSinceRetry++;
|
|
1329
|
+
clearFailure(chunk);
|
|
956
1330
|
}
|
|
957
1331
|
else {
|
|
958
|
-
|
|
1332
|
+
recordFailure(chunk, "batch embedding returned no vector");
|
|
959
1333
|
}
|
|
1334
|
+
batchChunkBytesProcessed += chunk.bytes;
|
|
1335
|
+
}
|
|
1336
|
+
await retryFailedChunks();
|
|
1337
|
+
}
|
|
1338
|
+
catch (error) {
|
|
1339
|
+
// Batch failed — try individual embeddings as fallback. If an
|
|
1340
|
+
// individual retry succeeds, any prior failure for that chunk is
|
|
1341
|
+
// cleared, so the visible error count reflects outstanding failures.
|
|
1342
|
+
const batchReason = reasonFromError(error);
|
|
1343
|
+
if (!session.isValid) {
|
|
1344
|
+
for (const chunk of chunkBatch)
|
|
1345
|
+
recordFailure(chunk, `batch failed and session expired: ${batchReason}`);
|
|
1346
|
+
batchChunkBytesProcessed += chunkBatch.reduce((sum, c) => sum + c.bytes, 0);
|
|
960
1347
|
}
|
|
961
|
-
|
|
962
|
-
|
|
1348
|
+
else {
|
|
1349
|
+
for (const chunk of chunkBatch) {
|
|
1350
|
+
await tryEmbedChunk(chunk);
|
|
1351
|
+
batchChunkBytesProcessed += chunk.bytes;
|
|
1352
|
+
await retryFailedChunks();
|
|
1353
|
+
}
|
|
963
1354
|
}
|
|
964
|
-
bytesProcessed += chunk.bytes;
|
|
965
1355
|
}
|
|
1356
|
+
const proportionalBytes = totalBatchChunkBytes === 0
|
|
1357
|
+
? batchBytes
|
|
1358
|
+
: Math.min(batchBytes, Math.round((batchChunkBytesProcessed / totalBatchChunkBytes) * batchBytes));
|
|
1359
|
+
options?.onProgress?.({
|
|
1360
|
+
chunksEmbedded,
|
|
1361
|
+
totalChunks,
|
|
1362
|
+
bytesProcessed: bytesProcessed + proportionalBytes,
|
|
1363
|
+
totalBytes,
|
|
1364
|
+
errors: activeErrorCount(),
|
|
1365
|
+
failures: failureList(),
|
|
1366
|
+
});
|
|
1367
|
+
}
|
|
1368
|
+
await retryFailedChunks(true);
|
|
1369
|
+
const removedPartialChunks = removeIncompleteEmbeddings(db, expectedChunksByHash, model);
|
|
1370
|
+
if (removedPartialChunks > 0) {
|
|
1371
|
+
chunksEmbedded = Math.max(0, chunksEmbedded - removedPartialChunks);
|
|
966
1372
|
}
|
|
967
|
-
|
|
1373
|
+
bytesProcessed += batchBytes;
|
|
1374
|
+
options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors: activeErrorCount(), failures: failureList() });
|
|
968
1375
|
}
|
|
969
|
-
return { chunksEmbedded, errors };
|
|
970
|
-
},
|
|
1376
|
+
return { chunksEmbedded, errors: activeErrorCount(), failures: failureList() };
|
|
1377
|
+
}, { maxDuration: 30 * 60 * 1000, name: 'generateEmbeddings' });
|
|
971
1378
|
return {
|
|
972
1379
|
docsProcessed: totalDocs,
|
|
973
1380
|
chunksEmbedded: result.chunksEmbedded,
|
|
974
1381
|
errors: result.errors,
|
|
1382
|
+
failures: result.failures,
|
|
975
1383
|
durationMs: Date.now() - startTime,
|
|
976
1384
|
};
|
|
977
1385
|
}
|
|
@@ -992,9 +1400,9 @@ export function createStore(dbPath) {
|
|
|
992
1400
|
close: () => db.close(),
|
|
993
1401
|
ensureVecTable: (dimensions) => ensureVecTableInternal(db, dimensions),
|
|
994
1402
|
// Index health
|
|
995
|
-
getHashesNeedingEmbedding: () => getHashesNeedingEmbedding(db),
|
|
996
|
-
getIndexHealth: () => getIndexHealth(db),
|
|
997
|
-
getStatus: () => getStatus(db),
|
|
1403
|
+
getHashesNeedingEmbedding: (model) => getHashesNeedingEmbedding(db, undefined, model ?? store.llm?.embedModelName ?? DEFAULT_EMBED_MODEL),
|
|
1404
|
+
getIndexHealth: (model) => getIndexHealth(db, model ?? store.llm?.embedModelName ?? DEFAULT_EMBED_MODEL),
|
|
1405
|
+
getStatus: (model) => getStatus(db, model ?? store.llm?.embedModelName ?? DEFAULT_EMBED_MODEL),
|
|
998
1406
|
// Caching
|
|
999
1407
|
getCacheKey,
|
|
1000
1408
|
getCachedResult: (cacheKey) => getCachedResult(db, cacheKey),
|
|
@@ -1022,8 +1430,8 @@ export function createStore(dbPath) {
|
|
|
1022
1430
|
searchFTS: (query, limit, collectionName) => searchFTS(db, query, limit, collectionName),
|
|
1023
1431
|
searchVec: (query, model, limit, collectionName, session, precomputedEmbedding) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding),
|
|
1024
1432
|
// Query expansion & reranking
|
|
1025
|
-
expandQuery: (query, model, intent) => expandQuery(query, model, db, intent, store.llm),
|
|
1026
|
-
rerank: (query, documents, model, intent) => rerank(query, documents, model, db, intent, store.llm),
|
|
1433
|
+
expandQuery: (query, model, intent) => expandQuery(query, model ?? store.llm?.generateModelName ?? DEFAULT_QUERY_MODEL, db, intent, store.llm),
|
|
1434
|
+
rerank: (query, documents, model, intent) => rerank(query, documents, model ?? store.llm?.rerankModelName ?? DEFAULT_RERANK_MODEL, db, intent, store.llm),
|
|
1027
1435
|
// Document retrieval
|
|
1028
1436
|
findDocument: (filename, options) => findDocument(db, filename, options),
|
|
1029
1437
|
getDocumentBody: (doc, fromLine, maxLines) => getDocumentBody(db, doc, fromLine, maxLines),
|
|
@@ -1036,6 +1444,7 @@ export function createStore(dbPath) {
|
|
|
1036
1444
|
insertContent: (hash, content, createdAt) => insertContent(db, hash, content, createdAt),
|
|
1037
1445
|
insertDocument: (collectionName, path, title, hash, createdAt, modifiedAt) => insertDocument(db, collectionName, path, title, hash, createdAt, modifiedAt),
|
|
1038
1446
|
findActiveDocument: (collectionName, path) => findActiveDocument(db, collectionName, path),
|
|
1447
|
+
findOrMigrateLegacyDocument: (collectionName, path) => findOrMigrateLegacyDocument(db, collectionName, path),
|
|
1039
1448
|
updateDocumentTitle: (documentId, title, modifiedAt) => updateDocumentTitle(db, documentId, title, modifiedAt),
|
|
1040
1449
|
updateDocument: (documentId, title, hash, modifiedAt) => updateDocument(db, documentId, title, hash, modifiedAt),
|
|
1041
1450
|
deactivateDocument: (collectionName, path) => deactivateDocument(db, collectionName, path),
|
|
@@ -1043,7 +1452,7 @@ export function createStore(dbPath) {
|
|
|
1043
1452
|
// Vector/embedding operations
|
|
1044
1453
|
getHashesForEmbedding: () => getHashesForEmbedding(db),
|
|
1045
1454
|
clearAllEmbeddings: () => clearAllEmbeddings(db),
|
|
1046
|
-
insertEmbedding: (hash, seq, pos, embedding, model, embeddedAt) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt),
|
|
1455
|
+
insertEmbedding: (hash, seq, pos, embedding, model, embeddedAt, totalChunks, fingerprint) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt, totalChunks, fingerprint),
|
|
1047
1456
|
};
|
|
1048
1457
|
return store;
|
|
1049
1458
|
}
|
|
@@ -1056,11 +1465,11 @@ export function getDocid(hash) {
|
|
|
1056
1465
|
/**
|
|
1057
1466
|
* Handelize a filename to be more token-friendly.
|
|
1058
1467
|
* - Convert triple underscore `___` to `/` (folder separator)
|
|
1059
|
-
* - Convert to lowercase
|
|
1060
1468
|
* - Replace sequences of non-word chars (except /) with single dash
|
|
1061
1469
|
* - Remove leading/trailing dashes from path segments
|
|
1062
1470
|
* - Preserve folder structure (a/b/c/d.md stays structured)
|
|
1063
1471
|
* - Preserve file extension
|
|
1472
|
+
* - Preserve original case (important for case-sensitive filesystems)
|
|
1064
1473
|
*/
|
|
1065
1474
|
/** Replace emoji/symbol codepoints with their hex representation (e.g. 🐘 → 1f418) */
|
|
1066
1475
|
function emojiToHex(str) {
|
|
@@ -1085,7 +1494,6 @@ export function handelize(path) {
|
|
|
1085
1494
|
}
|
|
1086
1495
|
const result = path
|
|
1087
1496
|
.replace(/___/g, '/') // Triple underscore becomes folder separator
|
|
1088
|
-
.toLowerCase()
|
|
1089
1497
|
.split('/')
|
|
1090
1498
|
.map((segment, idx, arr) => {
|
|
1091
1499
|
const isLastSegment = idx === arr.length - 1;
|
|
@@ -1097,7 +1505,7 @@ export function handelize(path) {
|
|
|
1097
1505
|
const ext = extMatch ? extMatch[1] : '';
|
|
1098
1506
|
const nameWithoutExt = ext ? segment.slice(0, -ext.length) : segment;
|
|
1099
1507
|
const cleanedName = nameWithoutExt
|
|
1100
|
-
.replace(/[^\p{L}\p{N}$]+/gu, '-') // Keep
|
|
1508
|
+
.replace(/[^\p{L}\p{N}$]+/gu, '-') // Keep letters, numbers, "$"; dash-separate rest (including dots)
|
|
1101
1509
|
.replace(/^-+|-+$/g, ''); // Remove leading/trailing dashes
|
|
1102
1510
|
return cleanedName + ext;
|
|
1103
1511
|
}
|
|
@@ -1118,17 +1526,85 @@ export function handelize(path) {
|
|
|
1118
1526
|
// =============================================================================
|
|
1119
1527
|
// Index health
|
|
1120
1528
|
// =============================================================================
|
|
1121
|
-
export function getHashesNeedingEmbedding(db) {
|
|
1122
|
-
const
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1529
|
+
export function getHashesNeedingEmbedding(db, collection, model = DEFAULT_EMBED_MODEL) {
|
|
1530
|
+
const collectionFilter = collection ? `AND d.collection = ?` : ``;
|
|
1531
|
+
const fingerprint = getEmbeddingFingerprint(model);
|
|
1532
|
+
return withLazyContentVectorMigration(db, () => {
|
|
1533
|
+
const stmt = db.prepare(`
|
|
1534
|
+
SELECT COUNT(DISTINCT d.hash) as count
|
|
1535
|
+
FROM documents d
|
|
1536
|
+
LEFT JOIN (
|
|
1537
|
+
SELECT hash, model, COUNT(*) AS chunk_count, MAX(total_chunks) AS expected_chunks
|
|
1538
|
+
FROM content_vectors
|
|
1539
|
+
WHERE model = ? AND embed_fingerprint = ?
|
|
1540
|
+
GROUP BY hash, model, embed_fingerprint
|
|
1541
|
+
) v ON d.hash = v.hash
|
|
1542
|
+
WHERE d.active = 1
|
|
1543
|
+
AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
|
|
1544
|
+
${collectionFilter}
|
|
1545
|
+
`);
|
|
1546
|
+
const result = (collection ? stmt.get(model, fingerprint, collection) : stmt.get(model, fingerprint));
|
|
1547
|
+
return result.count;
|
|
1548
|
+
});
|
|
1129
1549
|
}
|
|
1130
|
-
export function
|
|
1131
|
-
const
|
|
1550
|
+
export async function maybeAdoptLegacyEmbeddingFingerprint(store, model = DEFAULT_EMBED_MODEL) {
|
|
1551
|
+
const db = store.db;
|
|
1552
|
+
const fingerprint = getEmbeddingFingerprint(model);
|
|
1553
|
+
const legacyCount = withLazyContentVectorMigration(db, () => {
|
|
1554
|
+
const row = db.prepare(`SELECT COUNT(DISTINCT hash) AS count FROM content_vectors WHERE model = ? AND embed_fingerprint = ''`).get(model);
|
|
1555
|
+
return row.count;
|
|
1556
|
+
});
|
|
1557
|
+
if (legacyCount === 0) {
|
|
1558
|
+
return { checked: false, adopted: 0, reason: "no legacy empty-fingerprint embeddings" };
|
|
1559
|
+
}
|
|
1560
|
+
const sample = withLazyContentVectorMigration(db, () => db.prepare(`
|
|
1561
|
+
SELECT cv.hash, cv.seq, cv.pos, cv.total_chunks, c.doc AS body, MIN(d.path) AS path
|
|
1562
|
+
FROM content_vectors cv
|
|
1563
|
+
JOIN documents d ON d.hash = cv.hash AND d.active = 1
|
|
1564
|
+
JOIN content c ON c.hash = cv.hash
|
|
1565
|
+
WHERE cv.model = ? AND cv.embed_fingerprint = ''
|
|
1566
|
+
GROUP BY cv.hash, cv.seq, cv.pos, cv.total_chunks, c.doc
|
|
1567
|
+
ORDER BY cv.hash, cv.seq
|
|
1568
|
+
LIMIT 1
|
|
1569
|
+
`).get(model));
|
|
1570
|
+
if (!sample) {
|
|
1571
|
+
return { checked: false, adopted: 0, reason: `${legacyCount} legacy docs have no active sample` };
|
|
1572
|
+
}
|
|
1573
|
+
const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
|
|
1574
|
+
if (!tableExists) {
|
|
1575
|
+
return { checked: false, adopted: 0, reason: "vectors_vec table is missing" };
|
|
1576
|
+
}
|
|
1577
|
+
const expectedHashSeq = `${sample.hash}_${sample.seq}`;
|
|
1578
|
+
const title = extractTitle(sample.body, sample.path);
|
|
1579
|
+
const llm = getLlm(store);
|
|
1580
|
+
return await withLLMSessionForLlm(llm, async (session) => {
|
|
1581
|
+
const chunks = await chunkDocumentByTokens(sample.body, undefined, undefined, undefined, sample.path, undefined, session.signal);
|
|
1582
|
+
const chunk = chunks[sample.seq];
|
|
1583
|
+
if (!chunk) {
|
|
1584
|
+
return { checked: true, adopted: 0, reason: `sample chunk ${expectedHashSeq} no longer exists` };
|
|
1585
|
+
}
|
|
1586
|
+
const result = await session.embed(formatDocForEmbedding(chunk.text, title, model), { model });
|
|
1587
|
+
if (!result) {
|
|
1588
|
+
return { checked: true, adopted: 0, reason: "failed to embed legacy sample" };
|
|
1589
|
+
}
|
|
1590
|
+
const nearest = db.prepare(`
|
|
1591
|
+
SELECT hash_seq, distance
|
|
1592
|
+
FROM vectors_vec
|
|
1593
|
+
WHERE embedding MATCH ? AND k = 1
|
|
1594
|
+
`).get(new Float32Array(result.embedding));
|
|
1595
|
+
if (!nearest) {
|
|
1596
|
+
return { checked: true, adopted: 0, reason: "legacy sample vector not found" };
|
|
1597
|
+
}
|
|
1598
|
+
const threshold = 0.0001;
|
|
1599
|
+
if (nearest.hash_seq !== expectedHashSeq || nearest.distance > threshold) {
|
|
1600
|
+
return { checked: true, adopted: 0, reason: `legacy sample differs from current fingerprint (nearest ${nearest.hash_seq}, distance ${nearest.distance.toFixed(6)})` };
|
|
1601
|
+
}
|
|
1602
|
+
const update = withLazyContentVectorMigration(db, () => db.prepare(`UPDATE content_vectors SET embed_fingerprint = ? WHERE model = ? AND embed_fingerprint = ''`).run(fingerprint, model));
|
|
1603
|
+
return { checked: true, adopted: update.changes, reason: `sample ${expectedHashSeq} matched current fingerprint at distance ${nearest.distance.toFixed(6)}` };
|
|
1604
|
+
});
|
|
1605
|
+
}
|
|
1606
|
+
export function getIndexHealth(db, model = DEFAULT_EMBED_MODEL) {
|
|
1607
|
+
const needsEmbedding = getHashesNeedingEmbedding(db, undefined, model);
|
|
1132
1608
|
const totalDocs = db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get().count;
|
|
1133
1609
|
const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get();
|
|
1134
1610
|
let daysStale = null;
|
|
@@ -1181,13 +1657,15 @@ export function deleteInactiveDocuments(db) {
|
|
|
1181
1657
|
return result.changes;
|
|
1182
1658
|
}
|
|
1183
1659
|
/**
|
|
1184
|
-
* Remove orphaned content hashes that are not referenced by any
|
|
1660
|
+
* Remove orphaned content hashes that are not referenced by any document.
|
|
1661
|
+
* Inactive documents are soft-deleted tombstones, so their content rows must
|
|
1662
|
+
* remain referenced until deleteInactiveDocuments() hard-deletes them.
|
|
1185
1663
|
* Returns the number of orphaned content hashes deleted.
|
|
1186
1664
|
*/
|
|
1187
1665
|
export function cleanupOrphanedContent(db) {
|
|
1188
1666
|
const result = db.prepare(`
|
|
1189
1667
|
DELETE FROM content
|
|
1190
|
-
WHERE hash NOT IN (SELECT DISTINCT hash FROM documents
|
|
1668
|
+
WHERE hash NOT IN (SELECT DISTINCT hash FROM documents)
|
|
1191
1669
|
`).run();
|
|
1192
1670
|
return result.changes;
|
|
1193
1671
|
}
|
|
@@ -1196,39 +1674,50 @@ export function cleanupOrphanedContent(db) {
|
|
|
1196
1674
|
* Returns the number of orphaned embedding chunks deleted.
|
|
1197
1675
|
*/
|
|
1198
1676
|
export function cleanupOrphanedVectors(db) {
|
|
1199
|
-
//
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
if (!tableExists) {
|
|
1677
|
+
// sqlite-vec may not be loaded (e.g. Bun's bun:sqlite lacks loadExtension).
|
|
1678
|
+
// The vectors_vec virtual table can appear in sqlite_master from a prior
|
|
1679
|
+
// session, but querying it without the vec0 module loaded will crash (#380).
|
|
1680
|
+
if (!isSqliteVecAvailable()) {
|
|
1204
1681
|
return 0;
|
|
1205
1682
|
}
|
|
1206
|
-
//
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1683
|
+
// The schema entry can exist even when sqlite-vec itself is unavailable
|
|
1684
|
+
// (for example when reopening a DB without vec0 loaded). In that case,
|
|
1685
|
+
// touching the virtual table throws "no such module: vec0" and cleanup
|
|
1686
|
+
// should degrade gracefully like the rest of the vector features.
|
|
1687
|
+
try {
|
|
1688
|
+
db.prepare(`SELECT 1 FROM vectors_vec LIMIT 0`).get();
|
|
1689
|
+
}
|
|
1690
|
+
catch {
|
|
1214
1691
|
return 0;
|
|
1215
1692
|
}
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
SELECT
|
|
1693
|
+
return withLazyContentVectorMigration(db, () => {
|
|
1694
|
+
// Count orphaned vectors first
|
|
1695
|
+
const countResult = db.prepare(`
|
|
1696
|
+
SELECT COUNT(*) as c FROM content_vectors cv
|
|
1220
1697
|
WHERE NOT EXISTS (
|
|
1221
1698
|
SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
|
|
1222
1699
|
)
|
|
1223
|
-
)
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1700
|
+
`).get();
|
|
1701
|
+
if (countResult.c === 0) {
|
|
1702
|
+
return 0;
|
|
1703
|
+
}
|
|
1704
|
+
// Delete from vectors_vec first
|
|
1705
|
+
db.exec(`
|
|
1706
|
+
DELETE FROM vectors_vec WHERE hash_seq IN (
|
|
1707
|
+
SELECT cv.hash || '_' || cv.seq FROM content_vectors cv
|
|
1708
|
+
WHERE NOT EXISTS (
|
|
1709
|
+
SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
|
|
1710
|
+
)
|
|
1711
|
+
)
|
|
1712
|
+
`);
|
|
1713
|
+
// Delete from content_vectors
|
|
1714
|
+
db.exec(`
|
|
1715
|
+
DELETE FROM content_vectors WHERE hash NOT IN (
|
|
1716
|
+
SELECT hash FROM documents WHERE active = 1
|
|
1717
|
+
)
|
|
1718
|
+
`);
|
|
1719
|
+
return countResult.c;
|
|
1720
|
+
});
|
|
1232
1721
|
}
|
|
1233
1722
|
/**
|
|
1234
1723
|
* Run VACUUM to reclaim unused space in the database.
|
|
@@ -1290,6 +1779,21 @@ export function insertContent(db, hash, content, createdAt) {
|
|
|
1290
1779
|
db.prepare(`INSERT OR IGNORE INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
|
|
1291
1780
|
.run(hash, content, createdAt);
|
|
1292
1781
|
}
|
|
1782
|
+
function rebuildDocumentFTS(db, documentId) {
|
|
1783
|
+
const row = db.prepare(`
|
|
1784
|
+
SELECT d.id, d.collection, d.path, d.title, content.doc as body
|
|
1785
|
+
FROM documents d
|
|
1786
|
+
JOIN content ON content.hash = d.hash
|
|
1787
|
+
WHERE d.id = ? AND d.active = 1
|
|
1788
|
+
`).get(documentId);
|
|
1789
|
+
db.prepare(`DELETE FROM documents_fts WHERE rowid = ?`).run(documentId);
|
|
1790
|
+
if (!row)
|
|
1791
|
+
return;
|
|
1792
|
+
db.prepare(`
|
|
1793
|
+
INSERT INTO documents_fts(rowid, filepath, title, body)
|
|
1794
|
+
VALUES (?, ?, ?, ?)
|
|
1795
|
+
`).run(row.id, normalizeCjkForFTS(`${row.collection}/${row.path}`), normalizeCjkForFTS(row.title), normalizeCjkForFTS(row.body));
|
|
1796
|
+
}
|
|
1293
1797
|
/**
|
|
1294
1798
|
* Insert a new document into the documents table.
|
|
1295
1799
|
*/
|
|
@@ -1303,6 +1807,9 @@ export function insertDocument(db, collectionName, path, title, hash, createdAt,
|
|
|
1303
1807
|
modified_at = excluded.modified_at,
|
|
1304
1808
|
active = 1
|
|
1305
1809
|
`).run(collectionName, path, title, hash, createdAt, modifiedAt);
|
|
1810
|
+
const row = db.prepare(`SELECT id FROM documents WHERE collection = ? AND path = ?`).get(collectionName, path);
|
|
1811
|
+
if (row)
|
|
1812
|
+
rebuildDocumentFTS(db, row.id);
|
|
1306
1813
|
}
|
|
1307
1814
|
/**
|
|
1308
1815
|
* Find an active document by collection name and path.
|
|
@@ -1314,12 +1821,48 @@ export function findActiveDocument(db, collectionName, path) {
|
|
|
1314
1821
|
`).get(collectionName, path);
|
|
1315
1822
|
return row ?? null;
|
|
1316
1823
|
}
|
|
1824
|
+
/**
|
|
1825
|
+
* Find an active document, falling back to a case-insensitive path match.
|
|
1826
|
+
* If found under a different casing, renames it in-place and rebuilds the
|
|
1827
|
+
* FTS entry. Embeddings are keyed by content hash, so the rename is
|
|
1828
|
+
* safe — no re-embedding required.
|
|
1829
|
+
*
|
|
1830
|
+
* @internal Used by reindexCollection and indexFiles during qmd update.
|
|
1831
|
+
* Returns null if the document does not exist under either path.
|
|
1832
|
+
*/
|
|
1833
|
+
export function findOrMigrateLegacyDocument(db, collectionName, path) {
|
|
1834
|
+
const existing = findActiveDocument(db, collectionName, path);
|
|
1835
|
+
if (existing)
|
|
1836
|
+
return existing;
|
|
1837
|
+
const legacy = db.prepare(`
|
|
1838
|
+
SELECT id, hash, title FROM documents
|
|
1839
|
+
WHERE collection = ? AND path COLLATE NOCASE = ? AND active = 1
|
|
1840
|
+
ORDER BY id
|
|
1841
|
+
LIMIT 1
|
|
1842
|
+
`).get(collectionName, path);
|
|
1843
|
+
if (!legacy)
|
|
1844
|
+
return null;
|
|
1845
|
+
// Wrap rename + FTS rebuild in a transaction for atomicity.
|
|
1846
|
+
const migrate = db.transaction(() => {
|
|
1847
|
+
// Use OR IGNORE so a UNIQUE conflict (e.g. both "readme.md" and
|
|
1848
|
+
// "README.md" already exist) is a no-op rather than crashing.
|
|
1849
|
+
const result = db.prepare(`UPDATE OR IGNORE documents SET path = ? WHERE id = ? AND active = 1`).run(path, legacy.id);
|
|
1850
|
+
if (result.changes === 0)
|
|
1851
|
+
return false;
|
|
1852
|
+
rebuildDocumentFTS(db, legacy.id);
|
|
1853
|
+
return true;
|
|
1854
|
+
});
|
|
1855
|
+
if (!migrate())
|
|
1856
|
+
return null;
|
|
1857
|
+
return findActiveDocument(db, collectionName, path);
|
|
1858
|
+
}
|
|
1317
1859
|
/**
|
|
1318
1860
|
* Update the title and modified_at timestamp for a document.
|
|
1319
1861
|
*/
|
|
1320
1862
|
export function updateDocumentTitle(db, documentId, title, modifiedAt) {
|
|
1321
1863
|
db.prepare(`UPDATE documents SET title = ?, modified_at = ? WHERE id = ?`)
|
|
1322
1864
|
.run(title, modifiedAt, documentId);
|
|
1865
|
+
rebuildDocumentFTS(db, documentId);
|
|
1323
1866
|
}
|
|
1324
1867
|
/**
|
|
1325
1868
|
* Update an existing document's hash, title, and modified_at timestamp.
|
|
@@ -1328,6 +1871,7 @@ export function updateDocumentTitle(db, documentId, title, modifiedAt) {
|
|
|
1328
1871
|
export function updateDocument(db, documentId, title, hash, modifiedAt) {
|
|
1329
1872
|
db.prepare(`UPDATE documents SET title = ?, hash = ?, modified_at = ? WHERE id = ?`)
|
|
1330
1873
|
.run(title, hash, modifiedAt, documentId);
|
|
1874
|
+
rebuildDocumentFTS(db, documentId);
|
|
1331
1875
|
}
|
|
1332
1876
|
/**
|
|
1333
1877
|
* Deactivate a document (mark as inactive but don't delete).
|
|
@@ -1346,52 +1890,44 @@ export function getActiveDocumentPaths(db, collectionName) {
|
|
|
1346
1890
|
return rows.map(r => r.path);
|
|
1347
1891
|
}
|
|
1348
1892
|
export { formatQueryForEmbedding, formatDocForEmbedding };
|
|
1893
|
+
/**
|
|
1894
|
+
* Chunk a document using regex-only break point detection.
|
|
1895
|
+
* This is the sync, backward-compatible API used by tests and legacy callers.
|
|
1896
|
+
*/
|
|
1349
1897
|
export function chunkDocument(content, maxChars = CHUNK_SIZE_CHARS, overlapChars = CHUNK_OVERLAP_CHARS, windowChars = CHUNK_WINDOW_CHARS) {
|
|
1350
|
-
if (content.length <= maxChars) {
|
|
1351
|
-
return [{ text: content, pos: 0 }];
|
|
1352
|
-
}
|
|
1353
|
-
// Pre-scan all break points and code fences once
|
|
1354
1898
|
const breakPoints = scanBreakPoints(content);
|
|
1355
1899
|
const codeFences = findCodeFences(content);
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
|
|
1376
|
-
// Move forward, but overlap with previous chunk
|
|
1377
|
-
// For last chunk, don't overlap (just go to the end)
|
|
1378
|
-
if (endPos >= content.length) {
|
|
1379
|
-
break;
|
|
1380
|
-
}
|
|
1381
|
-
charPos = endPos - overlapChars;
|
|
1382
|
-
const lastChunkPos = chunks.at(-1).pos;
|
|
1383
|
-
if (charPos <= lastChunkPos) {
|
|
1384
|
-
// Prevent infinite loop - move forward at least a bit
|
|
1385
|
-
charPos = endPos;
|
|
1900
|
+
return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
|
|
1901
|
+
}
|
|
1902
|
+
/**
|
|
1903
|
+
* Async AST-aware chunking. Detects language from filepath, computes AST
|
|
1904
|
+
* break points for supported code files, merges with regex break points,
|
|
1905
|
+
* and delegates to the shared chunk algorithm.
|
|
1906
|
+
*
|
|
1907
|
+
* Falls back to regex-only when strategy is "regex", filepath is absent,
|
|
1908
|
+
* or language is unsupported.
|
|
1909
|
+
*/
|
|
1910
|
+
export async function chunkDocumentAsync(content, maxChars = CHUNK_SIZE_CHARS, overlapChars = CHUNK_OVERLAP_CHARS, windowChars = CHUNK_WINDOW_CHARS, filepath, chunkStrategy = "regex") {
|
|
1911
|
+
const regexPoints = scanBreakPoints(content);
|
|
1912
|
+
const codeFences = findCodeFences(content);
|
|
1913
|
+
let breakPoints = regexPoints;
|
|
1914
|
+
if (chunkStrategy === "auto" && filepath) {
|
|
1915
|
+
const { getASTBreakPoints } = await import("./ast.js");
|
|
1916
|
+
const astPoints = await getASTBreakPoints(content, filepath);
|
|
1917
|
+
if (astPoints.length > 0) {
|
|
1918
|
+
breakPoints = mergeBreakPoints(regexPoints, astPoints);
|
|
1386
1919
|
}
|
|
1387
1920
|
}
|
|
1388
|
-
return
|
|
1921
|
+
return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
|
|
1389
1922
|
}
|
|
1390
1923
|
/**
|
|
1391
1924
|
* Chunk a document by actual token count using the LLM tokenizer.
|
|
1392
1925
|
* More accurate than character-based chunking but requires async.
|
|
1926
|
+
*
|
|
1927
|
+
* When filepath and chunkStrategy are provided, uses AST-aware break points
|
|
1928
|
+
* for supported code files.
|
|
1393
1929
|
*/
|
|
1394
|
-
export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKENS, overlapTokens = CHUNK_OVERLAP_TOKENS, windowTokens = CHUNK_WINDOW_TOKENS) {
|
|
1930
|
+
export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKENS, overlapTokens = CHUNK_OVERLAP_TOKENS, windowTokens = CHUNK_WINDOW_TOKENS, filepath, chunkStrategy = "regex", signal) {
|
|
1395
1931
|
const llm = getDefaultLlamaCpp();
|
|
1396
1932
|
// Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3)
|
|
1397
1933
|
// If chunks exceed limit, they'll be re-split with actual ratio
|
|
@@ -1400,29 +1936,58 @@ export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKE
|
|
|
1400
1936
|
const overlapChars = overlapTokens * avgCharsPerToken;
|
|
1401
1937
|
const windowChars = windowTokens * avgCharsPerToken;
|
|
1402
1938
|
// Chunk in character space with conservative estimate
|
|
1403
|
-
|
|
1939
|
+
// Use AST-aware chunking for the first pass when filepath/strategy provided
|
|
1940
|
+
let charChunks = await chunkDocumentAsync(content, maxChars, overlapChars, windowChars, filepath, chunkStrategy);
|
|
1404
1941
|
// Tokenize and split any chunks that still exceed limit
|
|
1405
1942
|
const results = [];
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
|
|
1409
|
-
|
|
1943
|
+
const clampOverlapChars = (value, maxChars) => {
|
|
1944
|
+
if (maxChars <= 1)
|
|
1945
|
+
return 0;
|
|
1946
|
+
return Math.max(0, Math.min(maxChars - 1, Math.floor(value)));
|
|
1947
|
+
};
|
|
1948
|
+
const pushChunkWithinTokenLimit = async (text, pos) => {
|
|
1949
|
+
if (signal?.aborted)
|
|
1950
|
+
return;
|
|
1951
|
+
const tokens = await llm.tokenize(text);
|
|
1952
|
+
if (tokens.length <= maxTokens || text.length <= 1) {
|
|
1953
|
+
results.push({ text, pos, tokens: tokens.length });
|
|
1954
|
+
return;
|
|
1410
1955
|
}
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1956
|
+
const actualCharsPerToken = text.length / tokens.length;
|
|
1957
|
+
let safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95);
|
|
1958
|
+
if (!Number.isFinite(safeMaxChars) || safeMaxChars < 1) {
|
|
1959
|
+
safeMaxChars = Math.floor(text.length / 2);
|
|
1960
|
+
}
|
|
1961
|
+
safeMaxChars = Math.max(1, Math.min(text.length - 1, safeMaxChars));
|
|
1962
|
+
let nextOverlapChars = clampOverlapChars(overlapChars * actualCharsPerToken / 2, safeMaxChars);
|
|
1963
|
+
let nextWindowChars = Math.max(0, Math.floor(windowChars * actualCharsPerToken / 2));
|
|
1964
|
+
let subChunks = chunkDocument(text, safeMaxChars, nextOverlapChars, nextWindowChars);
|
|
1965
|
+
// Pathological single-line blobs can produce no meaningful breakpoint progress.
|
|
1966
|
+
// Fall back to a simple half split so every recursion step strictly shrinks.
|
|
1967
|
+
if (subChunks.length <= 1
|
|
1968
|
+
|| subChunks[0]?.text.length === text.length) {
|
|
1969
|
+
safeMaxChars = Math.max(1, Math.floor(text.length / 2));
|
|
1970
|
+
nextOverlapChars = 0;
|
|
1971
|
+
nextWindowChars = 0;
|
|
1972
|
+
subChunks = chunkDocument(text, safeMaxChars, nextOverlapChars, nextWindowChars);
|
|
1973
|
+
}
|
|
1974
|
+
if (subChunks.length <= 1
|
|
1975
|
+
|| subChunks[0]?.text.length === text.length) {
|
|
1976
|
+
const fallbackTokens = tokens.slice(0, Math.max(1, maxTokens));
|
|
1977
|
+
const truncatedText = await llm.detokenize(fallbackTokens);
|
|
1978
|
+
results.push({
|
|
1979
|
+
text: truncatedText,
|
|
1980
|
+
pos,
|
|
1981
|
+
tokens: fallbackTokens.length,
|
|
1982
|
+
});
|
|
1983
|
+
return;
|
|
1425
1984
|
}
|
|
1985
|
+
for (const subChunk of subChunks) {
|
|
1986
|
+
await pushChunkWithinTokenLimit(text.slice(subChunk.pos, subChunk.pos + subChunk.text.length), pos + subChunk.pos);
|
|
1987
|
+
}
|
|
1988
|
+
};
|
|
1989
|
+
for (const chunk of charChunks) {
|
|
1990
|
+
await pushChunkWithinTokenLimit(chunk.text, chunk.pos);
|
|
1426
1991
|
}
|
|
1427
1992
|
return results;
|
|
1428
1993
|
}
|
|
@@ -1523,7 +2088,7 @@ export function matchFilesByGlob(db, pattern) {
|
|
|
1523
2088
|
`).all();
|
|
1524
2089
|
const isMatch = picomatch(pattern);
|
|
1525
2090
|
return allFiles
|
|
1526
|
-
.filter(f => isMatch(f.virtual_path) || isMatch(f.path))
|
|
2091
|
+
.filter(f => isMatch(f.virtual_path) || isMatch(f.path) || isMatch(f.collection + '/' + f.path))
|
|
1527
2092
|
.map(f => ({
|
|
1528
2093
|
filepath: f.virtual_path, // Virtual path for precise lookup
|
|
1529
2094
|
displayPath: f.path, // Relative path for display
|
|
@@ -1874,8 +2439,23 @@ export function getTopLevelPathsWithoutContext(db, collectionName) {
|
|
|
1874
2439
|
// =============================================================================
|
|
1875
2440
|
// FTS Search
|
|
1876
2441
|
// =============================================================================
|
|
1877
|
-
function sanitizeFTS5Term(term) {
|
|
1878
|
-
return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
|
|
2442
|
+
export function sanitizeFTS5Term(term) {
|
|
2443
|
+
return term.replace(/[^\p{L}\p{N}'_]/gu, '').toLowerCase();
|
|
2444
|
+
}
|
|
2445
|
+
/**
|
|
2446
|
+
* Check if a token is a hyphenated compound word (e.g., multi-agent, DEC-0054, gpt-4).
|
|
2447
|
+
* Returns true if the token contains internal hyphens between word/digit characters.
|
|
2448
|
+
*/
|
|
2449
|
+
function isHyphenatedToken(token) {
|
|
2450
|
+
return /^[\p{L}\p{N}][\p{L}\p{N}'-]*-[\p{L}\p{N}][\p{L}\p{N}'-]*$/u.test(token);
|
|
2451
|
+
}
|
|
2452
|
+
/**
|
|
2453
|
+
* Sanitize a hyphenated term into an FTS5 phrase by splitting on hyphens
|
|
2454
|
+
* and sanitizing each part. Returns the parts joined by spaces for use
|
|
2455
|
+
* inside FTS5 quotes: "multi agent" matches "multi-agent" in porter tokenizer.
|
|
2456
|
+
*/
|
|
2457
|
+
function sanitizeHyphenatedTerm(term) {
|
|
2458
|
+
return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
|
|
1879
2459
|
}
|
|
1880
2460
|
/**
|
|
1881
2461
|
* Parse lex query syntax into FTS5 query.
|
|
@@ -1883,14 +2463,23 @@ function sanitizeFTS5Term(term) {
|
|
|
1883
2463
|
* Supports:
|
|
1884
2464
|
* - Quoted phrases: "exact phrase" → "exact phrase" (exact match)
|
|
1885
2465
|
* - Negation: -term or -"phrase" → uses FTS5 NOT operator
|
|
2466
|
+
* - Hyphenated tokens: multi-agent, DEC-0054, gpt-4 → treated as phrases
|
|
1886
2467
|
* - Plain terms: term → "term"* (prefix match)
|
|
1887
2468
|
*
|
|
1888
2469
|
* FTS5 NOT is a binary operator: `term1 NOT term2` means "match term1 but not term2".
|
|
1889
2470
|
* So `-term` only works when there are also positive terms.
|
|
1890
2471
|
*
|
|
2472
|
+
* Hyphen disambiguation: `-sports` at a word boundary is negation, but `multi-agent`
|
|
2473
|
+
* (where `-` is between word characters) is treated as a hyphenated phrase.
|
|
2474
|
+
* When a leading `-` is followed by what looks like a hyphenated compound word
|
|
2475
|
+
* (e.g., `-multi-agent`), the entire token is treated as a negated phrase.
|
|
2476
|
+
*
|
|
1891
2477
|
* Examples:
|
|
1892
2478
|
* performance -sports → "performance"* NOT "sports"*
|
|
1893
2479
|
* "machine learning" → "machine learning"
|
|
2480
|
+
* multi-agent memory → "multi agent" AND "memory"*
|
|
2481
|
+
* DEC-0054 → "dec 0054"
|
|
2482
|
+
* -multi-agent → NOT "multi agent"
|
|
1894
2483
|
*/
|
|
1895
2484
|
function buildFTS5Query(query) {
|
|
1896
2485
|
const positive = [];
|
|
@@ -1916,7 +2505,7 @@ function buildFTS5Query(query) {
|
|
|
1916
2505
|
const phrase = s.slice(start, i).trim();
|
|
1917
2506
|
i++; // skip closing quote
|
|
1918
2507
|
if (phrase.length > 0) {
|
|
1919
|
-
const sanitized = phrase
|
|
2508
|
+
const sanitized = sanitizeFTS5Phrase(phrase);
|
|
1920
2509
|
if (sanitized) {
|
|
1921
2510
|
const ftsPhrase = `"${sanitized}"`; // Exact phrase, no prefix match
|
|
1922
2511
|
if (negated) {
|
|
@@ -1934,14 +2523,42 @@ function buildFTS5Query(query) {
|
|
|
1934
2523
|
while (i < s.length && !/[\s"]/.test(s[i]))
|
|
1935
2524
|
i++;
|
|
1936
2525
|
const term = s.slice(start, i);
|
|
1937
|
-
|
|
1938
|
-
|
|
1939
|
-
|
|
1940
|
-
|
|
1941
|
-
|
|
2526
|
+
// Handle hyphenated tokens: multi-agent, DEC-0054, gpt-4
|
|
2527
|
+
// These get split into phrase queries so FTS5 porter tokenizer matches them.
|
|
2528
|
+
if (isHyphenatedToken(term)) {
|
|
2529
|
+
const sanitized = sanitizeHyphenatedTerm(term);
|
|
2530
|
+
if (sanitized) {
|
|
2531
|
+
const ftsPhrase = `"${sanitized}"`; // Phrase match (no prefix)
|
|
2532
|
+
if (negated) {
|
|
2533
|
+
negative.push(ftsPhrase);
|
|
2534
|
+
}
|
|
2535
|
+
else {
|
|
2536
|
+
positive.push(ftsPhrase);
|
|
2537
|
+
}
|
|
1942
2538
|
}
|
|
1943
|
-
|
|
1944
|
-
|
|
2539
|
+
}
|
|
2540
|
+
else if (containsCjk(term)) {
|
|
2541
|
+
const sanitized = sanitizeFTS5Phrase(term);
|
|
2542
|
+
if (sanitized) {
|
|
2543
|
+
const ftsPhrase = `"${sanitized}"`; // CJK phrase over character tokens
|
|
2544
|
+
if (negated) {
|
|
2545
|
+
negative.push(ftsPhrase);
|
|
2546
|
+
}
|
|
2547
|
+
else {
|
|
2548
|
+
positive.push(ftsPhrase);
|
|
2549
|
+
}
|
|
2550
|
+
}
|
|
2551
|
+
}
|
|
2552
|
+
else {
|
|
2553
|
+
const sanitized = sanitizeFTS5Term(term);
|
|
2554
|
+
if (sanitized) {
|
|
2555
|
+
const ftsTerm = `"${sanitized}"*`; // Prefix match
|
|
2556
|
+
if (negated) {
|
|
2557
|
+
negative.push(ftsTerm);
|
|
2558
|
+
}
|
|
2559
|
+
else {
|
|
2560
|
+
positive.push(ftsTerm);
|
|
2561
|
+
}
|
|
1945
2562
|
}
|
|
1946
2563
|
}
|
|
1947
2564
|
}
|
|
@@ -1964,8 +2581,9 @@ function buildFTS5Query(query) {
|
|
|
1964
2581
|
* Returns error message if invalid, null if valid.
|
|
1965
2582
|
*/
|
|
1966
2583
|
export function validateSemanticQuery(query) {
|
|
1967
|
-
// Check for negation syntax
|
|
1968
|
-
|
|
2584
|
+
// Check for negation syntax — only at token boundaries (start of string or after whitespace).
|
|
2585
|
+
// Hyphenated words like "real-time" or "write-ahead" must not trigger this.
|
|
2586
|
+
if (/(^|\s)-[\w"]/.test(query)) {
|
|
1969
2587
|
return 'Negation (-term) is not supported in vec/hyde queries. Use lex for exclusions.';
|
|
1970
2588
|
}
|
|
1971
2589
|
return null;
|
|
@@ -1984,26 +2602,42 @@ export function searchFTS(db, query, limit = 20, collectionName) {
|
|
|
1984
2602
|
const ftsQuery = buildFTS5Query(query);
|
|
1985
2603
|
if (!ftsQuery)
|
|
1986
2604
|
return [];
|
|
2605
|
+
// Use a CTE to force FTS5 to run first, then filter by collection.
|
|
2606
|
+
// Without the CTE, SQLite's query planner combines FTS5 MATCH with the
|
|
2607
|
+
// collection filter in a single WHERE clause, which can cause it to
|
|
2608
|
+
// abandon the FTS5 index and fall back to a full scan — turning an 8ms
|
|
2609
|
+
// query into a 17-second query on large collections.
|
|
2610
|
+
const params = [ftsQuery];
|
|
2611
|
+
// When filtering by collection, fetch extra candidates from the FTS index
|
|
2612
|
+
// since some will be filtered out. Without a collection filter we can
|
|
2613
|
+
// fetch exactly the requested limit.
|
|
2614
|
+
const ftsLimit = collectionName ? limit * 10 : limit;
|
|
1987
2615
|
let sql = `
|
|
2616
|
+
WITH fts_matches AS (
|
|
2617
|
+
SELECT rowid, bm25(documents_fts, 1.5, 4.0, 1.0) as bm25_score
|
|
2618
|
+
FROM documents_fts
|
|
2619
|
+
WHERE documents_fts MATCH ?
|
|
2620
|
+
ORDER BY bm25_score ASC
|
|
2621
|
+
LIMIT ${ftsLimit}
|
|
2622
|
+
)
|
|
1988
2623
|
SELECT
|
|
1989
2624
|
'qmd://' || d.collection || '/' || d.path as filepath,
|
|
1990
2625
|
d.collection || '/' || d.path as display_path,
|
|
1991
2626
|
d.title,
|
|
1992
2627
|
content.doc as body,
|
|
1993
2628
|
d.hash,
|
|
1994
|
-
|
|
1995
|
-
FROM
|
|
1996
|
-
JOIN documents d ON d.id =
|
|
2629
|
+
fm.bm25_score
|
|
2630
|
+
FROM fts_matches fm
|
|
2631
|
+
JOIN documents d ON d.id = fm.rowid
|
|
1997
2632
|
JOIN content ON content.hash = d.hash
|
|
1998
|
-
WHERE
|
|
2633
|
+
WHERE d.active = 1
|
|
1999
2634
|
`;
|
|
2000
|
-
const params = [ftsQuery];
|
|
2001
2635
|
if (collectionName) {
|
|
2002
2636
|
sql += ` AND d.collection = ?`;
|
|
2003
2637
|
params.push(String(collectionName));
|
|
2004
2638
|
}
|
|
2005
2639
|
// bm25 lower is better; sort ascending.
|
|
2006
|
-
sql += ` ORDER BY bm25_score ASC LIMIT ?`;
|
|
2640
|
+
sql += ` ORDER BY fm.bm25_score ASC LIMIT ?`;
|
|
2007
2641
|
params.push(limit);
|
|
2008
2642
|
const rows = db.prepare(sql).all(...params);
|
|
2009
2643
|
return rows.map(row => {
|
|
@@ -2075,7 +2709,7 @@ export async function searchVec(db, query, model, limit = 20, collectionName, se
|
|
|
2075
2709
|
docSql += ` AND d.collection = ?`;
|
|
2076
2710
|
params.push(collectionName);
|
|
2077
2711
|
}
|
|
2078
|
-
const docRows = db.prepare(docSql).all(...params);
|
|
2712
|
+
const docRows = withLazyContentVectorMigration(db, () => db.prepare(docSql).all(...params));
|
|
2079
2713
|
// Combine with distances and dedupe by filepath
|
|
2080
2714
|
const seen = new Map();
|
|
2081
2715
|
for (const row of docRows) {
|
|
@@ -2122,34 +2756,124 @@ async function getEmbedding(text, model, isQuery, session, llmOverride) {
|
|
|
2122
2756
|
* Get all unique content hashes that need embeddings (from active documents).
|
|
2123
2757
|
* Returns hash, document body, and a sample path for display purposes.
|
|
2124
2758
|
*/
|
|
2125
|
-
export function getHashesForEmbedding(db) {
|
|
2126
|
-
|
|
2759
|
+
export function getHashesForEmbedding(db, model = DEFAULT_EMBED_MODEL) {
|
|
2760
|
+
const fingerprint = getEmbeddingFingerprint(model);
|
|
2761
|
+
return withLazyContentVectorMigration(db, () => db.prepare(`
|
|
2127
2762
|
SELECT d.hash, c.doc as body, MIN(d.path) as path
|
|
2128
2763
|
FROM documents d
|
|
2129
2764
|
JOIN content c ON d.hash = c.hash
|
|
2130
|
-
LEFT JOIN
|
|
2131
|
-
|
|
2765
|
+
LEFT JOIN (
|
|
2766
|
+
SELECT hash, model, COUNT(*) AS chunk_count, MAX(total_chunks) AS expected_chunks
|
|
2767
|
+
FROM content_vectors
|
|
2768
|
+
WHERE model = ? AND embed_fingerprint = ?
|
|
2769
|
+
GROUP BY hash, model, embed_fingerprint
|
|
2770
|
+
) v ON d.hash = v.hash
|
|
2771
|
+
WHERE d.active = 1
|
|
2772
|
+
AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
|
|
2132
2773
|
GROUP BY d.hash
|
|
2133
|
-
`).all();
|
|
2774
|
+
`).all(model, fingerprint));
|
|
2134
2775
|
}
|
|
2135
2776
|
/**
|
|
2136
|
-
* Clear
|
|
2137
|
-
*
|
|
2138
|
-
|
|
2139
|
-
|
|
2140
|
-
|
|
2141
|
-
|
|
2777
|
+
* Clear embeddings for the whole index, or just for one collection.
|
|
2778
|
+
*
|
|
2779
|
+
* When `collection` is omitted the entire content_vectors table is emptied and
|
|
2780
|
+
* the vectors_vec virtual table is dropped (it is recreated with the right
|
|
2781
|
+
* dimensions on the next embed run).
|
|
2782
|
+
*
|
|
2783
|
+
* When `collection` is provided, only vectors whose hash is referenced
|
|
2784
|
+
* exclusively by active documents in that collection are removed. Hashes
|
|
2785
|
+
* shared with active documents in other collections are left in place so
|
|
2786
|
+
* vector search keeps working there (content_vectors is keyed globally by
|
|
2787
|
+
* content hash; identical document bodies across collections share a row).
|
|
2788
|
+
* vectors_vec is preserved so other collections keep working unless the scoped
|
|
2789
|
+
* clear empties content_vectors entirely, in which case it is dropped so the
|
|
2790
|
+
* next embed can recreate the table with the current dimensions.
|
|
2791
|
+
*/
|
|
2792
|
+
export function clearAllEmbeddings(db, collection) {
|
|
2793
|
+
if (!collection) {
|
|
2794
|
+
db.exec(`DELETE FROM content_vectors`);
|
|
2795
|
+
db.exec(`DROP TABLE IF EXISTS vectors_vec`);
|
|
2796
|
+
return;
|
|
2797
|
+
}
|
|
2798
|
+
const exclusiveHashesQuery = `
|
|
2799
|
+
SELECT DISTINCT d.hash
|
|
2800
|
+
FROM documents d
|
|
2801
|
+
WHERE d.collection = ? AND d.active = 1
|
|
2802
|
+
AND NOT EXISTS (
|
|
2803
|
+
SELECT 1 FROM documents d2
|
|
2804
|
+
WHERE d2.hash = d.hash
|
|
2805
|
+
AND d2.active = 1
|
|
2806
|
+
AND d2.collection != d.collection
|
|
2807
|
+
)
|
|
2808
|
+
`;
|
|
2809
|
+
const vecTableExists = db
|
|
2810
|
+
.prepare(`SELECT 1 FROM sqlite_master WHERE type='table' AND name='vectors_vec'`)
|
|
2811
|
+
.get();
|
|
2812
|
+
withLazyContentVectorMigration(db, () => {
|
|
2813
|
+
if (vecTableExists) {
|
|
2814
|
+
const hashSeqRows = db.prepare(`
|
|
2815
|
+
SELECT cv.hash, cv.seq
|
|
2816
|
+
FROM content_vectors cv
|
|
2817
|
+
WHERE cv.hash IN (${exclusiveHashesQuery})
|
|
2818
|
+
`).all(collection);
|
|
2819
|
+
const delVec = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
|
|
2820
|
+
for (const row of hashSeqRows) {
|
|
2821
|
+
delVec.run(`${row.hash}_${row.seq}`);
|
|
2822
|
+
}
|
|
2823
|
+
}
|
|
2824
|
+
db.prepare(`
|
|
2825
|
+
DELETE FROM content_vectors
|
|
2826
|
+
WHERE hash IN (${exclusiveHashesQuery})
|
|
2827
|
+
`).run(collection);
|
|
2828
|
+
const remaining = db
|
|
2829
|
+
.prepare(`SELECT COUNT(*) AS n FROM content_vectors`)
|
|
2830
|
+
.get();
|
|
2831
|
+
if (remaining.n === 0) {
|
|
2832
|
+
db.exec(`DROP TABLE IF EXISTS vectors_vec`);
|
|
2833
|
+
}
|
|
2834
|
+
});
|
|
2142
2835
|
}
|
|
2143
2836
|
/**
|
|
2144
2837
|
* Insert a single embedding into both content_vectors and vectors_vec tables.
|
|
2145
2838
|
* The hash_seq key is formatted as "hash_seq" for the vectors_vec table.
|
|
2839
|
+
*
|
|
2840
|
+
* content_vectors is inserted first so that getHashesForEmbedding (which checks
|
|
2841
|
+
* only content_vectors) won't re-select the hash on a crash between the two inserts.
|
|
2842
|
+
*
|
|
2843
|
+
* vectors_vec uses DELETE + INSERT instead of INSERT OR REPLACE because sqlite-vec's
|
|
2844
|
+
* vec0 virtual tables silently ignore the OR REPLACE conflict clause.
|
|
2146
2845
|
*/
|
|
2147
|
-
export function insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt) {
|
|
2846
|
+
export function insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt, totalChunks = 1, fingerprint = getEmbeddingFingerprint(model)) {
|
|
2148
2847
|
const hashSeq = `${hash}_${seq}`;
|
|
2149
|
-
|
|
2150
|
-
|
|
2151
|
-
|
|
2152
|
-
|
|
2848
|
+
withLazyContentVectorMigration(db, () => {
|
|
2849
|
+
// Insert content_vectors first — crash-safe ordering (see getHashesForEmbedding)
|
|
2850
|
+
const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embed_fingerprint, total_chunks, embedded_at) VALUES (?, ?, ?, ?, ?, ?, ?)`);
|
|
2851
|
+
insertContentVectorStmt.run(hash, seq, pos, model, fingerprint, totalChunks, embeddedAt);
|
|
2852
|
+
// vec0 virtual tables don't support OR REPLACE — use DELETE + INSERT
|
|
2853
|
+
const deleteVecStmt = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
|
|
2854
|
+
const insertVecStmt = db.prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
|
|
2855
|
+
deleteVecStmt.run(hashSeq);
|
|
2856
|
+
insertVecStmt.run(hashSeq, embedding);
|
|
2857
|
+
});
|
|
2858
|
+
}
|
|
2859
|
+
function removeIncompleteEmbeddings(db, expectedChunksByHash, model) {
|
|
2860
|
+
return withLazyContentVectorMigration(db, () => {
|
|
2861
|
+
let removed = 0;
|
|
2862
|
+
const rowsStmt = db.prepare(`SELECT seq FROM content_vectors WHERE hash = ? AND model = ?`);
|
|
2863
|
+
const deleteContentStmt = db.prepare(`DELETE FROM content_vectors WHERE hash = ? AND model = ?`);
|
|
2864
|
+
const deleteVecStmt = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
|
|
2865
|
+
for (const [hash, expectedChunks] of expectedChunksByHash) {
|
|
2866
|
+
const rows = rowsStmt.all(hash, model);
|
|
2867
|
+
if (rows.length === 0 || rows.length === expectedChunks)
|
|
2868
|
+
continue;
|
|
2869
|
+
for (const row of rows) {
|
|
2870
|
+
deleteVecStmt.run(`${hash}_${row.seq}`);
|
|
2871
|
+
}
|
|
2872
|
+
deleteContentStmt.run(hash, model);
|
|
2873
|
+
removed += rows.length;
|
|
2874
|
+
}
|
|
2875
|
+
return removed;
|
|
2876
|
+
});
|
|
2153
2877
|
}
|
|
2154
2878
|
// =============================================================================
|
|
2155
2879
|
// Query expansion
|
|
@@ -2161,12 +2885,15 @@ export async function expandQuery(query, model = DEFAULT_QUERY_MODEL, db, intent
|
|
|
2161
2885
|
if (cached) {
|
|
2162
2886
|
try {
|
|
2163
2887
|
const parsed = JSON.parse(cached);
|
|
2888
|
+
if (!Array.isArray(parsed))
|
|
2889
|
+
return [];
|
|
2890
|
+
const rows = parsed;
|
|
2164
2891
|
// Migrate old cache format: { type, text } → { type, query }
|
|
2165
|
-
if (
|
|
2166
|
-
return
|
|
2892
|
+
if (rows.length > 0 && typeof rows[0]?.query === "string") {
|
|
2893
|
+
return rows.map((r) => ({ type: r.type, query: String(r.query) }));
|
|
2167
2894
|
}
|
|
2168
|
-
else if (
|
|
2169
|
-
return
|
|
2895
|
+
else if (rows.length > 0 && typeof rows[0]?.text === "string") {
|
|
2896
|
+
return rows.map((r) => ({ type: r.type, query: String(r.text) }));
|
|
2170
2897
|
}
|
|
2171
2898
|
}
|
|
2172
2899
|
catch {
|
|
@@ -2473,7 +3200,7 @@ export function getDocumentBody(db, doc, fromLine, maxLines) {
|
|
|
2473
3200
|
let body = row.body;
|
|
2474
3201
|
if (fromLine !== undefined || maxLines !== undefined) {
|
|
2475
3202
|
const lines = body.split('\n');
|
|
2476
|
-
const start = (fromLine || 1) - 1;
|
|
3203
|
+
const start = Math.max(0, (fromLine || 1) - 1);
|
|
2477
3204
|
const end = maxLines !== undefined ? start + maxLines : lines.length;
|
|
2478
3205
|
body = lines.slice(start, end).join('\n');
|
|
2479
3206
|
}
|
|
@@ -2484,7 +3211,7 @@ export function getDocumentBody(db, doc, fromLine, maxLines) {
|
|
|
2484
3211
|
* Returns documents without body by default (use getDocumentBody to load)
|
|
2485
3212
|
*/
|
|
2486
3213
|
export function findDocuments(db, pattern, options = {}) {
|
|
2487
|
-
const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?');
|
|
3214
|
+
const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?') && !pattern.includes('{');
|
|
2488
3215
|
const errors = [];
|
|
2489
3216
|
const maxBytes = options.maxBytes ?? DEFAULT_MULTI_GET_MAX_BYTES;
|
|
2490
3217
|
const bodyCol = options.includeBody ? `, content.doc as body` : ``;
|
|
@@ -2581,7 +3308,7 @@ export function findDocuments(db, pattern, options = {}) {
|
|
|
2581
3308
|
// =============================================================================
|
|
2582
3309
|
// Status
|
|
2583
3310
|
// =============================================================================
|
|
2584
|
-
export function getStatus(db) {
|
|
3311
|
+
export function getStatus(db, model = DEFAULT_EMBED_MODEL) {
|
|
2585
3312
|
// DB is source of truth for collections — config provides supplementary metadata
|
|
2586
3313
|
const dbCollections = db.prepare(`
|
|
2587
3314
|
SELECT
|
|
@@ -2614,7 +3341,7 @@ export function getStatus(db) {
|
|
|
2614
3341
|
return new Date(b.lastUpdated).getTime() - new Date(a.lastUpdated).getTime();
|
|
2615
3342
|
});
|
|
2616
3343
|
const totalDocs = db.prepare(`SELECT COUNT(*) as c FROM documents WHERE active = 1`).get().c;
|
|
2617
|
-
const needsEmbedding = getHashesNeedingEmbedding(db);
|
|
3344
|
+
const needsEmbedding = getHashesNeedingEmbedding(db, undefined, model);
|
|
2618
3345
|
const hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
|
|
2619
3346
|
return {
|
|
2620
3347
|
totalDocuments: totalDocs,
|
|
@@ -2661,7 +3388,7 @@ export function extractSnippet(body, query, maxLen = 500, chunkPos, chunkLen, in
|
|
|
2661
3388
|
const totalLines = body.split('\n').length;
|
|
2662
3389
|
let searchBody = body;
|
|
2663
3390
|
let lineOffset = 0;
|
|
2664
|
-
if (chunkPos && chunkPos
|
|
3391
|
+
if (chunkPos !== undefined && chunkPos >= 0) {
|
|
2665
3392
|
// Search within the chunk region, with some padding for context
|
|
2666
3393
|
// Use provided chunkLen or fall back to max chunk size (covers variable-length chunks)
|
|
2667
3394
|
const searchLen = chunkLen || CHUNK_SIZE_CHARS;
|
|
@@ -2692,6 +3419,22 @@ export function extractSnippet(body, query, maxLen = 500, chunkPos, chunkLen, in
|
|
|
2692
3419
|
bestLine = i;
|
|
2693
3420
|
}
|
|
2694
3421
|
}
|
|
3422
|
+
if (chunkPos !== undefined && chunkPos >= 0 && bestScore <= 0) {
|
|
3423
|
+
if (chunkPos === 0) {
|
|
3424
|
+
// chunkPos=0 may be the chunk selector's initialization default for queries
|
|
3425
|
+
// where lexical chunk scoring found no winner (e.g. tokens filtered to empty
|
|
3426
|
+
// by the length>2 guard). Retry with full body so the real match isn't missed.
|
|
3427
|
+
return extractSnippet(body, query, maxLen, undefined, undefined, intent);
|
|
3428
|
+
}
|
|
3429
|
+
// For chunkPos > 0 the reranker actively picked this chunk. Tokens failing to
|
|
3430
|
+
// match literally is most likely a tokenizer limitation (quoted phrases, FTS5
|
|
3431
|
+
// syntax, HYDE passages, semantic hits), so anchor on the chunk start rather
|
|
3432
|
+
// than disregarding the reranker's pick.
|
|
3433
|
+
const contextStart = Math.max(0, chunkPos - 100);
|
|
3434
|
+
bestLine = chunkPos > contextStart
|
|
3435
|
+
? searchBody.slice(0, chunkPos - contextStart).split('\n').length - 1
|
|
3436
|
+
: 0;
|
|
3437
|
+
}
|
|
2695
3438
|
const start = Math.max(0, bestLine - 1);
|
|
2696
3439
|
const end = Math.min(lines.length, bestLine + 3);
|
|
2697
3440
|
const snippetLines = lines.slice(start, end);
|
|
@@ -2729,6 +3472,20 @@ export function addLineNumbers(text, startLine = 1) {
|
|
|
2729
3472
|
const lines = text.split('\n');
|
|
2730
3473
|
return lines.map((line, i) => `${startLine + i}: ${line}`).join('\n');
|
|
2731
3474
|
}
|
|
3475
|
+
/**
|
|
3476
|
+
* RRF list weights for hybridQuery.
|
|
3477
|
+
*
|
|
3478
|
+
* Original-query retrieval paths are the primary evidence and get 2x weight:
|
|
3479
|
+
* - original FTS
|
|
3480
|
+
* - original vector search
|
|
3481
|
+
*
|
|
3482
|
+
* Expansion-derived lists (lex/vec/hyde) stay at 1x regardless of list order,
|
|
3483
|
+
* so a lex expansion inserted before original vector search cannot steal the
|
|
3484
|
+
* original vector boost.
|
|
3485
|
+
*/
|
|
3486
|
+
export function getHybridRrfWeights(rankedListMeta) {
|
|
3487
|
+
return rankedListMeta.map(meta => meta.queryType === "original" ? 2.0 : 1.0);
|
|
3488
|
+
}
|
|
2732
3489
|
/**
|
|
2733
3490
|
* Hybrid search: BM25 + vector + query expansion + RRF + chunked reranking.
|
|
2734
3491
|
*
|
|
@@ -2817,7 +3574,8 @@ export async function hybridQuery(store, query, options) {
|
|
|
2817
3574
|
}
|
|
2818
3575
|
// Batch embed all vector queries in a single call
|
|
2819
3576
|
const llm = getLlm(store);
|
|
2820
|
-
const
|
|
3577
|
+
const embedModel = llm.embedModelName;
|
|
3578
|
+
const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text, embedModel));
|
|
2821
3579
|
hooks?.onEmbedStart?.(textsToEmbed.length);
|
|
2822
3580
|
const embedStart = Date.now();
|
|
2823
3581
|
const embeddings = await llm.embedBatch(textsToEmbed);
|
|
@@ -2827,7 +3585,7 @@ export async function hybridQuery(store, query, options) {
|
|
|
2827
3585
|
const embedding = embeddings[i]?.embedding;
|
|
2828
3586
|
if (!embedding)
|
|
2829
3587
|
continue;
|
|
2830
|
-
const vecResults = await store.searchVec(vecQueries[i].text,
|
|
3588
|
+
const vecResults = await store.searchVec(vecQueries[i].text, embedModel, 20, collection, undefined, embedding);
|
|
2831
3589
|
if (vecResults.length > 0) {
|
|
2832
3590
|
for (const r of vecResults)
|
|
2833
3591
|
docidMap.set(r.filepath, r.docid);
|
|
@@ -2843,8 +3601,9 @@ export async function hybridQuery(store, query, options) {
|
|
|
2843
3601
|
}
|
|
2844
3602
|
}
|
|
2845
3603
|
}
|
|
2846
|
-
// Step 4: RRF fusion —
|
|
2847
|
-
|
|
3604
|
+
// Step 4: RRF fusion — original-query FTS and vector lists get 2x weight;
|
|
3605
|
+
// expansion-derived lists stay at 1x independent of insertion order.
|
|
3606
|
+
const weights = getHybridRrfWeights(rankedListMeta);
|
|
2848
3607
|
const fused = reciprocalRankFusion(rankedLists, weights);
|
|
2849
3608
|
const rrfTraceByFile = explain ? buildRrfTrace(rankedLists, weights, rankedListMeta) : null;
|
|
2850
3609
|
const candidates = fused.slice(0, candidateLimit);
|
|
@@ -2855,8 +3614,9 @@ export async function hybridQuery(store, query, options) {
|
|
|
2855
3614
|
const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 2);
|
|
2856
3615
|
const intentTerms = intent ? extractIntentTerms(intent) : [];
|
|
2857
3616
|
const docChunkMap = new Map();
|
|
3617
|
+
const chunkStrategy = options?.chunkStrategy;
|
|
2858
3618
|
for (const cand of candidates) {
|
|
2859
|
-
const chunks =
|
|
3619
|
+
const chunks = await chunkDocumentAsync(cand.body, undefined, undefined, undefined, cand.file, chunkStrategy);
|
|
2860
3620
|
if (chunks.length === 0)
|
|
2861
3621
|
continue;
|
|
2862
3622
|
// Pick chunk with most keyword overlap (fallback: first chunk)
|
|
@@ -3024,10 +3784,11 @@ export async function vectorSearchQuery(store, query, options) {
|
|
|
3024
3784
|
const vecExpanded = allExpanded.filter(q => q.type !== 'lex');
|
|
3025
3785
|
options?.hooks?.onExpand?.(query, vecExpanded, Date.now() - expandStart);
|
|
3026
3786
|
// Run original + vec/hyde expanded through vector, sequentially — concurrent embed() hangs
|
|
3787
|
+
const embedModel = getLlm(store).embedModelName;
|
|
3027
3788
|
const queryTexts = [query, ...vecExpanded.map(q => q.query)];
|
|
3028
3789
|
const allResults = new Map();
|
|
3029
3790
|
for (const q of queryTexts) {
|
|
3030
|
-
const vecResults = await store.searchVec(q,
|
|
3791
|
+
const vecResults = await store.searchVec(q, embedModel, limit, collection);
|
|
3031
3792
|
for (const r of vecResults) {
|
|
3032
3793
|
const existing = allResults.get(r.filepath);
|
|
3033
3794
|
if (!existing || r.score > existing.score) {
|
|
@@ -3128,7 +3889,8 @@ export async function structuredSearch(store, searches, options) {
|
|
|
3128
3889
|
const vecSearches = searches.filter((s) => s.type === 'vec' || s.type === 'hyde');
|
|
3129
3890
|
if (vecSearches.length > 0) {
|
|
3130
3891
|
const llm = getLlm(store);
|
|
3131
|
-
const
|
|
3892
|
+
const embedModel = llm.embedModelName;
|
|
3893
|
+
const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query, embedModel));
|
|
3132
3894
|
hooks?.onEmbedStart?.(textsToEmbed.length);
|
|
3133
3895
|
const embedStart = Date.now();
|
|
3134
3896
|
const embeddings = await llm.embedBatch(textsToEmbed);
|
|
@@ -3138,7 +3900,7 @@ export async function structuredSearch(store, searches, options) {
|
|
|
3138
3900
|
if (!embedding)
|
|
3139
3901
|
continue;
|
|
3140
3902
|
for (const coll of collectionList) {
|
|
3141
|
-
const vecResults = await store.searchVec(vecSearches[i].query,
|
|
3903
|
+
const vecResults = await store.searchVec(vecSearches[i].query, embedModel, 20, coll, undefined, embedding);
|
|
3142
3904
|
if (vecResults.length > 0) {
|
|
3143
3905
|
for (const r of vecResults)
|
|
3144
3906
|
docidMap.set(r.filepath, r.docid);
|
|
@@ -3174,8 +3936,9 @@ export async function structuredSearch(store, searches, options) {
|
|
|
3174
3936
|
const queryTerms = primaryQuery.toLowerCase().split(/\s+/).filter(t => t.length > 2);
|
|
3175
3937
|
const intentTerms = intent ? extractIntentTerms(intent) : [];
|
|
3176
3938
|
const docChunkMap = new Map();
|
|
3939
|
+
const ssChunkStrategy = options?.chunkStrategy;
|
|
3177
3940
|
for (const cand of candidates) {
|
|
3178
|
-
const chunks =
|
|
3941
|
+
const chunks = await chunkDocumentAsync(cand.body, undefined, undefined, undefined, cand.file, ssChunkStrategy);
|
|
3179
3942
|
if (chunks.length === 0)
|
|
3180
3943
|
continue;
|
|
3181
3944
|
// Pick chunk with most keyword overlap
|