@tobilu/qmd 2.1.0 → 2.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +81 -0
- package/README.md +3 -0
- package/bin/qmd +39 -3
- package/dist/ast.d.ts +1 -0
- package/dist/ast.js +18 -8
- package/dist/bench/bench.d.ts +2 -0
- package/dist/bench/bench.js +108 -13
- package/dist/bench/score.d.ts +11 -4
- package/dist/bench/score.js +34 -13
- package/dist/bench/types.d.ts +13 -0
- package/dist/cli/qmd.d.ts +26 -0
- package/dist/cli/qmd.js +1172 -121
- package/dist/collections.d.ts +9 -0
- package/dist/collections.js +32 -7
- package/dist/db.d.ts +6 -3
- package/dist/db.js +1 -1
- package/dist/index.d.ts +4 -0
- package/dist/index.js +5 -2
- package/dist/llm.d.ts +65 -3
- package/dist/llm.js +376 -63
- package/dist/mcp/server.d.ts +6 -3
- package/dist/mcp/server.js +41 -26
- package/dist/paths.d.ts +1 -0
- package/dist/paths.js +4 -0
- package/dist/store.d.ts +92 -17
- package/dist/store.js +676 -176
- package/package.json +23 -12
- package/scripts/build.mjs +29 -0
- package/scripts/check-package-grammars.mjs +29 -0
- package/scripts/package-smoke.mjs +65 -0
- package/scripts/test-all.mjs +27 -0
- package/skills/qmd/SKILL.md +203 -0
- package/skills/qmd/references/mcp-setup.md +102 -0
- package/skills/release/SKILL.md +139 -0
- package/skills/release/scripts/install-hooks.sh +38 -0
- package/dist/embedded-skills.d.ts +0 -6
- package/dist/embedded-skills.js +0 -14
package/dist/store.js
CHANGED
|
@@ -16,18 +16,21 @@ import { createHash } from "crypto";
|
|
|
16
16
|
import { readFileSync, realpathSync, statSync, mkdirSync } from "node:fs";
|
|
17
17
|
// Note: node:path resolve is not imported — we export our own cross-platform resolve()
|
|
18
18
|
import fastGlob from "fast-glob";
|
|
19
|
-
import {
|
|
19
|
+
import { qmdHomedir } from "./paths.js";
|
|
20
|
+
import { LlamaCpp, getDefaultLlamaCpp, formatQueryForEmbedding, formatDocForEmbedding, withLLMSessionForLlm, DEFAULT_EMBED_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, } from "./llm.js";
|
|
20
21
|
// =============================================================================
|
|
21
22
|
// Configuration
|
|
22
23
|
// =============================================================================
|
|
23
|
-
const
|
|
24
|
-
export const
|
|
25
|
-
export const
|
|
26
|
-
export const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B";
|
|
24
|
+
export const DEFAULT_EMBED_MODEL = DEFAULT_EMBED_MODEL_URI;
|
|
25
|
+
export const DEFAULT_RERANK_MODEL = DEFAULT_RERANK_MODEL_URI;
|
|
26
|
+
export const DEFAULT_QUERY_MODEL = DEFAULT_GENERATE_MODEL_URI;
|
|
27
27
|
export const DEFAULT_GLOB = "**/*.md";
|
|
28
28
|
export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB
|
|
29
29
|
export const DEFAULT_EMBED_MAX_DOCS_PER_BATCH = 64;
|
|
30
30
|
export const DEFAULT_EMBED_MAX_BATCH_BYTES = 64 * 1024 * 1024; // 64MB
|
|
31
|
+
const EMBED_FINGERPRINT_PROBE_QUERY = "__qmd_embedding_query_probe__";
|
|
32
|
+
const EMBED_FINGERPRINT_PROBE_TITLE = "__qmd_embedding_title_probe__";
|
|
33
|
+
const EMBED_FINGERPRINT_PROBE_DOC = "__qmd_embedding_document_probe__";
|
|
31
34
|
// Chunking: 900 tokens per chunk with 15% overlap
|
|
32
35
|
// Increased from 800 to accommodate smart chunking finding natural break points
|
|
33
36
|
export const CHUNK_SIZE_TOKENS = 900;
|
|
@@ -38,6 +41,16 @@ export const CHUNK_OVERLAP_CHARS = CHUNK_OVERLAP_TOKENS * 4; // 540 chars
|
|
|
38
41
|
// Search window for finding optimal break points (in tokens, ~200 tokens)
|
|
39
42
|
export const CHUNK_WINDOW_TOKENS = 200;
|
|
40
43
|
export const CHUNK_WINDOW_CHARS = CHUNK_WINDOW_TOKENS * 4; // 800 chars
|
|
44
|
+
export function getEmbeddingFingerprint(model = DEFAULT_EMBED_MODEL) {
|
|
45
|
+
const significant = [
|
|
46
|
+
`model:${model}`,
|
|
47
|
+
`query:${formatQueryForEmbedding(EMBED_FINGERPRINT_PROBE_QUERY, model)}`,
|
|
48
|
+
`doc:${formatDocForEmbedding(EMBED_FINGERPRINT_PROBE_DOC, EMBED_FINGERPRINT_PROBE_TITLE, model)}`,
|
|
49
|
+
`chunk_tokens:${CHUNK_SIZE_TOKENS}`,
|
|
50
|
+
`chunk_overlap_tokens:${CHUNK_OVERLAP_TOKENS}`,
|
|
51
|
+
].join("\n");
|
|
52
|
+
return createHash("sha256").update(significant).digest("hex").slice(0, 6);
|
|
53
|
+
}
|
|
41
54
|
/**
|
|
42
55
|
* Get the LlamaCpp instance for a store — prefers the store's own instance,
|
|
43
56
|
* falls back to the global singleton.
|
|
@@ -228,7 +241,7 @@ export const RERANK_CANDIDATE_LIMIT = 40;
|
|
|
228
241
|
// Path utilities
|
|
229
242
|
// =============================================================================
|
|
230
243
|
export function homedir() {
|
|
231
|
-
return
|
|
244
|
+
return qmdHomedir();
|
|
232
245
|
}
|
|
233
246
|
/**
|
|
234
247
|
* Check if a path is absolute.
|
|
@@ -468,21 +481,25 @@ export function normalizeVirtualPath(input) {
|
|
|
468
481
|
export function parseVirtualPath(virtualPath) {
|
|
469
482
|
// Normalize the path first
|
|
470
483
|
const normalized = normalizeVirtualPath(virtualPath);
|
|
484
|
+
const [pathPart = normalized, queryString = ""] = normalized.split("?");
|
|
471
485
|
// Match: qmd://collection-name[/optional-path]
|
|
472
486
|
// Allows: qmd://name, qmd://name/, qmd://name/path
|
|
473
|
-
const match =
|
|
487
|
+
const match = pathPart.match(/^qmd:\/\/([^\/]+)\/?(.*)$/);
|
|
474
488
|
if (!match?.[1])
|
|
475
489
|
return null;
|
|
490
|
+
const indexName = new URLSearchParams(queryString).get("index")?.trim() || undefined;
|
|
476
491
|
return {
|
|
477
492
|
collectionName: match[1],
|
|
478
493
|
path: match[2] ?? '', // Empty string for collection root
|
|
494
|
+
...(indexName ? { indexName } : {}),
|
|
479
495
|
};
|
|
480
496
|
}
|
|
481
497
|
/**
|
|
482
498
|
* Build a virtual path from collection name and relative path.
|
|
483
499
|
*/
|
|
484
|
-
export function buildVirtualPath(collectionName, path) {
|
|
485
|
-
|
|
500
|
+
export function buildVirtualPath(collectionName, path, indexName) {
|
|
501
|
+
const base = `qmd://${collectionName}/${path}`;
|
|
502
|
+
return indexName ? `${base}?index=${encodeURIComponent(indexName)}` : base;
|
|
486
503
|
}
|
|
487
504
|
/**
|
|
488
505
|
* Check if a path is explicitly a virtual path.
|
|
@@ -552,6 +569,7 @@ function createSqliteVecUnavailableError(reason) {
|
|
|
552
569
|
"Install Homebrew SQLite so the sqlite-vec extension can be loaded, " +
|
|
553
570
|
"and set BREW_PREFIX if Homebrew is installed in a non-standard location.");
|
|
554
571
|
}
|
|
572
|
+
let _sqliteVecUnavailableReason = null;
|
|
555
573
|
function getErrorMessage(err) {
|
|
556
574
|
return err instanceof Error ? err.message : String(err);
|
|
557
575
|
}
|
|
@@ -568,16 +586,76 @@ export function verifySqliteVecLoaded(db) {
|
|
|
568
586
|
}
|
|
569
587
|
}
|
|
570
588
|
let _sqliteVecAvailable = null;
|
|
589
|
+
const CJK_CHAR_PATTERN = /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Hangul}]/u;
|
|
590
|
+
const CJK_RUN_PATTERN = /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Hangul}]+/gu;
|
|
591
|
+
const FTS_CJK_NORMALIZED_VERSION = "1";
|
|
592
|
+
/**
|
|
593
|
+
* FTS5's unicode61 tokenizer does not segment CJK text into searchable words.
|
|
594
|
+
* Normalize CJK runs by spacing every character so exact CJK queries can be
|
|
595
|
+
* translated into phrase queries while Latin text keeps the default tokenizer.
|
|
596
|
+
*/
|
|
597
|
+
export function normalizeCjkForFTS(text) {
|
|
598
|
+
return text.replace(CJK_RUN_PATTERN, run => ` ${Array.from(run).join(' ')} `);
|
|
599
|
+
}
|
|
600
|
+
function containsCjk(text) {
|
|
601
|
+
return CJK_CHAR_PATTERN.test(text);
|
|
602
|
+
}
|
|
603
|
+
function sanitizeFTS5Phrase(phrase) {
|
|
604
|
+
return normalizeCjkForFTS(phrase)
|
|
605
|
+
.split(/\s+/)
|
|
606
|
+
.map(t => sanitizeFTS5Term(t))
|
|
607
|
+
.filter(t => t)
|
|
608
|
+
.join(' ');
|
|
609
|
+
}
|
|
610
|
+
function rebuildFTSForCjkNormalization(db) {
|
|
611
|
+
const version = db.prepare(`SELECT value FROM store_config WHERE key = 'fts_cjk_normalized_version'`).get();
|
|
612
|
+
if (version?.value === FTS_CJK_NORMALIZED_VERSION)
|
|
613
|
+
return;
|
|
614
|
+
try {
|
|
615
|
+
db.exec(`DELETE FROM documents_fts WHERE rowid >= 0`);
|
|
616
|
+
}
|
|
617
|
+
catch {
|
|
618
|
+
// Some older/corrupt FTS5 shadow-table states can reject bulk deletes even
|
|
619
|
+
// though reads still work. Recreate the virtual table; documents_fts is a
|
|
620
|
+
// derived index, so rebuilding it from documents/content is safe.
|
|
621
|
+
db.exec(`DROP TABLE IF EXISTS documents_fts`);
|
|
622
|
+
db.exec(`
|
|
623
|
+
CREATE VIRTUAL TABLE documents_fts USING fts5(
|
|
624
|
+
filepath, title, body,
|
|
625
|
+
tokenize='porter unicode61'
|
|
626
|
+
)
|
|
627
|
+
`);
|
|
628
|
+
}
|
|
629
|
+
const rows = db.prepare(`
|
|
630
|
+
SELECT d.id, d.collection, d.path, d.title, content.doc as body
|
|
631
|
+
FROM documents d
|
|
632
|
+
JOIN content ON content.hash = d.hash
|
|
633
|
+
WHERE d.active = 1
|
|
634
|
+
`).all();
|
|
635
|
+
const insert = db.prepare(`INSERT INTO documents_fts(rowid, filepath, title, body) VALUES (?, ?, ?, ?)`);
|
|
636
|
+
const rebuild = db.transaction(() => {
|
|
637
|
+
for (const row of rows) {
|
|
638
|
+
insert.run(row.id, normalizeCjkForFTS(`${row.collection}/${row.path}`), normalizeCjkForFTS(row.title), normalizeCjkForFTS(row.body));
|
|
639
|
+
}
|
|
640
|
+
});
|
|
641
|
+
rebuild();
|
|
642
|
+
db.prepare(`
|
|
643
|
+
INSERT OR REPLACE INTO store_config(key, value)
|
|
644
|
+
VALUES ('fts_cjk_normalized_version', ?)
|
|
645
|
+
`).run(FTS_CJK_NORMALIZED_VERSION);
|
|
646
|
+
}
|
|
571
647
|
function initializeDatabase(db) {
|
|
572
648
|
try {
|
|
573
649
|
loadSqliteVec(db);
|
|
574
650
|
verifySqliteVecLoaded(db);
|
|
575
651
|
_sqliteVecAvailable = true;
|
|
652
|
+
_sqliteVecUnavailableReason = null;
|
|
576
653
|
}
|
|
577
654
|
catch (err) {
|
|
578
655
|
// sqlite-vec is optional — vector search won't work but FTS is fine
|
|
579
656
|
_sqliteVecAvailable = false;
|
|
580
|
-
|
|
657
|
+
_sqliteVecUnavailableReason = getErrorMessage(err);
|
|
658
|
+
console.warn(_sqliteVecUnavailableReason);
|
|
581
659
|
}
|
|
582
660
|
db.exec("PRAGMA journal_mode = WAL");
|
|
583
661
|
db.exec("PRAGMA foreign_keys = ON");
|
|
@@ -619,19 +697,16 @@ function initializeDatabase(db) {
|
|
|
619
697
|
created_at TEXT NOT NULL
|
|
620
698
|
)
|
|
621
699
|
`);
|
|
622
|
-
// Content vectors
|
|
623
|
-
|
|
624
|
-
const hasSeqColumn = cvInfo.some(col => col.name === 'seq');
|
|
625
|
-
if (cvInfo.length > 0 && !hasSeqColumn) {
|
|
626
|
-
db.exec(`DROP TABLE IF EXISTS content_vectors`);
|
|
627
|
-
db.exec(`DROP TABLE IF EXISTS vectors_vec`);
|
|
628
|
-
}
|
|
700
|
+
// Content vectors. Avoid PRAGMA schema probes during startup; legacy vector
|
|
701
|
+
// columns are repaired lazily when a vector/embedding query first needs them.
|
|
629
702
|
db.exec(`
|
|
630
703
|
CREATE TABLE IF NOT EXISTS content_vectors (
|
|
631
704
|
hash TEXT NOT NULL,
|
|
632
705
|
seq INTEGER NOT NULL DEFAULT 0,
|
|
633
706
|
pos INTEGER NOT NULL DEFAULT 0,
|
|
634
707
|
model TEXT NOT NULL,
|
|
708
|
+
embed_fingerprint TEXT NOT NULL DEFAULT '',
|
|
709
|
+
total_chunks INTEGER NOT NULL DEFAULT 1,
|
|
635
710
|
embedded_at TEXT NOT NULL,
|
|
636
711
|
PRIMARY KEY (hash, seq)
|
|
637
712
|
)
|
|
@@ -662,9 +737,12 @@ function initializeDatabase(db) {
|
|
|
662
737
|
tokenize='porter unicode61'
|
|
663
738
|
)
|
|
664
739
|
`);
|
|
665
|
-
// Triggers
|
|
740
|
+
// Triggers keep FTS in sync for callers that write directly to documents.
|
|
741
|
+
// Production indexing paths rebuild entries in TypeScript so CJK text can be
|
|
742
|
+
// normalized before it reaches the unicode61 tokenizer.
|
|
743
|
+
db.exec(`DROP TRIGGER IF EXISTS documents_ai`);
|
|
666
744
|
db.exec(`
|
|
667
|
-
CREATE TRIGGER
|
|
745
|
+
CREATE TRIGGER documents_ai AFTER INSERT ON documents
|
|
668
746
|
WHEN new.active = 1
|
|
669
747
|
BEGIN
|
|
670
748
|
INSERT INTO documents_fts(rowid, filepath, title, body)
|
|
@@ -676,13 +754,15 @@ function initializeDatabase(db) {
|
|
|
676
754
|
WHERE new.active = 1;
|
|
677
755
|
END
|
|
678
756
|
`);
|
|
757
|
+
db.exec(`DROP TRIGGER IF EXISTS documents_ad`);
|
|
679
758
|
db.exec(`
|
|
680
|
-
CREATE TRIGGER
|
|
759
|
+
CREATE TRIGGER documents_ad AFTER DELETE ON documents BEGIN
|
|
681
760
|
DELETE FROM documents_fts WHERE rowid = old.id;
|
|
682
761
|
END
|
|
683
762
|
`);
|
|
763
|
+
db.exec(`DROP TRIGGER IF EXISTS documents_au`);
|
|
684
764
|
db.exec(`
|
|
685
|
-
CREATE TRIGGER
|
|
765
|
+
CREATE TRIGGER documents_au AFTER UPDATE ON documents
|
|
686
766
|
BEGIN
|
|
687
767
|
-- Delete from FTS if no longer active
|
|
688
768
|
DELETE FROM documents_fts WHERE rowid = old.id AND new.active = 0;
|
|
@@ -697,6 +777,7 @@ function initializeDatabase(db) {
|
|
|
697
777
|
WHERE new.active = 1;
|
|
698
778
|
END
|
|
699
779
|
`);
|
|
780
|
+
rebuildFTSForCjkNormalization(db);
|
|
700
781
|
}
|
|
701
782
|
function rowToNamedCollection(row) {
|
|
702
783
|
return {
|
|
@@ -838,7 +919,7 @@ export function isSqliteVecAvailable() {
|
|
|
838
919
|
}
|
|
839
920
|
function ensureVecTableInternal(db, dimensions) {
|
|
840
921
|
if (!_sqliteVecAvailable) {
|
|
841
|
-
throw
|
|
922
|
+
throw createSqliteVecUnavailableError(_sqliteVecUnavailableReason ?? "vector operations require a SQLite build with extension loading support");
|
|
842
923
|
}
|
|
843
924
|
const tableInfo = db.prepare(`SELECT sql FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
|
|
844
925
|
if (tableInfo) {
|
|
@@ -902,7 +983,7 @@ export async function reindexCollection(store, collectionPath, globPattern, coll
|
|
|
902
983
|
}
|
|
903
984
|
const hash = await hashContent(content);
|
|
904
985
|
const title = extractTitle(content, relativeFile);
|
|
905
|
-
const existing =
|
|
986
|
+
const existing = findOrMigrateLegacyDocument(db, collectionName, path);
|
|
906
987
|
if (existing) {
|
|
907
988
|
if (existing.hash === hash) {
|
|
908
989
|
if (existing.title !== title) {
|
|
@@ -955,16 +1036,74 @@ function resolveEmbedOptions(options) {
|
|
|
955
1036
|
maxBatchBytes: validatePositiveIntegerOption("maxBatchBytes", options?.maxBatchBytes, DEFAULT_EMBED_MAX_BATCH_BYTES),
|
|
956
1037
|
};
|
|
957
1038
|
}
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
1039
|
+
const CONTENT_VECTOR_DESIRED_COLUMNS = [
|
|
1040
|
+
{ name: "seq", definition: "INTEGER NOT NULL DEFAULT 0" },
|
|
1041
|
+
{ name: "pos", definition: "INTEGER NOT NULL DEFAULT 0" },
|
|
1042
|
+
{ name: "model", definition: "TEXT NOT NULL DEFAULT ''" },
|
|
1043
|
+
{ name: "embed_fingerprint", definition: "TEXT NOT NULL DEFAULT ''" },
|
|
1044
|
+
{ name: "total_chunks", definition: "INTEGER NOT NULL DEFAULT 1" },
|
|
1045
|
+
{ name: "embedded_at", definition: "TEXT NOT NULL DEFAULT ''" },
|
|
1046
|
+
];
|
|
1047
|
+
function isContentVectorColumnError(error) {
|
|
1048
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1049
|
+
if (!/(no such column|has no column named)/i.test(message)) {
|
|
1050
|
+
return false;
|
|
1051
|
+
}
|
|
1052
|
+
return CONTENT_VECTOR_DESIRED_COLUMNS.some(col => message.includes(col.name));
|
|
1053
|
+
}
|
|
1054
|
+
function runContentVectorColumnRepairs(db) {
|
|
1055
|
+
for (const column of CONTENT_VECTOR_DESIRED_COLUMNS) {
|
|
1056
|
+
try {
|
|
1057
|
+
db.exec(`ALTER TABLE content_vectors ADD COLUMN ${column.name} ${column.definition}`);
|
|
1058
|
+
}
|
|
1059
|
+
catch (error) {
|
|
1060
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1061
|
+
// The repair series is intentionally idempotent: most columns should
|
|
1062
|
+
// already exist, and another caller may have repaired a missing column
|
|
1063
|
+
// between the failed query and this ALTER series.
|
|
1064
|
+
if (!message.includes("duplicate column name")) {
|
|
1065
|
+
throw error;
|
|
1066
|
+
}
|
|
1067
|
+
}
|
|
1068
|
+
}
|
|
1069
|
+
}
|
|
1070
|
+
function withLazyContentVectorMigration(db, operation) {
|
|
1071
|
+
let repaired = false;
|
|
1072
|
+
while (true) {
|
|
1073
|
+
try {
|
|
1074
|
+
return operation();
|
|
1075
|
+
}
|
|
1076
|
+
catch (error) {
|
|
1077
|
+
if (repaired || !isContentVectorColumnError(error)) {
|
|
1078
|
+
throw error;
|
|
1079
|
+
}
|
|
1080
|
+
runContentVectorColumnRepairs(db);
|
|
1081
|
+
repaired = true;
|
|
1082
|
+
}
|
|
1083
|
+
}
|
|
1084
|
+
}
|
|
1085
|
+
function getPendingEmbeddingDocs(db, collection, model = DEFAULT_EMBED_MODEL) {
|
|
1086
|
+
const collectionFilter = collection ? `AND d.collection = ?` : ``;
|
|
1087
|
+
const fingerprint = getEmbeddingFingerprint(model);
|
|
1088
|
+
return withLazyContentVectorMigration(db, () => {
|
|
1089
|
+
const stmt = db.prepare(`
|
|
1090
|
+
SELECT d.hash, MIN(d.path) as path, length(CAST(c.doc AS BLOB)) as bytes
|
|
1091
|
+
FROM documents d
|
|
1092
|
+
JOIN content c ON d.hash = c.hash
|
|
1093
|
+
LEFT JOIN (
|
|
1094
|
+
SELECT hash, model, COUNT(*) AS chunk_count, MAX(total_chunks) AS expected_chunks
|
|
1095
|
+
FROM content_vectors
|
|
1096
|
+
WHERE model = ? AND embed_fingerprint = ?
|
|
1097
|
+
GROUP BY hash, model, embed_fingerprint
|
|
1098
|
+
) v ON d.hash = v.hash
|
|
1099
|
+
WHERE d.active = 1
|
|
1100
|
+
AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
|
|
1101
|
+
${collectionFilter}
|
|
1102
|
+
GROUP BY d.hash
|
|
1103
|
+
ORDER BY MIN(d.path)
|
|
1104
|
+
`);
|
|
1105
|
+
return (collection ? stmt.all(model, fingerprint, collection) : stmt.all(model, fingerprint));
|
|
1106
|
+
});
|
|
968
1107
|
}
|
|
969
1108
|
function buildEmbeddingBatches(docs, maxDocsPerBatch, maxBatchBytes) {
|
|
970
1109
|
const batches = [];
|
|
@@ -1009,14 +1148,16 @@ function getEmbeddingDocsForBatch(db, batch) {
|
|
|
1009
1148
|
*/
|
|
1010
1149
|
export async function generateEmbeddings(store, options) {
|
|
1011
1150
|
const db = store.db;
|
|
1012
|
-
const
|
|
1151
|
+
const llm = getLlm(store);
|
|
1152
|
+
const model = options?.model ?? llm.embedModelName ?? DEFAULT_EMBED_MODEL;
|
|
1153
|
+
const fingerprint = getEmbeddingFingerprint(model);
|
|
1013
1154
|
const now = new Date().toISOString();
|
|
1014
1155
|
const { maxDocsPerBatch, maxBatchBytes } = resolveEmbedOptions(options);
|
|
1015
1156
|
const encoder = new TextEncoder();
|
|
1016
1157
|
if (options?.force) {
|
|
1017
|
-
clearAllEmbeddings(db);
|
|
1158
|
+
clearAllEmbeddings(db, options?.collection);
|
|
1018
1159
|
}
|
|
1019
|
-
const docsToEmbed = getPendingEmbeddingDocs(db);
|
|
1160
|
+
const docsToEmbed = getPendingEmbeddingDocs(db, options?.collection, model);
|
|
1020
1161
|
if (docsToEmbed.length === 0) {
|
|
1021
1162
|
return { docsProcessed: 0, chunksEmbedded: 0, errors: 0, durationMs: 0 };
|
|
1022
1163
|
}
|
|
@@ -1024,16 +1165,88 @@ export async function generateEmbeddings(store, options) {
|
|
|
1024
1165
|
const totalDocs = docsToEmbed.length;
|
|
1025
1166
|
const startTime = Date.now();
|
|
1026
1167
|
// Use store's LlamaCpp or global singleton, wrapped in a session
|
|
1027
|
-
const
|
|
1028
|
-
const embedModelUri = llm.embedModelName;
|
|
1168
|
+
const embedModelUri = model;
|
|
1029
1169
|
// Create a session manager for this llm instance
|
|
1030
1170
|
const result = await withLLMSessionForLlm(llm, async (session) => {
|
|
1031
1171
|
let chunksEmbedded = 0;
|
|
1032
|
-
let errors = 0;
|
|
1033
1172
|
let bytesProcessed = 0;
|
|
1034
1173
|
let totalChunks = 0;
|
|
1035
1174
|
let vectorTableInitialized = false;
|
|
1036
1175
|
const BATCH_SIZE = 32;
|
|
1176
|
+
const RETRY_AFTER_SUCCESSFUL_CHUNKS = 64;
|
|
1177
|
+
const MAX_RETRY_ATTEMPTS = 3;
|
|
1178
|
+
const failures = new Map();
|
|
1179
|
+
const retryQueue = new Map();
|
|
1180
|
+
let successesSinceRetry = 0;
|
|
1181
|
+
const failureList = () => [...failures.values()];
|
|
1182
|
+
const activeErrorCount = () => failures.size;
|
|
1183
|
+
const chunkKey = (chunk) => `${chunk.hash}:${chunk.seq}`;
|
|
1184
|
+
const reasonFromError = (error) => {
|
|
1185
|
+
const raw = error instanceof Error ? error.message : String(error);
|
|
1186
|
+
return raw.length > 180 ? `${raw.slice(0, 177)}...` : raw;
|
|
1187
|
+
};
|
|
1188
|
+
const recordFailure = (chunk, reason) => {
|
|
1189
|
+
const key = chunkKey(chunk);
|
|
1190
|
+
const previous = failures.get(key);
|
|
1191
|
+
failures.set(key, {
|
|
1192
|
+
path: chunk.path,
|
|
1193
|
+
hash: chunk.hash,
|
|
1194
|
+
seq: chunk.seq,
|
|
1195
|
+
attempts: (previous?.attempts ?? 0) + 1,
|
|
1196
|
+
reason,
|
|
1197
|
+
});
|
|
1198
|
+
retryQueue.set(key, chunk);
|
|
1199
|
+
};
|
|
1200
|
+
const clearFailure = (chunk) => {
|
|
1201
|
+
const key = chunkKey(chunk);
|
|
1202
|
+
failures.delete(key);
|
|
1203
|
+
retryQueue.delete(key);
|
|
1204
|
+
};
|
|
1205
|
+
const tryEmbedChunk = async (chunk) => {
|
|
1206
|
+
try {
|
|
1207
|
+
const text = formatDocForEmbedding(chunk.text, chunk.title, embedModelUri);
|
|
1208
|
+
const result = await session.embed(text, { model });
|
|
1209
|
+
if (!result) {
|
|
1210
|
+
recordFailure(chunk, "embedding returned no vector");
|
|
1211
|
+
return false;
|
|
1212
|
+
}
|
|
1213
|
+
insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now, chunk.expectedTotalChunks, fingerprint);
|
|
1214
|
+
chunksEmbedded++;
|
|
1215
|
+
successesSinceRetry++;
|
|
1216
|
+
clearFailure(chunk);
|
|
1217
|
+
return true;
|
|
1218
|
+
}
|
|
1219
|
+
catch (error) {
|
|
1220
|
+
recordFailure(chunk, reasonFromError(error));
|
|
1221
|
+
return false;
|
|
1222
|
+
}
|
|
1223
|
+
};
|
|
1224
|
+
const retryFailedChunks = async (force = false) => {
|
|
1225
|
+
if (!session.isValid || retryQueue.size === 0)
|
|
1226
|
+
return;
|
|
1227
|
+
if (!force && successesSinceRetry < RETRY_AFTER_SUCCESSFUL_CHUNKS)
|
|
1228
|
+
return;
|
|
1229
|
+
successesSinceRetry = 0;
|
|
1230
|
+
// Normal mode: one retry pass after enough unrelated chunks succeeded.
|
|
1231
|
+
// Force mode: we have run out of other chunks for this batch, so keep
|
|
1232
|
+
// retrying outstanding failures until they recover or hit the cap. The
|
|
1233
|
+
// cap prevents endless loops on permanently bad chunks.
|
|
1234
|
+
do {
|
|
1235
|
+
let retried = 0;
|
|
1236
|
+
for (const [key, chunk] of [...retryQueue]) {
|
|
1237
|
+
const failure = failures.get(key);
|
|
1238
|
+
if (!failure || failure.attempts >= MAX_RETRY_ATTEMPTS)
|
|
1239
|
+
continue;
|
|
1240
|
+
retried++;
|
|
1241
|
+
await tryEmbedChunk(chunk);
|
|
1242
|
+
}
|
|
1243
|
+
if (!force || retried === 0)
|
|
1244
|
+
break;
|
|
1245
|
+
} while (session.isValid && [...retryQueue].some(([key]) => {
|
|
1246
|
+
const failure = failures.get(key);
|
|
1247
|
+
return !!failure && failure.attempts < MAX_RETRY_ATTEMPTS;
|
|
1248
|
+
}));
|
|
1249
|
+
};
|
|
1037
1250
|
const batches = buildEmbeddingBatches(docsToEmbed, maxDocsPerBatch, maxBatchBytes);
|
|
1038
1251
|
for (const batchMeta of batches) {
|
|
1039
1252
|
// Abort early if session has been invalidated
|
|
@@ -1043,6 +1256,7 @@ export async function generateEmbeddings(store, options) {
|
|
|
1043
1256
|
}
|
|
1044
1257
|
const batchDocs = getEmbeddingDocsForBatch(db, batchMeta);
|
|
1045
1258
|
const batchChunks = [];
|
|
1259
|
+
const expectedChunksByHash = new Map();
|
|
1046
1260
|
const batchBytes = batchMeta.reduce((sum, doc) => sum + Math.max(0, doc.bytes), 0);
|
|
1047
1261
|
for (const doc of batchDocs) {
|
|
1048
1262
|
if (!doc.body.trim())
|
|
@@ -1052,19 +1266,22 @@ export async function generateEmbeddings(store, options) {
|
|
|
1052
1266
|
for (let seq = 0; seq < chunks.length; seq++) {
|
|
1053
1267
|
batchChunks.push({
|
|
1054
1268
|
hash: doc.hash,
|
|
1269
|
+
path: doc.path,
|
|
1055
1270
|
title,
|
|
1056
1271
|
text: chunks[seq].text,
|
|
1057
1272
|
seq,
|
|
1058
1273
|
pos: chunks[seq].pos,
|
|
1059
1274
|
tokens: chunks[seq].tokens,
|
|
1060
1275
|
bytes: encoder.encode(chunks[seq].text).length,
|
|
1276
|
+
expectedTotalChunks: chunks.length,
|
|
1061
1277
|
});
|
|
1062
1278
|
}
|
|
1279
|
+
expectedChunksByHash.set(doc.hash, chunks.length);
|
|
1063
1280
|
}
|
|
1064
1281
|
totalChunks += batchChunks.length;
|
|
1065
1282
|
if (batchChunks.length === 0) {
|
|
1066
1283
|
bytesProcessed += batchBytes;
|
|
1067
|
-
options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors });
|
|
1284
|
+
options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors: activeErrorCount(), failures: failureList() });
|
|
1068
1285
|
continue;
|
|
1069
1286
|
}
|
|
1070
1287
|
if (!vectorTableInitialized) {
|
|
@@ -1082,17 +1299,19 @@ export async function generateEmbeddings(store, options) {
|
|
|
1082
1299
|
for (let batchStart = 0; batchStart < batchChunks.length; batchStart += BATCH_SIZE) {
|
|
1083
1300
|
// Abort early if session has been invalidated (e.g. max duration exceeded)
|
|
1084
1301
|
if (!session.isValid) {
|
|
1085
|
-
const
|
|
1086
|
-
|
|
1087
|
-
|
|
1302
|
+
const remainingChunks = batchChunks.slice(batchStart);
|
|
1303
|
+
for (const chunk of remainingChunks)
|
|
1304
|
+
recordFailure(chunk, "LLM session expired before embedding chunk");
|
|
1305
|
+
console.warn(`⚠ Session expired — skipping ${remainingChunks.length} remaining chunks`);
|
|
1088
1306
|
break;
|
|
1089
1307
|
}
|
|
1090
|
-
// Abort early if error rate is too high (>80% of
|
|
1091
|
-
const processed = chunksEmbedded +
|
|
1092
|
-
if (processed >= BATCH_SIZE &&
|
|
1093
|
-
const
|
|
1094
|
-
|
|
1095
|
-
|
|
1308
|
+
// Abort early if active error rate is too high (>80% of attempted chunks failed)
|
|
1309
|
+
const processed = chunksEmbedded + activeErrorCount();
|
|
1310
|
+
if (processed >= BATCH_SIZE && activeErrorCount() > processed * 0.8) {
|
|
1311
|
+
const remainingChunks = batchChunks.slice(batchStart);
|
|
1312
|
+
for (const chunk of remainingChunks)
|
|
1313
|
+
recordFailure(chunk, "embedding aborted because error rate was too high");
|
|
1314
|
+
console.warn(`⚠ Error rate too high (${activeErrorCount()}/${processed}) — aborting embedding`);
|
|
1096
1315
|
break;
|
|
1097
1316
|
}
|
|
1098
1317
|
const batchEnd = Math.min(batchStart + BATCH_SIZE, batchChunks.length);
|
|
@@ -1104,39 +1323,33 @@ export async function generateEmbeddings(store, options) {
|
|
|
1104
1323
|
const chunk = chunkBatch[i];
|
|
1105
1324
|
const embedding = embeddings[i];
|
|
1106
1325
|
if (embedding) {
|
|
1107
|
-
insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now);
|
|
1326
|
+
insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now, chunk.expectedTotalChunks, fingerprint);
|
|
1108
1327
|
chunksEmbedded++;
|
|
1328
|
+
successesSinceRetry++;
|
|
1329
|
+
clearFailure(chunk);
|
|
1109
1330
|
}
|
|
1110
1331
|
else {
|
|
1111
|
-
|
|
1332
|
+
recordFailure(chunk, "batch embedding returned no vector");
|
|
1112
1333
|
}
|
|
1113
1334
|
batchChunkBytesProcessed += chunk.bytes;
|
|
1114
1335
|
}
|
|
1336
|
+
await retryFailedChunks();
|
|
1115
1337
|
}
|
|
1116
|
-
catch {
|
|
1117
|
-
// Batch failed — try individual embeddings as fallback
|
|
1118
|
-
//
|
|
1338
|
+
catch (error) {
|
|
1339
|
+
// Batch failed — try individual embeddings as fallback. If an
|
|
1340
|
+
// individual retry succeeds, any prior failure for that chunk is
|
|
1341
|
+
// cleared, so the visible error count reflects outstanding failures.
|
|
1342
|
+
const batchReason = reasonFromError(error);
|
|
1119
1343
|
if (!session.isValid) {
|
|
1120
|
-
|
|
1344
|
+
for (const chunk of chunkBatch)
|
|
1345
|
+
recordFailure(chunk, `batch failed and session expired: ${batchReason}`);
|
|
1121
1346
|
batchChunkBytesProcessed += chunkBatch.reduce((sum, c) => sum + c.bytes, 0);
|
|
1122
1347
|
}
|
|
1123
1348
|
else {
|
|
1124
1349
|
for (const chunk of chunkBatch) {
|
|
1125
|
-
|
|
1126
|
-
const text = formatDocForEmbedding(chunk.text, chunk.title, embedModelUri);
|
|
1127
|
-
const result = await session.embed(text, { model });
|
|
1128
|
-
if (result) {
|
|
1129
|
-
insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now);
|
|
1130
|
-
chunksEmbedded++;
|
|
1131
|
-
}
|
|
1132
|
-
else {
|
|
1133
|
-
errors++;
|
|
1134
|
-
}
|
|
1135
|
-
}
|
|
1136
|
-
catch {
|
|
1137
|
-
errors++;
|
|
1138
|
-
}
|
|
1350
|
+
await tryEmbedChunk(chunk);
|
|
1139
1351
|
batchChunkBytesProcessed += chunk.bytes;
|
|
1352
|
+
await retryFailedChunks();
|
|
1140
1353
|
}
|
|
1141
1354
|
}
|
|
1142
1355
|
}
|
|
@@ -1148,18 +1361,25 @@ export async function generateEmbeddings(store, options) {
|
|
|
1148
1361
|
totalChunks,
|
|
1149
1362
|
bytesProcessed: bytesProcessed + proportionalBytes,
|
|
1150
1363
|
totalBytes,
|
|
1151
|
-
errors,
|
|
1364
|
+
errors: activeErrorCount(),
|
|
1365
|
+
failures: failureList(),
|
|
1152
1366
|
});
|
|
1153
1367
|
}
|
|
1368
|
+
await retryFailedChunks(true);
|
|
1369
|
+
const removedPartialChunks = removeIncompleteEmbeddings(db, expectedChunksByHash, model);
|
|
1370
|
+
if (removedPartialChunks > 0) {
|
|
1371
|
+
chunksEmbedded = Math.max(0, chunksEmbedded - removedPartialChunks);
|
|
1372
|
+
}
|
|
1154
1373
|
bytesProcessed += batchBytes;
|
|
1155
|
-
options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors });
|
|
1374
|
+
options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors: activeErrorCount(), failures: failureList() });
|
|
1156
1375
|
}
|
|
1157
|
-
return { chunksEmbedded, errors };
|
|
1376
|
+
return { chunksEmbedded, errors: activeErrorCount(), failures: failureList() };
|
|
1158
1377
|
}, { maxDuration: 30 * 60 * 1000, name: 'generateEmbeddings' });
|
|
1159
1378
|
return {
|
|
1160
1379
|
docsProcessed: totalDocs,
|
|
1161
1380
|
chunksEmbedded: result.chunksEmbedded,
|
|
1162
1381
|
errors: result.errors,
|
|
1382
|
+
failures: result.failures,
|
|
1163
1383
|
durationMs: Date.now() - startTime,
|
|
1164
1384
|
};
|
|
1165
1385
|
}
|
|
@@ -1180,9 +1400,9 @@ export function createStore(dbPath) {
|
|
|
1180
1400
|
close: () => db.close(),
|
|
1181
1401
|
ensureVecTable: (dimensions) => ensureVecTableInternal(db, dimensions),
|
|
1182
1402
|
// Index health
|
|
1183
|
-
getHashesNeedingEmbedding: () => getHashesNeedingEmbedding(db),
|
|
1184
|
-
getIndexHealth: () => getIndexHealth(db),
|
|
1185
|
-
getStatus: () => getStatus(db),
|
|
1403
|
+
getHashesNeedingEmbedding: (model) => getHashesNeedingEmbedding(db, undefined, model ?? store.llm?.embedModelName ?? DEFAULT_EMBED_MODEL),
|
|
1404
|
+
getIndexHealth: (model) => getIndexHealth(db, model ?? store.llm?.embedModelName ?? DEFAULT_EMBED_MODEL),
|
|
1405
|
+
getStatus: (model) => getStatus(db, model ?? store.llm?.embedModelName ?? DEFAULT_EMBED_MODEL),
|
|
1186
1406
|
// Caching
|
|
1187
1407
|
getCacheKey,
|
|
1188
1408
|
getCachedResult: (cacheKey) => getCachedResult(db, cacheKey),
|
|
@@ -1210,8 +1430,8 @@ export function createStore(dbPath) {
|
|
|
1210
1430
|
searchFTS: (query, limit, collectionName) => searchFTS(db, query, limit, collectionName),
|
|
1211
1431
|
searchVec: (query, model, limit, collectionName, session, precomputedEmbedding) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding),
|
|
1212
1432
|
// Query expansion & reranking
|
|
1213
|
-
expandQuery: (query, model, intent) => expandQuery(query, model, db, intent, store.llm),
|
|
1214
|
-
rerank: (query, documents, model, intent) => rerank(query, documents, model, db, intent, store.llm),
|
|
1433
|
+
expandQuery: (query, model, intent) => expandQuery(query, model ?? store.llm?.generateModelName ?? DEFAULT_QUERY_MODEL, db, intent, store.llm),
|
|
1434
|
+
rerank: (query, documents, model, intent) => rerank(query, documents, model ?? store.llm?.rerankModelName ?? DEFAULT_RERANK_MODEL, db, intent, store.llm),
|
|
1215
1435
|
// Document retrieval
|
|
1216
1436
|
findDocument: (filename, options) => findDocument(db, filename, options),
|
|
1217
1437
|
getDocumentBody: (doc, fromLine, maxLines) => getDocumentBody(db, doc, fromLine, maxLines),
|
|
@@ -1224,6 +1444,7 @@ export function createStore(dbPath) {
|
|
|
1224
1444
|
insertContent: (hash, content, createdAt) => insertContent(db, hash, content, createdAt),
|
|
1225
1445
|
insertDocument: (collectionName, path, title, hash, createdAt, modifiedAt) => insertDocument(db, collectionName, path, title, hash, createdAt, modifiedAt),
|
|
1226
1446
|
findActiveDocument: (collectionName, path) => findActiveDocument(db, collectionName, path),
|
|
1447
|
+
findOrMigrateLegacyDocument: (collectionName, path) => findOrMigrateLegacyDocument(db, collectionName, path),
|
|
1227
1448
|
updateDocumentTitle: (documentId, title, modifiedAt) => updateDocumentTitle(db, documentId, title, modifiedAt),
|
|
1228
1449
|
updateDocument: (documentId, title, hash, modifiedAt) => updateDocument(db, documentId, title, hash, modifiedAt),
|
|
1229
1450
|
deactivateDocument: (collectionName, path) => deactivateDocument(db, collectionName, path),
|
|
@@ -1231,7 +1452,7 @@ export function createStore(dbPath) {
|
|
|
1231
1452
|
// Vector/embedding operations
|
|
1232
1453
|
getHashesForEmbedding: () => getHashesForEmbedding(db),
|
|
1233
1454
|
clearAllEmbeddings: () => clearAllEmbeddings(db),
|
|
1234
|
-
insertEmbedding: (hash, seq, pos, embedding, model, embeddedAt) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt),
|
|
1455
|
+
insertEmbedding: (hash, seq, pos, embedding, model, embeddedAt, totalChunks, fingerprint) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt, totalChunks, fingerprint),
|
|
1235
1456
|
};
|
|
1236
1457
|
return store;
|
|
1237
1458
|
}
|
|
@@ -1244,11 +1465,11 @@ export function getDocid(hash) {
|
|
|
1244
1465
|
/**
|
|
1245
1466
|
* Handelize a filename to be more token-friendly.
|
|
1246
1467
|
* - Convert triple underscore `___` to `/` (folder separator)
|
|
1247
|
-
* - Convert to lowercase
|
|
1248
1468
|
* - Replace sequences of non-word chars (except /) with single dash
|
|
1249
1469
|
* - Remove leading/trailing dashes from path segments
|
|
1250
1470
|
* - Preserve folder structure (a/b/c/d.md stays structured)
|
|
1251
1471
|
* - Preserve file extension
|
|
1472
|
+
* - Preserve original case (important for case-sensitive filesystems)
|
|
1252
1473
|
*/
|
|
1253
1474
|
/** Replace emoji/symbol codepoints with their hex representation (e.g. 🐘 → 1f418) */
|
|
1254
1475
|
function emojiToHex(str) {
|
|
@@ -1273,7 +1494,6 @@ export function handelize(path) {
|
|
|
1273
1494
|
}
|
|
1274
1495
|
const result = path
|
|
1275
1496
|
.replace(/___/g, '/') // Triple underscore becomes folder separator
|
|
1276
|
-
.toLowerCase()
|
|
1277
1497
|
.split('/')
|
|
1278
1498
|
.map((segment, idx, arr) => {
|
|
1279
1499
|
const isLastSegment = idx === arr.length - 1;
|
|
@@ -1306,17 +1526,85 @@ export function handelize(path) {
|
|
|
1306
1526
|
// =============================================================================
|
|
1307
1527
|
// Index health
|
|
1308
1528
|
// =============================================================================
|
|
1309
|
-
export function getHashesNeedingEmbedding(db) {
|
|
1310
|
-
const
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1529
|
+
export function getHashesNeedingEmbedding(db, collection, model = DEFAULT_EMBED_MODEL) {
|
|
1530
|
+
const collectionFilter = collection ? `AND d.collection = ?` : ``;
|
|
1531
|
+
const fingerprint = getEmbeddingFingerprint(model);
|
|
1532
|
+
return withLazyContentVectorMigration(db, () => {
|
|
1533
|
+
const stmt = db.prepare(`
|
|
1534
|
+
SELECT COUNT(DISTINCT d.hash) as count
|
|
1535
|
+
FROM documents d
|
|
1536
|
+
LEFT JOIN (
|
|
1537
|
+
SELECT hash, model, COUNT(*) AS chunk_count, MAX(total_chunks) AS expected_chunks
|
|
1538
|
+
FROM content_vectors
|
|
1539
|
+
WHERE model = ? AND embed_fingerprint = ?
|
|
1540
|
+
GROUP BY hash, model, embed_fingerprint
|
|
1541
|
+
) v ON d.hash = v.hash
|
|
1542
|
+
WHERE d.active = 1
|
|
1543
|
+
AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
|
|
1544
|
+
${collectionFilter}
|
|
1545
|
+
`);
|
|
1546
|
+
const result = (collection ? stmt.get(model, fingerprint, collection) : stmt.get(model, fingerprint));
|
|
1547
|
+
return result.count;
|
|
1548
|
+
});
|
|
1317
1549
|
}
|
|
1318
|
-
export function
|
|
1319
|
-
const
|
|
1550
|
+
export async function maybeAdoptLegacyEmbeddingFingerprint(store, model = DEFAULT_EMBED_MODEL) {
|
|
1551
|
+
const db = store.db;
|
|
1552
|
+
const fingerprint = getEmbeddingFingerprint(model);
|
|
1553
|
+
const legacyCount = withLazyContentVectorMigration(db, () => {
|
|
1554
|
+
const row = db.prepare(`SELECT COUNT(DISTINCT hash) AS count FROM content_vectors WHERE model = ? AND embed_fingerprint = ''`).get(model);
|
|
1555
|
+
return row.count;
|
|
1556
|
+
});
|
|
1557
|
+
if (legacyCount === 0) {
|
|
1558
|
+
return { checked: false, adopted: 0, reason: "no legacy empty-fingerprint embeddings" };
|
|
1559
|
+
}
|
|
1560
|
+
const sample = withLazyContentVectorMigration(db, () => db.prepare(`
|
|
1561
|
+
SELECT cv.hash, cv.seq, cv.pos, cv.total_chunks, c.doc AS body, MIN(d.path) AS path
|
|
1562
|
+
FROM content_vectors cv
|
|
1563
|
+
JOIN documents d ON d.hash = cv.hash AND d.active = 1
|
|
1564
|
+
JOIN content c ON c.hash = cv.hash
|
|
1565
|
+
WHERE cv.model = ? AND cv.embed_fingerprint = ''
|
|
1566
|
+
GROUP BY cv.hash, cv.seq, cv.pos, cv.total_chunks, c.doc
|
|
1567
|
+
ORDER BY cv.hash, cv.seq
|
|
1568
|
+
LIMIT 1
|
|
1569
|
+
`).get(model));
|
|
1570
|
+
if (!sample) {
|
|
1571
|
+
return { checked: false, adopted: 0, reason: `${legacyCount} legacy docs have no active sample` };
|
|
1572
|
+
}
|
|
1573
|
+
const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
|
|
1574
|
+
if (!tableExists) {
|
|
1575
|
+
return { checked: false, adopted: 0, reason: "vectors_vec table is missing" };
|
|
1576
|
+
}
|
|
1577
|
+
const expectedHashSeq = `${sample.hash}_${sample.seq}`;
|
|
1578
|
+
const title = extractTitle(sample.body, sample.path);
|
|
1579
|
+
const llm = getLlm(store);
|
|
1580
|
+
return await withLLMSessionForLlm(llm, async (session) => {
|
|
1581
|
+
const chunks = await chunkDocumentByTokens(sample.body, undefined, undefined, undefined, sample.path, undefined, session.signal);
|
|
1582
|
+
const chunk = chunks[sample.seq];
|
|
1583
|
+
if (!chunk) {
|
|
1584
|
+
return { checked: true, adopted: 0, reason: `sample chunk ${expectedHashSeq} no longer exists` };
|
|
1585
|
+
}
|
|
1586
|
+
const result = await session.embed(formatDocForEmbedding(chunk.text, title, model), { model });
|
|
1587
|
+
if (!result) {
|
|
1588
|
+
return { checked: true, adopted: 0, reason: "failed to embed legacy sample" };
|
|
1589
|
+
}
|
|
1590
|
+
const nearest = db.prepare(`
|
|
1591
|
+
SELECT hash_seq, distance
|
|
1592
|
+
FROM vectors_vec
|
|
1593
|
+
WHERE embedding MATCH ? AND k = 1
|
|
1594
|
+
`).get(new Float32Array(result.embedding));
|
|
1595
|
+
if (!nearest) {
|
|
1596
|
+
return { checked: true, adopted: 0, reason: "legacy sample vector not found" };
|
|
1597
|
+
}
|
|
1598
|
+
const threshold = 0.0001;
|
|
1599
|
+
if (nearest.hash_seq !== expectedHashSeq || nearest.distance > threshold) {
|
|
1600
|
+
return { checked: true, adopted: 0, reason: `legacy sample differs from current fingerprint (nearest ${nearest.hash_seq}, distance ${nearest.distance.toFixed(6)})` };
|
|
1601
|
+
}
|
|
1602
|
+
const update = withLazyContentVectorMigration(db, () => db.prepare(`UPDATE content_vectors SET embed_fingerprint = ? WHERE model = ? AND embed_fingerprint = ''`).run(fingerprint, model));
|
|
1603
|
+
return { checked: true, adopted: update.changes, reason: `sample ${expectedHashSeq} matched current fingerprint at distance ${nearest.distance.toFixed(6)}` };
|
|
1604
|
+
});
|
|
1605
|
+
}
|
|
1606
|
+
export function getIndexHealth(db, model = DEFAULT_EMBED_MODEL) {
|
|
1607
|
+
const needsEmbedding = getHashesNeedingEmbedding(db, undefined, model);
|
|
1320
1608
|
const totalDocs = db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get().count;
|
|
1321
1609
|
const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get();
|
|
1322
1610
|
let daysStale = null;
|
|
@@ -1369,13 +1657,15 @@ export function deleteInactiveDocuments(db) {
|
|
|
1369
1657
|
return result.changes;
|
|
1370
1658
|
}
|
|
1371
1659
|
/**
|
|
1372
|
-
* Remove orphaned content hashes that are not referenced by any
|
|
1660
|
+
* Remove orphaned content hashes that are not referenced by any document.
|
|
1661
|
+
* Inactive documents are soft-deleted tombstones, so their content rows must
|
|
1662
|
+
* remain referenced until deleteInactiveDocuments() hard-deletes them.
|
|
1373
1663
|
* Returns the number of orphaned content hashes deleted.
|
|
1374
1664
|
*/
|
|
1375
1665
|
export function cleanupOrphanedContent(db) {
|
|
1376
1666
|
const result = db.prepare(`
|
|
1377
1667
|
DELETE FROM content
|
|
1378
|
-
WHERE hash NOT IN (SELECT DISTINCT hash FROM documents
|
|
1668
|
+
WHERE hash NOT IN (SELECT DISTINCT hash FROM documents)
|
|
1379
1669
|
`).run();
|
|
1380
1670
|
return result.changes;
|
|
1381
1671
|
}
|
|
@@ -1400,32 +1690,34 @@ export function cleanupOrphanedVectors(db) {
|
|
|
1400
1690
|
catch {
|
|
1401
1691
|
return 0;
|
|
1402
1692
|
}
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
|
|
1408
|
-
)
|
|
1409
|
-
`).get();
|
|
1410
|
-
if (countResult.c === 0) {
|
|
1411
|
-
return 0;
|
|
1412
|
-
}
|
|
1413
|
-
// Delete from vectors_vec first
|
|
1414
|
-
db.exec(`
|
|
1415
|
-
DELETE FROM vectors_vec WHERE hash_seq IN (
|
|
1416
|
-
SELECT cv.hash || '_' || cv.seq FROM content_vectors cv
|
|
1693
|
+
return withLazyContentVectorMigration(db, () => {
|
|
1694
|
+
// Count orphaned vectors first
|
|
1695
|
+
const countResult = db.prepare(`
|
|
1696
|
+
SELECT COUNT(*) as c FROM content_vectors cv
|
|
1417
1697
|
WHERE NOT EXISTS (
|
|
1418
1698
|
SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
|
|
1419
1699
|
)
|
|
1420
|
-
)
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
|
|
1700
|
+
`).get();
|
|
1701
|
+
if (countResult.c === 0) {
|
|
1702
|
+
return 0;
|
|
1703
|
+
}
|
|
1704
|
+
// Delete from vectors_vec first
|
|
1705
|
+
db.exec(`
|
|
1706
|
+
DELETE FROM vectors_vec WHERE hash_seq IN (
|
|
1707
|
+
SELECT cv.hash || '_' || cv.seq FROM content_vectors cv
|
|
1708
|
+
WHERE NOT EXISTS (
|
|
1709
|
+
SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
|
|
1710
|
+
)
|
|
1711
|
+
)
|
|
1712
|
+
`);
|
|
1713
|
+
// Delete from content_vectors
|
|
1714
|
+
db.exec(`
|
|
1715
|
+
DELETE FROM content_vectors WHERE hash NOT IN (
|
|
1716
|
+
SELECT hash FROM documents WHERE active = 1
|
|
1717
|
+
)
|
|
1718
|
+
`);
|
|
1719
|
+
return countResult.c;
|
|
1720
|
+
});
|
|
1429
1721
|
}
|
|
1430
1722
|
/**
|
|
1431
1723
|
* Run VACUUM to reclaim unused space in the database.
|
|
@@ -1487,6 +1779,21 @@ export function insertContent(db, hash, content, createdAt) {
|
|
|
1487
1779
|
db.prepare(`INSERT OR IGNORE INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
|
|
1488
1780
|
.run(hash, content, createdAt);
|
|
1489
1781
|
}
|
|
1782
|
+
function rebuildDocumentFTS(db, documentId) {
|
|
1783
|
+
const row = db.prepare(`
|
|
1784
|
+
SELECT d.id, d.collection, d.path, d.title, content.doc as body
|
|
1785
|
+
FROM documents d
|
|
1786
|
+
JOIN content ON content.hash = d.hash
|
|
1787
|
+
WHERE d.id = ? AND d.active = 1
|
|
1788
|
+
`).get(documentId);
|
|
1789
|
+
db.prepare(`DELETE FROM documents_fts WHERE rowid = ?`).run(documentId);
|
|
1790
|
+
if (!row)
|
|
1791
|
+
return;
|
|
1792
|
+
db.prepare(`
|
|
1793
|
+
INSERT INTO documents_fts(rowid, filepath, title, body)
|
|
1794
|
+
VALUES (?, ?, ?, ?)
|
|
1795
|
+
`).run(row.id, normalizeCjkForFTS(`${row.collection}/${row.path}`), normalizeCjkForFTS(row.title), normalizeCjkForFTS(row.body));
|
|
1796
|
+
}
|
|
1490
1797
|
/**
|
|
1491
1798
|
* Insert a new document into the documents table.
|
|
1492
1799
|
*/
|
|
@@ -1500,6 +1807,9 @@ export function insertDocument(db, collectionName, path, title, hash, createdAt,
|
|
|
1500
1807
|
modified_at = excluded.modified_at,
|
|
1501
1808
|
active = 1
|
|
1502
1809
|
`).run(collectionName, path, title, hash, createdAt, modifiedAt);
|
|
1810
|
+
const row = db.prepare(`SELECT id FROM documents WHERE collection = ? AND path = ?`).get(collectionName, path);
|
|
1811
|
+
if (row)
|
|
1812
|
+
rebuildDocumentFTS(db, row.id);
|
|
1503
1813
|
}
|
|
1504
1814
|
/**
|
|
1505
1815
|
* Find an active document by collection name and path.
|
|
@@ -1511,12 +1821,48 @@ export function findActiveDocument(db, collectionName, path) {
|
|
|
1511
1821
|
`).get(collectionName, path);
|
|
1512
1822
|
return row ?? null;
|
|
1513
1823
|
}
|
|
1824
|
+
/**
|
|
1825
|
+
* Find an active document, falling back to a case-insensitive path match.
|
|
1826
|
+
* If found under a different casing, renames it in-place and rebuilds the
|
|
1827
|
+
* FTS entry. Embeddings are keyed by content hash, so the rename is
|
|
1828
|
+
* safe — no re-embedding required.
|
|
1829
|
+
*
|
|
1830
|
+
* @internal Used by reindexCollection and indexFiles during qmd update.
|
|
1831
|
+
* Returns null if the document does not exist under either path.
|
|
1832
|
+
*/
|
|
1833
|
+
export function findOrMigrateLegacyDocument(db, collectionName, path) {
|
|
1834
|
+
const existing = findActiveDocument(db, collectionName, path);
|
|
1835
|
+
if (existing)
|
|
1836
|
+
return existing;
|
|
1837
|
+
const legacy = db.prepare(`
|
|
1838
|
+
SELECT id, hash, title FROM documents
|
|
1839
|
+
WHERE collection = ? AND path COLLATE NOCASE = ? AND active = 1
|
|
1840
|
+
ORDER BY id
|
|
1841
|
+
LIMIT 1
|
|
1842
|
+
`).get(collectionName, path);
|
|
1843
|
+
if (!legacy)
|
|
1844
|
+
return null;
|
|
1845
|
+
// Wrap rename + FTS rebuild in a transaction for atomicity.
|
|
1846
|
+
const migrate = db.transaction(() => {
|
|
1847
|
+
// Use OR IGNORE so a UNIQUE conflict (e.g. both "readme.md" and
|
|
1848
|
+
// "README.md" already exist) is a no-op rather than crashing.
|
|
1849
|
+
const result = db.prepare(`UPDATE OR IGNORE documents SET path = ? WHERE id = ? AND active = 1`).run(path, legacy.id);
|
|
1850
|
+
if (result.changes === 0)
|
|
1851
|
+
return false;
|
|
1852
|
+
rebuildDocumentFTS(db, legacy.id);
|
|
1853
|
+
return true;
|
|
1854
|
+
});
|
|
1855
|
+
if (!migrate())
|
|
1856
|
+
return null;
|
|
1857
|
+
return findActiveDocument(db, collectionName, path);
|
|
1858
|
+
}
|
|
1514
1859
|
/**
|
|
1515
1860
|
* Update the title and modified_at timestamp for a document.
|
|
1516
1861
|
*/
|
|
1517
1862
|
export function updateDocumentTitle(db, documentId, title, modifiedAt) {
|
|
1518
1863
|
db.prepare(`UPDATE documents SET title = ?, modified_at = ? WHERE id = ?`)
|
|
1519
1864
|
.run(title, modifiedAt, documentId);
|
|
1865
|
+
rebuildDocumentFTS(db, documentId);
|
|
1520
1866
|
}
|
|
1521
1867
|
/**
|
|
1522
1868
|
* Update an existing document's hash, title, and modified_at timestamp.
|
|
@@ -1525,6 +1871,7 @@ export function updateDocumentTitle(db, documentId, title, modifiedAt) {
|
|
|
1525
1871
|
export function updateDocument(db, documentId, title, hash, modifiedAt) {
|
|
1526
1872
|
db.prepare(`UPDATE documents SET title = ?, hash = ?, modified_at = ? WHERE id = ?`)
|
|
1527
1873
|
.run(title, hash, modifiedAt, documentId);
|
|
1874
|
+
rebuildDocumentFTS(db, documentId);
|
|
1528
1875
|
}
|
|
1529
1876
|
/**
|
|
1530
1877
|
* Deactivate a document (mark as inactive but don't delete).
|
|
@@ -1593,31 +1940,54 @@ export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKE
|
|
|
1593
1940
|
let charChunks = await chunkDocumentAsync(content, maxChars, overlapChars, windowChars, filepath, chunkStrategy);
|
|
1594
1941
|
// Tokenize and split any chunks that still exceed limit
|
|
1595
1942
|
const results = [];
|
|
1596
|
-
|
|
1597
|
-
|
|
1943
|
+
const clampOverlapChars = (value, maxChars) => {
|
|
1944
|
+
if (maxChars <= 1)
|
|
1945
|
+
return 0;
|
|
1946
|
+
return Math.max(0, Math.min(maxChars - 1, Math.floor(value)));
|
|
1947
|
+
};
|
|
1948
|
+
const pushChunkWithinTokenLimit = async (text, pos) => {
|
|
1598
1949
|
if (signal?.aborted)
|
|
1599
|
-
|
|
1600
|
-
const tokens = await llm.tokenize(
|
|
1601
|
-
if (tokens.length <= maxTokens) {
|
|
1602
|
-
results.push({ text
|
|
1950
|
+
return;
|
|
1951
|
+
const tokens = await llm.tokenize(text);
|
|
1952
|
+
if (tokens.length <= maxTokens || text.length <= 1) {
|
|
1953
|
+
results.push({ text, pos, tokens: tokens.length });
|
|
1954
|
+
return;
|
|
1603
1955
|
}
|
|
1604
|
-
|
|
1605
|
-
|
|
1606
|
-
|
|
1607
|
-
|
|
1608
|
-
|
|
1609
|
-
|
|
1610
|
-
|
|
1611
|
-
|
|
1612
|
-
|
|
1613
|
-
|
|
1614
|
-
|
|
1615
|
-
|
|
1616
|
-
|
|
1617
|
-
|
|
1618
|
-
|
|
1619
|
-
|
|
1956
|
+
const actualCharsPerToken = text.length / tokens.length;
|
|
1957
|
+
let safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95);
|
|
1958
|
+
if (!Number.isFinite(safeMaxChars) || safeMaxChars < 1) {
|
|
1959
|
+
safeMaxChars = Math.floor(text.length / 2);
|
|
1960
|
+
}
|
|
1961
|
+
safeMaxChars = Math.max(1, Math.min(text.length - 1, safeMaxChars));
|
|
1962
|
+
let nextOverlapChars = clampOverlapChars(overlapChars * actualCharsPerToken / 2, safeMaxChars);
|
|
1963
|
+
let nextWindowChars = Math.max(0, Math.floor(windowChars * actualCharsPerToken / 2));
|
|
1964
|
+
let subChunks = chunkDocument(text, safeMaxChars, nextOverlapChars, nextWindowChars);
|
|
1965
|
+
// Pathological single-line blobs can produce no meaningful breakpoint progress.
|
|
1966
|
+
// Fall back to a simple half split so every recursion step strictly shrinks.
|
|
1967
|
+
if (subChunks.length <= 1
|
|
1968
|
+
|| subChunks[0]?.text.length === text.length) {
|
|
1969
|
+
safeMaxChars = Math.max(1, Math.floor(text.length / 2));
|
|
1970
|
+
nextOverlapChars = 0;
|
|
1971
|
+
nextWindowChars = 0;
|
|
1972
|
+
subChunks = chunkDocument(text, safeMaxChars, nextOverlapChars, nextWindowChars);
|
|
1973
|
+
}
|
|
1974
|
+
if (subChunks.length <= 1
|
|
1975
|
+
|| subChunks[0]?.text.length === text.length) {
|
|
1976
|
+
const fallbackTokens = tokens.slice(0, Math.max(1, maxTokens));
|
|
1977
|
+
const truncatedText = await llm.detokenize(fallbackTokens);
|
|
1978
|
+
results.push({
|
|
1979
|
+
text: truncatedText,
|
|
1980
|
+
pos,
|
|
1981
|
+
tokens: fallbackTokens.length,
|
|
1982
|
+
});
|
|
1983
|
+
return;
|
|
1620
1984
|
}
|
|
1985
|
+
for (const subChunk of subChunks) {
|
|
1986
|
+
await pushChunkWithinTokenLimit(text.slice(subChunk.pos, subChunk.pos + subChunk.text.length), pos + subChunk.pos);
|
|
1987
|
+
}
|
|
1988
|
+
};
|
|
1989
|
+
for (const chunk of charChunks) {
|
|
1990
|
+
await pushChunkWithinTokenLimit(chunk.text, chunk.pos);
|
|
1621
1991
|
}
|
|
1622
1992
|
return results;
|
|
1623
1993
|
}
|
|
@@ -2135,7 +2505,7 @@ function buildFTS5Query(query) {
|
|
|
2135
2505
|
const phrase = s.slice(start, i).trim();
|
|
2136
2506
|
i++; // skip closing quote
|
|
2137
2507
|
if (phrase.length > 0) {
|
|
2138
|
-
const sanitized = phrase
|
|
2508
|
+
const sanitized = sanitizeFTS5Phrase(phrase);
|
|
2139
2509
|
if (sanitized) {
|
|
2140
2510
|
const ftsPhrase = `"${sanitized}"`; // Exact phrase, no prefix match
|
|
2141
2511
|
if (negated) {
|
|
@@ -2167,6 +2537,18 @@ function buildFTS5Query(query) {
|
|
|
2167
2537
|
}
|
|
2168
2538
|
}
|
|
2169
2539
|
}
|
|
2540
|
+
else if (containsCjk(term)) {
|
|
2541
|
+
const sanitized = sanitizeFTS5Phrase(term);
|
|
2542
|
+
if (sanitized) {
|
|
2543
|
+
const ftsPhrase = `"${sanitized}"`; // CJK phrase over character tokens
|
|
2544
|
+
if (negated) {
|
|
2545
|
+
negative.push(ftsPhrase);
|
|
2546
|
+
}
|
|
2547
|
+
else {
|
|
2548
|
+
positive.push(ftsPhrase);
|
|
2549
|
+
}
|
|
2550
|
+
}
|
|
2551
|
+
}
|
|
2170
2552
|
else {
|
|
2171
2553
|
const sanitized = sanitizeFTS5Term(term);
|
|
2172
2554
|
if (sanitized) {
|
|
@@ -2199,8 +2581,9 @@ function buildFTS5Query(query) {
|
|
|
2199
2581
|
* Returns error message if invalid, null if valid.
|
|
2200
2582
|
*/
|
|
2201
2583
|
export function validateSemanticQuery(query) {
|
|
2202
|
-
// Check for negation syntax
|
|
2203
|
-
|
|
2584
|
+
// Check for negation syntax — only at token boundaries (start of string or after whitespace).
|
|
2585
|
+
// Hyphenated words like "real-time" or "write-ahead" must not trigger this.
|
|
2586
|
+
if (/(^|\s)-[\w"]/.test(query)) {
|
|
2204
2587
|
return 'Negation (-term) is not supported in vec/hyde queries. Use lex for exclusions.';
|
|
2205
2588
|
}
|
|
2206
2589
|
return null;
|
|
@@ -2326,7 +2709,7 @@ export async function searchVec(db, query, model, limit = 20, collectionName, se
|
|
|
2326
2709
|
docSql += ` AND d.collection = ?`;
|
|
2327
2710
|
params.push(collectionName);
|
|
2328
2711
|
}
|
|
2329
|
-
const docRows = db.prepare(docSql).all(...params);
|
|
2712
|
+
const docRows = withLazyContentVectorMigration(db, () => db.prepare(docSql).all(...params));
|
|
2330
2713
|
// Combine with distances and dedupe by filepath
|
|
2331
2714
|
const seen = new Map();
|
|
2332
2715
|
for (const row of docRows) {
|
|
@@ -2373,23 +2756,82 @@ async function getEmbedding(text, model, isQuery, session, llmOverride) {
|
|
|
2373
2756
|
* Get all unique content hashes that need embeddings (from active documents).
|
|
2374
2757
|
* Returns hash, document body, and a sample path for display purposes.
|
|
2375
2758
|
*/
|
|
2376
|
-
export function getHashesForEmbedding(db) {
|
|
2377
|
-
|
|
2759
|
+
export function getHashesForEmbedding(db, model = DEFAULT_EMBED_MODEL) {
|
|
2760
|
+
const fingerprint = getEmbeddingFingerprint(model);
|
|
2761
|
+
return withLazyContentVectorMigration(db, () => db.prepare(`
|
|
2378
2762
|
SELECT d.hash, c.doc as body, MIN(d.path) as path
|
|
2379
2763
|
FROM documents d
|
|
2380
2764
|
JOIN content c ON d.hash = c.hash
|
|
2381
|
-
LEFT JOIN
|
|
2382
|
-
|
|
2765
|
+
LEFT JOIN (
|
|
2766
|
+
SELECT hash, model, COUNT(*) AS chunk_count, MAX(total_chunks) AS expected_chunks
|
|
2767
|
+
FROM content_vectors
|
|
2768
|
+
WHERE model = ? AND embed_fingerprint = ?
|
|
2769
|
+
GROUP BY hash, model, embed_fingerprint
|
|
2770
|
+
) v ON d.hash = v.hash
|
|
2771
|
+
WHERE d.active = 1
|
|
2772
|
+
AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
|
|
2383
2773
|
GROUP BY d.hash
|
|
2384
|
-
`).all();
|
|
2774
|
+
`).all(model, fingerprint));
|
|
2385
2775
|
}
|
|
2386
2776
|
/**
|
|
2387
|
-
* Clear
|
|
2388
|
-
*
|
|
2389
|
-
|
|
2390
|
-
|
|
2391
|
-
|
|
2392
|
-
|
|
2777
|
+
* Clear embeddings for the whole index, or just for one collection.
|
|
2778
|
+
*
|
|
2779
|
+
* When `collection` is omitted the entire content_vectors table is emptied and
|
|
2780
|
+
* the vectors_vec virtual table is dropped (it is recreated with the right
|
|
2781
|
+
* dimensions on the next embed run).
|
|
2782
|
+
*
|
|
2783
|
+
* When `collection` is provided, only vectors whose hash is referenced
|
|
2784
|
+
* exclusively by active documents in that collection are removed. Hashes
|
|
2785
|
+
* shared with active documents in other collections are left in place so
|
|
2786
|
+
* vector search keeps working there (content_vectors is keyed globally by
|
|
2787
|
+
* content hash; identical document bodies across collections share a row).
|
|
2788
|
+
* vectors_vec is preserved so other collections keep working unless the scoped
|
|
2789
|
+
* clear empties content_vectors entirely, in which case it is dropped so the
|
|
2790
|
+
* next embed can recreate the table with the current dimensions.
|
|
2791
|
+
*/
|
|
2792
|
+
export function clearAllEmbeddings(db, collection) {
|
|
2793
|
+
if (!collection) {
|
|
2794
|
+
db.exec(`DELETE FROM content_vectors`);
|
|
2795
|
+
db.exec(`DROP TABLE IF EXISTS vectors_vec`);
|
|
2796
|
+
return;
|
|
2797
|
+
}
|
|
2798
|
+
const exclusiveHashesQuery = `
|
|
2799
|
+
SELECT DISTINCT d.hash
|
|
2800
|
+
FROM documents d
|
|
2801
|
+
WHERE d.collection = ? AND d.active = 1
|
|
2802
|
+
AND NOT EXISTS (
|
|
2803
|
+
SELECT 1 FROM documents d2
|
|
2804
|
+
WHERE d2.hash = d.hash
|
|
2805
|
+
AND d2.active = 1
|
|
2806
|
+
AND d2.collection != d.collection
|
|
2807
|
+
)
|
|
2808
|
+
`;
|
|
2809
|
+
const vecTableExists = db
|
|
2810
|
+
.prepare(`SELECT 1 FROM sqlite_master WHERE type='table' AND name='vectors_vec'`)
|
|
2811
|
+
.get();
|
|
2812
|
+
withLazyContentVectorMigration(db, () => {
|
|
2813
|
+
if (vecTableExists) {
|
|
2814
|
+
const hashSeqRows = db.prepare(`
|
|
2815
|
+
SELECT cv.hash, cv.seq
|
|
2816
|
+
FROM content_vectors cv
|
|
2817
|
+
WHERE cv.hash IN (${exclusiveHashesQuery})
|
|
2818
|
+
`).all(collection);
|
|
2819
|
+
const delVec = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
|
|
2820
|
+
for (const row of hashSeqRows) {
|
|
2821
|
+
delVec.run(`${row.hash}_${row.seq}`);
|
|
2822
|
+
}
|
|
2823
|
+
}
|
|
2824
|
+
db.prepare(`
|
|
2825
|
+
DELETE FROM content_vectors
|
|
2826
|
+
WHERE hash IN (${exclusiveHashesQuery})
|
|
2827
|
+
`).run(collection);
|
|
2828
|
+
const remaining = db
|
|
2829
|
+
.prepare(`SELECT COUNT(*) AS n FROM content_vectors`)
|
|
2830
|
+
.get();
|
|
2831
|
+
if (remaining.n === 0) {
|
|
2832
|
+
db.exec(`DROP TABLE IF EXISTS vectors_vec`);
|
|
2833
|
+
}
|
|
2834
|
+
});
|
|
2393
2835
|
}
|
|
2394
2836
|
/**
|
|
2395
2837
|
* Insert a single embedding into both content_vectors and vectors_vec tables.
|
|
@@ -2401,16 +2843,37 @@ export function clearAllEmbeddings(db) {
|
|
|
2401
2843
|
* vectors_vec uses DELETE + INSERT instead of INSERT OR REPLACE because sqlite-vec's
|
|
2402
2844
|
* vec0 virtual tables silently ignore the OR REPLACE conflict clause.
|
|
2403
2845
|
*/
|
|
2404
|
-
export function insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt) {
|
|
2846
|
+
export function insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt, totalChunks = 1, fingerprint = getEmbeddingFingerprint(model)) {
|
|
2405
2847
|
const hashSeq = `${hash}_${seq}`;
|
|
2406
|
-
|
|
2407
|
-
|
|
2408
|
-
|
|
2409
|
-
|
|
2410
|
-
|
|
2411
|
-
|
|
2412
|
-
|
|
2413
|
-
|
|
2848
|
+
withLazyContentVectorMigration(db, () => {
|
|
2849
|
+
// Insert content_vectors first — crash-safe ordering (see getHashesForEmbedding)
|
|
2850
|
+
const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embed_fingerprint, total_chunks, embedded_at) VALUES (?, ?, ?, ?, ?, ?, ?)`);
|
|
2851
|
+
insertContentVectorStmt.run(hash, seq, pos, model, fingerprint, totalChunks, embeddedAt);
|
|
2852
|
+
// vec0 virtual tables don't support OR REPLACE — use DELETE + INSERT
|
|
2853
|
+
const deleteVecStmt = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
|
|
2854
|
+
const insertVecStmt = db.prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
|
|
2855
|
+
deleteVecStmt.run(hashSeq);
|
|
2856
|
+
insertVecStmt.run(hashSeq, embedding);
|
|
2857
|
+
});
|
|
2858
|
+
}
|
|
2859
|
+
function removeIncompleteEmbeddings(db, expectedChunksByHash, model) {
|
|
2860
|
+
return withLazyContentVectorMigration(db, () => {
|
|
2861
|
+
let removed = 0;
|
|
2862
|
+
const rowsStmt = db.prepare(`SELECT seq FROM content_vectors WHERE hash = ? AND model = ?`);
|
|
2863
|
+
const deleteContentStmt = db.prepare(`DELETE FROM content_vectors WHERE hash = ? AND model = ?`);
|
|
2864
|
+
const deleteVecStmt = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
|
|
2865
|
+
for (const [hash, expectedChunks] of expectedChunksByHash) {
|
|
2866
|
+
const rows = rowsStmt.all(hash, model);
|
|
2867
|
+
if (rows.length === 0 || rows.length === expectedChunks)
|
|
2868
|
+
continue;
|
|
2869
|
+
for (const row of rows) {
|
|
2870
|
+
deleteVecStmt.run(`${hash}_${row.seq}`);
|
|
2871
|
+
}
|
|
2872
|
+
deleteContentStmt.run(hash, model);
|
|
2873
|
+
removed += rows.length;
|
|
2874
|
+
}
|
|
2875
|
+
return removed;
|
|
2876
|
+
});
|
|
2414
2877
|
}
|
|
2415
2878
|
// =============================================================================
|
|
2416
2879
|
// Query expansion
|
|
@@ -2422,12 +2885,15 @@ export async function expandQuery(query, model = DEFAULT_QUERY_MODEL, db, intent
|
|
|
2422
2885
|
if (cached) {
|
|
2423
2886
|
try {
|
|
2424
2887
|
const parsed = JSON.parse(cached);
|
|
2888
|
+
if (!Array.isArray(parsed))
|
|
2889
|
+
return [];
|
|
2890
|
+
const rows = parsed;
|
|
2425
2891
|
// Migrate old cache format: { type, text } → { type, query }
|
|
2426
|
-
if (
|
|
2427
|
-
return
|
|
2892
|
+
if (rows.length > 0 && typeof rows[0]?.query === "string") {
|
|
2893
|
+
return rows.map((r) => ({ type: r.type, query: String(r.query) }));
|
|
2428
2894
|
}
|
|
2429
|
-
else if (
|
|
2430
|
-
return
|
|
2895
|
+
else if (rows.length > 0 && typeof rows[0]?.text === "string") {
|
|
2896
|
+
return rows.map((r) => ({ type: r.type, query: String(r.text) }));
|
|
2431
2897
|
}
|
|
2432
2898
|
}
|
|
2433
2899
|
catch {
|
|
@@ -2734,7 +3200,7 @@ export function getDocumentBody(db, doc, fromLine, maxLines) {
|
|
|
2734
3200
|
let body = row.body;
|
|
2735
3201
|
if (fromLine !== undefined || maxLines !== undefined) {
|
|
2736
3202
|
const lines = body.split('\n');
|
|
2737
|
-
const start = (fromLine || 1) - 1;
|
|
3203
|
+
const start = Math.max(0, (fromLine || 1) - 1);
|
|
2738
3204
|
const end = maxLines !== undefined ? start + maxLines : lines.length;
|
|
2739
3205
|
body = lines.slice(start, end).join('\n');
|
|
2740
3206
|
}
|
|
@@ -2842,7 +3308,7 @@ export function findDocuments(db, pattern, options = {}) {
|
|
|
2842
3308
|
// =============================================================================
|
|
2843
3309
|
// Status
|
|
2844
3310
|
// =============================================================================
|
|
2845
|
-
export function getStatus(db) {
|
|
3311
|
+
export function getStatus(db, model = DEFAULT_EMBED_MODEL) {
|
|
2846
3312
|
// DB is source of truth for collections — config provides supplementary metadata
|
|
2847
3313
|
const dbCollections = db.prepare(`
|
|
2848
3314
|
SELECT
|
|
@@ -2875,7 +3341,7 @@ export function getStatus(db) {
|
|
|
2875
3341
|
return new Date(b.lastUpdated).getTime() - new Date(a.lastUpdated).getTime();
|
|
2876
3342
|
});
|
|
2877
3343
|
const totalDocs = db.prepare(`SELECT COUNT(*) as c FROM documents WHERE active = 1`).get().c;
|
|
2878
|
-
const needsEmbedding = getHashesNeedingEmbedding(db);
|
|
3344
|
+
const needsEmbedding = getHashesNeedingEmbedding(db, undefined, model);
|
|
2879
3345
|
const hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
|
|
2880
3346
|
return {
|
|
2881
3347
|
totalDocuments: totalDocs,
|
|
@@ -2922,7 +3388,7 @@ export function extractSnippet(body, query, maxLen = 500, chunkPos, chunkLen, in
|
|
|
2922
3388
|
const totalLines = body.split('\n').length;
|
|
2923
3389
|
let searchBody = body;
|
|
2924
3390
|
let lineOffset = 0;
|
|
2925
|
-
if (chunkPos && chunkPos
|
|
3391
|
+
if (chunkPos !== undefined && chunkPos >= 0) {
|
|
2926
3392
|
// Search within the chunk region, with some padding for context
|
|
2927
3393
|
// Use provided chunkLen or fall back to max chunk size (covers variable-length chunks)
|
|
2928
3394
|
const searchLen = chunkLen || CHUNK_SIZE_CHARS;
|
|
@@ -2953,6 +3419,22 @@ export function extractSnippet(body, query, maxLen = 500, chunkPos, chunkLen, in
|
|
|
2953
3419
|
bestLine = i;
|
|
2954
3420
|
}
|
|
2955
3421
|
}
|
|
3422
|
+
if (chunkPos !== undefined && chunkPos >= 0 && bestScore <= 0) {
|
|
3423
|
+
if (chunkPos === 0) {
|
|
3424
|
+
// chunkPos=0 may be the chunk selector's initialization default for queries
|
|
3425
|
+
// where lexical chunk scoring found no winner (e.g. tokens filtered to empty
|
|
3426
|
+
// by the length>2 guard). Retry with full body so the real match isn't missed.
|
|
3427
|
+
return extractSnippet(body, query, maxLen, undefined, undefined, intent);
|
|
3428
|
+
}
|
|
3429
|
+
// For chunkPos > 0 the reranker actively picked this chunk. Tokens failing to
|
|
3430
|
+
// match literally is most likely a tokenizer limitation (quoted phrases, FTS5
|
|
3431
|
+
// syntax, HYDE passages, semantic hits), so anchor on the chunk start rather
|
|
3432
|
+
// than disregarding the reranker's pick.
|
|
3433
|
+
const contextStart = Math.max(0, chunkPos - 100);
|
|
3434
|
+
bestLine = chunkPos > contextStart
|
|
3435
|
+
? searchBody.slice(0, chunkPos - contextStart).split('\n').length - 1
|
|
3436
|
+
: 0;
|
|
3437
|
+
}
|
|
2956
3438
|
const start = Math.max(0, bestLine - 1);
|
|
2957
3439
|
const end = Math.min(lines.length, bestLine + 3);
|
|
2958
3440
|
const snippetLines = lines.slice(start, end);
|
|
@@ -2990,6 +3472,20 @@ export function addLineNumbers(text, startLine = 1) {
|
|
|
2990
3472
|
const lines = text.split('\n');
|
|
2991
3473
|
return lines.map((line, i) => `${startLine + i}: ${line}`).join('\n');
|
|
2992
3474
|
}
|
|
3475
|
+
/**
|
|
3476
|
+
* RRF list weights for hybridQuery.
|
|
3477
|
+
*
|
|
3478
|
+
* Original-query retrieval paths are the primary evidence and get 2x weight:
|
|
3479
|
+
* - original FTS
|
|
3480
|
+
* - original vector search
|
|
3481
|
+
*
|
|
3482
|
+
* Expansion-derived lists (lex/vec/hyde) stay at 1x regardless of list order,
|
|
3483
|
+
* so a lex expansion inserted before original vector search cannot steal the
|
|
3484
|
+
* original vector boost.
|
|
3485
|
+
*/
|
|
3486
|
+
export function getHybridRrfWeights(rankedListMeta) {
|
|
3487
|
+
return rankedListMeta.map(meta => meta.queryType === "original" ? 2.0 : 1.0);
|
|
3488
|
+
}
|
|
2993
3489
|
/**
|
|
2994
3490
|
* Hybrid search: BM25 + vector + query expansion + RRF + chunked reranking.
|
|
2995
3491
|
*
|
|
@@ -3078,7 +3574,8 @@ export async function hybridQuery(store, query, options) {
|
|
|
3078
3574
|
}
|
|
3079
3575
|
// Batch embed all vector queries in a single call
|
|
3080
3576
|
const llm = getLlm(store);
|
|
3081
|
-
const
|
|
3577
|
+
const embedModel = llm.embedModelName;
|
|
3578
|
+
const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text, embedModel));
|
|
3082
3579
|
hooks?.onEmbedStart?.(textsToEmbed.length);
|
|
3083
3580
|
const embedStart = Date.now();
|
|
3084
3581
|
const embeddings = await llm.embedBatch(textsToEmbed);
|
|
@@ -3088,7 +3585,7 @@ export async function hybridQuery(store, query, options) {
|
|
|
3088
3585
|
const embedding = embeddings[i]?.embedding;
|
|
3089
3586
|
if (!embedding)
|
|
3090
3587
|
continue;
|
|
3091
|
-
const vecResults = await store.searchVec(vecQueries[i].text,
|
|
3588
|
+
const vecResults = await store.searchVec(vecQueries[i].text, embedModel, 20, collection, undefined, embedding);
|
|
3092
3589
|
if (vecResults.length > 0) {
|
|
3093
3590
|
for (const r of vecResults)
|
|
3094
3591
|
docidMap.set(r.filepath, r.docid);
|
|
@@ -3104,8 +3601,9 @@ export async function hybridQuery(store, query, options) {
|
|
|
3104
3601
|
}
|
|
3105
3602
|
}
|
|
3106
3603
|
}
|
|
3107
|
-
// Step 4: RRF fusion —
|
|
3108
|
-
|
|
3604
|
+
// Step 4: RRF fusion — original-query FTS and vector lists get 2x weight;
|
|
3605
|
+
// expansion-derived lists stay at 1x independent of insertion order.
|
|
3606
|
+
const weights = getHybridRrfWeights(rankedListMeta);
|
|
3109
3607
|
const fused = reciprocalRankFusion(rankedLists, weights);
|
|
3110
3608
|
const rrfTraceByFile = explain ? buildRrfTrace(rankedLists, weights, rankedListMeta) : null;
|
|
3111
3609
|
const candidates = fused.slice(0, candidateLimit);
|
|
@@ -3286,10 +3784,11 @@ export async function vectorSearchQuery(store, query, options) {
|
|
|
3286
3784
|
const vecExpanded = allExpanded.filter(q => q.type !== 'lex');
|
|
3287
3785
|
options?.hooks?.onExpand?.(query, vecExpanded, Date.now() - expandStart);
|
|
3288
3786
|
// Run original + vec/hyde expanded through vector, sequentially — concurrent embed() hangs
|
|
3787
|
+
const embedModel = getLlm(store).embedModelName;
|
|
3289
3788
|
const queryTexts = [query, ...vecExpanded.map(q => q.query)];
|
|
3290
3789
|
const allResults = new Map();
|
|
3291
3790
|
for (const q of queryTexts) {
|
|
3292
|
-
const vecResults = await store.searchVec(q,
|
|
3791
|
+
const vecResults = await store.searchVec(q, embedModel, limit, collection);
|
|
3293
3792
|
for (const r of vecResults) {
|
|
3294
3793
|
const existing = allResults.get(r.filepath);
|
|
3295
3794
|
if (!existing || r.score > existing.score) {
|
|
@@ -3390,7 +3889,8 @@ export async function structuredSearch(store, searches, options) {
|
|
|
3390
3889
|
const vecSearches = searches.filter((s) => s.type === 'vec' || s.type === 'hyde');
|
|
3391
3890
|
if (vecSearches.length > 0) {
|
|
3392
3891
|
const llm = getLlm(store);
|
|
3393
|
-
const
|
|
3892
|
+
const embedModel = llm.embedModelName;
|
|
3893
|
+
const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query, embedModel));
|
|
3394
3894
|
hooks?.onEmbedStart?.(textsToEmbed.length);
|
|
3395
3895
|
const embedStart = Date.now();
|
|
3396
3896
|
const embeddings = await llm.embedBatch(textsToEmbed);
|
|
@@ -3400,7 +3900,7 @@ export async function structuredSearch(store, searches, options) {
|
|
|
3400
3900
|
if (!embedding)
|
|
3401
3901
|
continue;
|
|
3402
3902
|
for (const coll of collectionList) {
|
|
3403
|
-
const vecResults = await store.searchVec(vecSearches[i].query,
|
|
3903
|
+
const vecResults = await store.searchVec(vecSearches[i].query, embedModel, 20, coll, undefined, embedding);
|
|
3404
3904
|
if (vecResults.length > 0) {
|
|
3405
3905
|
for (const r of vecResults)
|
|
3406
3906
|
docidMap.set(r.filepath, r.docid);
|