@tobilu/qmd 1.1.6 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +39 -0
- package/README.md +198 -39
- package/bin/qmd +23 -0
- package/dist/{formatter.d.ts → cli/formatter.d.ts} +1 -1
- package/dist/{formatter.js → cli/formatter.js} +1 -1
- package/dist/{qmd.js → cli/qmd.js} +266 -154
- package/dist/embedded-skills.d.ts +6 -0
- package/dist/embedded-skills.js +14 -0
- package/dist/index.d.ts +129 -38
- package/dist/index.js +175 -41
- package/dist/llm.d.ts +6 -0
- package/dist/llm.js +24 -1
- package/dist/maintenance.d.ts +23 -0
- package/dist/maintenance.js +37 -0
- package/dist/{mcp.js → mcp/server.js} +41 -61
- package/dist/store.d.ts +83 -19
- package/dist/store.js +561 -84
- package/package.json +12 -11
- /package/dist/{qmd.d.ts → cli/qmd.d.ts} +0 -0
- /package/dist/{mcp.d.ts → mcp/server.d.ts} +0 -0
package/dist/store.js
CHANGED
|
@@ -13,9 +13,10 @@
|
|
|
13
13
|
import { openDatabase, loadSqliteVec } from "./db.js";
|
|
14
14
|
import picomatch from "picomatch";
|
|
15
15
|
import { createHash } from "crypto";
|
|
16
|
-
import { realpathSync, statSync, mkdirSync } from "node:fs";
|
|
17
|
-
|
|
18
|
-
import
|
|
16
|
+
import { readFileSync, realpathSync, statSync, mkdirSync } from "node:fs";
|
|
17
|
+
// Note: node:path resolve is not imported — we export our own cross-platform resolve()
|
|
18
|
+
import fastGlob from "fast-glob";
|
|
19
|
+
import { LlamaCpp, getDefaultLlamaCpp, formatQueryForEmbedding, formatDocForEmbedding, withLLMSessionForLlm, } from "./llm.js";
|
|
19
20
|
// =============================================================================
|
|
20
21
|
// Configuration
|
|
21
22
|
// =============================================================================
|
|
@@ -35,6 +36,13 @@ export const CHUNK_OVERLAP_CHARS = CHUNK_OVERLAP_TOKENS * 4; // 540 chars
|
|
|
35
36
|
// Search window for finding optimal break points (in tokens, ~200 tokens)
|
|
36
37
|
export const CHUNK_WINDOW_TOKENS = 200;
|
|
37
38
|
export const CHUNK_WINDOW_CHARS = CHUNK_WINDOW_TOKENS * 4; // 800 chars
|
|
39
|
+
/**
|
|
40
|
+
* Get the LlamaCpp instance for a store — prefers the store's own instance,
|
|
41
|
+
* falls back to the global singleton.
|
|
42
|
+
*/
|
|
43
|
+
function getLlm(store) {
|
|
44
|
+
return store.llm ?? getDefaultLlamaCpp();
|
|
45
|
+
}
|
|
38
46
|
/**
|
|
39
47
|
* Patterns for detecting break points in markdown documents.
|
|
40
48
|
* Higher scores indicate better places to split.
|
|
@@ -442,8 +450,8 @@ export function resolveVirtualPath(db, virtualPath) {
|
|
|
442
450
|
* Returns null if the file is not in any indexed collection.
|
|
443
451
|
*/
|
|
444
452
|
export function toVirtualPath(db, absolutePath) {
|
|
445
|
-
// Get all collections from
|
|
446
|
-
const collections =
|
|
453
|
+
// Get all collections from DB
|
|
454
|
+
const collections = getStoreCollections(db);
|
|
447
455
|
// Find which collection this absolute path belongs to
|
|
448
456
|
for (const coll of collections) {
|
|
449
457
|
if (absolutePath.startsWith(coll.path + '/') || absolutePath === coll.path) {
|
|
@@ -556,6 +564,25 @@ function initializeDatabase(db) {
|
|
|
556
564
|
embedded_at TEXT NOT NULL,
|
|
557
565
|
PRIMARY KEY (hash, seq)
|
|
558
566
|
)
|
|
567
|
+
`);
|
|
568
|
+
// Store collections — makes the DB self-contained (no external config needed)
|
|
569
|
+
db.exec(`
|
|
570
|
+
CREATE TABLE IF NOT EXISTS store_collections (
|
|
571
|
+
name TEXT PRIMARY KEY,
|
|
572
|
+
path TEXT NOT NULL,
|
|
573
|
+
pattern TEXT NOT NULL DEFAULT '**/*.md',
|
|
574
|
+
ignore_patterns TEXT,
|
|
575
|
+
include_by_default INTEGER DEFAULT 1,
|
|
576
|
+
update_command TEXT,
|
|
577
|
+
context TEXT
|
|
578
|
+
)
|
|
579
|
+
`);
|
|
580
|
+
// Store config — key-value metadata (e.g. config_hash for sync optimization)
|
|
581
|
+
db.exec(`
|
|
582
|
+
CREATE TABLE IF NOT EXISTS store_config (
|
|
583
|
+
key TEXT PRIMARY KEY,
|
|
584
|
+
value TEXT
|
|
585
|
+
)
|
|
559
586
|
`);
|
|
560
587
|
// FTS - index filepath (collection/path), title, and content
|
|
561
588
|
db.exec(`
|
|
@@ -600,6 +627,141 @@ function initializeDatabase(db) {
|
|
|
600
627
|
END
|
|
601
628
|
`);
|
|
602
629
|
}
|
|
630
|
+
function rowToNamedCollection(row) {
|
|
631
|
+
return {
|
|
632
|
+
name: row.name,
|
|
633
|
+
path: row.path,
|
|
634
|
+
pattern: row.pattern,
|
|
635
|
+
...(row.ignore_patterns ? { ignore: JSON.parse(row.ignore_patterns) } : {}),
|
|
636
|
+
...(row.include_by_default === 0 ? { includeByDefault: false } : {}),
|
|
637
|
+
...(row.update_command ? { update: row.update_command } : {}),
|
|
638
|
+
...(row.context ? { context: JSON.parse(row.context) } : {}),
|
|
639
|
+
};
|
|
640
|
+
}
|
|
641
|
+
export function getStoreCollections(db) {
|
|
642
|
+
const rows = db.prepare(`SELECT * FROM store_collections`).all();
|
|
643
|
+
return rows.map(rowToNamedCollection);
|
|
644
|
+
}
|
|
645
|
+
export function getStoreCollection(db, name) {
|
|
646
|
+
const row = db.prepare(`SELECT * FROM store_collections WHERE name = ?`).get(name);
|
|
647
|
+
if (row == null)
|
|
648
|
+
return null;
|
|
649
|
+
return rowToNamedCollection(row);
|
|
650
|
+
}
|
|
651
|
+
export function getStoreGlobalContext(db) {
|
|
652
|
+
const row = db.prepare(`SELECT value FROM store_config WHERE key = 'global_context'`).get();
|
|
653
|
+
if (row == null)
|
|
654
|
+
return undefined;
|
|
655
|
+
return row.value || undefined;
|
|
656
|
+
}
|
|
657
|
+
export function getStoreContexts(db) {
|
|
658
|
+
const results = [];
|
|
659
|
+
// Global context
|
|
660
|
+
const globalCtx = getStoreGlobalContext(db);
|
|
661
|
+
if (globalCtx) {
|
|
662
|
+
results.push({ collection: "*", path: "/", context: globalCtx });
|
|
663
|
+
}
|
|
664
|
+
// Collection contexts
|
|
665
|
+
const rows = db.prepare(`SELECT name, context FROM store_collections WHERE context IS NOT NULL`).all();
|
|
666
|
+
for (const row of rows) {
|
|
667
|
+
const ctxMap = JSON.parse(row.context);
|
|
668
|
+
for (const [path, context] of Object.entries(ctxMap)) {
|
|
669
|
+
results.push({ collection: row.name, path, context });
|
|
670
|
+
}
|
|
671
|
+
}
|
|
672
|
+
return results;
|
|
673
|
+
}
|
|
674
|
+
export function upsertStoreCollection(db, name, collection) {
|
|
675
|
+
db.prepare(`
|
|
676
|
+
INSERT INTO store_collections (name, path, pattern, ignore_patterns, include_by_default, update_command, context)
|
|
677
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
678
|
+
ON CONFLICT(name) DO UPDATE SET
|
|
679
|
+
path = excluded.path,
|
|
680
|
+
pattern = excluded.pattern,
|
|
681
|
+
ignore_patterns = excluded.ignore_patterns,
|
|
682
|
+
include_by_default = excluded.include_by_default,
|
|
683
|
+
update_command = excluded.update_command,
|
|
684
|
+
context = excluded.context
|
|
685
|
+
`).run(name, collection.path, collection.pattern || '**/*.md', collection.ignore ? JSON.stringify(collection.ignore) : null, collection.includeByDefault === false ? 0 : 1, collection.update || null, collection.context ? JSON.stringify(collection.context) : null);
|
|
686
|
+
}
|
|
687
|
+
export function deleteStoreCollection(db, name) {
|
|
688
|
+
const result = db.prepare(`DELETE FROM store_collections WHERE name = ?`).run(name);
|
|
689
|
+
return result.changes > 0;
|
|
690
|
+
}
|
|
691
|
+
export function renameStoreCollection(db, oldName, newName) {
|
|
692
|
+
// Check target doesn't exist
|
|
693
|
+
const existing = db.prepare(`SELECT name FROM store_collections WHERE name = ?`).get(newName);
|
|
694
|
+
if (existing != null) {
|
|
695
|
+
throw new Error(`Collection '${newName}' already exists`);
|
|
696
|
+
}
|
|
697
|
+
const result = db.prepare(`UPDATE store_collections SET name = ? WHERE name = ?`).run(newName, oldName);
|
|
698
|
+
return result.changes > 0;
|
|
699
|
+
}
|
|
700
|
+
export function updateStoreContext(db, collectionName, path, text) {
|
|
701
|
+
const row = db.prepare(`SELECT context FROM store_collections WHERE name = ?`).get(collectionName);
|
|
702
|
+
if (row == null)
|
|
703
|
+
return false;
|
|
704
|
+
const ctxMap = row.context ? JSON.parse(row.context) : {};
|
|
705
|
+
ctxMap[path] = text;
|
|
706
|
+
db.prepare(`UPDATE store_collections SET context = ? WHERE name = ?`).run(JSON.stringify(ctxMap), collectionName);
|
|
707
|
+
return true;
|
|
708
|
+
}
|
|
709
|
+
export function removeStoreContext(db, collectionName, path) {
|
|
710
|
+
const row = db.prepare(`SELECT context FROM store_collections WHERE name = ?`).get(collectionName);
|
|
711
|
+
if (row == null)
|
|
712
|
+
return false;
|
|
713
|
+
if (!row.context)
|
|
714
|
+
return false;
|
|
715
|
+
const ctxMap = JSON.parse(row.context);
|
|
716
|
+
if (!(path in ctxMap))
|
|
717
|
+
return false;
|
|
718
|
+
delete ctxMap[path];
|
|
719
|
+
const newCtx = Object.keys(ctxMap).length > 0 ? JSON.stringify(ctxMap) : null;
|
|
720
|
+
db.prepare(`UPDATE store_collections SET context = ? WHERE name = ?`).run(newCtx, collectionName);
|
|
721
|
+
return true;
|
|
722
|
+
}
|
|
723
|
+
export function setStoreGlobalContext(db, value) {
|
|
724
|
+
if (value === undefined) {
|
|
725
|
+
db.prepare(`DELETE FROM store_config WHERE key = 'global_context'`).run();
|
|
726
|
+
}
|
|
727
|
+
else {
|
|
728
|
+
db.prepare(`INSERT INTO store_config (key, value) VALUES ('global_context', ?) ON CONFLICT(key) DO UPDATE SET value = excluded.value`).run(value);
|
|
729
|
+
}
|
|
730
|
+
}
|
|
731
|
+
/**
|
|
732
|
+
* Sync external config (YAML/inline) into SQLite store_collections.
|
|
733
|
+
* External config always wins. Skips sync if config hash hasn't changed.
|
|
734
|
+
*/
|
|
735
|
+
export function syncConfigToDb(db, config) {
|
|
736
|
+
// Check config hash — skip sync if unchanged
|
|
737
|
+
const configJson = JSON.stringify(config);
|
|
738
|
+
const hash = createHash('sha256').update(configJson).digest('hex');
|
|
739
|
+
const existingHash = db.prepare(`SELECT value FROM store_config WHERE key = 'config_hash'`).get();
|
|
740
|
+
if (existingHash != null && existingHash.value === hash) {
|
|
741
|
+
return; // Config unchanged, skip sync
|
|
742
|
+
}
|
|
743
|
+
// Sync collections
|
|
744
|
+
const configNames = new Set(Object.keys(config.collections));
|
|
745
|
+
for (const [name, coll] of Object.entries(config.collections)) {
|
|
746
|
+
upsertStoreCollection(db, name, coll);
|
|
747
|
+
}
|
|
748
|
+
// Delete collections not in config
|
|
749
|
+
const dbCollections = db.prepare(`SELECT name FROM store_collections`).all();
|
|
750
|
+
for (const row of dbCollections) {
|
|
751
|
+
if (!configNames.has(row.name)) {
|
|
752
|
+
db.prepare(`DELETE FROM store_collections WHERE name = ?`).run(row.name);
|
|
753
|
+
}
|
|
754
|
+
}
|
|
755
|
+
// Sync global context
|
|
756
|
+
if (config.global_context !== undefined) {
|
|
757
|
+
setStoreGlobalContext(db, config.global_context);
|
|
758
|
+
}
|
|
759
|
+
else {
|
|
760
|
+
setStoreGlobalContext(db, undefined);
|
|
761
|
+
}
|
|
762
|
+
// Save config hash
|
|
763
|
+
db.prepare(`INSERT INTO store_config (key, value) VALUES ('config_hash', ?) ON CONFLICT(key) DO UPDATE SET value = excluded.value`).run(hash);
|
|
764
|
+
}
|
|
603
765
|
export function isSqliteVecAvailable() {
|
|
604
766
|
return _sqliteVecAvailable === true;
|
|
605
767
|
}
|
|
@@ -620,6 +782,199 @@ function ensureVecTableInternal(db, dimensions) {
|
|
|
620
782
|
}
|
|
621
783
|
db.exec(`CREATE VIRTUAL TABLE vectors_vec USING vec0(hash_seq TEXT PRIMARY KEY, embedding float[${dimensions}] distance_metric=cosine)`);
|
|
622
784
|
}
|
|
785
|
+
/**
|
|
786
|
+
* Re-index a single collection by scanning the filesystem and updating the database.
|
|
787
|
+
* Pure function — no console output, no db lifecycle management.
|
|
788
|
+
*/
|
|
789
|
+
export async function reindexCollection(store, collectionPath, globPattern, collectionName, options) {
|
|
790
|
+
const db = store.db;
|
|
791
|
+
const now = new Date().toISOString();
|
|
792
|
+
const excludeDirs = ["node_modules", ".git", ".cache", "vendor", "dist", "build"];
|
|
793
|
+
const allIgnore = [
|
|
794
|
+
...excludeDirs.map(d => `**/${d}/**`),
|
|
795
|
+
...(options?.ignorePatterns || []),
|
|
796
|
+
];
|
|
797
|
+
const allFiles = await fastGlob(globPattern, {
|
|
798
|
+
cwd: collectionPath,
|
|
799
|
+
onlyFiles: true,
|
|
800
|
+
followSymbolicLinks: false,
|
|
801
|
+
dot: false,
|
|
802
|
+
ignore: allIgnore,
|
|
803
|
+
});
|
|
804
|
+
// Filter hidden files/folders
|
|
805
|
+
const files = allFiles.filter(file => {
|
|
806
|
+
const parts = file.split("/");
|
|
807
|
+
return !parts.some(part => part.startsWith("."));
|
|
808
|
+
});
|
|
809
|
+
const total = files.length;
|
|
810
|
+
let indexed = 0, updated = 0, unchanged = 0, processed = 0;
|
|
811
|
+
const seenPaths = new Set();
|
|
812
|
+
for (const relativeFile of files) {
|
|
813
|
+
const filepath = getRealPath(resolve(collectionPath, relativeFile));
|
|
814
|
+
const path = handelize(relativeFile);
|
|
815
|
+
seenPaths.add(path);
|
|
816
|
+
let content;
|
|
817
|
+
try {
|
|
818
|
+
content = readFileSync(filepath, "utf-8");
|
|
819
|
+
}
|
|
820
|
+
catch {
|
|
821
|
+
processed++;
|
|
822
|
+
options?.onProgress?.({ file: relativeFile, current: processed, total });
|
|
823
|
+
continue;
|
|
824
|
+
}
|
|
825
|
+
if (!content.trim()) {
|
|
826
|
+
processed++;
|
|
827
|
+
continue;
|
|
828
|
+
}
|
|
829
|
+
const hash = await hashContent(content);
|
|
830
|
+
const title = extractTitle(content, relativeFile);
|
|
831
|
+
const existing = findActiveDocument(db, collectionName, path);
|
|
832
|
+
if (existing) {
|
|
833
|
+
if (existing.hash === hash) {
|
|
834
|
+
if (existing.title !== title) {
|
|
835
|
+
updateDocumentTitle(db, existing.id, title, now);
|
|
836
|
+
updated++;
|
|
837
|
+
}
|
|
838
|
+
else {
|
|
839
|
+
unchanged++;
|
|
840
|
+
}
|
|
841
|
+
}
|
|
842
|
+
else {
|
|
843
|
+
insertContent(db, hash, content, now);
|
|
844
|
+
const stat = statSync(filepath);
|
|
845
|
+
updateDocument(db, existing.id, title, hash, stat ? new Date(stat.mtime).toISOString() : now);
|
|
846
|
+
updated++;
|
|
847
|
+
}
|
|
848
|
+
}
|
|
849
|
+
else {
|
|
850
|
+
indexed++;
|
|
851
|
+
insertContent(db, hash, content, now);
|
|
852
|
+
const stat = statSync(filepath);
|
|
853
|
+
insertDocument(db, collectionName, path, title, hash, stat ? new Date(stat.birthtime).toISOString() : now, stat ? new Date(stat.mtime).toISOString() : now);
|
|
854
|
+
}
|
|
855
|
+
processed++;
|
|
856
|
+
options?.onProgress?.({ file: relativeFile, current: processed, total });
|
|
857
|
+
}
|
|
858
|
+
// Deactivate documents that no longer exist
|
|
859
|
+
const allActive = getActiveDocumentPaths(db, collectionName);
|
|
860
|
+
let removed = 0;
|
|
861
|
+
for (const path of allActive) {
|
|
862
|
+
if (!seenPaths.has(path)) {
|
|
863
|
+
deactivateDocument(db, collectionName, path);
|
|
864
|
+
removed++;
|
|
865
|
+
}
|
|
866
|
+
}
|
|
867
|
+
const orphanedCleaned = cleanupOrphanedContent(db);
|
|
868
|
+
return { indexed, updated, unchanged, removed, orphanedCleaned };
|
|
869
|
+
}
|
|
870
|
+
/**
|
|
871
|
+
* Generate vector embeddings for documents that need them.
|
|
872
|
+
* Pure function — no console output, no db lifecycle management.
|
|
873
|
+
* Uses the store's LlamaCpp instance if set, otherwise the global singleton.
|
|
874
|
+
*/
|
|
875
|
+
export async function generateEmbeddings(store, options) {
|
|
876
|
+
const db = store.db;
|
|
877
|
+
const model = options?.model ?? DEFAULT_EMBED_MODEL;
|
|
878
|
+
const now = new Date().toISOString();
|
|
879
|
+
if (options?.force) {
|
|
880
|
+
clearAllEmbeddings(db);
|
|
881
|
+
}
|
|
882
|
+
const hashesToEmbed = getHashesForEmbedding(db);
|
|
883
|
+
if (hashesToEmbed.length === 0) {
|
|
884
|
+
return { docsProcessed: 0, chunksEmbedded: 0, errors: 0, durationMs: 0 };
|
|
885
|
+
}
|
|
886
|
+
const allChunks = [];
|
|
887
|
+
for (const item of hashesToEmbed) {
|
|
888
|
+
const encoder = new TextEncoder();
|
|
889
|
+
const bodyBytes = encoder.encode(item.body).length;
|
|
890
|
+
if (bodyBytes === 0)
|
|
891
|
+
continue;
|
|
892
|
+
const title = extractTitle(item.body, item.path);
|
|
893
|
+
const chunks = await chunkDocumentByTokens(item.body);
|
|
894
|
+
for (let seq = 0; seq < chunks.length; seq++) {
|
|
895
|
+
allChunks.push({
|
|
896
|
+
hash: item.hash,
|
|
897
|
+
title,
|
|
898
|
+
text: chunks[seq].text,
|
|
899
|
+
seq,
|
|
900
|
+
pos: chunks[seq].pos,
|
|
901
|
+
tokens: chunks[seq].tokens,
|
|
902
|
+
bytes: encoder.encode(chunks[seq].text).length,
|
|
903
|
+
});
|
|
904
|
+
}
|
|
905
|
+
}
|
|
906
|
+
if (allChunks.length === 0) {
|
|
907
|
+
return { docsProcessed: 0, chunksEmbedded: 0, errors: 0, durationMs: 0 };
|
|
908
|
+
}
|
|
909
|
+
const totalBytes = allChunks.reduce((sum, chk) => sum + chk.bytes, 0);
|
|
910
|
+
const totalChunks = allChunks.length;
|
|
911
|
+
const totalDocs = hashesToEmbed.length;
|
|
912
|
+
const startTime = Date.now();
|
|
913
|
+
// Use store's LlamaCpp or global singleton, wrapped in a session
|
|
914
|
+
const llm = getLlm(store);
|
|
915
|
+
const sessionOptions = { maxDuration: 30 * 60 * 1000, name: 'generateEmbeddings' };
|
|
916
|
+
// Create a session manager for this llm instance
|
|
917
|
+
const result = await withLLMSessionForLlm(llm, async (session) => {
|
|
918
|
+
// Get embedding dimensions from first chunk
|
|
919
|
+
const firstChunk = allChunks[0];
|
|
920
|
+
const firstText = formatDocForEmbedding(firstChunk.text, firstChunk.title);
|
|
921
|
+
const firstResult = await session.embed(firstText);
|
|
922
|
+
if (!firstResult) {
|
|
923
|
+
throw new Error("Failed to get embedding dimensions from first chunk");
|
|
924
|
+
}
|
|
925
|
+
store.ensureVecTable(firstResult.embedding.length);
|
|
926
|
+
let chunksEmbedded = 0, errors = 0, bytesProcessed = 0;
|
|
927
|
+
const BATCH_SIZE = 32;
|
|
928
|
+
for (let batchStart = 0; batchStart < allChunks.length; batchStart += BATCH_SIZE) {
|
|
929
|
+
const batchEnd = Math.min(batchStart + BATCH_SIZE, allChunks.length);
|
|
930
|
+
const batch = allChunks.slice(batchStart, batchEnd);
|
|
931
|
+
const texts = batch.map(chunk => formatDocForEmbedding(chunk.text, chunk.title));
|
|
932
|
+
try {
|
|
933
|
+
const embeddings = await session.embedBatch(texts);
|
|
934
|
+
for (let i = 0; i < batch.length; i++) {
|
|
935
|
+
const chunk = batch[i];
|
|
936
|
+
const embedding = embeddings[i];
|
|
937
|
+
if (embedding) {
|
|
938
|
+
insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now);
|
|
939
|
+
chunksEmbedded++;
|
|
940
|
+
}
|
|
941
|
+
else {
|
|
942
|
+
errors++;
|
|
943
|
+
}
|
|
944
|
+
bytesProcessed += chunk.bytes;
|
|
945
|
+
}
|
|
946
|
+
}
|
|
947
|
+
catch {
|
|
948
|
+
// Batch failed — try individual embeddings as fallback
|
|
949
|
+
for (const chunk of batch) {
|
|
950
|
+
try {
|
|
951
|
+
const text = formatDocForEmbedding(chunk.text, chunk.title);
|
|
952
|
+
const result = await session.embed(text);
|
|
953
|
+
if (result) {
|
|
954
|
+
insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now);
|
|
955
|
+
chunksEmbedded++;
|
|
956
|
+
}
|
|
957
|
+
else {
|
|
958
|
+
errors++;
|
|
959
|
+
}
|
|
960
|
+
}
|
|
961
|
+
catch {
|
|
962
|
+
errors++;
|
|
963
|
+
}
|
|
964
|
+
bytesProcessed += chunk.bytes;
|
|
965
|
+
}
|
|
966
|
+
}
|
|
967
|
+
options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors });
|
|
968
|
+
}
|
|
969
|
+
return { chunksEmbedded, errors };
|
|
970
|
+
}, sessionOptions);
|
|
971
|
+
return {
|
|
972
|
+
docsProcessed: totalDocs,
|
|
973
|
+
chunksEmbedded: result.chunksEmbedded,
|
|
974
|
+
errors: result.errors,
|
|
975
|
+
durationMs: Date.now() - startTime,
|
|
976
|
+
};
|
|
977
|
+
}
|
|
623
978
|
/**
|
|
624
979
|
* Create a new store instance with the given database path.
|
|
625
980
|
* If no path is provided, uses the default path (~/.cache/qmd/index.sqlite).
|
|
@@ -631,7 +986,7 @@ export function createStore(dbPath) {
|
|
|
631
986
|
const resolvedPath = dbPath || getDefaultDbPath();
|
|
632
987
|
const db = openDatabase(resolvedPath);
|
|
633
988
|
initializeDatabase(db);
|
|
634
|
-
|
|
989
|
+
const store = {
|
|
635
990
|
db,
|
|
636
991
|
dbPath: resolvedPath,
|
|
637
992
|
close: () => db.close(),
|
|
@@ -667,8 +1022,8 @@ export function createStore(dbPath) {
|
|
|
667
1022
|
searchFTS: (query, limit, collectionName) => searchFTS(db, query, limit, collectionName),
|
|
668
1023
|
searchVec: (query, model, limit, collectionName, session, precomputedEmbedding) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding),
|
|
669
1024
|
// Query expansion & reranking
|
|
670
|
-
expandQuery: (query, model, intent) => expandQuery(query, model, db, intent),
|
|
671
|
-
rerank: (query, documents, model, intent) => rerank(query, documents, model, db, intent),
|
|
1025
|
+
expandQuery: (query, model, intent) => expandQuery(query, model, db, intent, store.llm),
|
|
1026
|
+
rerank: (query, documents, model, intent) => rerank(query, documents, model, db, intent, store.llm),
|
|
672
1027
|
// Document retrieval
|
|
673
1028
|
findDocument: (filename, options) => findDocument(db, filename, options),
|
|
674
1029
|
getDocumentBody: (doc, fromLine, maxLines) => getDocumentBody(db, doc, fromLine, maxLines),
|
|
@@ -690,6 +1045,7 @@ export function createStore(dbPath) {
|
|
|
690
1045
|
clearAllEmbeddings: () => clearAllEmbeddings(db),
|
|
691
1046
|
insertEmbedding: (hash, seq, pos, embedding, model, embeddedAt) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt),
|
|
692
1047
|
};
|
|
1048
|
+
return store;
|
|
693
1049
|
}
|
|
694
1050
|
/**
|
|
695
1051
|
* Extract short docid from a full hash (first 6 characters).
|
|
@@ -1188,15 +1544,15 @@ export function matchFilesByGlob(db, pattern) {
|
|
|
1188
1544
|
* @returns Context string or null if no context is defined
|
|
1189
1545
|
*/
|
|
1190
1546
|
export function getContextForPath(db, collectionName, path) {
|
|
1191
|
-
const
|
|
1192
|
-
const coll = getCollection(collectionName);
|
|
1547
|
+
const coll = getStoreCollection(db, collectionName);
|
|
1193
1548
|
if (!coll)
|
|
1194
1549
|
return null;
|
|
1195
1550
|
// Collect ALL matching contexts (global + all path prefixes)
|
|
1196
1551
|
const contexts = [];
|
|
1197
1552
|
// Add global context if present
|
|
1198
|
-
|
|
1199
|
-
|
|
1553
|
+
const globalCtx = getStoreGlobalContext(db);
|
|
1554
|
+
if (globalCtx) {
|
|
1555
|
+
contexts.push(globalCtx);
|
|
1200
1556
|
}
|
|
1201
1557
|
// Add all matching path contexts (from most general to most specific)
|
|
1202
1558
|
if (coll.context) {
|
|
@@ -1221,15 +1577,14 @@ export function getContextForPath(db, collectionName, path) {
|
|
|
1221
1577
|
}
|
|
1222
1578
|
/**
|
|
1223
1579
|
* Get context for a file path (virtual or filesystem).
|
|
1224
|
-
* Resolves the collection and relative path
|
|
1580
|
+
* Resolves the collection and relative path from the DB store_collections table.
|
|
1225
1581
|
*/
|
|
1226
1582
|
export function getContextForFile(db, filepath) {
|
|
1227
1583
|
// Handle undefined or null filepath
|
|
1228
1584
|
if (!filepath)
|
|
1229
1585
|
return null;
|
|
1230
|
-
// Get all collections from
|
|
1231
|
-
const collections =
|
|
1232
|
-
const config = collectionsLoadConfig();
|
|
1586
|
+
// Get all collections from DB
|
|
1587
|
+
const collections = getStoreCollections(db);
|
|
1233
1588
|
// Parse virtual path format: qmd://collection/path
|
|
1234
1589
|
let collectionName = null;
|
|
1235
1590
|
let relativePath = null;
|
|
@@ -1256,8 +1611,8 @@ export function getContextForFile(db, filepath) {
|
|
|
1256
1611
|
if (!collectionName || relativePath === null)
|
|
1257
1612
|
return null;
|
|
1258
1613
|
}
|
|
1259
|
-
// Get the collection from
|
|
1260
|
-
const coll =
|
|
1614
|
+
// Get the collection from DB
|
|
1615
|
+
const coll = getStoreCollection(db, collectionName);
|
|
1261
1616
|
if (!coll)
|
|
1262
1617
|
return null;
|
|
1263
1618
|
// Verify this document exists in the database
|
|
@@ -1272,8 +1627,9 @@ export function getContextForFile(db, filepath) {
|
|
|
1272
1627
|
// Collect ALL matching contexts (global + all path prefixes)
|
|
1273
1628
|
const contexts = [];
|
|
1274
1629
|
// Add global context if present
|
|
1275
|
-
|
|
1276
|
-
|
|
1630
|
+
const globalCtx = getStoreGlobalContext(db);
|
|
1631
|
+
if (globalCtx) {
|
|
1632
|
+
contexts.push(globalCtx);
|
|
1277
1633
|
}
|
|
1278
1634
|
// Add all matching path contexts (from most general to most specific)
|
|
1279
1635
|
if (coll.context) {
|
|
@@ -1297,11 +1653,10 @@ export function getContextForFile(db, filepath) {
|
|
|
1297
1653
|
return contexts.length > 0 ? contexts.join('\n\n') : null;
|
|
1298
1654
|
}
|
|
1299
1655
|
/**
|
|
1300
|
-
* Get collection by name from
|
|
1301
|
-
* Returns collection metadata from ~/.config/qmd/index.yml
|
|
1656
|
+
* Get collection by name from DB store_collections table.
|
|
1302
1657
|
*/
|
|
1303
1658
|
export function getCollectionByName(db, name) {
|
|
1304
|
-
const collection =
|
|
1659
|
+
const collection = getStoreCollection(db, name);
|
|
1305
1660
|
if (!collection)
|
|
1306
1661
|
return null;
|
|
1307
1662
|
return {
|
|
@@ -1312,10 +1667,10 @@ export function getCollectionByName(db, name) {
|
|
|
1312
1667
|
}
|
|
1313
1668
|
/**
|
|
1314
1669
|
* List all collections with document counts from database.
|
|
1315
|
-
* Merges
|
|
1670
|
+
* Merges store_collections config with database statistics.
|
|
1316
1671
|
*/
|
|
1317
1672
|
export function listCollections(db) {
|
|
1318
|
-
const collections =
|
|
1673
|
+
const collections = getStoreCollections(db);
|
|
1319
1674
|
// Get document counts from database for each collection
|
|
1320
1675
|
const result = collections.map(coll => {
|
|
1321
1676
|
const stats = db.prepare(`
|
|
@@ -1333,6 +1688,7 @@ export function listCollections(db) {
|
|
|
1333
1688
|
doc_count: stats?.doc_count || 0,
|
|
1334
1689
|
active_count: stats?.active_count || 0,
|
|
1335
1690
|
last_modified: stats?.last_modified || null,
|
|
1691
|
+
includeByDefault: coll.includeByDefault !== false,
|
|
1336
1692
|
};
|
|
1337
1693
|
});
|
|
1338
1694
|
return result;
|
|
@@ -1349,8 +1705,8 @@ export function removeCollection(db, collectionName) {
|
|
|
1349
1705
|
DELETE FROM content
|
|
1350
1706
|
WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
|
|
1351
1707
|
`).run();
|
|
1352
|
-
// Remove from
|
|
1353
|
-
|
|
1708
|
+
// Remove from store_collections
|
|
1709
|
+
deleteStoreCollection(db, collectionName);
|
|
1354
1710
|
return {
|
|
1355
1711
|
deletedDocs: docResult.changes,
|
|
1356
1712
|
cleanedHashes: cleanupResult.changes
|
|
@@ -1364,8 +1720,8 @@ export function renameCollection(db, oldName, newName) {
|
|
|
1364
1720
|
// Update all documents with the new collection name in database
|
|
1365
1721
|
db.prepare(`UPDATE documents SET collection = ? WHERE collection = ?`)
|
|
1366
1722
|
.run(newName, oldName);
|
|
1367
|
-
// Rename in
|
|
1368
|
-
|
|
1723
|
+
// Rename in store_collections
|
|
1724
|
+
renameStoreCollection(db, oldName, newName);
|
|
1369
1725
|
}
|
|
1370
1726
|
// =============================================================================
|
|
1371
1727
|
// Context Management Operations
|
|
@@ -1379,16 +1735,16 @@ export function insertContext(db, collectionId, pathPrefix, context) {
|
|
|
1379
1735
|
if (!coll) {
|
|
1380
1736
|
throw new Error(`Collection with id ${collectionId} not found`);
|
|
1381
1737
|
}
|
|
1382
|
-
//
|
|
1383
|
-
|
|
1738
|
+
// Add context to store_collections
|
|
1739
|
+
updateStoreContext(db, coll.name, pathPrefix, context);
|
|
1384
1740
|
}
|
|
1385
1741
|
/**
|
|
1386
1742
|
* Delete a context for a specific collection and path prefix.
|
|
1387
1743
|
* Returns the number of contexts deleted.
|
|
1388
1744
|
*/
|
|
1389
1745
|
export function deleteContext(db, collectionName, pathPrefix) {
|
|
1390
|
-
//
|
|
1391
|
-
const success =
|
|
1746
|
+
// Remove context from store_collections
|
|
1747
|
+
const success = removeStoreContext(db, collectionName, pathPrefix);
|
|
1392
1748
|
return success ? 1 : 0;
|
|
1393
1749
|
}
|
|
1394
1750
|
/**
|
|
@@ -1398,12 +1754,12 @@ export function deleteContext(db, collectionName, pathPrefix) {
|
|
|
1398
1754
|
export function deleteGlobalContexts(db) {
|
|
1399
1755
|
let deletedCount = 0;
|
|
1400
1756
|
// Remove global context
|
|
1401
|
-
|
|
1757
|
+
setStoreGlobalContext(db, undefined);
|
|
1402
1758
|
deletedCount++;
|
|
1403
1759
|
// Remove root context (empty string) from all collections
|
|
1404
|
-
const collections =
|
|
1760
|
+
const collections = getStoreCollections(db);
|
|
1405
1761
|
for (const coll of collections) {
|
|
1406
|
-
const success =
|
|
1762
|
+
const success = removeStoreContext(db, coll.name, '');
|
|
1407
1763
|
if (success) {
|
|
1408
1764
|
deletedCount++;
|
|
1409
1765
|
}
|
|
@@ -1415,7 +1771,7 @@ export function deleteGlobalContexts(db) {
|
|
|
1415
1771
|
* Returns contexts ordered by collection name, then by path prefix length (longest first).
|
|
1416
1772
|
*/
|
|
1417
1773
|
export function listPathContexts(db) {
|
|
1418
|
-
const allContexts =
|
|
1774
|
+
const allContexts = getStoreContexts(db);
|
|
1419
1775
|
// Convert to expected format and sort
|
|
1420
1776
|
return allContexts.map(ctx => ({
|
|
1421
1777
|
collection_name: ctx.collection,
|
|
@@ -1438,7 +1794,7 @@ export function listPathContexts(db) {
|
|
|
1438
1794
|
* Get all collections (name only - from YAML config).
|
|
1439
1795
|
*/
|
|
1440
1796
|
export function getAllCollections(db) {
|
|
1441
|
-
const collections =
|
|
1797
|
+
const collections = getStoreCollections(db);
|
|
1442
1798
|
return collections.map(c => ({ name: c.name }));
|
|
1443
1799
|
}
|
|
1444
1800
|
/**
|
|
@@ -1446,11 +1802,11 @@ export function getAllCollections(db) {
|
|
|
1446
1802
|
* Returns collections that have no context entries at all (not even root context).
|
|
1447
1803
|
*/
|
|
1448
1804
|
export function getCollectionsWithoutContext(db) {
|
|
1449
|
-
// Get all collections from
|
|
1450
|
-
const
|
|
1805
|
+
// Get all collections from DB
|
|
1806
|
+
const allCollections = getStoreCollections(db);
|
|
1451
1807
|
// Filter to those without context
|
|
1452
1808
|
const collectionsWithoutContext = [];
|
|
1453
|
-
for (const coll of
|
|
1809
|
+
for (const coll of allCollections) {
|
|
1454
1810
|
// Check if collection has any context
|
|
1455
1811
|
if (!coll.context || Object.keys(coll.context).length === 0) {
|
|
1456
1812
|
// Get doc count from database
|
|
@@ -1478,13 +1834,13 @@ export function getTopLevelPathsWithoutContext(db, collectionName) {
|
|
|
1478
1834
|
SELECT DISTINCT path FROM documents
|
|
1479
1835
|
WHERE collection = ? AND active = 1
|
|
1480
1836
|
`).all(collectionName);
|
|
1481
|
-
// Get existing contexts for this collection from
|
|
1482
|
-
const
|
|
1483
|
-
if (!
|
|
1837
|
+
// Get existing contexts for this collection from DB
|
|
1838
|
+
const dbColl = getStoreCollection(db, collectionName);
|
|
1839
|
+
if (!dbColl)
|
|
1484
1840
|
return [];
|
|
1485
1841
|
const contextPrefixes = new Set();
|
|
1486
|
-
if (
|
|
1487
|
-
for (const prefix of Object.keys(
|
|
1842
|
+
if (dbColl.context) {
|
|
1843
|
+
for (const prefix of Object.keys(dbColl.context)) {
|
|
1488
1844
|
contextPrefixes.add(prefix);
|
|
1489
1845
|
}
|
|
1490
1846
|
}
|
|
@@ -1754,12 +2110,12 @@ export async function searchVec(db, query, model, limit = 20, collectionName, se
|
|
|
1754
2110
|
// =============================================================================
|
|
1755
2111
|
// Embeddings
|
|
1756
2112
|
// =============================================================================
|
|
1757
|
-
async function getEmbedding(text, model, isQuery, session) {
|
|
2113
|
+
async function getEmbedding(text, model, isQuery, session, llmOverride) {
|
|
1758
2114
|
// Format text using the appropriate prompt template
|
|
1759
2115
|
const formattedText = isQuery ? formatQueryForEmbedding(text, model) : formatDocForEmbedding(text, undefined, model);
|
|
1760
2116
|
const result = session
|
|
1761
2117
|
? await session.embed(formattedText, { model, isQuery })
|
|
1762
|
-
: await getDefaultLlamaCpp().embed(formattedText, { model, isQuery });
|
|
2118
|
+
: await (llmOverride ?? getDefaultLlamaCpp()).embed(formattedText, { model, isQuery });
|
|
1763
2119
|
return result?.embedding || null;
|
|
1764
2120
|
}
|
|
1765
2121
|
/**
|
|
@@ -1798,26 +2154,33 @@ export function insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt
|
|
|
1798
2154
|
// =============================================================================
|
|
1799
2155
|
// Query expansion
|
|
1800
2156
|
// =============================================================================
|
|
1801
|
-
export async function expandQuery(query, model = DEFAULT_QUERY_MODEL, db, intent) {
|
|
2157
|
+
export async function expandQuery(query, model = DEFAULT_QUERY_MODEL, db, intent, llmOverride) {
|
|
1802
2158
|
// Check cache first — stored as JSON preserving types
|
|
1803
2159
|
const cacheKey = getCacheKey("expandQuery", { query, model, ...(intent && { intent }) });
|
|
1804
2160
|
const cached = getCachedResult(db, cacheKey);
|
|
1805
2161
|
if (cached) {
|
|
1806
2162
|
try {
|
|
1807
|
-
|
|
2163
|
+
const parsed = JSON.parse(cached);
|
|
2164
|
+
// Migrate old cache format: { type, text } → { type, query }
|
|
2165
|
+
if (parsed.length > 0 && parsed[0].query) {
|
|
2166
|
+
return parsed;
|
|
2167
|
+
}
|
|
2168
|
+
else if (parsed.length > 0 && parsed[0].text) {
|
|
2169
|
+
return parsed.map((r) => ({ type: r.type, query: r.text }));
|
|
2170
|
+
}
|
|
1808
2171
|
}
|
|
1809
2172
|
catch {
|
|
1810
2173
|
// Old cache format (pre-typed, newline-separated text) — re-expand
|
|
1811
2174
|
}
|
|
1812
2175
|
}
|
|
1813
|
-
const llm = getDefaultLlamaCpp();
|
|
2176
|
+
const llm = llmOverride ?? getDefaultLlamaCpp();
|
|
1814
2177
|
// Note: LlamaCpp uses hardcoded model, model parameter is ignored
|
|
1815
2178
|
const results = await llm.expandQuery(query, { intent });
|
|
1816
2179
|
// Map Queryable[] → ExpandedQuery[] (same shape, decoupled from llm.ts internals).
|
|
1817
2180
|
// Filter out entries that duplicate the original query text.
|
|
1818
2181
|
const expanded = results
|
|
1819
2182
|
.filter(r => r.text !== query)
|
|
1820
|
-
.map(r => ({ type: r.type,
|
|
2183
|
+
.map(r => ({ type: r.type, query: r.text }));
|
|
1821
2184
|
if (expanded.length > 0) {
|
|
1822
2185
|
setCachedResult(db, cacheKey, JSON.stringify(expanded));
|
|
1823
2186
|
}
|
|
@@ -1826,7 +2189,7 @@ export async function expandQuery(query, model = DEFAULT_QUERY_MODEL, db, intent
|
|
|
1826
2189
|
// =============================================================================
|
|
1827
2190
|
// Reranking
|
|
1828
2191
|
// =============================================================================
|
|
1829
|
-
export async function rerank(query, documents, model = DEFAULT_RERANK_MODEL, db, intent) {
|
|
2192
|
+
export async function rerank(query, documents, model = DEFAULT_RERANK_MODEL, db, intent, llmOverride) {
|
|
1830
2193
|
// Prepend intent to rerank query so the reranker scores with domain context
|
|
1831
2194
|
const rerankQuery = intent ? `${intent}\n\n${query}` : query;
|
|
1832
2195
|
const cachedResults = new Map();
|
|
@@ -1849,7 +2212,7 @@ export async function rerank(query, documents, model = DEFAULT_RERANK_MODEL, db,
|
|
|
1849
2212
|
}
|
|
1850
2213
|
// Rerank uncached documents using LlamaCpp
|
|
1851
2214
|
if (uncachedDocsByChunk.size > 0) {
|
|
1852
|
-
const llm = getDefaultLlamaCpp();
|
|
2215
|
+
const llm = llmOverride ?? getDefaultLlamaCpp();
|
|
1853
2216
|
const uncachedDocs = [...uncachedDocsByChunk.values()];
|
|
1854
2217
|
const rerankResult = await llm.rerank(rerankQuery, uncachedDocs, { model });
|
|
1855
2218
|
// Cache results by chunk text so identical chunks across files are scored once.
|
|
@@ -2026,9 +2389,9 @@ export function findDocument(db, filename, options = {}) {
|
|
|
2026
2389
|
LIMIT 1
|
|
2027
2390
|
`).get(`%${filepath}`);
|
|
2028
2391
|
}
|
|
2029
|
-
// Try to match by absolute path (requires looking up collection paths from
|
|
2392
|
+
// Try to match by absolute path (requires looking up collection paths from DB)
|
|
2030
2393
|
if (!doc && !filepath.startsWith('qmd://')) {
|
|
2031
|
-
const collections =
|
|
2394
|
+
const collections = getStoreCollections(db);
|
|
2032
2395
|
for (const coll of collections) {
|
|
2033
2396
|
let relativePath = null;
|
|
2034
2397
|
// If filepath is absolute and starts with collection path, extract relative part
|
|
@@ -2088,9 +2451,9 @@ export function getDocumentBody(db, doc, fromLine, maxLines) {
|
|
|
2088
2451
|
WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
|
|
2089
2452
|
`).get(filepath);
|
|
2090
2453
|
}
|
|
2091
|
-
// Try absolute path by looking up in
|
|
2454
|
+
// Try absolute path by looking up in DB store_collections
|
|
2092
2455
|
if (!row) {
|
|
2093
|
-
const collections =
|
|
2456
|
+
const collections = getStoreCollections(db);
|
|
2094
2457
|
for (const coll of collections) {
|
|
2095
2458
|
if (filepath.startsWith(coll.path + '/')) {
|
|
2096
2459
|
const relativePath = filepath.slice(coll.path.length + 1);
|
|
@@ -2219,23 +2582,27 @@ export function findDocuments(db, pattern, options = {}) {
|
|
|
2219
2582
|
// Status
|
|
2220
2583
|
// =============================================================================
|
|
2221
2584
|
export function getStatus(db) {
|
|
2222
|
-
//
|
|
2223
|
-
const
|
|
2224
|
-
|
|
2225
|
-
|
|
2226
|
-
|
|
2227
|
-
|
|
2228
|
-
|
|
2229
|
-
|
|
2230
|
-
|
|
2231
|
-
|
|
2232
|
-
|
|
2585
|
+
// DB is source of truth for collections — config provides supplementary metadata
|
|
2586
|
+
const dbCollections = db.prepare(`
|
|
2587
|
+
SELECT
|
|
2588
|
+
collection as name,
|
|
2589
|
+
COUNT(*) as active_count,
|
|
2590
|
+
MAX(modified_at) as last_doc_update
|
|
2591
|
+
FROM documents
|
|
2592
|
+
WHERE active = 1
|
|
2593
|
+
GROUP BY collection
|
|
2594
|
+
`).all();
|
|
2595
|
+
// Build a lookup from store_collections for path/pattern metadata
|
|
2596
|
+
const storeCollections = getStoreCollections(db);
|
|
2597
|
+
const configLookup = new Map(storeCollections.map(c => [c.name, { path: c.path, pattern: c.pattern }]));
|
|
2598
|
+
const collections = dbCollections.map(row => {
|
|
2599
|
+
const config = configLookup.get(row.name);
|
|
2233
2600
|
return {
|
|
2234
|
-
name:
|
|
2235
|
-
path:
|
|
2236
|
-
pattern:
|
|
2237
|
-
documents:
|
|
2238
|
-
lastUpdated:
|
|
2601
|
+
name: row.name,
|
|
2602
|
+
path: config?.path ?? null,
|
|
2603
|
+
pattern: config?.pattern ?? null,
|
|
2604
|
+
documents: row.active_count,
|
|
2605
|
+
lastUpdated: row.last_doc_update || new Date().toISOString(),
|
|
2239
2606
|
};
|
|
2240
2607
|
});
|
|
2241
2608
|
// Sort by last update time (most recent first)
|
|
@@ -2382,6 +2749,7 @@ export async function hybridQuery(store, query, options) {
|
|
|
2382
2749
|
const collection = options?.collection;
|
|
2383
2750
|
const explain = options?.explain ?? false;
|
|
2384
2751
|
const intent = options?.intent;
|
|
2752
|
+
const skipRerank = options?.skipRerank ?? false;
|
|
2385
2753
|
const hooks = options?.hooks;
|
|
2386
2754
|
const rankedLists = [];
|
|
2387
2755
|
const rankedListMeta = [];
|
|
@@ -2425,7 +2793,7 @@ export async function hybridQuery(store, query, options) {
|
|
|
2425
2793
|
// 3a: Run FTS for all lex expansions right away (no LLM needed)
|
|
2426
2794
|
for (const q of expanded) {
|
|
2427
2795
|
if (q.type === 'lex') {
|
|
2428
|
-
const ftsResults = store.searchFTS(q.
|
|
2796
|
+
const ftsResults = store.searchFTS(q.query, 20, collection);
|
|
2429
2797
|
if (ftsResults.length > 0) {
|
|
2430
2798
|
for (const r of ftsResults)
|
|
2431
2799
|
docidMap.set(r.filepath, r.docid);
|
|
@@ -2433,7 +2801,7 @@ export async function hybridQuery(store, query, options) {
|
|
|
2433
2801
|
file: r.filepath, displayPath: r.displayPath,
|
|
2434
2802
|
title: r.title, body: r.body || "", score: r.score,
|
|
2435
2803
|
})));
|
|
2436
|
-
rankedListMeta.push({ source: "fts", queryType: "lex", query: q.
|
|
2804
|
+
rankedListMeta.push({ source: "fts", queryType: "lex", query: q.query });
|
|
2437
2805
|
}
|
|
2438
2806
|
}
|
|
2439
2807
|
}
|
|
@@ -2444,11 +2812,11 @@ export async function hybridQuery(store, query, options) {
|
|
|
2444
2812
|
];
|
|
2445
2813
|
for (const q of expanded) {
|
|
2446
2814
|
if (q.type === 'vec' || q.type === 'hyde') {
|
|
2447
|
-
vecQueries.push({ text: q.
|
|
2815
|
+
vecQueries.push({ text: q.query, queryType: q.type });
|
|
2448
2816
|
}
|
|
2449
2817
|
}
|
|
2450
2818
|
// Batch embed all vector queries in a single call
|
|
2451
|
-
const llm =
|
|
2819
|
+
const llm = getLlm(store);
|
|
2452
2820
|
const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text));
|
|
2453
2821
|
hooks?.onEmbedStart?.(textsToEmbed.length);
|
|
2454
2822
|
const embedStart = Date.now();
|
|
@@ -2486,7 +2854,6 @@ export async function hybridQuery(store, query, options) {
|
|
|
2486
2854
|
// Reranking full bodies is O(tokens) — the critical perf lesson that motivated this refactor.
|
|
2487
2855
|
const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 2);
|
|
2488
2856
|
const intentTerms = intent ? extractIntentTerms(intent) : [];
|
|
2489
|
-
const chunksToRerank = [];
|
|
2490
2857
|
const docChunkMap = new Map();
|
|
2491
2858
|
for (const cand of candidates) {
|
|
2492
2859
|
const chunks = chunkDocument(cand.body);
|
|
@@ -2508,10 +2875,65 @@ export async function hybridQuery(store, query, options) {
|
|
|
2508
2875
|
bestIdx = i;
|
|
2509
2876
|
}
|
|
2510
2877
|
}
|
|
2511
|
-
chunksToRerank.push({ file: cand.file, text: chunks[bestIdx].text });
|
|
2512
2878
|
docChunkMap.set(cand.file, { chunks, bestIdx });
|
|
2513
2879
|
}
|
|
2880
|
+
if (skipRerank) {
|
|
2881
|
+
// Skip LLM reranking — return candidates scored by RRF only
|
|
2882
|
+
const seenFiles = new Set();
|
|
2883
|
+
return candidates
|
|
2884
|
+
.map((cand, i) => {
|
|
2885
|
+
const chunkInfo = docChunkMap.get(cand.file);
|
|
2886
|
+
const bestIdx = chunkInfo?.bestIdx ?? 0;
|
|
2887
|
+
const bestChunk = chunkInfo?.chunks[bestIdx]?.text || cand.body || "";
|
|
2888
|
+
const bestChunkPos = chunkInfo?.chunks[bestIdx]?.pos || 0;
|
|
2889
|
+
const rrfRank = i + 1;
|
|
2890
|
+
const rrfScore = 1 / rrfRank;
|
|
2891
|
+
const trace = rrfTraceByFile?.get(cand.file);
|
|
2892
|
+
const explainData = explain ? {
|
|
2893
|
+
ftsScores: trace?.contributions.filter(c => c.source === "fts").map(c => c.backendScore) ?? [],
|
|
2894
|
+
vectorScores: trace?.contributions.filter(c => c.source === "vec").map(c => c.backendScore) ?? [],
|
|
2895
|
+
rrf: {
|
|
2896
|
+
rank: rrfRank,
|
|
2897
|
+
positionScore: rrfScore,
|
|
2898
|
+
weight: 1.0,
|
|
2899
|
+
baseScore: trace?.baseScore ?? 0,
|
|
2900
|
+
topRankBonus: trace?.topRankBonus ?? 0,
|
|
2901
|
+
totalScore: trace?.totalScore ?? 0,
|
|
2902
|
+
contributions: trace?.contributions ?? [],
|
|
2903
|
+
},
|
|
2904
|
+
rerankScore: 0,
|
|
2905
|
+
blendedScore: rrfScore,
|
|
2906
|
+
} : undefined;
|
|
2907
|
+
return {
|
|
2908
|
+
file: cand.file,
|
|
2909
|
+
displayPath: cand.displayPath,
|
|
2910
|
+
title: cand.title,
|
|
2911
|
+
body: cand.body,
|
|
2912
|
+
bestChunk,
|
|
2913
|
+
bestChunkPos,
|
|
2914
|
+
score: rrfScore,
|
|
2915
|
+
context: store.getContextForFile(cand.file),
|
|
2916
|
+
docid: docidMap.get(cand.file) || "",
|
|
2917
|
+
...(explainData ? { explain: explainData } : {}),
|
|
2918
|
+
};
|
|
2919
|
+
})
|
|
2920
|
+
.filter(r => {
|
|
2921
|
+
if (seenFiles.has(r.file))
|
|
2922
|
+
return false;
|
|
2923
|
+
seenFiles.add(r.file);
|
|
2924
|
+
return true;
|
|
2925
|
+
})
|
|
2926
|
+
.filter(r => r.score >= minScore)
|
|
2927
|
+
.slice(0, limit);
|
|
2928
|
+
}
|
|
2514
2929
|
// Step 6: Rerank chunks (NOT full bodies)
|
|
2930
|
+
const chunksToRerank = [];
|
|
2931
|
+
for (const cand of candidates) {
|
|
2932
|
+
const chunkInfo = docChunkMap.get(cand.file);
|
|
2933
|
+
if (chunkInfo) {
|
|
2934
|
+
chunksToRerank.push({ file: cand.file, text: chunkInfo.chunks[chunkInfo.bestIdx].text });
|
|
2935
|
+
}
|
|
2936
|
+
}
|
|
2515
2937
|
hooks?.onRerankStart?.(chunksToRerank.length);
|
|
2516
2938
|
const rerankStart = Date.now();
|
|
2517
2939
|
const reranked = await store.rerank(query, chunksToRerank, undefined, intent);
|
|
@@ -2602,7 +3024,7 @@ export async function vectorSearchQuery(store, query, options) {
|
|
|
2602
3024
|
const vecExpanded = allExpanded.filter(q => q.type !== 'lex');
|
|
2603
3025
|
options?.hooks?.onExpand?.(query, vecExpanded, Date.now() - expandStart);
|
|
2604
3026
|
// Run original + vec/hyde expanded through vector, sequentially — concurrent embed() hangs
|
|
2605
|
-
const queryTexts = [query, ...vecExpanded.map(q => q.
|
|
3027
|
+
const queryTexts = [query, ...vecExpanded.map(q => q.query)];
|
|
2606
3028
|
const allResults = new Map();
|
|
2607
3029
|
for (const q of queryTexts) {
|
|
2608
3030
|
const vecResults = await store.searchVec(q, DEFAULT_EMBED_MODEL, limit, collection);
|
|
@@ -2650,6 +3072,7 @@ export async function structuredSearch(store, searches, options) {
|
|
|
2650
3072
|
const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT;
|
|
2651
3073
|
const explain = options?.explain ?? false;
|
|
2652
3074
|
const intent = options?.intent;
|
|
3075
|
+
const skipRerank = options?.skipRerank ?? false;
|
|
2653
3076
|
const hooks = options?.hooks;
|
|
2654
3077
|
const collections = options?.collections;
|
|
2655
3078
|
if (searches.length === 0)
|
|
@@ -2704,7 +3127,7 @@ export async function structuredSearch(store, searches, options) {
|
|
|
2704
3127
|
if (hasVectors) {
|
|
2705
3128
|
const vecSearches = searches.filter((s) => s.type === 'vec' || s.type === 'hyde');
|
|
2706
3129
|
if (vecSearches.length > 0) {
|
|
2707
|
-
const llm =
|
|
3130
|
+
const llm = getLlm(store);
|
|
2708
3131
|
const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query));
|
|
2709
3132
|
hooks?.onEmbedStart?.(textsToEmbed.length);
|
|
2710
3133
|
const embedStart = Date.now();
|
|
@@ -2750,7 +3173,6 @@ export async function structuredSearch(store, searches, options) {
|
|
|
2750
3173
|
|| searches[0]?.query || "";
|
|
2751
3174
|
const queryTerms = primaryQuery.toLowerCase().split(/\s+/).filter(t => t.length > 2);
|
|
2752
3175
|
const intentTerms = intent ? extractIntentTerms(intent) : [];
|
|
2753
|
-
const chunksToRerank = [];
|
|
2754
3176
|
const docChunkMap = new Map();
|
|
2755
3177
|
for (const cand of candidates) {
|
|
2756
3178
|
const chunks = chunkDocument(cand.body);
|
|
@@ -2772,10 +3194,65 @@ export async function structuredSearch(store, searches, options) {
|
|
|
2772
3194
|
bestIdx = i;
|
|
2773
3195
|
}
|
|
2774
3196
|
}
|
|
2775
|
-
chunksToRerank.push({ file: cand.file, text: chunks[bestIdx].text });
|
|
2776
3197
|
docChunkMap.set(cand.file, { chunks, bestIdx });
|
|
2777
3198
|
}
|
|
3199
|
+
if (skipRerank) {
|
|
3200
|
+
// Skip LLM reranking — return candidates scored by RRF only
|
|
3201
|
+
const seenFiles = new Set();
|
|
3202
|
+
return candidates
|
|
3203
|
+
.map((cand, i) => {
|
|
3204
|
+
const chunkInfo = docChunkMap.get(cand.file);
|
|
3205
|
+
const bestIdx = chunkInfo?.bestIdx ?? 0;
|
|
3206
|
+
const bestChunk = chunkInfo?.chunks[bestIdx]?.text || cand.body || "";
|
|
3207
|
+
const bestChunkPos = chunkInfo?.chunks[bestIdx]?.pos || 0;
|
|
3208
|
+
const rrfRank = i + 1;
|
|
3209
|
+
const rrfScore = 1 / rrfRank;
|
|
3210
|
+
const trace = rrfTraceByFile?.get(cand.file);
|
|
3211
|
+
const explainData = explain ? {
|
|
3212
|
+
ftsScores: trace?.contributions.filter(c => c.source === "fts").map(c => c.backendScore) ?? [],
|
|
3213
|
+
vectorScores: trace?.contributions.filter(c => c.source === "vec").map(c => c.backendScore) ?? [],
|
|
3214
|
+
rrf: {
|
|
3215
|
+
rank: rrfRank,
|
|
3216
|
+
positionScore: rrfScore,
|
|
3217
|
+
weight: 1.0,
|
|
3218
|
+
baseScore: trace?.baseScore ?? 0,
|
|
3219
|
+
topRankBonus: trace?.topRankBonus ?? 0,
|
|
3220
|
+
totalScore: trace?.totalScore ?? 0,
|
|
3221
|
+
contributions: trace?.contributions ?? [],
|
|
3222
|
+
},
|
|
3223
|
+
rerankScore: 0,
|
|
3224
|
+
blendedScore: rrfScore,
|
|
3225
|
+
} : undefined;
|
|
3226
|
+
return {
|
|
3227
|
+
file: cand.file,
|
|
3228
|
+
displayPath: cand.displayPath,
|
|
3229
|
+
title: cand.title,
|
|
3230
|
+
body: cand.body,
|
|
3231
|
+
bestChunk,
|
|
3232
|
+
bestChunkPos,
|
|
3233
|
+
score: rrfScore,
|
|
3234
|
+
context: store.getContextForFile(cand.file),
|
|
3235
|
+
docid: docidMap.get(cand.file) || "",
|
|
3236
|
+
...(explainData ? { explain: explainData } : {}),
|
|
3237
|
+
};
|
|
3238
|
+
})
|
|
3239
|
+
.filter(r => {
|
|
3240
|
+
if (seenFiles.has(r.file))
|
|
3241
|
+
return false;
|
|
3242
|
+
seenFiles.add(r.file);
|
|
3243
|
+
return true;
|
|
3244
|
+
})
|
|
3245
|
+
.filter(r => r.score >= minScore)
|
|
3246
|
+
.slice(0, limit);
|
|
3247
|
+
}
|
|
2778
3248
|
// Step 5: Rerank chunks
|
|
3249
|
+
const chunksToRerank = [];
|
|
3250
|
+
for (const cand of candidates) {
|
|
3251
|
+
const chunkInfo = docChunkMap.get(cand.file);
|
|
3252
|
+
if (chunkInfo) {
|
|
3253
|
+
chunksToRerank.push({ file: cand.file, text: chunkInfo.chunks[chunkInfo.bestIdx].text });
|
|
3254
|
+
}
|
|
3255
|
+
}
|
|
2779
3256
|
hooks?.onRerankStart?.(chunksToRerank.length);
|
|
2780
3257
|
const rerankStart2 = Date.now();
|
|
2781
3258
|
const reranked = await store.rerank(primaryQuery, chunksToRerank, undefined, intent);
|