@comfanion/usethis_search 4.4.0 → 4.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/api.ts +34 -17
- package/cache/manager.ts +30 -19
- package/cli.ts +8 -5
- package/file-indexer.ts +28 -11
- package/hooks/message-before.ts +5 -5
- package/hooks/tool-substitution.ts +4 -120
- package/index.ts +17 -6
- package/package.json +3 -2
- package/tools/codeindex.ts +192 -184
- package/tools/graph.ts +265 -0
- package/tools/read-interceptor.ts +7 -3
- package/tools/search.ts +268 -190
- package/tools/workspace-state.ts +1 -2
- package/tools/workspace.ts +76 -108
- package/vectorizer/analyzers/lsp-client.ts +52 -6
- package/vectorizer/chunkers/chunker-factory.ts +6 -0
- package/vectorizer/chunkers/code-chunker.ts +73 -16
- package/vectorizer/chunkers/lsp-chunker.ts +313 -191
- package/vectorizer/graph-db.ts +6 -4
- package/vectorizer/index.ts +329 -134
- package/vectorizer/usage-tracker.ts +36 -0
- package/vectorizer.yaml +2 -2
package/vectorizer/index.ts
CHANGED
|
@@ -21,6 +21,7 @@ import { UsageTracker } from "./usage-tracker.ts";
|
|
|
21
21
|
import { ChunkStore } from "./chunk-store.ts";
|
|
22
22
|
import { decomposeQuery, rrfMerge, DEFAULT_DECOMPOSER_CONFIG } from "./query-decomposer.ts";
|
|
23
23
|
import type { DecomposerConfig } from "./query-decomposer.ts";
|
|
24
|
+
import { glob } from "glob";
|
|
24
25
|
|
|
25
26
|
// Suppress transformers.js logs unless DEBUG is set
|
|
26
27
|
const DEBUG = process.env.DEBUG?.includes("vectorizer") || process.env.DEBUG === "*";
|
|
@@ -106,8 +107,8 @@ const DEFAULT_WORKSPACE_CONFIG = {
|
|
|
106
107
|
maxFiles: 30, // Max number of files in workspace
|
|
107
108
|
attachTopN: 5, // Top N search results to attach with full content
|
|
108
109
|
attachRelatedPerFile: 3, // Max graph relations per main file
|
|
109
|
-
minScoreMain: 0.65, // Min score for main files
|
|
110
|
-
minScoreRelated: 0.5, // Min score for graph relations
|
|
110
|
+
minScoreMain: 0.65, // Min score for main files (override in vectorizer.yaml)
|
|
111
|
+
minScoreRelated: 0.5, // Min score for graph relations (override in vectorizer.yaml)
|
|
111
112
|
persistContent: false, // Save full content in snapshots (debug mode)
|
|
112
113
|
autoPruneSearch: true, // Replace old search outputs with compact summaries
|
|
113
114
|
substituteToolOutputs: true, // Replace tool outputs when files in workspace
|
|
@@ -498,6 +499,54 @@ function clearQueryCache() {
|
|
|
498
499
|
}
|
|
499
500
|
}
|
|
500
501
|
|
|
502
|
+
// ── Shared ONNX model singleton ─────────────────────────────────────────────
|
|
503
|
+
// Model lives for the entire session — not tied to indexer pool TTL.
|
|
504
|
+
// Indexer eviction clears caches/DB but model stays loaded for fast search.
|
|
505
|
+
let _sharedModel: any = null
|
|
506
|
+
let _sharedModelPromise: Promise<any> | null = null
|
|
507
|
+
|
|
508
|
+
async function getSharedModel(retries = 3): Promise<any> {
|
|
509
|
+
if (_sharedModel) return _sharedModel
|
|
510
|
+
|
|
511
|
+
// Prevent concurrent loads — reuse in-flight promise
|
|
512
|
+
if (_sharedModelPromise) return _sharedModelPromise
|
|
513
|
+
|
|
514
|
+
_sharedModelPromise = (async () => {
|
|
515
|
+
let lastError: Error | null = null
|
|
516
|
+
for (let attempt = 1; attempt <= retries; attempt++) {
|
|
517
|
+
try {
|
|
518
|
+
if (DEBUG) console.log(`[vectorizer] Loading embedding model: ${EMBEDDING_MODEL}... (attempt ${attempt}/${retries})`)
|
|
519
|
+
_sharedModel = await pipeline("feature-extraction", EMBEDDING_MODEL, {
|
|
520
|
+
progress_callback: DEBUG ? undefined : null,
|
|
521
|
+
})
|
|
522
|
+
if (DEBUG) console.log(`[vectorizer] Model loaded: ${EMBEDDING_MODEL}`)
|
|
523
|
+
_sharedModelPromise = null
|
|
524
|
+
return _sharedModel
|
|
525
|
+
} catch (error) {
|
|
526
|
+
lastError = error as Error
|
|
527
|
+
if (attempt < retries) {
|
|
528
|
+
const delay = attempt * 2000
|
|
529
|
+
if (DEBUG) console.log(`[vectorizer] Model load attempt ${attempt} failed: ${lastError.message}. Retrying in ${delay}ms...`)
|
|
530
|
+
await new Promise(r => setTimeout(r, delay))
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
_sharedModelPromise = null
|
|
535
|
+
throw new Error(`Model loading failed after ${retries} attempts: ${lastError?.message || "unknown"}`)
|
|
536
|
+
})()
|
|
537
|
+
|
|
538
|
+
return _sharedModelPromise
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
/** Dispose shared model — call only on process exit or explicit cleanup. */
|
|
542
|
+
async function disposeSharedModel(): Promise<void> {
|
|
543
|
+
if (_sharedModel) {
|
|
544
|
+
try { await _sharedModel.dispose() } catch { /* best effort */ }
|
|
545
|
+
_sharedModel = null
|
|
546
|
+
}
|
|
547
|
+
_sharedModelPromise = null
|
|
548
|
+
}
|
|
549
|
+
|
|
501
550
|
class CodebaseIndexer {
|
|
502
551
|
constructor(projectRoot, indexName = "code") {
|
|
503
552
|
this.root = projectRoot;
|
|
@@ -566,21 +615,14 @@ class CodebaseIndexer {
|
|
|
566
615
|
|
|
567
616
|
async loadModel() {
|
|
568
617
|
if (!this.model) {
|
|
569
|
-
|
|
570
|
-
if (DEBUG) console.log(`[vectorizer] Loading embedding model: ${EMBEDDING_MODEL}...`);
|
|
571
|
-
this.model = await pipeline("feature-extraction", EMBEDDING_MODEL, {
|
|
572
|
-
progress_callback: DEBUG ? undefined : null,
|
|
573
|
-
});
|
|
574
|
-
if (DEBUG) console.log(`[vectorizer] Model loaded: ${EMBEDDING_MODEL}`);
|
|
575
|
-
} catch (error) {
|
|
576
|
-
this.model = null;
|
|
577
|
-
throw new Error(`Model loading failed: ${error.message || error}`);
|
|
578
|
-
}
|
|
618
|
+
this.model = await getSharedModel();
|
|
579
619
|
}
|
|
580
620
|
return this.model;
|
|
581
621
|
}
|
|
582
622
|
|
|
583
623
|
async unloadModel() {
|
|
624
|
+
// Drop reference to shared model — do NOT dispose it.
|
|
625
|
+
// Model singleton lives for the entire session.
|
|
584
626
|
this.model = null;
|
|
585
627
|
// Release BM25 data held in memory
|
|
586
628
|
if (this.bm25) {
|
|
@@ -609,6 +651,14 @@ class CodebaseIndexer {
|
|
|
609
651
|
try { await this.usageTracker.save(); } catch { /* best effort */ }
|
|
610
652
|
this.usageTracker = null;
|
|
611
653
|
}
|
|
654
|
+
// Close LanceDB connection (releases file handles)
|
|
655
|
+
if (this.db) {
|
|
656
|
+
try {
|
|
657
|
+
// LanceDB v0.x doesn't expose close() — drop reference to release
|
|
658
|
+
// Arrow/IPC file handles are released when Connection is GC'd
|
|
659
|
+
this.db = null;
|
|
660
|
+
} catch { /* best effort */ }
|
|
661
|
+
}
|
|
612
662
|
this._chunkCache = null;
|
|
613
663
|
clearQueryCache();
|
|
614
664
|
if (global.gc) global.gc();
|
|
@@ -716,6 +766,8 @@ class CodebaseIndexer {
|
|
|
716
766
|
return null; // unchanged
|
|
717
767
|
}
|
|
718
768
|
|
|
769
|
+
if (DEBUG) console.log(`[vectorizer] prepareFile: ${relPath} [read]`);
|
|
770
|
+
|
|
719
771
|
// Extract metadata
|
|
720
772
|
const fileMeta = await extractFileMetadata(filePath, content);
|
|
721
773
|
const archived = this.isArchived(relPath, content);
|
|
@@ -723,9 +775,13 @@ class CodebaseIndexer {
|
|
|
723
775
|
// Clean content before chunking
|
|
724
776
|
const cleaned = cleanContent(content, fileMeta.file_type, CLEANING_CONFIG);
|
|
725
777
|
|
|
778
|
+
if (DEBUG) console.log(`[vectorizer] prepareFile: ${relPath} [chunking]`);
|
|
779
|
+
|
|
726
780
|
// Semantic chunking (async for LSP-based chunking)
|
|
727
781
|
const chunks = await chunkContent(cleaned, fileMeta.file_type, fileMeta.language, CHUNKING_CONFIG, filePath, this.root);
|
|
728
782
|
|
|
783
|
+
if (DEBUG) console.log(`[vectorizer] prepareFile: ${relPath} [${chunks.length} chunks, building graph]`);
|
|
784
|
+
|
|
729
785
|
// Assign chunk IDs
|
|
730
786
|
const chunksWithIds = this.graphBuilder
|
|
731
787
|
? this.graphBuilder.assignChunkIds(relPath, chunks)
|
|
@@ -753,23 +809,39 @@ class CodebaseIndexer {
|
|
|
753
809
|
}
|
|
754
810
|
|
|
755
811
|
// Return prepared rows (without vector — Phase 2 fills it)
|
|
756
|
-
const rows = chunksWithIds.map((chunk, i) =>
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
content
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
812
|
+
const rows = chunksWithIds.map((chunk, i) => {
|
|
813
|
+
// Build metadata prefix for embedding enrichment.
|
|
814
|
+
// This helps the embedding model associate function/class names with the code,
|
|
815
|
+
// improving search relevance for queries like "ensureBudget" or "WorkspaceCache".
|
|
816
|
+
// The prefix is stored in a separate field — original content stays clean.
|
|
817
|
+
const metaParts: string[] = []
|
|
818
|
+
if (relPath) metaParts.push(`File: ${relPath}`)
|
|
819
|
+
if (chunk.class_name) metaParts.push(`Class: ${chunk.class_name}`)
|
|
820
|
+
if (chunk.function_name) metaParts.push(`Method: ${chunk.function_name}`)
|
|
821
|
+
if (chunk.heading_context) metaParts.push(`Section: ${chunk.heading_context}`)
|
|
822
|
+
const metaPrefix = metaParts.length > 0 ? `// ${metaParts.join(" | ")}\n` : ""
|
|
823
|
+
|
|
824
|
+
return {
|
|
825
|
+
chunk_id: chunk.chunk_id,
|
|
826
|
+
file: relPath,
|
|
827
|
+
chunk_index: i,
|
|
828
|
+
content: chunk.content,
|
|
829
|
+
// Enriched content for embedding — metadata prefix + original content.
|
|
830
|
+
// Phase 2 embeds this instead of raw content.
|
|
831
|
+
content_for_embedding: metaPrefix + chunk.content,
|
|
832
|
+
archived,
|
|
833
|
+
file_type: fileMeta.file_type,
|
|
834
|
+
language: fileMeta.language,
|
|
835
|
+
last_modified: fileMeta.last_modified,
|
|
836
|
+
file_size: fileMeta.file_size,
|
|
837
|
+
heading_context: chunk.heading_context || "",
|
|
838
|
+
function_name: chunk.function_name || "",
|
|
839
|
+
class_name: chunk.class_name || "",
|
|
840
|
+
tags: (fileMeta.tags || []).join(","),
|
|
841
|
+
start_line: chunk.start_line ?? -1,
|
|
842
|
+
end_line: chunk.end_line ?? -1,
|
|
843
|
+
}
|
|
844
|
+
});
|
|
773
845
|
|
|
774
846
|
// Store chunks in ChunkStore (Phase 1 — BM25 available immediately)
|
|
775
847
|
if (this.chunkStore) {
|
|
@@ -781,6 +853,7 @@ class CodebaseIndexer {
|
|
|
781
853
|
}
|
|
782
854
|
}
|
|
783
855
|
|
|
856
|
+
if (DEBUG) console.log(`[vectorizer] prepareFile: ${relPath} [done, ${rows.length} rows]`);
|
|
784
857
|
return { relPath, hash, rows };
|
|
785
858
|
}
|
|
786
859
|
|
|
@@ -805,19 +878,24 @@ class CodebaseIndexer {
|
|
|
805
878
|
|
|
806
879
|
// Batch embed
|
|
807
880
|
const allData = [];
|
|
881
|
+
let embedErrors = 0;
|
|
808
882
|
for (let i = 0; i < allRows.length; i += batchSize) {
|
|
809
883
|
const batch = allRows.slice(i, i + batchSize);
|
|
810
|
-
const texts = batch.map(r => r.content);
|
|
811
|
-
|
|
812
|
-
// Embed batch — @xenova/transformers processes array inputs efficiently
|
|
813
|
-
const embeddings = [];
|
|
814
|
-
for (const text of texts) {
|
|
815
|
-
const result = await model(text, { pooling: "mean", normalize: true });
|
|
816
|
-
embeddings.push(Array.from(result.data));
|
|
817
|
-
}
|
|
818
884
|
|
|
819
|
-
for (
|
|
820
|
-
|
|
885
|
+
for (const row of batch) {
|
|
886
|
+
try {
|
|
887
|
+
// Use enriched content (with metadata prefix) for embedding,
|
|
888
|
+
// but store original content in LanceDB for display.
|
|
889
|
+
const textToEmbed = row.content_for_embedding || row.content;
|
|
890
|
+
const result = await model(textToEmbed, { pooling: "mean", normalize: true });
|
|
891
|
+
// Don't store content_for_embedding in LanceDB — it's only for embedding
|
|
892
|
+
const { content_for_embedding, ...rowWithoutEmbeddingText } = row;
|
|
893
|
+
allData.push({ ...rowWithoutEmbeddingText, vector: Array.from(result.data) });
|
|
894
|
+
} catch (e) {
|
|
895
|
+
embedErrors++;
|
|
896
|
+
if (DEBUG) console.log(`[vectorizer] Embed failed for ${row.chunk_id || row.file}: ${(e as Error).message}`);
|
|
897
|
+
// Skip this chunk — don't let one bad chunk kill the entire index
|
|
898
|
+
}
|
|
821
899
|
}
|
|
822
900
|
|
|
823
901
|
if (onProgress) {
|
|
@@ -825,14 +903,25 @@ class CodebaseIndexer {
|
|
|
825
903
|
}
|
|
826
904
|
}
|
|
827
905
|
|
|
906
|
+
if (embedErrors > 0 && DEBUG) {
|
|
907
|
+
console.log(`[vectorizer] ${embedErrors} chunks failed to embed (skipped)`);
|
|
908
|
+
}
|
|
909
|
+
|
|
828
910
|
// Bulk store in LanceDB
|
|
911
|
+
if (allData.length === 0) return 0;
|
|
912
|
+
|
|
829
913
|
const tableName = "chunks";
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
914
|
+
try {
|
|
915
|
+
const tables = await this.db.tableNames();
|
|
916
|
+
if (tables.includes(tableName)) {
|
|
917
|
+
const table = await this.db.openTable(tableName);
|
|
918
|
+
await table.add(allData);
|
|
919
|
+
} else {
|
|
920
|
+
await this.db.createTable(tableName, allData);
|
|
921
|
+
}
|
|
922
|
+
} catch (e) {
|
|
923
|
+
if (DEBUG) console.log(`[vectorizer] LanceDB store failed: ${(e as Error).message}`);
|
|
924
|
+
throw e; // Re-throw — caller (indexAll) will catch and log
|
|
836
925
|
}
|
|
837
926
|
|
|
838
927
|
// Update hashes + mark vectorized in ChunkStore
|
|
@@ -876,8 +965,8 @@ class CodebaseIndexer {
|
|
|
876
965
|
// Clean content before chunking
|
|
877
966
|
const cleaned = cleanContent(content, fileMeta.file_type, CLEANING_CONFIG);
|
|
878
967
|
|
|
879
|
-
// Semantic chunking
|
|
880
|
-
const chunks = chunkContent(cleaned, fileMeta.file_type, fileMeta.language, CHUNKING_CONFIG);
|
|
968
|
+
// Semantic chunking (await for LSP-based chunking when filePath is provided)
|
|
969
|
+
const chunks = await chunkContent(cleaned, fileMeta.file_type, fileMeta.language, CHUNKING_CONFIG, filePath, this.root);
|
|
881
970
|
|
|
882
971
|
// v3: Assign chunk IDs for graph tracking (works without graph — just adds IDs)
|
|
883
972
|
const chunksWithIds = this.graphBuilder
|
|
@@ -916,12 +1005,22 @@ class CodebaseIndexer {
|
|
|
916
1005
|
|
|
917
1006
|
const data = [];
|
|
918
1007
|
for (let i = 0; i < chunksWithIds.length; i++) {
|
|
919
|
-
const
|
|
1008
|
+
const chunk = chunksWithIds[i];
|
|
1009
|
+
// Build metadata prefix for embedding enrichment (same as prepareFile)
|
|
1010
|
+
const metaParts: string[] = [];
|
|
1011
|
+
if (relPath) metaParts.push(`File: ${relPath}`);
|
|
1012
|
+
if (chunk.class_name) metaParts.push(`Class: ${chunk.class_name}`);
|
|
1013
|
+
if (chunk.function_name) metaParts.push(`Method: ${chunk.function_name}`);
|
|
1014
|
+
if (chunk.heading_context) metaParts.push(`Section: ${chunk.heading_context}`);
|
|
1015
|
+
const metaPrefix = metaParts.length > 0 ? `// ${metaParts.join(" | ")}\n` : "";
|
|
1016
|
+
const textToEmbed = metaPrefix + chunk.content;
|
|
1017
|
+
|
|
1018
|
+
const embedding = await this.embed(textToEmbed);
|
|
920
1019
|
data.push({
|
|
921
|
-
chunk_id:
|
|
1020
|
+
chunk_id: chunk.chunk_id,
|
|
922
1021
|
file: relPath,
|
|
923
1022
|
chunk_index: i,
|
|
924
|
-
content:
|
|
1023
|
+
content: chunk.content,
|
|
925
1024
|
vector: embedding,
|
|
926
1025
|
archived: archived,
|
|
927
1026
|
// v2 metadata
|
|
@@ -929,13 +1028,13 @@ class CodebaseIndexer {
|
|
|
929
1028
|
language: fileMeta.language,
|
|
930
1029
|
last_modified: fileMeta.last_modified,
|
|
931
1030
|
file_size: fileMeta.file_size,
|
|
932
|
-
heading_context:
|
|
933
|
-
function_name:
|
|
934
|
-
class_name:
|
|
1031
|
+
heading_context: chunk.heading_context || "",
|
|
1032
|
+
function_name: chunk.function_name || "",
|
|
1033
|
+
class_name: chunk.class_name || "",
|
|
935
1034
|
tags: (fileMeta.tags || []).join(","),
|
|
936
1035
|
// Line numbers for "from-to" extraction (default to -1 when unknown)
|
|
937
|
-
start_line:
|
|
938
|
-
end_line:
|
|
1036
|
+
start_line: chunk.start_line ?? -1,
|
|
1037
|
+
end_line: chunk.end_line ?? -1,
|
|
939
1038
|
});
|
|
940
1039
|
}
|
|
941
1040
|
|
|
@@ -1110,7 +1209,13 @@ class CodebaseIndexer {
|
|
|
1110
1209
|
content: neighborChunk.content,
|
|
1111
1210
|
relation: edge.predicate,
|
|
1112
1211
|
score,
|
|
1113
|
-
via: edge.source
|
|
1212
|
+
via: edge.source,
|
|
1213
|
+
start_line: neighborChunk.start_line,
|
|
1214
|
+
end_line: neighborChunk.end_line,
|
|
1215
|
+
chunk_index: neighborChunk.chunk_index,
|
|
1216
|
+
language: neighborChunk.language,
|
|
1217
|
+
function_name: neighborChunk.function_name,
|
|
1218
|
+
class_name: neighborChunk.class_name,
|
|
1114
1219
|
});
|
|
1115
1220
|
}
|
|
1116
1221
|
|
|
@@ -1374,69 +1479,99 @@ class CodebaseIndexer {
|
|
|
1374
1479
|
}
|
|
1375
1480
|
|
|
1376
1481
|
async findChunkById(chunkId) {
|
|
1377
|
-
//
|
|
1378
|
-
//
|
|
1482
|
+
// LRU cache with bounded size — avoids loading all 100K rows into memory.
|
|
1483
|
+
// Point lookups: ChunkStore (SQLite) first, LanceDB fallback for vectors.
|
|
1379
1484
|
if (!this._chunkCache) {
|
|
1380
1485
|
this._chunkCache = new Map();
|
|
1486
|
+
}
|
|
1381
1487
|
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1488
|
+
// Check LRU cache first
|
|
1489
|
+
const cached = this._chunkCache.get(chunkId);
|
|
1490
|
+
if (cached) {
|
|
1491
|
+
// Move to end (most recently used) — delete & re-insert
|
|
1492
|
+
this._chunkCache.delete(chunkId);
|
|
1493
|
+
this._chunkCache.set(chunkId, cached);
|
|
1494
|
+
return cached;
|
|
1495
|
+
}
|
|
1496
|
+
|
|
1497
|
+
// Point lookup: ChunkStore (SQLite — fast, no vectors)
|
|
1498
|
+
let chunk = null;
|
|
1499
|
+
if (this.chunkStore) {
|
|
1500
|
+
try {
|
|
1501
|
+
chunk = this.chunkStore.getChunkById(chunkId);
|
|
1502
|
+
} catch {
|
|
1503
|
+
// best effort
|
|
1504
|
+
}
|
|
1505
|
+
}
|
|
1506
|
+
|
|
1507
|
+
// If we need vectors (for cosine similarity in graph expansion),
|
|
1508
|
+
// try LanceDB point lookup. Only if ChunkStore had no result or we need vectors.
|
|
1509
|
+
if (!chunk) {
|
|
1510
|
+
try {
|
|
1511
|
+
const tableName = "chunks";
|
|
1512
|
+
const tables = await this.db.tableNames();
|
|
1513
|
+
if (tables.includes(tableName)) {
|
|
1387
1514
|
const table = await this.db.openTable(tableName);
|
|
1388
|
-
const rows = await table.filter("
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
this._chunkCache.set(row.chunk_id, row);
|
|
1392
|
-
}
|
|
1515
|
+
const rows = await table.filter(`chunk_id = '${chunkId.replace(/'/g, "''")}'`).limit(1).execute();
|
|
1516
|
+
if (rows.length > 0) {
|
|
1517
|
+
chunk = rows[0];
|
|
1393
1518
|
}
|
|
1394
|
-
} catch (e) {
|
|
1395
|
-
if (DEBUG) console.log("[vectorizer] Chunk cache from LanceDB failed:", e.message);
|
|
1396
1519
|
}
|
|
1520
|
+
} catch (e) {
|
|
1521
|
+
if (DEBUG) console.log("[vectorizer] LanceDB point lookup failed:", e.message);
|
|
1397
1522
|
}
|
|
1523
|
+
}
|
|
1398
1524
|
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
}
|
|
1407
|
-
}
|
|
1408
|
-
if (DEBUG && allChunks.length > 0) {
|
|
1409
|
-
console.log(`[vectorizer] Chunk cache from ChunkStore (${allChunks.length} chunks, no vectors)`);
|
|
1410
|
-
}
|
|
1411
|
-
} catch (e) {
|
|
1412
|
-
if (DEBUG) console.log("[vectorizer] Chunk cache from ChunkStore failed:", e.message);
|
|
1525
|
+
if (chunk) {
|
|
1526
|
+
// LRU eviction: cap at 500 entries (each ~2KB without vector, ~4KB with)
|
|
1527
|
+
const MAX_CHUNK_CACHE = 500;
|
|
1528
|
+
if (this._chunkCache.size >= MAX_CHUNK_CACHE) {
|
|
1529
|
+
const oldest = this._chunkCache.keys().next().value;
|
|
1530
|
+
if (oldest !== undefined) {
|
|
1531
|
+
this._chunkCache.delete(oldest);
|
|
1413
1532
|
}
|
|
1414
1533
|
}
|
|
1534
|
+
this._chunkCache.set(chunkId, chunk);
|
|
1415
1535
|
}
|
|
1416
|
-
|
|
1536
|
+
|
|
1537
|
+
return chunk || null;
|
|
1417
1538
|
}
|
|
1418
1539
|
|
|
1419
1540
|
/**
|
|
1420
1541
|
* Find all chunks belonging to a specific file path.
|
|
1542
|
+
* Uses ChunkStore (SQLite) for efficient file-level queries — no full cache load.
|
|
1421
1543
|
* @param {string} filePath - Relative file path (e.g. "src/auth.ts")
|
|
1422
1544
|
* @returns {Promise<Array>} Array of chunks from this file
|
|
1423
1545
|
*/
|
|
1424
1546
|
async findChunksByPath(filePath) {
|
|
1425
|
-
//
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
|
|
1430
|
-
|
|
1431
|
-
|
|
1432
|
-
|
|
1433
|
-
|
|
1547
|
+
// Primary: ChunkStore has an index on file column — fast
|
|
1548
|
+
if (this.chunkStore) {
|
|
1549
|
+
try {
|
|
1550
|
+
const chunks = this.chunkStore.getChunksByFile(filePath);
|
|
1551
|
+
if (chunks.length > 0) {
|
|
1552
|
+
chunks.sort((a, b) => (a.chunk_index || 0) - (b.chunk_index || 0));
|
|
1553
|
+
return chunks;
|
|
1554
|
+
}
|
|
1555
|
+
} catch {
|
|
1556
|
+
// fallback below
|
|
1434
1557
|
}
|
|
1435
1558
|
}
|
|
1436
|
-
|
|
1437
|
-
//
|
|
1438
|
-
|
|
1439
|
-
|
|
1559
|
+
|
|
1560
|
+
// Fallback: LanceDB filter by file
|
|
1561
|
+
try {
|
|
1562
|
+
const tableName = "chunks";
|
|
1563
|
+
const tables = await this.db.tableNames();
|
|
1564
|
+
if (tables.includes(tableName)) {
|
|
1565
|
+
const table = await this.db.openTable(tableName);
|
|
1566
|
+
const rows = await table.filter(`file = '${filePath.replace(/'/g, "''")}'`).limit(1000).execute();
|
|
1567
|
+
rows.sort((a, b) => (a.chunk_index || 0) - (b.chunk_index || 0));
|
|
1568
|
+
return rows;
|
|
1569
|
+
}
|
|
1570
|
+
} catch (e) {
|
|
1571
|
+
if (DEBUG) console.log("[vectorizer] findChunksByPath LanceDB failed:", e.message);
|
|
1572
|
+
}
|
|
1573
|
+
|
|
1574
|
+
return [];
|
|
1440
1575
|
}
|
|
1441
1576
|
|
|
1442
1577
|
cosineSimilarity(vecA, vecB) {
|
|
@@ -1448,11 +1583,11 @@ class CodebaseIndexer {
|
|
|
1448
1583
|
normA += vecA[i] * vecA[i];
|
|
1449
1584
|
normB += vecB[i] * vecB[i];
|
|
1450
1585
|
}
|
|
1451
|
-
|
|
1586
|
+
const denom = Math.sqrt(normA) * Math.sqrt(normB);
|
|
1587
|
+
return denom === 0 ? 0 : dotProduct / denom;
|
|
1452
1588
|
}
|
|
1453
1589
|
|
|
1454
1590
|
async checkHealth(extraIgnore = []) {
|
|
1455
|
-
const { glob } = await import("glob");
|
|
1456
1591
|
const preset = INDEX_PRESETS[this.indexName] || DEFAULT_PRESETS.code;
|
|
1457
1592
|
|
|
1458
1593
|
const ignore = [
|
|
@@ -1544,7 +1679,6 @@ class CodebaseIndexer {
|
|
|
1544
1679
|
}
|
|
1545
1680
|
|
|
1546
1681
|
async indexAll(onProgress = null, extraIgnore = []) {
|
|
1547
|
-
const { glob } = await import("glob");
|
|
1548
1682
|
const preset = INDEX_PRESETS[this.indexName] || DEFAULT_PRESETS.code;
|
|
1549
1683
|
|
|
1550
1684
|
const ignore = [
|
|
@@ -1611,22 +1745,33 @@ class CodebaseIndexer {
|
|
|
1611
1745
|
const total = files.length;
|
|
1612
1746
|
const CONCURRENCY = 5;
|
|
1613
1747
|
|
|
1748
|
+
// Helper: write to indexer.log (always, not just DEBUG)
|
|
1749
|
+
const logPath = path.join(this.root, ".opencode", "indexer.log");
|
|
1750
|
+
const fsSync = await import("fs");
|
|
1751
|
+
const logToFile = (msg: string) => {
|
|
1752
|
+
const ts = new Date().toISOString().slice(11, 19);
|
|
1753
|
+
try { fsSync.appendFileSync(logPath, `${ts} ${msg}\n`); } catch { /* non-fatal */ }
|
|
1754
|
+
};
|
|
1755
|
+
|
|
1614
1756
|
// ══════════════════════════════════════════════════════════════════════════
|
|
1615
|
-
// Phase 1: Prepare files
|
|
1757
|
+
// Phase 1: Prepare files sequentially (chunk + graph, no embedding)
|
|
1616
1758
|
// ══════════════════════════════════════════════════════════════════════════
|
|
1759
|
+
logToFile(`Phase 1: preparing ${total} files (concurrency=${CONCURRENCY})`);
|
|
1760
|
+
|
|
1617
1761
|
const preparedFiles = [];
|
|
1618
1762
|
let prepared = 0;
|
|
1619
1763
|
let skipped = 0;
|
|
1764
|
+
let errors = 0;
|
|
1620
1765
|
|
|
1621
|
-
// Process in batches of CONCURRENCY
|
|
1622
1766
|
for (let i = 0; i < files.length; i += CONCURRENCY) {
|
|
1623
1767
|
const batch = files.slice(i, i + CONCURRENCY);
|
|
1624
1768
|
const promises = batch.map(async (relPath) => {
|
|
1625
1769
|
const filePath = path.join(this.root, relPath);
|
|
1626
1770
|
try {
|
|
1627
|
-
|
|
1628
|
-
|
|
1629
|
-
|
|
1771
|
+
return await this.prepareFile(filePath);
|
|
1772
|
+
} catch (e) {
|
|
1773
|
+
logToFile(`ERROR prepare ${relPath}: ${(e as Error).message}`);
|
|
1774
|
+
errors++;
|
|
1630
1775
|
return null;
|
|
1631
1776
|
}
|
|
1632
1777
|
});
|
|
@@ -1643,7 +1788,7 @@ class CodebaseIndexer {
|
|
|
1643
1788
|
}
|
|
1644
1789
|
}
|
|
1645
1790
|
|
|
1646
|
-
|
|
1791
|
+
logToFile(`Phase 1 done: ${prepared} prepared, ${skipped} skipped, ${errors} errors`);
|
|
1647
1792
|
|
|
1648
1793
|
// ══════════════════════════════════════════════════════════════════════════
|
|
1649
1794
|
// Phase 2: Batch embed + store (sequential, batch forward pass)
|
|
@@ -1651,19 +1796,24 @@ class CodebaseIndexer {
|
|
|
1651
1796
|
let chunksEmbedded = 0;
|
|
1652
1797
|
if (preparedFiles.length > 0) {
|
|
1653
1798
|
const totalChunks = preparedFiles.reduce((sum, pf) => sum + pf.rows.length, 0);
|
|
1654
|
-
|
|
1655
|
-
|
|
1656
|
-
chunksEmbedded = await this.embedAndStore(preparedFiles, 32, (done, embedTotal, phase) => {
|
|
1657
|
-
if (onProgress) onProgress(done, embedTotal, `embedding`, done, "embed");
|
|
1658
|
-
});
|
|
1799
|
+
logToFile(`Phase 2: embedding ${totalChunks} chunks from ${preparedFiles.length} files`);
|
|
1659
1800
|
|
|
1660
|
-
|
|
1801
|
+
try {
|
|
1802
|
+
chunksEmbedded = await this.embedAndStore(preparedFiles, 32, (done, embedTotal, phase) => {
|
|
1803
|
+
if (onProgress) onProgress(done, embedTotal, `embedding`, done, "embed");
|
|
1804
|
+
});
|
|
1805
|
+
logToFile(`Phase 2 done: ${chunksEmbedded} chunks embedded and stored`);
|
|
1806
|
+
} catch (e) {
|
|
1807
|
+
logToFile(`Phase 2 FAILED: ${(e as Error).message}\n${(e as Error).stack || ""}`);
|
|
1808
|
+
throw e;
|
|
1809
|
+
}
|
|
1810
|
+
} else {
|
|
1811
|
+
logToFile(`Phase 2 skipped: no prepared files`);
|
|
1661
1812
|
}
|
|
1662
1813
|
|
|
1663
|
-
const indexed = prepared;
|
|
1814
|
+
const indexed = prepared;
|
|
1664
1815
|
|
|
1665
1816
|
// FR-005: Build semantic similarity edges as post-pass
|
|
1666
|
-
// Disabled by default (O(n²) — slow on large repos). Enable via graph.semantic_edges: true
|
|
1667
1817
|
let semanticEdges = 0;
|
|
1668
1818
|
if (chunksEmbedded > 0 && this.graphBuilder && this.graphDB && GRAPH_CONFIG.semantic_edges) {
|
|
1669
1819
|
try {
|
|
@@ -1676,30 +1826,25 @@ class CodebaseIndexer {
|
|
|
1676
1826
|
.filter(r => r.chunk_id && r.vector)
|
|
1677
1827
|
.map(r => ({ chunk_id: r.chunk_id, vector: Array.from(r.vector), file: r.file }));
|
|
1678
1828
|
|
|
1679
|
-
// Skip if too many chunks — O(n²) becomes prohibitive
|
|
1680
1829
|
const maxChunks = GRAPH_CONFIG.semantic_edges_max_chunks ?? 500;
|
|
1681
1830
|
if (chunkData.length > maxChunks) {
|
|
1682
|
-
|
|
1831
|
+
logToFile(`Semantic edges skipped: ${chunkData.length} chunks > max ${maxChunks}`);
|
|
1683
1832
|
} else {
|
|
1684
1833
|
semanticEdges = await this.graphBuilder.buildSemanticEdges(chunkData, 0.8, 3);
|
|
1685
|
-
|
|
1834
|
+
logToFile(`Semantic edges: ${semanticEdges} built`);
|
|
1686
1835
|
}
|
|
1687
1836
|
}
|
|
1688
1837
|
} catch (e) {
|
|
1689
|
-
|
|
1690
|
-
// non-fatal — explicit edges still work
|
|
1838
|
+
logToFile(`Semantic edges FAILED: ${(e as Error).message}`);
|
|
1691
1839
|
}
|
|
1692
1840
|
}
|
|
1693
1841
|
|
|
1694
|
-
// Cleanup
|
|
1842
|
+
// Cleanup LSP
|
|
1695
1843
|
if (this.graphBuilder) {
|
|
1696
|
-
try {
|
|
1697
|
-
await this.graphBuilder.cleanup();
|
|
1698
|
-
} catch {
|
|
1699
|
-
// Best effort — continue even if cleanup fails
|
|
1700
|
-
}
|
|
1844
|
+
try { await this.graphBuilder.cleanup(); } catch { /* best effort */ }
|
|
1701
1845
|
}
|
|
1702
1846
|
|
|
1847
|
+
logToFile(`indexAll complete: ${indexed} indexed, ${chunksEmbedded} embedded, ${semanticEdges} semantic edges`);
|
|
1703
1848
|
return { indexed, skipped, total, semanticEdges };
|
|
1704
1849
|
}
|
|
1705
1850
|
|
|
@@ -1763,9 +1908,13 @@ class CodebaseIndexer {
|
|
|
1763
1908
|
if (entry.isDirectory() && entry.name !== "lancedb") {
|
|
1764
1909
|
try {
|
|
1765
1910
|
const indexer = await new CodebaseIndexer(this.root, entry.name).init();
|
|
1766
|
-
|
|
1767
|
-
|
|
1768
|
-
|
|
1911
|
+
try {
|
|
1912
|
+
const stat = await indexer.getStats();
|
|
1913
|
+
if (stat.fileCount > 0 || stat.chunkCount > 0) {
|
|
1914
|
+
stats.push(stat);
|
|
1915
|
+
}
|
|
1916
|
+
} finally {
|
|
1917
|
+
await indexer.unloadModel();
|
|
1769
1918
|
}
|
|
1770
1919
|
} catch {}
|
|
1771
1920
|
}
|
|
@@ -1775,20 +1924,38 @@ class CodebaseIndexer {
|
|
|
1775
1924
|
}
|
|
1776
1925
|
|
|
1777
1926
|
async clear() {
|
|
1927
|
+
// Close open DB handles before deleting their files
|
|
1928
|
+
if (this.chunkStore) { try { this.chunkStore.close(); } catch { /* best effort */ } this.chunkStore = null; }
|
|
1929
|
+
if (this.graphBuilder) { try { await this.graphBuilder.cleanup(); } catch { /* best effort */ } this.graphBuilder = null; }
|
|
1930
|
+
if (this.graphDB) { try { await this.graphDB.close(); } catch { /* best effort */ } this.graphDB = null; }
|
|
1931
|
+
if (this.usageTracker) { try { await this.usageTracker.save(); } catch { /* best effort */ } this.usageTracker = null; }
|
|
1932
|
+
this.db = null;
|
|
1933
|
+
|
|
1778
1934
|
await fs.rm(this.cacheDir, { recursive: true, force: true });
|
|
1779
1935
|
this.hashes = {};
|
|
1780
1936
|
if (this.bm25) { this.bm25.clear(); this.bm25 = null; }
|
|
1781
1937
|
this._bm25Rows = null;
|
|
1938
|
+
this._chunkCache = null;
|
|
1782
1939
|
this.metrics = null;
|
|
1940
|
+
this.model = null;
|
|
1783
1941
|
await this.init();
|
|
1784
1942
|
}
|
|
1785
1943
|
|
|
1786
1944
|
async clearAll() {
|
|
1945
|
+
// Close open DB handles before deleting their files
|
|
1946
|
+
if (this.chunkStore) { try { this.chunkStore.close(); } catch { /* best effort */ } this.chunkStore = null; }
|
|
1947
|
+
if (this.graphBuilder) { try { await this.graphBuilder.cleanup(); } catch { /* best effort */ } this.graphBuilder = null; }
|
|
1948
|
+
if (this.graphDB) { try { await this.graphDB.close(); } catch { /* best effort */ } this.graphDB = null; }
|
|
1949
|
+
if (this.usageTracker) { try { await this.usageTracker.save(); } catch { /* best effort */ } this.usageTracker = null; }
|
|
1950
|
+
this.db = null;
|
|
1951
|
+
|
|
1787
1952
|
await fs.rm(this.baseDir, { recursive: true, force: true });
|
|
1788
1953
|
this.hashes = {};
|
|
1789
1954
|
if (this.bm25) { this.bm25.clear(); this.bm25 = null; }
|
|
1790
1955
|
this._bm25Rows = null;
|
|
1956
|
+
this._chunkCache = null;
|
|
1791
1957
|
this.metrics = null;
|
|
1958
|
+
this.model = null;
|
|
1792
1959
|
clearQueryCache();
|
|
1793
1960
|
await this.init();
|
|
1794
1961
|
}
|
|
@@ -1832,7 +1999,9 @@ function getWorkspaceConfig() {
|
|
|
1832
1999
|
// ── Singleton indexer pool ──────────────────────────────────────────────────
|
|
1833
2000
|
// Prevents LevelDB lock conflicts when parallel searches hit the same index.
|
|
1834
2001
|
// Each unique (projectRoot, indexName) gets one shared CodebaseIndexer.
|
|
1835
|
-
|
|
2002
|
+
// TTL eviction: idle indexers are unloaded after POOL_IDLE_TTL_MS to free memory.
|
|
2003
|
+
const POOL_IDLE_TTL_MS = 5 * 60 * 1000; // 5 minutes idle → unload model + caches
|
|
2004
|
+
const _indexerPool = new Map<string, { indexer: CodebaseIndexer; refCount: number; initPromise: Promise<CodebaseIndexer>; idleTimer?: ReturnType<typeof setTimeout> }>();
|
|
1836
2005
|
|
|
1837
2006
|
/**
|
|
1838
2007
|
* Get or create a shared CodebaseIndexer for the given project + index.
|
|
@@ -1851,6 +2020,11 @@ async function getIndexer(projectRoot: string, indexName: string = "code"): Prom
|
|
|
1851
2020
|
const existing = _indexerPool.get(key);
|
|
1852
2021
|
if (existing) {
|
|
1853
2022
|
existing.refCount++;
|
|
2023
|
+
// Cancel pending idle eviction — someone is using it again
|
|
2024
|
+
if (existing.idleTimer) {
|
|
2025
|
+
clearTimeout(existing.idleTimer);
|
|
2026
|
+
existing.idleTimer = undefined;
|
|
2027
|
+
}
|
|
1854
2028
|
return existing.initPromise;
|
|
1855
2029
|
}
|
|
1856
2030
|
|
|
@@ -1862,15 +2036,35 @@ async function getIndexer(projectRoot: string, indexName: string = "code"): Prom
|
|
|
1862
2036
|
|
|
1863
2037
|
/**
|
|
1864
2038
|
* Release a reference to a shared indexer. When refCount reaches 0,
|
|
1865
|
-
*
|
|
1866
|
-
*
|
|
2039
|
+
* starts a TTL timer. If no one reuses within POOL_IDLE_TTL_MS,
|
|
2040
|
+
* the indexer is fully unloaded (model disposed, caches freed, DB closed).
|
|
1867
2041
|
*/
|
|
1868
2042
|
function releaseIndexer(projectRoot: string, indexName: string = "code") {
|
|
1869
2043
|
const key = `${projectRoot}::${indexName}`;
|
|
1870
2044
|
const entry = _indexerPool.get(key);
|
|
1871
2045
|
if (!entry) return;
|
|
1872
2046
|
entry.refCount = Math.max(0, entry.refCount - 1);
|
|
1873
|
-
|
|
2047
|
+
|
|
2048
|
+
if (entry.refCount === 0) {
|
|
2049
|
+
// Start idle eviction timer — free memory if not reused soon
|
|
2050
|
+
if (entry.idleTimer) clearTimeout(entry.idleTimer);
|
|
2051
|
+
entry.idleTimer = setTimeout(async () => {
|
|
2052
|
+
// Double-check: still idle?
|
|
2053
|
+
const current = _indexerPool.get(key);
|
|
2054
|
+
if (!current || current.refCount > 0) return;
|
|
2055
|
+
_indexerPool.delete(key);
|
|
2056
|
+
try {
|
|
2057
|
+
const indexer = await current.initPromise;
|
|
2058
|
+
await indexer.unloadModel();
|
|
2059
|
+
} catch {
|
|
2060
|
+
// best effort
|
|
2061
|
+
}
|
|
2062
|
+
}, POOL_IDLE_TTL_MS);
|
|
2063
|
+
// Don't keep process alive for idle eviction
|
|
2064
|
+
if (entry.idleTimer && typeof entry.idleTimer === "object" && "unref" in entry.idleTimer) {
|
|
2065
|
+
(entry.idleTimer as NodeJS.Timeout).unref();
|
|
2066
|
+
}
|
|
2067
|
+
}
|
|
1874
2068
|
}
|
|
1875
2069
|
|
|
1876
2070
|
/**
|
|
@@ -1881,6 +2075,7 @@ async function destroyIndexer(projectRoot: string, indexName: string = "code") {
|
|
|
1881
2075
|
const key = `${projectRoot}::${indexName}`;
|
|
1882
2076
|
const entry = _indexerPool.get(key);
|
|
1883
2077
|
if (!entry) return;
|
|
2078
|
+
if (entry.idleTimer) clearTimeout(entry.idleTimer);
|
|
1884
2079
|
_indexerPool.delete(key);
|
|
1885
2080
|
try {
|
|
1886
2081
|
const indexer = await entry.initPromise;
|
|
@@ -1894,4 +2089,4 @@ function getDecomposerConfig() {
|
|
|
1894
2089
|
return DECOMPOSER_CONFIG;
|
|
1895
2090
|
}
|
|
1896
2091
|
|
|
1897
|
-
export { CodebaseIndexer, INDEX_PRESETS, getEmbeddingModel, getSearchConfig, getWorkspaceConfig, getDecomposerConfig, getIndexer, releaseIndexer, destroyIndexer };
|
|
2092
|
+
export { CodebaseIndexer, INDEX_PRESETS, getEmbeddingModel, getSearchConfig, getWorkspaceConfig, getDecomposerConfig, getIndexer, releaseIndexer, destroyIndexer, disposeSharedModel };
|