@comfanion/usethis_search 3.0.0-dev.26 → 3.0.0-dev.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@comfanion/usethis_search",
3
- "version": "3.0.0-dev.26",
3
+ "version": "3.0.0-dev.28",
4
4
  "description": "OpenCode plugin: semantic search with graph-based context (v3: graph relations, 1-hop context, LSP + regex analyzers)",
5
5
  "type": "module",
6
6
  "main": "./index.ts",
@@ -32,6 +32,7 @@
32
32
  "vectorizer/query-cache.ts",
33
33
  "vectorizer/search-metrics.ts",
34
34
  "vectorizer/graph-db.ts",
35
+ "vectorizer/chunk-store.ts",
35
36
  "vectorizer/usage-tracker.ts",
36
37
  "vectorizer/graph-builder.ts",
37
38
  "vectorizer/analyzers/regex-analyzer.ts",
package/tools/search.ts CHANGED
@@ -218,9 +218,12 @@ Examples:
218
218
  // ── Reranking — boost results where query keywords appear in text ──────
219
219
  const queryKeywords = args.query.toLowerCase().split(/\s+/).filter((w: string) => w.length > 2)
220
220
  for (const r of allResults) {
221
+ const isBM25Only = !!r._bm25Only
221
222
  const vectorScore = r._distance != null ? Math.max(0, 1 - r._distance / 2) : 0
222
223
  r._vectorScore = vectorScore
223
- r._bm25Component = r._combinedScore != null ? Math.max(0, r._combinedScore - vectorScore) : 0
224
+ r._bm25Component = isBM25Only
225
+ ? (r._combinedScore ?? 0)
226
+ : (r._combinedScore != null ? Math.max(0, r._combinedScore - vectorScore) : 0)
224
227
  const baseScore = r._combinedScore ?? vectorScore
225
228
 
226
229
  const text = (r.content || "").toLowerCase()
@@ -264,10 +267,15 @@ Examples:
264
267
 
265
268
  // ── Confidence signal ──────────────────────────────────────────────────
266
269
  const topScore = sortedGroups[0].best._finalScore ?? 0
270
+ const hasBM25Only = allResults.some((r: any) => r._bm25Only)
267
271
  const scope = args.searchAll ? "all indexes" : `index "${indexName}"`
268
272
  const filterLabel = args.filter ? ` filter:"${args.filter}"` : ""
269
273
  let output = `## Search Results for: "${args.query}" (${scope}${filterLabel})\n\n`
270
274
 
275
+ if (hasBM25Only) {
276
+ output += `> **BM25-only mode** — vector embeddings not yet available. Results are keyword-based. Quality will improve after embedding completes.\n\n`
277
+ }
278
+
271
279
  if (topScore < 0.45) {
272
280
  output += `> **Low confidence results.** Best score: ${topScore.toFixed(3)}. These results may not be relevant to your query.\n> Try more specific keywords or different phrasing.\n\n`
273
281
  }
@@ -287,8 +295,10 @@ Examples:
287
295
  const metaLine = metaParts.length > 0 ? ` (${metaParts.join(", ")})` : ""
288
296
 
289
297
  // Score breakdown
290
- const breakdownParts: string[] = [`vec: ${(r._vectorScore ?? 0).toFixed(2)}`]
291
- if (r._bm25Component > 0.005) breakdownParts.push(`bm25: +${r._bm25Component.toFixed(2)}`)
298
+ const breakdownParts: string[] = r._bm25Only
299
+ ? [`bm25: ${(r._bm25Component ?? 0).toFixed(2)}`]
300
+ : [`vec: ${(r._vectorScore ?? 0).toFixed(2)}`]
301
+ if (!r._bm25Only && r._bm25Component > 0.005) breakdownParts.push(`bm25: +${r._bm25Component.toFixed(2)}`)
292
302
  if (r._keywordBonus > 0.005) breakdownParts.push(`kw: +${r._keywordBonus.toFixed(2)}`)
293
303
  const breakdown = breakdownParts.join(", ")
294
304
 
@@ -0,0 +1,207 @@
1
+ /**
2
+ * ChunkStore — SQLite-based persistent chunk storage.
3
+ * Populated by Phase 1 (no vectors needed). Provides BM25 search
4
+ * and metadata queries immediately, before embedding is complete.
5
+ *
6
+ * Uses bun:sqlite with WAL mode for concurrent read access.
7
+ */
8
+
9
+ import { Database } from "bun:sqlite"
10
+
11
+ export interface StoredChunk {
12
+ chunk_id: string
13
+ file: string
14
+ chunk_index: number
15
+ content: string
16
+ file_type: string
17
+ language: string
18
+ last_modified: string
19
+ file_size: number
20
+ heading_context: string
21
+ function_name: string
22
+ class_name: string
23
+ tags: string
24
+ start_line: number
25
+ end_line: number
26
+ archived: boolean
27
+ vectorized: boolean
28
+ }
29
+
30
+ export class ChunkStore {
31
+ private db: Database | null = null
32
+
33
+ // Prepared statements
34
+ private _stmtInsert: any = null
35
+ private _stmtByFile: any = null
36
+ private _stmtDeleteByFile: any = null
37
+ private _stmtAll: any = null
38
+ private _stmtByChunkId: any = null
39
+ private _stmtMarkVectorized: any = null
40
+ private _stmtHasVectors: any = null
41
+ private _stmtCount: any = null
42
+ private _stmtSearch: any = null
43
+
44
+ constructor(private dbPath: string) {}
45
+
46
+ async init(): Promise<this> {
47
+ const fullPath = this.dbPath.endsWith(".db") ? this.dbPath : this.dbPath + ".db"
48
+ this.db = new Database(fullPath)
49
+
50
+ this.db.exec("PRAGMA journal_mode = WAL")
51
+ this.db.exec("PRAGMA synchronous = NORMAL")
52
+ this.db.exec("PRAGMA cache_size = -4000") // 4MB cache
53
+
54
+ this.db.exec(`
55
+ CREATE TABLE IF NOT EXISTS chunks (
56
+ chunk_id TEXT PRIMARY KEY,
57
+ file TEXT NOT NULL,
58
+ chunk_index INTEGER NOT NULL DEFAULT 0,
59
+ content TEXT NOT NULL,
60
+ file_type TEXT NOT NULL DEFAULT '',
61
+ language TEXT NOT NULL DEFAULT '',
62
+ last_modified TEXT NOT NULL DEFAULT '',
63
+ file_size INTEGER NOT NULL DEFAULT 0,
64
+ heading_context TEXT NOT NULL DEFAULT '',
65
+ function_name TEXT NOT NULL DEFAULT '',
66
+ class_name TEXT NOT NULL DEFAULT '',
67
+ tags TEXT NOT NULL DEFAULT '',
68
+ start_line INTEGER NOT NULL DEFAULT -1,
69
+ end_line INTEGER NOT NULL DEFAULT -1,
70
+ archived INTEGER NOT NULL DEFAULT 0,
71
+ vectorized INTEGER NOT NULL DEFAULT 0
72
+ )
73
+ `)
74
+
75
+ this.db.exec("CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file)")
76
+ this.db.exec("CREATE INDEX IF NOT EXISTS idx_chunks_vectorized ON chunks(vectorized)")
77
+ this.db.exec("CREATE INDEX IF NOT EXISTS idx_chunks_language ON chunks(language)")
78
+
79
+ // Prepare statements
80
+ this._stmtInsert = this.db.prepare(`
81
+ INSERT OR REPLACE INTO chunks
82
+ (chunk_id, file, chunk_index, content, file_type, language, last_modified, file_size,
83
+ heading_context, function_name, class_name, tags, start_line, end_line, archived, vectorized)
84
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 0)
85
+ `)
86
+ this._stmtByFile = this.db.prepare("SELECT * FROM chunks WHERE file = ?")
87
+ this._stmtDeleteByFile = this.db.prepare("DELETE FROM chunks WHERE file = ?")
88
+ this._stmtAll = this.db.prepare("SELECT * FROM chunks")
89
+ this._stmtByChunkId = this.db.prepare("SELECT * FROM chunks WHERE chunk_id = ?")
90
+ this._stmtMarkVectorized = this.db.prepare("UPDATE chunks SET vectorized = 1 WHERE file = ?")
91
+ this._stmtHasVectors = this.db.prepare("SELECT COUNT(*) as cnt FROM chunks WHERE vectorized = 0")
92
+ this._stmtCount = this.db.prepare("SELECT COUNT(*) as cnt FROM chunks")
93
+
94
+ return this
95
+ }
96
+
97
+ /**
98
+ * Store chunks from Phase 1 (batch, in transaction).
99
+ */
100
+ storeChunks(rows: Array<{
101
+ chunk_id: string, file: string, chunk_index: number, content: string,
102
+ file_type: string, language: string, last_modified: string, file_size: number,
103
+ heading_context: string, function_name: string, class_name: string, tags: string,
104
+ start_line: number, end_line: number, archived: boolean
105
+ }>): void {
106
+ if (!this.db) throw new Error("ChunkStore not initialized")
107
+
108
+ const insertMany = this.db.transaction((items: typeof rows) => {
109
+ for (const r of items) {
110
+ this._stmtInsert.run(
111
+ r.chunk_id, r.file, r.chunk_index, r.content,
112
+ r.file_type, r.language, r.last_modified, r.file_size,
113
+ r.heading_context, r.function_name, r.class_name, r.tags,
114
+ r.start_line, r.end_line, r.archived ? 1 : 0
115
+ )
116
+ }
117
+ })
118
+ insertMany(rows)
119
+ }
120
+
121
+ /**
122
+ * Delete all chunks for a file (before re-indexing).
123
+ */
124
+ deleteByFile(filePath: string): void {
125
+ if (!this.db) throw new Error("ChunkStore not initialized")
126
+ this._stmtDeleteByFile.run(filePath)
127
+ }
128
+
129
+ /**
130
+ * Mark all chunks for a file as vectorized (Phase 2 complete).
131
+ */
132
+ markVectorized(filePath: string): void {
133
+ if (!this.db) throw new Error("ChunkStore not initialized")
134
+ this._stmtMarkVectorized.run(filePath)
135
+ }
136
+
137
+ /**
138
+ * Check if all chunks have vectors.
139
+ */
140
+ hasUnvectorizedChunks(): boolean {
141
+ if (!this.db) return false
142
+ const row = this._stmtHasVectors.get() as { cnt: number }
143
+ return row.cnt > 0
144
+ }
145
+
146
+ /**
147
+ * Get all chunks (for BM25 index building).
148
+ */
149
+ getAllChunks(): StoredChunk[] {
150
+ if (!this.db) return []
151
+ return this._stmtAll.all().map((r: any) => this.toChunk(r))
152
+ }
153
+
154
+ /**
155
+ * Get chunks for a specific file.
156
+ */
157
+ getChunksByFile(filePath: string): StoredChunk[] {
158
+ if (!this.db) return []
159
+ return this._stmtByFile.all(filePath).map((r: any) => this.toChunk(r))
160
+ }
161
+
162
+ /**
163
+ * Get a single chunk by ID.
164
+ */
165
+ getChunkById(chunkId: string): StoredChunk | null {
166
+ if (!this.db) return null
167
+ const row = this._stmtByChunkId.get(chunkId)
168
+ return row ? this.toChunk(row) : null
169
+ }
170
+
171
+ /**
172
+ * Total chunk count.
173
+ */
174
+ count(): number {
175
+ if (!this.db) return 0
176
+ const row = this._stmtCount.get() as { cnt: number }
177
+ return row.cnt
178
+ }
179
+
180
+ close(): void {
181
+ if (this.db) {
182
+ this.db.close()
183
+ this.db = null
184
+ }
185
+ }
186
+
187
+ private toChunk(row: any): StoredChunk {
188
+ return {
189
+ chunk_id: row.chunk_id,
190
+ file: row.file,
191
+ chunk_index: row.chunk_index,
192
+ content: row.content,
193
+ file_type: row.file_type,
194
+ language: row.language,
195
+ last_modified: row.last_modified,
196
+ file_size: row.file_size,
197
+ heading_context: row.heading_context,
198
+ function_name: row.function_name,
199
+ class_name: row.class_name,
200
+ tags: row.tags,
201
+ start_line: row.start_line,
202
+ end_line: row.end_line,
203
+ archived: !!row.archived,
204
+ vectorized: !!row.vectorized,
205
+ }
206
+ }
207
+ }
@@ -18,6 +18,7 @@ import { SearchMetrics } from "./search-metrics.ts";
18
18
  import { GraphDB } from "./graph-db.ts";
19
19
  import { GraphBuilder, isStructuralPredicate } from "./graph-builder.ts";
20
20
  import { UsageTracker } from "./usage-tracker.ts";
21
+ import { ChunkStore } from "./chunk-store.ts";
21
22
 
22
23
  // Suppress transformers.js logs unless DEBUG is set
23
24
  const DEBUG = process.env.DEBUG?.includes("vectorizer") || process.env.DEBUG === "*";
@@ -448,6 +449,7 @@ class CodebaseIndexer {
448
449
  this.graphBuilder = null; // Graph builder orchestrator
449
450
  this._chunkCache = null; // Lazy Map<chunk_id, row> for findChunkById
450
451
  this.usageTracker = null; // Usage tracking & provenance (v3)
452
+ this.chunkStore = null; // SQLite chunk store (BM25 without vectors)
451
453
  }
452
454
 
453
455
  async init() {
@@ -459,6 +461,16 @@ class CodebaseIndexer {
459
461
  this.db = await lancedb.connect(path.join(this.cacheDir, "lancedb"));
460
462
  await this.loadHashes();
461
463
 
464
+ // ChunkStore — SQLite store for BM25 search without vectors
465
+ try {
466
+ const chunkStorePath = path.join(this.cacheDir, "chunks.db");
467
+ this.chunkStore = await new ChunkStore(chunkStorePath).init();
468
+ if (DEBUG) console.log(`[vectorizer] ChunkStore initialized: ${chunkStorePath}`);
469
+ } catch (e) {
470
+ if (DEBUG) console.log(`[vectorizer] ChunkStore init failed: ${e.message || e}`);
471
+ this.chunkStore = null;
472
+ }
473
+
462
474
  // Graph DB — only if graph is enabled in config
463
475
  // Non-fatal: if LevelDB lock fails (parallel access), search works without graph
464
476
  if (GRAPH_CONFIG.enabled) {
@@ -512,6 +524,11 @@ class CodebaseIndexer {
512
524
  }
513
525
  this._bm25Rows = null;
514
526
  this.metrics = null;
527
+ // Close ChunkStore
528
+ if (this.chunkStore) {
529
+ try { this.chunkStore.close(); } catch { /* best effort */ }
530
+ this.chunkStore = null;
531
+ }
515
532
  // Close graph DB to release LevelDB lock
516
533
  if (this.graphDB) {
517
534
  try { await this.graphDB.close(); } catch { /* best effort */ }
@@ -612,7 +629,161 @@ class CodebaseIndexer {
612
629
  return this.hashes[relPath] !== currentHash;
613
630
  }
614
631
 
615
- // ── Index a single file (v2: cleaning + semantic chunking + metadata) ─────
632
+ // ── Phase 1: Prepare file (chunk + graph, NO embedding) ─────────────────
633
+ // Returns prepared chunk data ready for embedding, or null if skipped.
634
+
635
+ async prepareFile(filePath) {
636
+ const relPath = path.relative(this.root, filePath);
637
+
638
+ let content;
639
+ try {
640
+ content = await fs.readFile(filePath, "utf8");
641
+ } catch {
642
+ return null;
643
+ }
644
+
645
+ const hash = this.fileHash(content);
646
+ if (this.hashes[relPath] === hash) {
647
+ return null; // unchanged
648
+ }
649
+
650
+ // Extract metadata
651
+ const fileMeta = await extractFileMetadata(filePath, content);
652
+ const archived = this.isArchived(relPath, content);
653
+
654
+ // Clean content before chunking
655
+ const cleaned = cleanContent(content, fileMeta.file_type, CLEANING_CONFIG);
656
+
657
+ // Semantic chunking
658
+ const chunks = chunkContent(cleaned, fileMeta.file_type, fileMeta.language, CHUNKING_CONFIG);
659
+
660
+ // Assign chunk IDs
661
+ const chunksWithIds = this.graphBuilder
662
+ ? this.graphBuilder.assignChunkIds(relPath, chunks)
663
+ : chunks.map((c, i) => ({ ...c, chunk_id: `chunk:${relPath}::_chunk_${i}` }));
664
+
665
+ // Build graph edges (Phase 1 — no embedding needed)
666
+ if (this.graphBuilder && this.graphDB) {
667
+ await this.graphDB.deleteByFile(relPath);
668
+ const edgesBuilt = await this.graphBuilder.buildEdges(relPath, content, chunksWithIds, fileMeta.file_type);
669
+
670
+ if (edgesBuilt > 0 || DEBUG) {
671
+ const timestamp = new Date().toISOString().slice(11, 19);
672
+ const logMsg = `${timestamp} Graph built: ${relPath} (${chunksWithIds.length} chunks)`;
673
+ if (DEBUG) console.log(`[vectorizer] ${logMsg}`);
674
+ try {
675
+ const logPath = path.join(this.root, ".opencode", "indexer.log");
676
+ const fsSync = await import("fs");
677
+ fsSync.appendFileSync(logPath, `${logMsg}\n`);
678
+ } catch { /* non-fatal */ }
679
+ }
680
+
681
+ try {
682
+ await this.graphDB.setFileMeta(relPath, hash, Date.now());
683
+ } catch { /* non-fatal */ }
684
+ }
685
+
686
+ // Return prepared rows (without vector — Phase 2 fills it)
687
+ const rows = chunksWithIds.map((chunk, i) => ({
688
+ chunk_id: chunk.chunk_id,
689
+ file: relPath,
690
+ chunk_index: i,
691
+ content: chunk.content,
692
+ archived,
693
+ file_type: fileMeta.file_type,
694
+ language: fileMeta.language,
695
+ last_modified: fileMeta.last_modified,
696
+ file_size: fileMeta.file_size,
697
+ heading_context: chunk.heading_context || "",
698
+ function_name: chunk.function_name || "",
699
+ class_name: chunk.class_name || "",
700
+ tags: (fileMeta.tags || []).join(","),
701
+ start_line: chunk.start_line ?? -1,
702
+ end_line: chunk.end_line ?? -1,
703
+ }));
704
+
705
+ // Store chunks in ChunkStore (Phase 1 — BM25 available immediately)
706
+ if (this.chunkStore) {
707
+ try {
708
+ this.chunkStore.deleteByFile(relPath);
709
+ this.chunkStore.storeChunks(rows);
710
+ } catch (e) {
711
+ if (DEBUG) console.log(`[vectorizer] ChunkStore write failed for ${relPath}: ${e.message || e}`);
712
+ }
713
+ }
714
+
715
+ return { relPath, hash, rows };
716
+ }
717
+
718
+ // ── Phase 2: Batch embed + store ──────────────────────────────────────────
719
+ // Takes prepared rows from prepareFile(), embeds in batches, stores in LanceDB.
720
+
721
+ async embedAndStore(preparedFiles, batchSize = 32, onProgress = null) {
722
+ if (preparedFiles.length === 0) return 0;
723
+
724
+ // Collect all rows with their content for batch embedding
725
+ const allRows = [];
726
+ for (const pf of preparedFiles) {
727
+ for (const row of pf.rows) {
728
+ allRows.push(row);
729
+ }
730
+ }
731
+
732
+ if (allRows.length === 0) return 0;
733
+
734
+ // Load model once
735
+ const model = await this.loadModel();
736
+
737
+ // Batch embed
738
+ const allData = [];
739
+ for (let i = 0; i < allRows.length; i += batchSize) {
740
+ const batch = allRows.slice(i, i + batchSize);
741
+ const texts = batch.map(r => r.content);
742
+
743
+ // Embed batch — @xenova/transformers processes array inputs efficiently
744
+ const embeddings = [];
745
+ for (const text of texts) {
746
+ const result = await model(text, { pooling: "mean", normalize: true });
747
+ embeddings.push(Array.from(result.data));
748
+ }
749
+
750
+ for (let j = 0; j < batch.length; j++) {
751
+ allData.push({ ...batch[j], vector: embeddings[j] });
752
+ }
753
+
754
+ if (onProgress) {
755
+ onProgress(Math.min(i + batchSize, allRows.length), allRows.length, "embedding");
756
+ }
757
+ }
758
+
759
+ // Bulk store in LanceDB
760
+ const tableName = "chunks";
761
+ const tables = await this.db.tableNames();
762
+ if (tables.includes(tableName)) {
763
+ const table = await this.db.openTable(tableName);
764
+ await table.add(allData);
765
+ } else {
766
+ await this.db.createTable(tableName, allData);
767
+ }
768
+
769
+ // Update hashes + mark vectorized in ChunkStore
770
+ for (const pf of preparedFiles) {
771
+ this.hashes[pf.relPath] = pf.hash;
772
+ if (this.chunkStore) {
773
+ try { this.chunkStore.markVectorized(pf.relPath); } catch { /* non-fatal */ }
774
+ }
775
+ }
776
+ await this.saveHashes();
777
+
778
+ // Invalidate caches
779
+ if (this.bm25) { this.bm25.clear(); this.bm25 = null; }
780
+ this._bm25Rows = null;
781
+ this._chunkCache = null;
782
+
783
+ return allData.length;
784
+ }
785
+
786
+ // ── Index a single file (legacy — used by freshen/on-change) ───────────
616
787
 
617
788
  async indexFile(filePath) {
618
789
  const relPath = path.relative(this.root, filePath);
@@ -701,6 +872,24 @@ class CodebaseIndexer {
701
872
 
702
873
  if (data.length === 0) return false;
703
874
 
875
+ // Store in ChunkStore (Phase 1 data) + mark vectorized (has embedding)
876
+ if (this.chunkStore) {
877
+ try {
878
+ this.chunkStore.deleteByFile(relPath);
879
+ this.chunkStore.storeChunks(data.map(d => ({
880
+ chunk_id: d.chunk_id, file: d.file, chunk_index: d.chunk_index,
881
+ content: d.content, file_type: d.file_type, language: d.language,
882
+ last_modified: d.last_modified, file_size: d.file_size,
883
+ heading_context: d.heading_context, function_name: d.function_name,
884
+ class_name: d.class_name, tags: d.tags,
885
+ start_line: d.start_line, end_line: d.end_line, archived: d.archived,
886
+ })));
887
+ this.chunkStore.markVectorized(relPath);
888
+ } catch (e) {
889
+ if (DEBUG) console.log(`[vectorizer] ChunkStore write failed for ${relPath}: ${e.message || e}`);
890
+ }
891
+ }
892
+
704
893
  const tableName = "chunks";
705
894
  const tables = await this.db.tableNames();
706
895
  if (tables.includes(tableName)) {
@@ -728,6 +917,35 @@ class CodebaseIndexer {
728
917
  async ensureBM25() {
729
918
  if (this.bm25) return this.bm25;
730
919
 
920
+ // Primary source: ChunkStore (SQLite) — available after Phase 1, no vectors needed
921
+ if (this.chunkStore) {
922
+ try {
923
+ const allChunks = this.chunkStore.getAllChunks();
924
+ if (allChunks.length > 0) {
925
+ // Sort for stable ID mapping between builds
926
+ allChunks.sort((a, b) => {
927
+ const ka = `${a.file}:${a.chunk_index}`;
928
+ const kb = `${b.file}:${b.chunk_index}`;
929
+ return ka.localeCompare(kb);
930
+ });
931
+
932
+ // Release previous data before rebuilding
933
+ if (this.bm25) this.bm25.clear();
934
+ this._bm25Rows = null;
935
+
936
+ this.bm25 = new BM25Index();
937
+ this.bm25.build(allChunks.map((r) => r.content));
938
+ this._bm25Rows = allChunks;
939
+
940
+ if (DEBUG) console.log(`[vectorizer] BM25 built from ChunkStore (${allChunks.length} chunks)`);
941
+ return this.bm25;
942
+ }
943
+ } catch (e) {
944
+ if (DEBUG) console.log("[vectorizer] BM25 from ChunkStore failed, trying LanceDB:", e.message);
945
+ }
946
+ }
947
+
948
+ // Fallback: LanceDB (legacy — for indexes without ChunkStore)
731
949
  const tableName = "chunks";
732
950
  const tables = await this.db.tableNames();
733
951
  if (!tables.includes(tableName)) return null;
@@ -761,15 +979,170 @@ class CodebaseIndexer {
761
979
  return this.bm25;
762
980
  }
763
981
 
764
- // ── Search (v2: hybrid + metadata filters + metrics) ──────────────────────
982
+ // ── Shared helpers for search paths ─────────────────────────────────────────
983
+
984
+ _applyMetadataFilters(results, includeArchived, options) {
985
+ if (!includeArchived) {
986
+ results = results.filter((r) => !r.archived);
987
+ }
988
+ if (options.fileType) {
989
+ results = results.filter((r) => r.file_type === options.fileType);
990
+ }
991
+ if (options.language) {
992
+ results = results.filter((r) => r.language === options.language);
993
+ }
994
+ if (options.modifiedAfter) {
995
+ const after = new Date(options.modifiedAfter).getTime();
996
+ results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() >= after);
997
+ }
998
+ if (options.modifiedBefore) {
999
+ const before = new Date(options.modifiedBefore).getTime();
1000
+ results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() <= before);
1001
+ }
1002
+ if (options.tags && options.tags.length > 0) {
1003
+ results = results.filter((r) => {
1004
+ const rowTags = (r.tags || "").split(",").filter(Boolean);
1005
+ return options.tags.some((t) => rowTags.includes(t));
1006
+ });
1007
+ }
1008
+ return results;
1009
+ }
1010
+
1011
+ async _expandGraphContext(finalResults, queryEmbedding, query) {
1012
+ if (!this.graphDB) return;
1013
+
1014
+ for (const result of finalResults) {
1015
+ if (!result.chunk_id) continue;
1016
+
1017
+ const outgoing = await this.graphDB.getOutgoing(result.chunk_id);
1018
+ const incoming = await this.graphDB.getIncoming(result.chunk_id);
1019
+ const allEdges = [...outgoing, ...incoming].filter(
1020
+ e => e.predicate !== "belongs_to" && e.predicate !== "graph_built" && !isStructuralPredicate(e.predicate)
1021
+ );
1022
+
1023
+ const neighbors = [];
1024
+ for (const edge of allEdges) {
1025
+ const neighborId = edge.subject === result.chunk_id ? edge.object : edge.subject;
1026
+ const neighborChunk = await this.findChunkById(neighborId);
1027
+ if (!neighborChunk) continue;
1028
+
1029
+ let score;
1030
+ if (queryEmbedding && neighborChunk.vector) {
1031
+ const similarity = this.cosineSimilarity(neighborChunk.vector, queryEmbedding);
1032
+ score = edge.weight * similarity;
1033
+ } else {
1034
+ // No vectors — use edge weight only (BM25-only fallback)
1035
+ score = edge.weight * 0.7; // dampen without cosine confirmation
1036
+ }
1037
+
1038
+ neighbors.push({
1039
+ chunk_id: neighborId,
1040
+ file: neighborChunk.file,
1041
+ content: neighborChunk.content,
1042
+ relation: edge.predicate,
1043
+ score,
1044
+ via: edge.source
1045
+ });
1046
+ }
1047
+
1048
+ neighbors.sort((a, b) => b.score - a.score);
1049
+ const minRelevance = GRAPH_CONFIG.min_relevance ?? 0.5;
1050
+ const maxRelated = GRAPH_CONFIG.max_related ?? 4;
1051
+ result.relatedContext = neighbors
1052
+ .filter(n => n.score >= minRelevance)
1053
+ .slice(0, maxRelated);
1054
+
1055
+ if (this.usageTracker) {
1056
+ for (const n of result.relatedContext) {
1057
+ this.usageTracker.recordProvenance(query, result.chunk_id, n.chunk_id, n.relation);
1058
+ }
1059
+ }
1060
+ }
1061
+
1062
+ // Record usage counts for all returned chunks
1063
+ if (this.usageTracker) {
1064
+ const allChunkIds = [];
1065
+ for (const r of finalResults) {
1066
+ if (r.chunk_id) allChunkIds.push(r.chunk_id);
1067
+ if (r.relatedContext) {
1068
+ for (const rc of r.relatedContext) {
1069
+ if (rc.chunk_id) allChunkIds.push(rc.chunk_id);
1070
+ }
1071
+ }
1072
+ }
1073
+ this.usageTracker.recordSearchResults(allChunkIds);
1074
+ this.usageTracker.save().catch(() => {});
1075
+ }
1076
+ }
1077
+
1078
+ // ── Search (v3: hybrid + BM25-only fallback + metadata filters + metrics) ──
765
1079
 
766
1080
  async search(query, limit = 5, includeArchived = false, options = {}) {
767
1081
  const tableName = "chunks";
768
1082
  const tables = await this.db.tableNames();
769
- if (!tables.includes(tableName)) {
770
- return [];
1083
+
1084
+ const indexConfig = INDEX_PRESETS[this.indexName];
1085
+ const indexHybridEnabled = indexConfig?.hybrid ?? false;
1086
+ const indexBM25Weight = indexConfig?.bm25_weight ?? HYBRID_CONFIG.bm25_weight;
1087
+ const isHybrid = indexHybridEnabled || options.hybrid;
1088
+
1089
+ // ── Detect if vectors are available ──────────────────────────────────────
1090
+ const hasVectorTable = tables.includes(tableName);
1091
+ let hasVectors = false;
1092
+ if (hasVectorTable) {
1093
+ try {
1094
+ const table = await this.db.openTable(tableName);
1095
+ const rowCount = await table.countRows();
1096
+ hasVectors = rowCount > 0;
1097
+ } catch {
1098
+ hasVectors = false;
1099
+ }
771
1100
  }
772
1101
 
1102
+ // ── BM25-only fallback (Phase 1 complete, Phase 2 not yet) ──────────────
1103
+ if (!hasVectors) {
1104
+ const bm25 = await this.ensureBM25();
1105
+ if (!bm25 || !this._bm25Rows) {
1106
+ // No vectors AND no chunks — nothing indexed yet
1107
+ return [];
1108
+ }
1109
+
1110
+ if (DEBUG) console.log("[vectorizer] BM25-only search (no vectors yet)");
1111
+
1112
+ const fetchLimit = Math.max(limit * 3, 50);
1113
+ const bm25Results = bm25.search(query, fetchLimit);
1114
+
1115
+ // Normalize BM25 scores to [0, 1]
1116
+ let maxBM25 = 0;
1117
+ for (const r of bm25Results) {
1118
+ if (r.score > maxBM25) maxBM25 = r.score;
1119
+ }
1120
+
1121
+ let results = [];
1122
+ for (const br of bm25Results) {
1123
+ if (br.id < this._bm25Rows.length) {
1124
+ const row = this._bm25Rows[br.id];
1125
+ const normScore = maxBM25 > 0 ? br.score / maxBM25 : 0;
1126
+ results.push({
1127
+ ...row,
1128
+ _combinedScore: normScore,
1129
+ _distance: null, // no vector distance available
1130
+ _bm25Only: true,
1131
+ });
1132
+ }
1133
+ }
1134
+
1135
+ // Apply metadata filters then return (graph context added below)
1136
+ results = this._applyMetadataFilters(results, includeArchived, options);
1137
+ const finalResults = results.slice(0, limit);
1138
+
1139
+ // Graph context expansion (same as vector path)
1140
+ await this._expandGraphContext(finalResults, null, query);
1141
+
1142
+ return finalResults;
1143
+ }
1144
+
1145
+ // ── Vector search (Phase 2 complete) ─────────────────────────────────────
773
1146
  const queryEmbedding = await this.embedQuery(query);
774
1147
  const table = await this.db.openTable(tableName);
775
1148
 
@@ -777,10 +1150,6 @@ class CodebaseIndexer {
777
1150
  const hasFilters = !includeArchived || options.fileType || options.language ||
778
1151
  options.modifiedAfter || options.modifiedBefore ||
779
1152
  (options.tags && options.tags.length > 0);
780
- const indexConfig = INDEX_PRESETS[this.indexName];
781
- const indexHybridEnabled = indexConfig?.hybrid ?? false;
782
- const indexBM25Weight = indexConfig?.bm25_weight ?? HYBRID_CONFIG.bm25_weight;
783
- const isHybrid = indexHybridEnabled || options.hybrid;
784
1153
  const fetchLimit = (hasFilters || isHybrid) ? Math.max(limit *3, 50) : limit;
785
1154
  let results;
786
1155
  try {
@@ -864,35 +1233,7 @@ class CodebaseIndexer {
864
1233
  }
865
1234
 
866
1235
  // ── Metadata filters ──────────────────────────────────────────────────
867
- if (!includeArchived) {
868
- results = results.filter((r) => !r.archived);
869
- }
870
-
871
- if (options.fileType) {
872
- results = results.filter((r) => r.file_type === options.fileType);
873
- }
874
-
875
- if (options.language) {
876
- results = results.filter((r) => r.language === options.language);
877
- }
878
-
879
- if (options.modifiedAfter) {
880
- const after = new Date(options.modifiedAfter).getTime();
881
- results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() >= after);
882
- }
883
-
884
- if (options.modifiedBefore) {
885
- const before = new Date(options.modifiedBefore).getTime();
886
- results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() <= before);
887
- }
888
-
889
- if (options.tags && options.tags.length > 0) {
890
- results = results.filter((r) => {
891
- const rowTags = (r.tags || "").split(",").filter(Boolean);
892
- return options.tags.some((t) => rowTags.includes(t));
893
- });
894
- }
895
-
1236
+ results = this._applyMetadataFilters(results, includeArchived, options);
896
1237
  const finalResults = results.slice(0, limit);
897
1238
 
898
1239
  // ── Metrics tracking ────────────────────────────────────────────────────
@@ -917,68 +1258,7 @@ class CodebaseIndexer {
917
1258
  }
918
1259
 
919
1260
  // ── Graph context expansion (v3) ───────────────────────────────────────
920
- if (this.graphDB) {
921
- for (const result of finalResults) {
922
- if (!result.chunk_id) continue;
923
-
924
- const outgoing = await this.graphDB.getOutgoing(result.chunk_id);
925
- const incoming = await this.graphDB.getIncoming(result.chunk_id);
926
- // Filter out structural and meta edges — only relation edges are useful for context
927
- const allEdges = [...outgoing, ...incoming].filter(
928
- e => e.predicate !== "belongs_to" && e.predicate !== "graph_built" && !isStructuralPredicate(e.predicate)
929
- );
930
-
931
- const neighbors = [];
932
- for (const edge of allEdges) {
933
- const neighborId = edge.subject === result.chunk_id ? edge.object : edge.subject;
934
- const neighborChunk = await this.findChunkById(neighborId);
935
- if (!neighborChunk) continue;
936
-
937
- const similarity = this.cosineSimilarity(neighborChunk.vector, queryEmbedding);
938
- const score = edge.weight * similarity;
939
-
940
- neighbors.push({
941
- chunk_id: neighborId,
942
- file: neighborChunk.file,
943
- content: neighborChunk.content,
944
- relation: edge.predicate,
945
- score,
946
- via: edge.source
947
- });
948
- }
949
-
950
- // Apply min_relevance filter, then cap at max_related
951
- neighbors.sort((a, b) => b.score - a.score);
952
- const minRelevance = GRAPH_CONFIG.min_relevance ?? 0.5;
953
- const maxRelated = GRAPH_CONFIG.max_related ?? 4;
954
- result.relatedContext = neighbors
955
- .filter(n => n.score >= minRelevance)
956
- .slice(0, maxRelated);
957
-
958
- // FR-060: Record provenance for each attached chunk
959
- if (this.usageTracker) {
960
- for (const n of result.relatedContext) {
961
- this.usageTracker.recordProvenance(query, result.chunk_id, n.chunk_id, n.relation);
962
- }
963
- }
964
- }
965
- }
966
-
967
- // FR-061: Record usage counts for all returned chunks (main + attached)
968
- if (this.usageTracker) {
969
- const allChunkIds = [];
970
- for (const r of finalResults) {
971
- if (r.chunk_id) allChunkIds.push(r.chunk_id);
972
- if (r.relatedContext) {
973
- for (const rc of r.relatedContext) {
974
- if (rc.chunk_id) allChunkIds.push(rc.chunk_id);
975
- }
976
- }
977
- }
978
- this.usageTracker.recordSearchResults(allChunkIds);
979
- // Save asynchronously (non-blocking)
980
- this.usageTracker.save().catch(() => {});
981
- }
1261
+ await this._expandGraphContext(finalResults, queryEmbedding, query);
982
1262
 
983
1263
  return finalResults;
984
1264
  }
@@ -987,22 +1267,39 @@ class CodebaseIndexer {
987
1267
  // Lazy-build an in-memory Map keyed by chunk_id on first call.
988
1268
  // The cache lives until unloadModel() clears it.
989
1269
  if (!this._chunkCache) {
1270
+ this._chunkCache = new Map();
1271
+
1272
+ // Primary: LanceDB (has vectors for cosine similarity in graph expansion)
990
1273
  const tableName = "chunks";
991
1274
  const tables = await this.db.tableNames();
992
- if (!tables.includes(tableName)) return null;
993
-
994
- const table = await this.db.openTable(tableName);
995
- let rows;
996
- try {
997
- rows = await table.filter("true").limit(100000).execute();
998
- } catch (e) {
999
- if (DEBUG) console.log("[vectorizer] Chunk cache build failed (corrupted table?):", e.message);
1000
- return null;
1275
+ if (tables.includes(tableName)) {
1276
+ try {
1277
+ const table = await this.db.openTable(tableName);
1278
+ const rows = await table.filter("true").limit(100000).execute();
1279
+ for (const row of rows) {
1280
+ if (row.chunk_id) {
1281
+ this._chunkCache.set(row.chunk_id, row);
1282
+ }
1283
+ }
1284
+ } catch (e) {
1285
+ if (DEBUG) console.log("[vectorizer] Chunk cache from LanceDB failed:", e.message);
1286
+ }
1001
1287
  }
1002
- this._chunkCache = new Map();
1003
- for (const row of rows) {
1004
- if (row.chunk_id) {
1005
- this._chunkCache.set(row.chunk_id, row);
1288
+
1289
+ // Fallback: ChunkStore (no vectors, but has content for BM25-only mode)
1290
+ if (this._chunkCache.size === 0 && this.chunkStore) {
1291
+ try {
1292
+ const allChunks = this.chunkStore.getAllChunks();
1293
+ for (const chunk of allChunks) {
1294
+ if (chunk.chunk_id) {
1295
+ this._chunkCache.set(chunk.chunk_id, chunk);
1296
+ }
1297
+ }
1298
+ if (DEBUG && allChunks.length > 0) {
1299
+ console.log(`[vectorizer] Chunk cache from ChunkStore (${allChunks.length} chunks, no vectors)`);
1300
+ }
1301
+ } catch (e) {
1302
+ if (DEBUG) console.log("[vectorizer] Chunk cache from ChunkStore failed:", e.message);
1006
1303
  }
1007
1304
  }
1008
1305
  }
@@ -1094,6 +1391,9 @@ class CodebaseIndexer {
1094
1391
  // best effort
1095
1392
  }
1096
1393
  }
1394
+ if (this.chunkStore) {
1395
+ try { this.chunkStore.deleteByFile(relPath); } catch { /* best effort */ }
1396
+ }
1097
1397
  delete this.hashes[relPath];
1098
1398
  deleted++;
1099
1399
  }
@@ -1156,6 +1456,11 @@ class CodebaseIndexer {
1156
1456
  }
1157
1457
  }
1158
1458
 
1459
+ // Delete chunks from ChunkStore
1460
+ if (this.chunkStore) {
1461
+ try { this.chunkStore.deleteByFile(relPath); } catch { /* best effort */ }
1462
+ }
1463
+
1159
1464
  delete this.hashes[relPath];
1160
1465
  removed++;
1161
1466
  }
@@ -1170,31 +1475,64 @@ class CodebaseIndexer {
1170
1475
  }
1171
1476
  }
1172
1477
 
1173
- let indexed = 0;
1174
- let skipped = 0;
1175
1478
  const total = files.length;
1479
+ const CONCURRENCY = 5;
1176
1480
 
1177
- for (let i = 0; i < files.length; i++) {
1178
- const relPath = files[i];
1179
- const filePath = path.join(this.root, relPath);
1180
- try {
1181
- const wasIndexed = await this.indexFile(filePath);
1182
- if (wasIndexed) {
1183
- indexed++;
1184
- // FR-053: progress indicator includes graph building phase
1185
- if (onProgress) onProgress(indexed, total, relPath, i + 1);
1481
+ // ══════════════════════════════════════════════════════════════════════════
1482
+ // Phase 1: Prepare files in parallel (chunk + graph, no embedding)
1483
+ // ══════════════════════════════════════════════════════════════════════════
1484
+ const preparedFiles = [];
1485
+ let prepared = 0;
1486
+ let skipped = 0;
1487
+
1488
+ // Process in batches of CONCURRENCY
1489
+ for (let i = 0; i < files.length; i += CONCURRENCY) {
1490
+ const batch = files.slice(i, i + CONCURRENCY);
1491
+ const promises = batch.map(async (relPath) => {
1492
+ const filePath = path.join(this.root, relPath);
1493
+ try {
1494
+ const result = await this.prepareFile(filePath);
1495
+ return result;
1496
+ } catch {
1497
+ return null;
1498
+ }
1499
+ });
1500
+
1501
+ const results = await Promise.all(promises);
1502
+ for (let j = 0; j < results.length; j++) {
1503
+ if (results[j]) {
1504
+ preparedFiles.push(results[j]);
1505
+ prepared++;
1506
+ if (onProgress) onProgress(prepared, total, results[j].relPath, i + j + 1, "prepare");
1186
1507
  } else {
1187
1508
  skipped++;
1188
1509
  }
1189
- } catch {
1190
- skipped++;
1191
1510
  }
1192
1511
  }
1193
1512
 
1513
+ if (DEBUG) console.log(`[vectorizer] Phase 1 done: ${prepared} files prepared, ${skipped} skipped`);
1514
+
1515
+ // ══════════════════════════════════════════════════════════════════════════
1516
+ // Phase 2: Batch embed + store (sequential, batch forward pass)
1517
+ // ══════════════════════════════════════════════════════════════════════════
1518
+ let chunksEmbedded = 0;
1519
+ if (preparedFiles.length > 0) {
1520
+ const totalChunks = preparedFiles.reduce((sum, pf) => sum + pf.rows.length, 0);
1521
+ if (DEBUG) console.log(`[vectorizer] Phase 2: embedding ${totalChunks} chunks from ${preparedFiles.length} files`);
1522
+
1523
+ chunksEmbedded = await this.embedAndStore(preparedFiles, 32, (done, embedTotal, phase) => {
1524
+ if (onProgress) onProgress(done, embedTotal, `embedding`, done, "embed");
1525
+ });
1526
+
1527
+ if (DEBUG) console.log(`[vectorizer] Phase 2 done: ${chunksEmbedded} chunks embedded and stored`);
1528
+ }
1529
+
1530
+ const indexed = prepared; // file count for backward compat
1531
+
1194
1532
  // FR-005: Build semantic similarity edges as post-pass
1195
1533
  // Disabled by default (O(n²) — slow on large repos). Enable via graph.semantic_edges: true
1196
1534
  let semanticEdges = 0;
1197
- if (indexed > 0 && this.graphBuilder && this.graphDB && GRAPH_CONFIG.semantic_edges) {
1535
+ if (chunksEmbedded > 0 && this.graphBuilder && this.graphDB && GRAPH_CONFIG.semantic_edges) {
1198
1536
  try {
1199
1537
  const tableName = "chunks";
1200
1538
  const tables = await this.db.tableNames();
@@ -1237,23 +1575,34 @@ class CodebaseIndexer {
1237
1575
 
1238
1576
  async getStats() {
1239
1577
  const fileCount = Object.keys(this.hashes).length;
1240
- let chunkCount = 0;
1578
+ let vectorChunkCount = 0;
1579
+ let totalChunkCount = 0;
1580
+ let hasUnvectorized = false;
1241
1581
 
1242
1582
  try {
1243
1583
  const tables = await this.db.tableNames();
1244
1584
  if (tables.includes("chunks")) {
1245
1585
  const table = await this.db.openTable("chunks");
1246
- chunkCount = await table.countRows();
1586
+ vectorChunkCount = await table.countRows();
1247
1587
  }
1248
1588
  } catch {}
1249
1589
 
1590
+ if (this.chunkStore) {
1591
+ try {
1592
+ totalChunkCount = this.chunkStore.count();
1593
+ hasUnvectorized = this.chunkStore.hasUnvectorizedChunks();
1594
+ } catch {}
1595
+ }
1596
+
1250
1597
  const preset = INDEX_PRESETS[this.indexName];
1251
1598
  return {
1252
1599
  indexName: this.indexName,
1253
1600
  description: preset?.description || "Custom index",
1254
1601
  model: EMBEDDING_MODEL,
1255
1602
  fileCount,
1256
- chunkCount,
1603
+ chunkCount: totalChunkCount || vectorChunkCount,
1604
+ vectorizedChunks: vectorChunkCount,
1605
+ pendingEmbedding: hasUnvectorized,
1257
1606
  features: {
1258
1607
  chunking: CHUNKING_CONFIG.strategy,
1259
1608
  hybrid: preset?.hybrid ?? false,