@comfanion/usethis_search 3.0.0-dev.27 → 3.0.0-dev.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@comfanion/usethis_search",
3
- "version": "3.0.0-dev.27",
3
+ "version": "3.0.0-dev.28",
4
4
  "description": "OpenCode plugin: semantic search with graph-based context (v3: graph relations, 1-hop context, LSP + regex analyzers)",
5
5
  "type": "module",
6
6
  "main": "./index.ts",
@@ -32,6 +32,7 @@
32
32
  "vectorizer/query-cache.ts",
33
33
  "vectorizer/search-metrics.ts",
34
34
  "vectorizer/graph-db.ts",
35
+ "vectorizer/chunk-store.ts",
35
36
  "vectorizer/usage-tracker.ts",
36
37
  "vectorizer/graph-builder.ts",
37
38
  "vectorizer/analyzers/regex-analyzer.ts",
package/tools/search.ts CHANGED
@@ -218,9 +218,12 @@ Examples:
218
218
  // ── Reranking — boost results where query keywords appear in text ──────
219
219
  const queryKeywords = args.query.toLowerCase().split(/\s+/).filter((w: string) => w.length > 2)
220
220
  for (const r of allResults) {
221
+ const isBM25Only = !!r._bm25Only
221
222
  const vectorScore = r._distance != null ? Math.max(0, 1 - r._distance / 2) : 0
222
223
  r._vectorScore = vectorScore
223
- r._bm25Component = r._combinedScore != null ? Math.max(0, r._combinedScore - vectorScore) : 0
224
+ r._bm25Component = isBM25Only
225
+ ? (r._combinedScore ?? 0)
226
+ : (r._combinedScore != null ? Math.max(0, r._combinedScore - vectorScore) : 0)
224
227
  const baseScore = r._combinedScore ?? vectorScore
225
228
 
226
229
  const text = (r.content || "").toLowerCase()
@@ -264,10 +267,15 @@ Examples:
264
267
 
265
268
  // ── Confidence signal ──────────────────────────────────────────────────
266
269
  const topScore = sortedGroups[0].best._finalScore ?? 0
270
+ const hasBM25Only = allResults.some((r: any) => r._bm25Only)
267
271
  const scope = args.searchAll ? "all indexes" : `index "${indexName}"`
268
272
  const filterLabel = args.filter ? ` filter:"${args.filter}"` : ""
269
273
  let output = `## Search Results for: "${args.query}" (${scope}${filterLabel})\n\n`
270
274
 
275
+ if (hasBM25Only) {
276
+ output += `> **BM25-only mode** — vector embeddings not yet available. Results are keyword-based. Quality will improve after embedding completes.\n\n`
277
+ }
278
+
271
279
  if (topScore < 0.45) {
272
280
  output += `> **Low confidence results.** Best score: ${topScore.toFixed(3)}. These results may not be relevant to your query.\n> Try more specific keywords or different phrasing.\n\n`
273
281
  }
@@ -287,8 +295,10 @@ Examples:
287
295
  const metaLine = metaParts.length > 0 ? ` (${metaParts.join(", ")})` : ""
288
296
 
289
297
  // Score breakdown
290
- const breakdownParts: string[] = [`vec: ${(r._vectorScore ?? 0).toFixed(2)}`]
291
- if (r._bm25Component > 0.005) breakdownParts.push(`bm25: +${r._bm25Component.toFixed(2)}`)
298
+ const breakdownParts: string[] = r._bm25Only
299
+ ? [`bm25: ${(r._bm25Component ?? 0).toFixed(2)}`]
300
+ : [`vec: ${(r._vectorScore ?? 0).toFixed(2)}`]
301
+ if (!r._bm25Only && r._bm25Component > 0.005) breakdownParts.push(`bm25: +${r._bm25Component.toFixed(2)}`)
292
302
  if (r._keywordBonus > 0.005) breakdownParts.push(`kw: +${r._keywordBonus.toFixed(2)}`)
293
303
  const breakdown = breakdownParts.join(", ")
294
304
 
@@ -0,0 +1,207 @@
1
+ /**
2
+ * ChunkStore — SQLite-based persistent chunk storage.
3
+ * Populated by Phase 1 (no vectors needed). Provides BM25 search
4
+ * and metadata queries immediately, before embedding is complete.
5
+ *
6
+ * Uses bun:sqlite with WAL mode for concurrent read access.
7
+ */
8
+
9
+ import { Database } from "bun:sqlite"
10
+
11
+ export interface StoredChunk {
12
+ chunk_id: string
13
+ file: string
14
+ chunk_index: number
15
+ content: string
16
+ file_type: string
17
+ language: string
18
+ last_modified: string
19
+ file_size: number
20
+ heading_context: string
21
+ function_name: string
22
+ class_name: string
23
+ tags: string
24
+ start_line: number
25
+ end_line: number
26
+ archived: boolean
27
+ vectorized: boolean
28
+ }
29
+
30
+ export class ChunkStore {
31
+ private db: Database | null = null
32
+
33
+ // Prepared statements
34
+ private _stmtInsert: any = null
35
+ private _stmtByFile: any = null
36
+ private _stmtDeleteByFile: any = null
37
+ private _stmtAll: any = null
38
+ private _stmtByChunkId: any = null
39
+ private _stmtMarkVectorized: any = null
40
+ private _stmtHasVectors: any = null
41
+ private _stmtCount: any = null
42
+ private _stmtSearch: any = null
43
+
44
+ constructor(private dbPath: string) {}
45
+
46
+ async init(): Promise<this> {
47
+ const fullPath = this.dbPath.endsWith(".db") ? this.dbPath : this.dbPath + ".db"
48
+ this.db = new Database(fullPath)
49
+
50
+ this.db.exec("PRAGMA journal_mode = WAL")
51
+ this.db.exec("PRAGMA synchronous = NORMAL")
52
+ this.db.exec("PRAGMA cache_size = -4000") // 4MB cache
53
+
54
+ this.db.exec(`
55
+ CREATE TABLE IF NOT EXISTS chunks (
56
+ chunk_id TEXT PRIMARY KEY,
57
+ file TEXT NOT NULL,
58
+ chunk_index INTEGER NOT NULL DEFAULT 0,
59
+ content TEXT NOT NULL,
60
+ file_type TEXT NOT NULL DEFAULT '',
61
+ language TEXT NOT NULL DEFAULT '',
62
+ last_modified TEXT NOT NULL DEFAULT '',
63
+ file_size INTEGER NOT NULL DEFAULT 0,
64
+ heading_context TEXT NOT NULL DEFAULT '',
65
+ function_name TEXT NOT NULL DEFAULT '',
66
+ class_name TEXT NOT NULL DEFAULT '',
67
+ tags TEXT NOT NULL DEFAULT '',
68
+ start_line INTEGER NOT NULL DEFAULT -1,
69
+ end_line INTEGER NOT NULL DEFAULT -1,
70
+ archived INTEGER NOT NULL DEFAULT 0,
71
+ vectorized INTEGER NOT NULL DEFAULT 0
72
+ )
73
+ `)
74
+
75
+ this.db.exec("CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file)")
76
+ this.db.exec("CREATE INDEX IF NOT EXISTS idx_chunks_vectorized ON chunks(vectorized)")
77
+ this.db.exec("CREATE INDEX IF NOT EXISTS idx_chunks_language ON chunks(language)")
78
+
79
+ // Prepare statements
80
+ this._stmtInsert = this.db.prepare(`
81
+ INSERT OR REPLACE INTO chunks
82
+ (chunk_id, file, chunk_index, content, file_type, language, last_modified, file_size,
83
+ heading_context, function_name, class_name, tags, start_line, end_line, archived, vectorized)
84
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 0)
85
+ `)
86
+ this._stmtByFile = this.db.prepare("SELECT * FROM chunks WHERE file = ?")
87
+ this._stmtDeleteByFile = this.db.prepare("DELETE FROM chunks WHERE file = ?")
88
+ this._stmtAll = this.db.prepare("SELECT * FROM chunks")
89
+ this._stmtByChunkId = this.db.prepare("SELECT * FROM chunks WHERE chunk_id = ?")
90
+ this._stmtMarkVectorized = this.db.prepare("UPDATE chunks SET vectorized = 1 WHERE file = ?")
91
+ this._stmtHasVectors = this.db.prepare("SELECT COUNT(*) as cnt FROM chunks WHERE vectorized = 0")
92
+ this._stmtCount = this.db.prepare("SELECT COUNT(*) as cnt FROM chunks")
93
+
94
+ return this
95
+ }
96
+
97
+ /**
98
+ * Store chunks from Phase 1 (batch, in transaction).
99
+ */
100
+ storeChunks(rows: Array<{
101
+ chunk_id: string, file: string, chunk_index: number, content: string,
102
+ file_type: string, language: string, last_modified: string, file_size: number,
103
+ heading_context: string, function_name: string, class_name: string, tags: string,
104
+ start_line: number, end_line: number, archived: boolean
105
+ }>): void {
106
+ if (!this.db) throw new Error("ChunkStore not initialized")
107
+
108
+ const insertMany = this.db.transaction((items: typeof rows) => {
109
+ for (const r of items) {
110
+ this._stmtInsert.run(
111
+ r.chunk_id, r.file, r.chunk_index, r.content,
112
+ r.file_type, r.language, r.last_modified, r.file_size,
113
+ r.heading_context, r.function_name, r.class_name, r.tags,
114
+ r.start_line, r.end_line, r.archived ? 1 : 0
115
+ )
116
+ }
117
+ })
118
+ insertMany(rows)
119
+ }
120
+
121
+ /**
122
+ * Delete all chunks for a file (before re-indexing).
123
+ */
124
+ deleteByFile(filePath: string): void {
125
+ if (!this.db) throw new Error("ChunkStore not initialized")
126
+ this._stmtDeleteByFile.run(filePath)
127
+ }
128
+
129
+ /**
130
+ * Mark all chunks for a file as vectorized (Phase 2 complete).
131
+ */
132
+ markVectorized(filePath: string): void {
133
+ if (!this.db) throw new Error("ChunkStore not initialized")
134
+ this._stmtMarkVectorized.run(filePath)
135
+ }
136
+
137
+ /**
138
+ * Check if all chunks have vectors.
139
+ */
140
+ hasUnvectorizedChunks(): boolean {
141
+ if (!this.db) return false
142
+ const row = this._stmtHasVectors.get() as { cnt: number }
143
+ return row.cnt > 0
144
+ }
145
+
146
+ /**
147
+ * Get all chunks (for BM25 index building).
148
+ */
149
+ getAllChunks(): StoredChunk[] {
150
+ if (!this.db) return []
151
+ return this._stmtAll.all().map((r: any) => this.toChunk(r))
152
+ }
153
+
154
+ /**
155
+ * Get chunks for a specific file.
156
+ */
157
+ getChunksByFile(filePath: string): StoredChunk[] {
158
+ if (!this.db) return []
159
+ return this._stmtByFile.all(filePath).map((r: any) => this.toChunk(r))
160
+ }
161
+
162
+ /**
163
+ * Get a single chunk by ID.
164
+ */
165
+ getChunkById(chunkId: string): StoredChunk | null {
166
+ if (!this.db) return null
167
+ const row = this._stmtByChunkId.get(chunkId)
168
+ return row ? this.toChunk(row) : null
169
+ }
170
+
171
+ /**
172
+ * Total chunk count.
173
+ */
174
+ count(): number {
175
+ if (!this.db) return 0
176
+ const row = this._stmtCount.get() as { cnt: number }
177
+ return row.cnt
178
+ }
179
+
180
+ close(): void {
181
+ if (this.db) {
182
+ this.db.close()
183
+ this.db = null
184
+ }
185
+ }
186
+
187
+ private toChunk(row: any): StoredChunk {
188
+ return {
189
+ chunk_id: row.chunk_id,
190
+ file: row.file,
191
+ chunk_index: row.chunk_index,
192
+ content: row.content,
193
+ file_type: row.file_type,
194
+ language: row.language,
195
+ last_modified: row.last_modified,
196
+ file_size: row.file_size,
197
+ heading_context: row.heading_context,
198
+ function_name: row.function_name,
199
+ class_name: row.class_name,
200
+ tags: row.tags,
201
+ start_line: row.start_line,
202
+ end_line: row.end_line,
203
+ archived: !!row.archived,
204
+ vectorized: !!row.vectorized,
205
+ }
206
+ }
207
+ }
@@ -18,6 +18,7 @@ import { SearchMetrics } from "./search-metrics.ts";
18
18
  import { GraphDB } from "./graph-db.ts";
19
19
  import { GraphBuilder, isStructuralPredicate } from "./graph-builder.ts";
20
20
  import { UsageTracker } from "./usage-tracker.ts";
21
+ import { ChunkStore } from "./chunk-store.ts";
21
22
 
22
23
  // Suppress transformers.js logs unless DEBUG is set
23
24
  const DEBUG = process.env.DEBUG?.includes("vectorizer") || process.env.DEBUG === "*";
@@ -448,6 +449,7 @@ class CodebaseIndexer {
448
449
  this.graphBuilder = null; // Graph builder orchestrator
449
450
  this._chunkCache = null; // Lazy Map<chunk_id, row> for findChunkById
450
451
  this.usageTracker = null; // Usage tracking & provenance (v3)
452
+ this.chunkStore = null; // SQLite chunk store (BM25 without vectors)
451
453
  }
452
454
 
453
455
  async init() {
@@ -459,6 +461,16 @@ class CodebaseIndexer {
459
461
  this.db = await lancedb.connect(path.join(this.cacheDir, "lancedb"));
460
462
  await this.loadHashes();
461
463
 
464
+ // ChunkStore — SQLite store for BM25 search without vectors
465
+ try {
466
+ const chunkStorePath = path.join(this.cacheDir, "chunks.db");
467
+ this.chunkStore = await new ChunkStore(chunkStorePath).init();
468
+ if (DEBUG) console.log(`[vectorizer] ChunkStore initialized: ${chunkStorePath}`);
469
+ } catch (e) {
470
+ if (DEBUG) console.log(`[vectorizer] ChunkStore init failed: ${e.message || e}`);
471
+ this.chunkStore = null;
472
+ }
473
+
462
474
  // Graph DB — only if graph is enabled in config
463
475
  // Non-fatal: if LevelDB lock fails (parallel access), search works without graph
464
476
  if (GRAPH_CONFIG.enabled) {
@@ -512,6 +524,11 @@ class CodebaseIndexer {
512
524
  }
513
525
  this._bm25Rows = null;
514
526
  this.metrics = null;
527
+ // Close ChunkStore
528
+ if (this.chunkStore) {
529
+ try { this.chunkStore.close(); } catch { /* best effort */ }
530
+ this.chunkStore = null;
531
+ }
515
532
  // Close graph DB to release LevelDB lock
516
533
  if (this.graphDB) {
517
534
  try { await this.graphDB.close(); } catch { /* best effort */ }
@@ -685,6 +702,16 @@ class CodebaseIndexer {
685
702
  end_line: chunk.end_line ?? -1,
686
703
  }));
687
704
 
705
+ // Store chunks in ChunkStore (Phase 1 — BM25 available immediately)
706
+ if (this.chunkStore) {
707
+ try {
708
+ this.chunkStore.deleteByFile(relPath);
709
+ this.chunkStore.storeChunks(rows);
710
+ } catch (e) {
711
+ if (DEBUG) console.log(`[vectorizer] ChunkStore write failed for ${relPath}: ${e.message || e}`);
712
+ }
713
+ }
714
+
688
715
  return { relPath, hash, rows };
689
716
  }
690
717
 
@@ -739,9 +766,12 @@ class CodebaseIndexer {
739
766
  await this.db.createTable(tableName, allData);
740
767
  }
741
768
 
742
- // Update hashes for all prepared files
769
+ // Update hashes + mark vectorized in ChunkStore
743
770
  for (const pf of preparedFiles) {
744
771
  this.hashes[pf.relPath] = pf.hash;
772
+ if (this.chunkStore) {
773
+ try { this.chunkStore.markVectorized(pf.relPath); } catch { /* non-fatal */ }
774
+ }
745
775
  }
746
776
  await this.saveHashes();
747
777
 
@@ -842,6 +872,24 @@ class CodebaseIndexer {
842
872
 
843
873
  if (data.length === 0) return false;
844
874
 
875
+ // Store in ChunkStore (Phase 1 data) + mark vectorized (has embedding)
876
+ if (this.chunkStore) {
877
+ try {
878
+ this.chunkStore.deleteByFile(relPath);
879
+ this.chunkStore.storeChunks(data.map(d => ({
880
+ chunk_id: d.chunk_id, file: d.file, chunk_index: d.chunk_index,
881
+ content: d.content, file_type: d.file_type, language: d.language,
882
+ last_modified: d.last_modified, file_size: d.file_size,
883
+ heading_context: d.heading_context, function_name: d.function_name,
884
+ class_name: d.class_name, tags: d.tags,
885
+ start_line: d.start_line, end_line: d.end_line, archived: d.archived,
886
+ })));
887
+ this.chunkStore.markVectorized(relPath);
888
+ } catch (e) {
889
+ if (DEBUG) console.log(`[vectorizer] ChunkStore write failed for ${relPath}: ${e.message || e}`);
890
+ }
891
+ }
892
+
845
893
  const tableName = "chunks";
846
894
  const tables = await this.db.tableNames();
847
895
  if (tables.includes(tableName)) {
@@ -869,6 +917,35 @@ class CodebaseIndexer {
869
917
  async ensureBM25() {
870
918
  if (this.bm25) return this.bm25;
871
919
 
920
+ // Primary source: ChunkStore (SQLite) — available after Phase 1, no vectors needed
921
+ if (this.chunkStore) {
922
+ try {
923
+ const allChunks = this.chunkStore.getAllChunks();
924
+ if (allChunks.length > 0) {
925
+ // Sort for stable ID mapping between builds
926
+ allChunks.sort((a, b) => {
927
+ const ka = `${a.file}:${a.chunk_index}`;
928
+ const kb = `${b.file}:${b.chunk_index}`;
929
+ return ka.localeCompare(kb);
930
+ });
931
+
932
+ // Release previous data before rebuilding
933
+ if (this.bm25) this.bm25.clear();
934
+ this._bm25Rows = null;
935
+
936
+ this.bm25 = new BM25Index();
937
+ this.bm25.build(allChunks.map((r) => r.content));
938
+ this._bm25Rows = allChunks;
939
+
940
+ if (DEBUG) console.log(`[vectorizer] BM25 built from ChunkStore (${allChunks.length} chunks)`);
941
+ return this.bm25;
942
+ }
943
+ } catch (e) {
944
+ if (DEBUG) console.log("[vectorizer] BM25 from ChunkStore failed, trying LanceDB:", e.message);
945
+ }
946
+ }
947
+
948
+ // Fallback: LanceDB (legacy — for indexes without ChunkStore)
872
949
  const tableName = "chunks";
873
950
  const tables = await this.db.tableNames();
874
951
  if (!tables.includes(tableName)) return null;
@@ -902,15 +979,170 @@ class CodebaseIndexer {
902
979
  return this.bm25;
903
980
  }
904
981
 
905
- // ── Search (v2: hybrid + metadata filters + metrics) ──────────────────────
982
+ // ── Shared helpers for search paths ─────────────────────────────────────────
983
+
984
+ _applyMetadataFilters(results, includeArchived, options) {
985
+ if (!includeArchived) {
986
+ results = results.filter((r) => !r.archived);
987
+ }
988
+ if (options.fileType) {
989
+ results = results.filter((r) => r.file_type === options.fileType);
990
+ }
991
+ if (options.language) {
992
+ results = results.filter((r) => r.language === options.language);
993
+ }
994
+ if (options.modifiedAfter) {
995
+ const after = new Date(options.modifiedAfter).getTime();
996
+ results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() >= after);
997
+ }
998
+ if (options.modifiedBefore) {
999
+ const before = new Date(options.modifiedBefore).getTime();
1000
+ results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() <= before);
1001
+ }
1002
+ if (options.tags && options.tags.length > 0) {
1003
+ results = results.filter((r) => {
1004
+ const rowTags = (r.tags || "").split(",").filter(Boolean);
1005
+ return options.tags.some((t) => rowTags.includes(t));
1006
+ });
1007
+ }
1008
+ return results;
1009
+ }
1010
+
1011
+ async _expandGraphContext(finalResults, queryEmbedding, query) {
1012
+ if (!this.graphDB) return;
1013
+
1014
+ for (const result of finalResults) {
1015
+ if (!result.chunk_id) continue;
1016
+
1017
+ const outgoing = await this.graphDB.getOutgoing(result.chunk_id);
1018
+ const incoming = await this.graphDB.getIncoming(result.chunk_id);
1019
+ const allEdges = [...outgoing, ...incoming].filter(
1020
+ e => e.predicate !== "belongs_to" && e.predicate !== "graph_built" && !isStructuralPredicate(e.predicate)
1021
+ );
1022
+
1023
+ const neighbors = [];
1024
+ for (const edge of allEdges) {
1025
+ const neighborId = edge.subject === result.chunk_id ? edge.object : edge.subject;
1026
+ const neighborChunk = await this.findChunkById(neighborId);
1027
+ if (!neighborChunk) continue;
1028
+
1029
+ let score;
1030
+ if (queryEmbedding && neighborChunk.vector) {
1031
+ const similarity = this.cosineSimilarity(neighborChunk.vector, queryEmbedding);
1032
+ score = edge.weight * similarity;
1033
+ } else {
1034
+ // No vectors — use edge weight only (BM25-only fallback)
1035
+ score = edge.weight * 0.7; // dampen without cosine confirmation
1036
+ }
1037
+
1038
+ neighbors.push({
1039
+ chunk_id: neighborId,
1040
+ file: neighborChunk.file,
1041
+ content: neighborChunk.content,
1042
+ relation: edge.predicate,
1043
+ score,
1044
+ via: edge.source
1045
+ });
1046
+ }
1047
+
1048
+ neighbors.sort((a, b) => b.score - a.score);
1049
+ const minRelevance = GRAPH_CONFIG.min_relevance ?? 0.5;
1050
+ const maxRelated = GRAPH_CONFIG.max_related ?? 4;
1051
+ result.relatedContext = neighbors
1052
+ .filter(n => n.score >= minRelevance)
1053
+ .slice(0, maxRelated);
1054
+
1055
+ if (this.usageTracker) {
1056
+ for (const n of result.relatedContext) {
1057
+ this.usageTracker.recordProvenance(query, result.chunk_id, n.chunk_id, n.relation);
1058
+ }
1059
+ }
1060
+ }
1061
+
1062
+ // Record usage counts for all returned chunks
1063
+ if (this.usageTracker) {
1064
+ const allChunkIds = [];
1065
+ for (const r of finalResults) {
1066
+ if (r.chunk_id) allChunkIds.push(r.chunk_id);
1067
+ if (r.relatedContext) {
1068
+ for (const rc of r.relatedContext) {
1069
+ if (rc.chunk_id) allChunkIds.push(rc.chunk_id);
1070
+ }
1071
+ }
1072
+ }
1073
+ this.usageTracker.recordSearchResults(allChunkIds);
1074
+ this.usageTracker.save().catch(() => {});
1075
+ }
1076
+ }
1077
+
1078
+ // ── Search (v3: hybrid + BM25-only fallback + metadata filters + metrics) ──
906
1079
 
907
1080
  async search(query, limit = 5, includeArchived = false, options = {}) {
908
1081
  const tableName = "chunks";
909
1082
  const tables = await this.db.tableNames();
910
- if (!tables.includes(tableName)) {
911
- return [];
1083
+
1084
+ const indexConfig = INDEX_PRESETS[this.indexName];
1085
+ const indexHybridEnabled = indexConfig?.hybrid ?? false;
1086
+ const indexBM25Weight = indexConfig?.bm25_weight ?? HYBRID_CONFIG.bm25_weight;
1087
+ const isHybrid = indexHybridEnabled || options.hybrid;
1088
+
1089
+ // ── Detect if vectors are available ──────────────────────────────────────
1090
+ const hasVectorTable = tables.includes(tableName);
1091
+ let hasVectors = false;
1092
+ if (hasVectorTable) {
1093
+ try {
1094
+ const table = await this.db.openTable(tableName);
1095
+ const rowCount = await table.countRows();
1096
+ hasVectors = rowCount > 0;
1097
+ } catch {
1098
+ hasVectors = false;
1099
+ }
1100
+ }
1101
+
1102
+ // ── BM25-only fallback (Phase 1 complete, Phase 2 not yet) ──────────────
1103
+ if (!hasVectors) {
1104
+ const bm25 = await this.ensureBM25();
1105
+ if (!bm25 || !this._bm25Rows) {
1106
+ // No vectors AND no chunks — nothing indexed yet
1107
+ return [];
1108
+ }
1109
+
1110
+ if (DEBUG) console.log("[vectorizer] BM25-only search (no vectors yet)");
1111
+
1112
+ const fetchLimit = Math.max(limit * 3, 50);
1113
+ const bm25Results = bm25.search(query, fetchLimit);
1114
+
1115
+ // Normalize BM25 scores to [0, 1]
1116
+ let maxBM25 = 0;
1117
+ for (const r of bm25Results) {
1118
+ if (r.score > maxBM25) maxBM25 = r.score;
1119
+ }
1120
+
1121
+ let results = [];
1122
+ for (const br of bm25Results) {
1123
+ if (br.id < this._bm25Rows.length) {
1124
+ const row = this._bm25Rows[br.id];
1125
+ const normScore = maxBM25 > 0 ? br.score / maxBM25 : 0;
1126
+ results.push({
1127
+ ...row,
1128
+ _combinedScore: normScore,
1129
+ _distance: null, // no vector distance available
1130
+ _bm25Only: true,
1131
+ });
1132
+ }
1133
+ }
1134
+
1135
+ // Apply metadata filters then return (graph context added below)
1136
+ results = this._applyMetadataFilters(results, includeArchived, options);
1137
+ const finalResults = results.slice(0, limit);
1138
+
1139
+ // Graph context expansion (same as vector path)
1140
+ await this._expandGraphContext(finalResults, null, query);
1141
+
1142
+ return finalResults;
912
1143
  }
913
1144
 
1145
+ // ── Vector search (Phase 2 complete) ─────────────────────────────────────
914
1146
  const queryEmbedding = await this.embedQuery(query);
915
1147
  const table = await this.db.openTable(tableName);
916
1148
 
@@ -918,10 +1150,6 @@ class CodebaseIndexer {
918
1150
  const hasFilters = !includeArchived || options.fileType || options.language ||
919
1151
  options.modifiedAfter || options.modifiedBefore ||
920
1152
  (options.tags && options.tags.length > 0);
921
- const indexConfig = INDEX_PRESETS[this.indexName];
922
- const indexHybridEnabled = indexConfig?.hybrid ?? false;
923
- const indexBM25Weight = indexConfig?.bm25_weight ?? HYBRID_CONFIG.bm25_weight;
924
- const isHybrid = indexHybridEnabled || options.hybrid;
925
1153
  const fetchLimit = (hasFilters || isHybrid) ? Math.max(limit *3, 50) : limit;
926
1154
  let results;
927
1155
  try {
@@ -1005,35 +1233,7 @@ class CodebaseIndexer {
1005
1233
  }
1006
1234
 
1007
1235
  // ── Metadata filters ──────────────────────────────────────────────────
1008
- if (!includeArchived) {
1009
- results = results.filter((r) => !r.archived);
1010
- }
1011
-
1012
- if (options.fileType) {
1013
- results = results.filter((r) => r.file_type === options.fileType);
1014
- }
1015
-
1016
- if (options.language) {
1017
- results = results.filter((r) => r.language === options.language);
1018
- }
1019
-
1020
- if (options.modifiedAfter) {
1021
- const after = new Date(options.modifiedAfter).getTime();
1022
- results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() >= after);
1023
- }
1024
-
1025
- if (options.modifiedBefore) {
1026
- const before = new Date(options.modifiedBefore).getTime();
1027
- results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() <= before);
1028
- }
1029
-
1030
- if (options.tags && options.tags.length > 0) {
1031
- results = results.filter((r) => {
1032
- const rowTags = (r.tags || "").split(",").filter(Boolean);
1033
- return options.tags.some((t) => rowTags.includes(t));
1034
- });
1035
- }
1036
-
1236
+ results = this._applyMetadataFilters(results, includeArchived, options);
1037
1237
  const finalResults = results.slice(0, limit);
1038
1238
 
1039
1239
  // ── Metrics tracking ────────────────────────────────────────────────────
@@ -1058,68 +1258,7 @@ class CodebaseIndexer {
1058
1258
  }
1059
1259
 
1060
1260
  // ── Graph context expansion (v3) ───────────────────────────────────────
1061
- if (this.graphDB) {
1062
- for (const result of finalResults) {
1063
- if (!result.chunk_id) continue;
1064
-
1065
- const outgoing = await this.graphDB.getOutgoing(result.chunk_id);
1066
- const incoming = await this.graphDB.getIncoming(result.chunk_id);
1067
- // Filter out structural and meta edges — only relation edges are useful for context
1068
- const allEdges = [...outgoing, ...incoming].filter(
1069
- e => e.predicate !== "belongs_to" && e.predicate !== "graph_built" && !isStructuralPredicate(e.predicate)
1070
- );
1071
-
1072
- const neighbors = [];
1073
- for (const edge of allEdges) {
1074
- const neighborId = edge.subject === result.chunk_id ? edge.object : edge.subject;
1075
- const neighborChunk = await this.findChunkById(neighborId);
1076
- if (!neighborChunk) continue;
1077
-
1078
- const similarity = this.cosineSimilarity(neighborChunk.vector, queryEmbedding);
1079
- const score = edge.weight * similarity;
1080
-
1081
- neighbors.push({
1082
- chunk_id: neighborId,
1083
- file: neighborChunk.file,
1084
- content: neighborChunk.content,
1085
- relation: edge.predicate,
1086
- score,
1087
- via: edge.source
1088
- });
1089
- }
1090
-
1091
- // Apply min_relevance filter, then cap at max_related
1092
- neighbors.sort((a, b) => b.score - a.score);
1093
- const minRelevance = GRAPH_CONFIG.min_relevance ?? 0.5;
1094
- const maxRelated = GRAPH_CONFIG.max_related ?? 4;
1095
- result.relatedContext = neighbors
1096
- .filter(n => n.score >= minRelevance)
1097
- .slice(0, maxRelated);
1098
-
1099
- // FR-060: Record provenance for each attached chunk
1100
- if (this.usageTracker) {
1101
- for (const n of result.relatedContext) {
1102
- this.usageTracker.recordProvenance(query, result.chunk_id, n.chunk_id, n.relation);
1103
- }
1104
- }
1105
- }
1106
- }
1107
-
1108
- // FR-061: Record usage counts for all returned chunks (main + attached)
1109
- if (this.usageTracker) {
1110
- const allChunkIds = [];
1111
- for (const r of finalResults) {
1112
- if (r.chunk_id) allChunkIds.push(r.chunk_id);
1113
- if (r.relatedContext) {
1114
- for (const rc of r.relatedContext) {
1115
- if (rc.chunk_id) allChunkIds.push(rc.chunk_id);
1116
- }
1117
- }
1118
- }
1119
- this.usageTracker.recordSearchResults(allChunkIds);
1120
- // Save asynchronously (non-blocking)
1121
- this.usageTracker.save().catch(() => {});
1122
- }
1261
+ await this._expandGraphContext(finalResults, queryEmbedding, query);
1123
1262
 
1124
1263
  return finalResults;
1125
1264
  }
@@ -1128,22 +1267,39 @@ class CodebaseIndexer {
1128
1267
  // Lazy-build an in-memory Map keyed by chunk_id on first call.
1129
1268
  // The cache lives until unloadModel() clears it.
1130
1269
  if (!this._chunkCache) {
1270
+ this._chunkCache = new Map();
1271
+
1272
+ // Primary: LanceDB (has vectors for cosine similarity in graph expansion)
1131
1273
  const tableName = "chunks";
1132
1274
  const tables = await this.db.tableNames();
1133
- if (!tables.includes(tableName)) return null;
1134
-
1135
- const table = await this.db.openTable(tableName);
1136
- let rows;
1137
- try {
1138
- rows = await table.filter("true").limit(100000).execute();
1139
- } catch (e) {
1140
- if (DEBUG) console.log("[vectorizer] Chunk cache build failed (corrupted table?):", e.message);
1141
- return null;
1275
+ if (tables.includes(tableName)) {
1276
+ try {
1277
+ const table = await this.db.openTable(tableName);
1278
+ const rows = await table.filter("true").limit(100000).execute();
1279
+ for (const row of rows) {
1280
+ if (row.chunk_id) {
1281
+ this._chunkCache.set(row.chunk_id, row);
1282
+ }
1283
+ }
1284
+ } catch (e) {
1285
+ if (DEBUG) console.log("[vectorizer] Chunk cache from LanceDB failed:", e.message);
1286
+ }
1142
1287
  }
1143
- this._chunkCache = new Map();
1144
- for (const row of rows) {
1145
- if (row.chunk_id) {
1146
- this._chunkCache.set(row.chunk_id, row);
1288
+
1289
+ // Fallback: ChunkStore (no vectors, but has content for BM25-only mode)
1290
+ if (this._chunkCache.size === 0 && this.chunkStore) {
1291
+ try {
1292
+ const allChunks = this.chunkStore.getAllChunks();
1293
+ for (const chunk of allChunks) {
1294
+ if (chunk.chunk_id) {
1295
+ this._chunkCache.set(chunk.chunk_id, chunk);
1296
+ }
1297
+ }
1298
+ if (DEBUG && allChunks.length > 0) {
1299
+ console.log(`[vectorizer] Chunk cache from ChunkStore (${allChunks.length} chunks, no vectors)`);
1300
+ }
1301
+ } catch (e) {
1302
+ if (DEBUG) console.log("[vectorizer] Chunk cache from ChunkStore failed:", e.message);
1147
1303
  }
1148
1304
  }
1149
1305
  }
@@ -1235,6 +1391,9 @@ class CodebaseIndexer {
1235
1391
  // best effort
1236
1392
  }
1237
1393
  }
1394
+ if (this.chunkStore) {
1395
+ try { this.chunkStore.deleteByFile(relPath); } catch { /* best effort */ }
1396
+ }
1238
1397
  delete this.hashes[relPath];
1239
1398
  deleted++;
1240
1399
  }
@@ -1297,6 +1456,11 @@ class CodebaseIndexer {
1297
1456
  }
1298
1457
  }
1299
1458
 
1459
+ // Delete chunks from ChunkStore
1460
+ if (this.chunkStore) {
1461
+ try { this.chunkStore.deleteByFile(relPath); } catch { /* best effort */ }
1462
+ }
1463
+
1300
1464
  delete this.hashes[relPath];
1301
1465
  removed++;
1302
1466
  }
@@ -1411,23 +1575,34 @@ class CodebaseIndexer {
1411
1575
 
1412
1576
  async getStats() {
1413
1577
  const fileCount = Object.keys(this.hashes).length;
1414
- let chunkCount = 0;
1578
+ let vectorChunkCount = 0;
1579
+ let totalChunkCount = 0;
1580
+ let hasUnvectorized = false;
1415
1581
 
1416
1582
  try {
1417
1583
  const tables = await this.db.tableNames();
1418
1584
  if (tables.includes("chunks")) {
1419
1585
  const table = await this.db.openTable("chunks");
1420
- chunkCount = await table.countRows();
1586
+ vectorChunkCount = await table.countRows();
1421
1587
  }
1422
1588
  } catch {}
1423
1589
 
1590
+ if (this.chunkStore) {
1591
+ try {
1592
+ totalChunkCount = this.chunkStore.count();
1593
+ hasUnvectorized = this.chunkStore.hasUnvectorizedChunks();
1594
+ } catch {}
1595
+ }
1596
+
1424
1597
  const preset = INDEX_PRESETS[this.indexName];
1425
1598
  return {
1426
1599
  indexName: this.indexName,
1427
1600
  description: preset?.description || "Custom index",
1428
1601
  model: EMBEDDING_MODEL,
1429
1602
  fileCount,
1430
- chunkCount,
1603
+ chunkCount: totalChunkCount || vectorChunkCount,
1604
+ vectorizedChunks: vectorChunkCount,
1605
+ pendingEmbedding: hasUnvectorized,
1431
1606
  features: {
1432
1607
  chunking: CHUNKING_CONFIG.strategy,
1433
1608
  hybrid: preset?.hybrid ?? false,