@comfanion/usethis_search 3.0.0-dev.27 → 3.0.0-dev.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -1
- package/tools/search.ts +13 -3
- package/vectorizer/chunk-store.ts +207 -0
- package/vectorizer/index.ts +290 -115
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@comfanion/usethis_search",
|
|
3
|
-
"version": "3.0.0-dev.
|
|
3
|
+
"version": "3.0.0-dev.28",
|
|
4
4
|
"description": "OpenCode plugin: semantic search with graph-based context (v3: graph relations, 1-hop context, LSP + regex analyzers)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./index.ts",
|
|
@@ -32,6 +32,7 @@
|
|
|
32
32
|
"vectorizer/query-cache.ts",
|
|
33
33
|
"vectorizer/search-metrics.ts",
|
|
34
34
|
"vectorizer/graph-db.ts",
|
|
35
|
+
"vectorizer/chunk-store.ts",
|
|
35
36
|
"vectorizer/usage-tracker.ts",
|
|
36
37
|
"vectorizer/graph-builder.ts",
|
|
37
38
|
"vectorizer/analyzers/regex-analyzer.ts",
|
package/tools/search.ts
CHANGED
|
@@ -218,9 +218,12 @@ Examples:
|
|
|
218
218
|
// ── Reranking — boost results where query keywords appear in text ──────
|
|
219
219
|
const queryKeywords = args.query.toLowerCase().split(/\s+/).filter((w: string) => w.length > 2)
|
|
220
220
|
for (const r of allResults) {
|
|
221
|
+
const isBM25Only = !!r._bm25Only
|
|
221
222
|
const vectorScore = r._distance != null ? Math.max(0, 1 - r._distance / 2) : 0
|
|
222
223
|
r._vectorScore = vectorScore
|
|
223
|
-
r._bm25Component =
|
|
224
|
+
r._bm25Component = isBM25Only
|
|
225
|
+
? (r._combinedScore ?? 0)
|
|
226
|
+
: (r._combinedScore != null ? Math.max(0, r._combinedScore - vectorScore) : 0)
|
|
224
227
|
const baseScore = r._combinedScore ?? vectorScore
|
|
225
228
|
|
|
226
229
|
const text = (r.content || "").toLowerCase()
|
|
@@ -264,10 +267,15 @@ Examples:
|
|
|
264
267
|
|
|
265
268
|
// ── Confidence signal ──────────────────────────────────────────────────
|
|
266
269
|
const topScore = sortedGroups[0].best._finalScore ?? 0
|
|
270
|
+
const hasBM25Only = allResults.some((r: any) => r._bm25Only)
|
|
267
271
|
const scope = args.searchAll ? "all indexes" : `index "${indexName}"`
|
|
268
272
|
const filterLabel = args.filter ? ` filter:"${args.filter}"` : ""
|
|
269
273
|
let output = `## Search Results for: "${args.query}" (${scope}${filterLabel})\n\n`
|
|
270
274
|
|
|
275
|
+
if (hasBM25Only) {
|
|
276
|
+
output += `> **BM25-only mode** — vector embeddings not yet available. Results are keyword-based. Quality will improve after embedding completes.\n\n`
|
|
277
|
+
}
|
|
278
|
+
|
|
271
279
|
if (topScore < 0.45) {
|
|
272
280
|
output += `> **Low confidence results.** Best score: ${topScore.toFixed(3)}. These results may not be relevant to your query.\n> Try more specific keywords or different phrasing.\n\n`
|
|
273
281
|
}
|
|
@@ -287,8 +295,10 @@ Examples:
|
|
|
287
295
|
const metaLine = metaParts.length > 0 ? ` (${metaParts.join(", ")})` : ""
|
|
288
296
|
|
|
289
297
|
// Score breakdown
|
|
290
|
-
const breakdownParts: string[] =
|
|
291
|
-
|
|
298
|
+
const breakdownParts: string[] = r._bm25Only
|
|
299
|
+
? [`bm25: ${(r._bm25Component ?? 0).toFixed(2)}`]
|
|
300
|
+
: [`vec: ${(r._vectorScore ?? 0).toFixed(2)}`]
|
|
301
|
+
if (!r._bm25Only && r._bm25Component > 0.005) breakdownParts.push(`bm25: +${r._bm25Component.toFixed(2)}`)
|
|
292
302
|
if (r._keywordBonus > 0.005) breakdownParts.push(`kw: +${r._keywordBonus.toFixed(2)}`)
|
|
293
303
|
const breakdown = breakdownParts.join(", ")
|
|
294
304
|
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ChunkStore — SQLite-based persistent chunk storage.
|
|
3
|
+
* Populated by Phase 1 (no vectors needed). Provides BM25 search
|
|
4
|
+
* and metadata queries immediately, before embedding is complete.
|
|
5
|
+
*
|
|
6
|
+
* Uses bun:sqlite with WAL mode for concurrent read access.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { Database } from "bun:sqlite"
|
|
10
|
+
|
|
11
|
+
export interface StoredChunk {
|
|
12
|
+
chunk_id: string
|
|
13
|
+
file: string
|
|
14
|
+
chunk_index: number
|
|
15
|
+
content: string
|
|
16
|
+
file_type: string
|
|
17
|
+
language: string
|
|
18
|
+
last_modified: string
|
|
19
|
+
file_size: number
|
|
20
|
+
heading_context: string
|
|
21
|
+
function_name: string
|
|
22
|
+
class_name: string
|
|
23
|
+
tags: string
|
|
24
|
+
start_line: number
|
|
25
|
+
end_line: number
|
|
26
|
+
archived: boolean
|
|
27
|
+
vectorized: boolean
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export class ChunkStore {
|
|
31
|
+
private db: Database | null = null
|
|
32
|
+
|
|
33
|
+
// Prepared statements
|
|
34
|
+
private _stmtInsert: any = null
|
|
35
|
+
private _stmtByFile: any = null
|
|
36
|
+
private _stmtDeleteByFile: any = null
|
|
37
|
+
private _stmtAll: any = null
|
|
38
|
+
private _stmtByChunkId: any = null
|
|
39
|
+
private _stmtMarkVectorized: any = null
|
|
40
|
+
private _stmtHasVectors: any = null
|
|
41
|
+
private _stmtCount: any = null
|
|
42
|
+
private _stmtSearch: any = null
|
|
43
|
+
|
|
44
|
+
constructor(private dbPath: string) {}
|
|
45
|
+
|
|
46
|
+
async init(): Promise<this> {
|
|
47
|
+
const fullPath = this.dbPath.endsWith(".db") ? this.dbPath : this.dbPath + ".db"
|
|
48
|
+
this.db = new Database(fullPath)
|
|
49
|
+
|
|
50
|
+
this.db.exec("PRAGMA journal_mode = WAL")
|
|
51
|
+
this.db.exec("PRAGMA synchronous = NORMAL")
|
|
52
|
+
this.db.exec("PRAGMA cache_size = -4000") // 4MB cache
|
|
53
|
+
|
|
54
|
+
this.db.exec(`
|
|
55
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
56
|
+
chunk_id TEXT PRIMARY KEY,
|
|
57
|
+
file TEXT NOT NULL,
|
|
58
|
+
chunk_index INTEGER NOT NULL DEFAULT 0,
|
|
59
|
+
content TEXT NOT NULL,
|
|
60
|
+
file_type TEXT NOT NULL DEFAULT '',
|
|
61
|
+
language TEXT NOT NULL DEFAULT '',
|
|
62
|
+
last_modified TEXT NOT NULL DEFAULT '',
|
|
63
|
+
file_size INTEGER NOT NULL DEFAULT 0,
|
|
64
|
+
heading_context TEXT NOT NULL DEFAULT '',
|
|
65
|
+
function_name TEXT NOT NULL DEFAULT '',
|
|
66
|
+
class_name TEXT NOT NULL DEFAULT '',
|
|
67
|
+
tags TEXT NOT NULL DEFAULT '',
|
|
68
|
+
start_line INTEGER NOT NULL DEFAULT -1,
|
|
69
|
+
end_line INTEGER NOT NULL DEFAULT -1,
|
|
70
|
+
archived INTEGER NOT NULL DEFAULT 0,
|
|
71
|
+
vectorized INTEGER NOT NULL DEFAULT 0
|
|
72
|
+
)
|
|
73
|
+
`)
|
|
74
|
+
|
|
75
|
+
this.db.exec("CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file)")
|
|
76
|
+
this.db.exec("CREATE INDEX IF NOT EXISTS idx_chunks_vectorized ON chunks(vectorized)")
|
|
77
|
+
this.db.exec("CREATE INDEX IF NOT EXISTS idx_chunks_language ON chunks(language)")
|
|
78
|
+
|
|
79
|
+
// Prepare statements
|
|
80
|
+
this._stmtInsert = this.db.prepare(`
|
|
81
|
+
INSERT OR REPLACE INTO chunks
|
|
82
|
+
(chunk_id, file, chunk_index, content, file_type, language, last_modified, file_size,
|
|
83
|
+
heading_context, function_name, class_name, tags, start_line, end_line, archived, vectorized)
|
|
84
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 0)
|
|
85
|
+
`)
|
|
86
|
+
this._stmtByFile = this.db.prepare("SELECT * FROM chunks WHERE file = ?")
|
|
87
|
+
this._stmtDeleteByFile = this.db.prepare("DELETE FROM chunks WHERE file = ?")
|
|
88
|
+
this._stmtAll = this.db.prepare("SELECT * FROM chunks")
|
|
89
|
+
this._stmtByChunkId = this.db.prepare("SELECT * FROM chunks WHERE chunk_id = ?")
|
|
90
|
+
this._stmtMarkVectorized = this.db.prepare("UPDATE chunks SET vectorized = 1 WHERE file = ?")
|
|
91
|
+
this._stmtHasVectors = this.db.prepare("SELECT COUNT(*) as cnt FROM chunks WHERE vectorized = 0")
|
|
92
|
+
this._stmtCount = this.db.prepare("SELECT COUNT(*) as cnt FROM chunks")
|
|
93
|
+
|
|
94
|
+
return this
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Store chunks from Phase 1 (batch, in transaction).
|
|
99
|
+
*/
|
|
100
|
+
storeChunks(rows: Array<{
|
|
101
|
+
chunk_id: string, file: string, chunk_index: number, content: string,
|
|
102
|
+
file_type: string, language: string, last_modified: string, file_size: number,
|
|
103
|
+
heading_context: string, function_name: string, class_name: string, tags: string,
|
|
104
|
+
start_line: number, end_line: number, archived: boolean
|
|
105
|
+
}>): void {
|
|
106
|
+
if (!this.db) throw new Error("ChunkStore not initialized")
|
|
107
|
+
|
|
108
|
+
const insertMany = this.db.transaction((items: typeof rows) => {
|
|
109
|
+
for (const r of items) {
|
|
110
|
+
this._stmtInsert.run(
|
|
111
|
+
r.chunk_id, r.file, r.chunk_index, r.content,
|
|
112
|
+
r.file_type, r.language, r.last_modified, r.file_size,
|
|
113
|
+
r.heading_context, r.function_name, r.class_name, r.tags,
|
|
114
|
+
r.start_line, r.end_line, r.archived ? 1 : 0
|
|
115
|
+
)
|
|
116
|
+
}
|
|
117
|
+
})
|
|
118
|
+
insertMany(rows)
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Delete all chunks for a file (before re-indexing).
|
|
123
|
+
*/
|
|
124
|
+
deleteByFile(filePath: string): void {
|
|
125
|
+
if (!this.db) throw new Error("ChunkStore not initialized")
|
|
126
|
+
this._stmtDeleteByFile.run(filePath)
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Mark all chunks for a file as vectorized (Phase 2 complete).
|
|
131
|
+
*/
|
|
132
|
+
markVectorized(filePath: string): void {
|
|
133
|
+
if (!this.db) throw new Error("ChunkStore not initialized")
|
|
134
|
+
this._stmtMarkVectorized.run(filePath)
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Check if all chunks have vectors.
|
|
139
|
+
*/
|
|
140
|
+
hasUnvectorizedChunks(): boolean {
|
|
141
|
+
if (!this.db) return false
|
|
142
|
+
const row = this._stmtHasVectors.get() as { cnt: number }
|
|
143
|
+
return row.cnt > 0
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Get all chunks (for BM25 index building).
|
|
148
|
+
*/
|
|
149
|
+
getAllChunks(): StoredChunk[] {
|
|
150
|
+
if (!this.db) return []
|
|
151
|
+
return this._stmtAll.all().map((r: any) => this.toChunk(r))
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* Get chunks for a specific file.
|
|
156
|
+
*/
|
|
157
|
+
getChunksByFile(filePath: string): StoredChunk[] {
|
|
158
|
+
if (!this.db) return []
|
|
159
|
+
return this._stmtByFile.all(filePath).map((r: any) => this.toChunk(r))
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* Get a single chunk by ID.
|
|
164
|
+
*/
|
|
165
|
+
getChunkById(chunkId: string): StoredChunk | null {
|
|
166
|
+
if (!this.db) return null
|
|
167
|
+
const row = this._stmtByChunkId.get(chunkId)
|
|
168
|
+
return row ? this.toChunk(row) : null
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Total chunk count.
|
|
173
|
+
*/
|
|
174
|
+
count(): number {
|
|
175
|
+
if (!this.db) return 0
|
|
176
|
+
const row = this._stmtCount.get() as { cnt: number }
|
|
177
|
+
return row.cnt
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
close(): void {
|
|
181
|
+
if (this.db) {
|
|
182
|
+
this.db.close()
|
|
183
|
+
this.db = null
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
private toChunk(row: any): StoredChunk {
|
|
188
|
+
return {
|
|
189
|
+
chunk_id: row.chunk_id,
|
|
190
|
+
file: row.file,
|
|
191
|
+
chunk_index: row.chunk_index,
|
|
192
|
+
content: row.content,
|
|
193
|
+
file_type: row.file_type,
|
|
194
|
+
language: row.language,
|
|
195
|
+
last_modified: row.last_modified,
|
|
196
|
+
file_size: row.file_size,
|
|
197
|
+
heading_context: row.heading_context,
|
|
198
|
+
function_name: row.function_name,
|
|
199
|
+
class_name: row.class_name,
|
|
200
|
+
tags: row.tags,
|
|
201
|
+
start_line: row.start_line,
|
|
202
|
+
end_line: row.end_line,
|
|
203
|
+
archived: !!row.archived,
|
|
204
|
+
vectorized: !!row.vectorized,
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
}
|
package/vectorizer/index.ts
CHANGED
|
@@ -18,6 +18,7 @@ import { SearchMetrics } from "./search-metrics.ts";
|
|
|
18
18
|
import { GraphDB } from "./graph-db.ts";
|
|
19
19
|
import { GraphBuilder, isStructuralPredicate } from "./graph-builder.ts";
|
|
20
20
|
import { UsageTracker } from "./usage-tracker.ts";
|
|
21
|
+
import { ChunkStore } from "./chunk-store.ts";
|
|
21
22
|
|
|
22
23
|
// Suppress transformers.js logs unless DEBUG is set
|
|
23
24
|
const DEBUG = process.env.DEBUG?.includes("vectorizer") || process.env.DEBUG === "*";
|
|
@@ -448,6 +449,7 @@ class CodebaseIndexer {
|
|
|
448
449
|
this.graphBuilder = null; // Graph builder orchestrator
|
|
449
450
|
this._chunkCache = null; // Lazy Map<chunk_id, row> for findChunkById
|
|
450
451
|
this.usageTracker = null; // Usage tracking & provenance (v3)
|
|
452
|
+
this.chunkStore = null; // SQLite chunk store (BM25 without vectors)
|
|
451
453
|
}
|
|
452
454
|
|
|
453
455
|
async init() {
|
|
@@ -459,6 +461,16 @@ class CodebaseIndexer {
|
|
|
459
461
|
this.db = await lancedb.connect(path.join(this.cacheDir, "lancedb"));
|
|
460
462
|
await this.loadHashes();
|
|
461
463
|
|
|
464
|
+
// ChunkStore — SQLite store for BM25 search without vectors
|
|
465
|
+
try {
|
|
466
|
+
const chunkStorePath = path.join(this.cacheDir, "chunks.db");
|
|
467
|
+
this.chunkStore = await new ChunkStore(chunkStorePath).init();
|
|
468
|
+
if (DEBUG) console.log(`[vectorizer] ChunkStore initialized: ${chunkStorePath}`);
|
|
469
|
+
} catch (e) {
|
|
470
|
+
if (DEBUG) console.log(`[vectorizer] ChunkStore init failed: ${e.message || e}`);
|
|
471
|
+
this.chunkStore = null;
|
|
472
|
+
}
|
|
473
|
+
|
|
462
474
|
// Graph DB — only if graph is enabled in config
|
|
463
475
|
// Non-fatal: if LevelDB lock fails (parallel access), search works without graph
|
|
464
476
|
if (GRAPH_CONFIG.enabled) {
|
|
@@ -512,6 +524,11 @@ class CodebaseIndexer {
|
|
|
512
524
|
}
|
|
513
525
|
this._bm25Rows = null;
|
|
514
526
|
this.metrics = null;
|
|
527
|
+
// Close ChunkStore
|
|
528
|
+
if (this.chunkStore) {
|
|
529
|
+
try { this.chunkStore.close(); } catch { /* best effort */ }
|
|
530
|
+
this.chunkStore = null;
|
|
531
|
+
}
|
|
515
532
|
// Close graph DB to release LevelDB lock
|
|
516
533
|
if (this.graphDB) {
|
|
517
534
|
try { await this.graphDB.close(); } catch { /* best effort */ }
|
|
@@ -685,6 +702,16 @@ class CodebaseIndexer {
|
|
|
685
702
|
end_line: chunk.end_line ?? -1,
|
|
686
703
|
}));
|
|
687
704
|
|
|
705
|
+
// Store chunks in ChunkStore (Phase 1 — BM25 available immediately)
|
|
706
|
+
if (this.chunkStore) {
|
|
707
|
+
try {
|
|
708
|
+
this.chunkStore.deleteByFile(relPath);
|
|
709
|
+
this.chunkStore.storeChunks(rows);
|
|
710
|
+
} catch (e) {
|
|
711
|
+
if (DEBUG) console.log(`[vectorizer] ChunkStore write failed for ${relPath}: ${e.message || e}`);
|
|
712
|
+
}
|
|
713
|
+
}
|
|
714
|
+
|
|
688
715
|
return { relPath, hash, rows };
|
|
689
716
|
}
|
|
690
717
|
|
|
@@ -739,9 +766,12 @@ class CodebaseIndexer {
|
|
|
739
766
|
await this.db.createTable(tableName, allData);
|
|
740
767
|
}
|
|
741
768
|
|
|
742
|
-
// Update hashes
|
|
769
|
+
// Update hashes + mark vectorized in ChunkStore
|
|
743
770
|
for (const pf of preparedFiles) {
|
|
744
771
|
this.hashes[pf.relPath] = pf.hash;
|
|
772
|
+
if (this.chunkStore) {
|
|
773
|
+
try { this.chunkStore.markVectorized(pf.relPath); } catch { /* non-fatal */ }
|
|
774
|
+
}
|
|
745
775
|
}
|
|
746
776
|
await this.saveHashes();
|
|
747
777
|
|
|
@@ -842,6 +872,24 @@ class CodebaseIndexer {
|
|
|
842
872
|
|
|
843
873
|
if (data.length === 0) return false;
|
|
844
874
|
|
|
875
|
+
// Store in ChunkStore (Phase 1 data) + mark vectorized (has embedding)
|
|
876
|
+
if (this.chunkStore) {
|
|
877
|
+
try {
|
|
878
|
+
this.chunkStore.deleteByFile(relPath);
|
|
879
|
+
this.chunkStore.storeChunks(data.map(d => ({
|
|
880
|
+
chunk_id: d.chunk_id, file: d.file, chunk_index: d.chunk_index,
|
|
881
|
+
content: d.content, file_type: d.file_type, language: d.language,
|
|
882
|
+
last_modified: d.last_modified, file_size: d.file_size,
|
|
883
|
+
heading_context: d.heading_context, function_name: d.function_name,
|
|
884
|
+
class_name: d.class_name, tags: d.tags,
|
|
885
|
+
start_line: d.start_line, end_line: d.end_line, archived: d.archived,
|
|
886
|
+
})));
|
|
887
|
+
this.chunkStore.markVectorized(relPath);
|
|
888
|
+
} catch (e) {
|
|
889
|
+
if (DEBUG) console.log(`[vectorizer] ChunkStore write failed for ${relPath}: ${e.message || e}`);
|
|
890
|
+
}
|
|
891
|
+
}
|
|
892
|
+
|
|
845
893
|
const tableName = "chunks";
|
|
846
894
|
const tables = await this.db.tableNames();
|
|
847
895
|
if (tables.includes(tableName)) {
|
|
@@ -869,6 +917,35 @@ class CodebaseIndexer {
|
|
|
869
917
|
async ensureBM25() {
|
|
870
918
|
if (this.bm25) return this.bm25;
|
|
871
919
|
|
|
920
|
+
// Primary source: ChunkStore (SQLite) — available after Phase 1, no vectors needed
|
|
921
|
+
if (this.chunkStore) {
|
|
922
|
+
try {
|
|
923
|
+
const allChunks = this.chunkStore.getAllChunks();
|
|
924
|
+
if (allChunks.length > 0) {
|
|
925
|
+
// Sort for stable ID mapping between builds
|
|
926
|
+
allChunks.sort((a, b) => {
|
|
927
|
+
const ka = `${a.file}:${a.chunk_index}`;
|
|
928
|
+
const kb = `${b.file}:${b.chunk_index}`;
|
|
929
|
+
return ka.localeCompare(kb);
|
|
930
|
+
});
|
|
931
|
+
|
|
932
|
+
// Release previous data before rebuilding
|
|
933
|
+
if (this.bm25) this.bm25.clear();
|
|
934
|
+
this._bm25Rows = null;
|
|
935
|
+
|
|
936
|
+
this.bm25 = new BM25Index();
|
|
937
|
+
this.bm25.build(allChunks.map((r) => r.content));
|
|
938
|
+
this._bm25Rows = allChunks;
|
|
939
|
+
|
|
940
|
+
if (DEBUG) console.log(`[vectorizer] BM25 built from ChunkStore (${allChunks.length} chunks)`);
|
|
941
|
+
return this.bm25;
|
|
942
|
+
}
|
|
943
|
+
} catch (e) {
|
|
944
|
+
if (DEBUG) console.log("[vectorizer] BM25 from ChunkStore failed, trying LanceDB:", e.message);
|
|
945
|
+
}
|
|
946
|
+
}
|
|
947
|
+
|
|
948
|
+
// Fallback: LanceDB (legacy — for indexes without ChunkStore)
|
|
872
949
|
const tableName = "chunks";
|
|
873
950
|
const tables = await this.db.tableNames();
|
|
874
951
|
if (!tables.includes(tableName)) return null;
|
|
@@ -902,15 +979,170 @@ class CodebaseIndexer {
|
|
|
902
979
|
return this.bm25;
|
|
903
980
|
}
|
|
904
981
|
|
|
905
|
-
// ──
|
|
982
|
+
// ── Shared helpers for search paths ─────────────────────────────────────────
|
|
983
|
+
|
|
984
|
+
_applyMetadataFilters(results, includeArchived, options) {
|
|
985
|
+
if (!includeArchived) {
|
|
986
|
+
results = results.filter((r) => !r.archived);
|
|
987
|
+
}
|
|
988
|
+
if (options.fileType) {
|
|
989
|
+
results = results.filter((r) => r.file_type === options.fileType);
|
|
990
|
+
}
|
|
991
|
+
if (options.language) {
|
|
992
|
+
results = results.filter((r) => r.language === options.language);
|
|
993
|
+
}
|
|
994
|
+
if (options.modifiedAfter) {
|
|
995
|
+
const after = new Date(options.modifiedAfter).getTime();
|
|
996
|
+
results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() >= after);
|
|
997
|
+
}
|
|
998
|
+
if (options.modifiedBefore) {
|
|
999
|
+
const before = new Date(options.modifiedBefore).getTime();
|
|
1000
|
+
results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() <= before);
|
|
1001
|
+
}
|
|
1002
|
+
if (options.tags && options.tags.length > 0) {
|
|
1003
|
+
results = results.filter((r) => {
|
|
1004
|
+
const rowTags = (r.tags || "").split(",").filter(Boolean);
|
|
1005
|
+
return options.tags.some((t) => rowTags.includes(t));
|
|
1006
|
+
});
|
|
1007
|
+
}
|
|
1008
|
+
return results;
|
|
1009
|
+
}
|
|
1010
|
+
|
|
1011
|
+
async _expandGraphContext(finalResults, queryEmbedding, query) {
|
|
1012
|
+
if (!this.graphDB) return;
|
|
1013
|
+
|
|
1014
|
+
for (const result of finalResults) {
|
|
1015
|
+
if (!result.chunk_id) continue;
|
|
1016
|
+
|
|
1017
|
+
const outgoing = await this.graphDB.getOutgoing(result.chunk_id);
|
|
1018
|
+
const incoming = await this.graphDB.getIncoming(result.chunk_id);
|
|
1019
|
+
const allEdges = [...outgoing, ...incoming].filter(
|
|
1020
|
+
e => e.predicate !== "belongs_to" && e.predicate !== "graph_built" && !isStructuralPredicate(e.predicate)
|
|
1021
|
+
);
|
|
1022
|
+
|
|
1023
|
+
const neighbors = [];
|
|
1024
|
+
for (const edge of allEdges) {
|
|
1025
|
+
const neighborId = edge.subject === result.chunk_id ? edge.object : edge.subject;
|
|
1026
|
+
const neighborChunk = await this.findChunkById(neighborId);
|
|
1027
|
+
if (!neighborChunk) continue;
|
|
1028
|
+
|
|
1029
|
+
let score;
|
|
1030
|
+
if (queryEmbedding && neighborChunk.vector) {
|
|
1031
|
+
const similarity = this.cosineSimilarity(neighborChunk.vector, queryEmbedding);
|
|
1032
|
+
score = edge.weight * similarity;
|
|
1033
|
+
} else {
|
|
1034
|
+
// No vectors — use edge weight only (BM25-only fallback)
|
|
1035
|
+
score = edge.weight * 0.7; // dampen without cosine confirmation
|
|
1036
|
+
}
|
|
1037
|
+
|
|
1038
|
+
neighbors.push({
|
|
1039
|
+
chunk_id: neighborId,
|
|
1040
|
+
file: neighborChunk.file,
|
|
1041
|
+
content: neighborChunk.content,
|
|
1042
|
+
relation: edge.predicate,
|
|
1043
|
+
score,
|
|
1044
|
+
via: edge.source
|
|
1045
|
+
});
|
|
1046
|
+
}
|
|
1047
|
+
|
|
1048
|
+
neighbors.sort((a, b) => b.score - a.score);
|
|
1049
|
+
const minRelevance = GRAPH_CONFIG.min_relevance ?? 0.5;
|
|
1050
|
+
const maxRelated = GRAPH_CONFIG.max_related ?? 4;
|
|
1051
|
+
result.relatedContext = neighbors
|
|
1052
|
+
.filter(n => n.score >= minRelevance)
|
|
1053
|
+
.slice(0, maxRelated);
|
|
1054
|
+
|
|
1055
|
+
if (this.usageTracker) {
|
|
1056
|
+
for (const n of result.relatedContext) {
|
|
1057
|
+
this.usageTracker.recordProvenance(query, result.chunk_id, n.chunk_id, n.relation);
|
|
1058
|
+
}
|
|
1059
|
+
}
|
|
1060
|
+
}
|
|
1061
|
+
|
|
1062
|
+
// Record usage counts for all returned chunks
|
|
1063
|
+
if (this.usageTracker) {
|
|
1064
|
+
const allChunkIds = [];
|
|
1065
|
+
for (const r of finalResults) {
|
|
1066
|
+
if (r.chunk_id) allChunkIds.push(r.chunk_id);
|
|
1067
|
+
if (r.relatedContext) {
|
|
1068
|
+
for (const rc of r.relatedContext) {
|
|
1069
|
+
if (rc.chunk_id) allChunkIds.push(rc.chunk_id);
|
|
1070
|
+
}
|
|
1071
|
+
}
|
|
1072
|
+
}
|
|
1073
|
+
this.usageTracker.recordSearchResults(allChunkIds);
|
|
1074
|
+
this.usageTracker.save().catch(() => {});
|
|
1075
|
+
}
|
|
1076
|
+
}
|
|
1077
|
+
|
|
1078
|
+
// ── Search (v3: hybrid + BM25-only fallback + metadata filters + metrics) ──
|
|
906
1079
|
|
|
907
1080
|
async search(query, limit = 5, includeArchived = false, options = {}) {
|
|
908
1081
|
const tableName = "chunks";
|
|
909
1082
|
const tables = await this.db.tableNames();
|
|
910
|
-
|
|
911
|
-
|
|
1083
|
+
|
|
1084
|
+
const indexConfig = INDEX_PRESETS[this.indexName];
|
|
1085
|
+
const indexHybridEnabled = indexConfig?.hybrid ?? false;
|
|
1086
|
+
const indexBM25Weight = indexConfig?.bm25_weight ?? HYBRID_CONFIG.bm25_weight;
|
|
1087
|
+
const isHybrid = indexHybridEnabled || options.hybrid;
|
|
1088
|
+
|
|
1089
|
+
// ── Detect if vectors are available ──────────────────────────────────────
|
|
1090
|
+
const hasVectorTable = tables.includes(tableName);
|
|
1091
|
+
let hasVectors = false;
|
|
1092
|
+
if (hasVectorTable) {
|
|
1093
|
+
try {
|
|
1094
|
+
const table = await this.db.openTable(tableName);
|
|
1095
|
+
const rowCount = await table.countRows();
|
|
1096
|
+
hasVectors = rowCount > 0;
|
|
1097
|
+
} catch {
|
|
1098
|
+
hasVectors = false;
|
|
1099
|
+
}
|
|
1100
|
+
}
|
|
1101
|
+
|
|
1102
|
+
// ── BM25-only fallback (Phase 1 complete, Phase 2 not yet) ──────────────
|
|
1103
|
+
if (!hasVectors) {
|
|
1104
|
+
const bm25 = await this.ensureBM25();
|
|
1105
|
+
if (!bm25 || !this._bm25Rows) {
|
|
1106
|
+
// No vectors AND no chunks — nothing indexed yet
|
|
1107
|
+
return [];
|
|
1108
|
+
}
|
|
1109
|
+
|
|
1110
|
+
if (DEBUG) console.log("[vectorizer] BM25-only search (no vectors yet)");
|
|
1111
|
+
|
|
1112
|
+
const fetchLimit = Math.max(limit * 3, 50);
|
|
1113
|
+
const bm25Results = bm25.search(query, fetchLimit);
|
|
1114
|
+
|
|
1115
|
+
// Normalize BM25 scores to [0, 1]
|
|
1116
|
+
let maxBM25 = 0;
|
|
1117
|
+
for (const r of bm25Results) {
|
|
1118
|
+
if (r.score > maxBM25) maxBM25 = r.score;
|
|
1119
|
+
}
|
|
1120
|
+
|
|
1121
|
+
let results = [];
|
|
1122
|
+
for (const br of bm25Results) {
|
|
1123
|
+
if (br.id < this._bm25Rows.length) {
|
|
1124
|
+
const row = this._bm25Rows[br.id];
|
|
1125
|
+
const normScore = maxBM25 > 0 ? br.score / maxBM25 : 0;
|
|
1126
|
+
results.push({
|
|
1127
|
+
...row,
|
|
1128
|
+
_combinedScore: normScore,
|
|
1129
|
+
_distance: null, // no vector distance available
|
|
1130
|
+
_bm25Only: true,
|
|
1131
|
+
});
|
|
1132
|
+
}
|
|
1133
|
+
}
|
|
1134
|
+
|
|
1135
|
+
// Apply metadata filters then return (graph context added below)
|
|
1136
|
+
results = this._applyMetadataFilters(results, includeArchived, options);
|
|
1137
|
+
const finalResults = results.slice(0, limit);
|
|
1138
|
+
|
|
1139
|
+
// Graph context expansion (same as vector path)
|
|
1140
|
+
await this._expandGraphContext(finalResults, null, query);
|
|
1141
|
+
|
|
1142
|
+
return finalResults;
|
|
912
1143
|
}
|
|
913
1144
|
|
|
1145
|
+
// ── Vector search (Phase 2 complete) ─────────────────────────────────────
|
|
914
1146
|
const queryEmbedding = await this.embedQuery(query);
|
|
915
1147
|
const table = await this.db.openTable(tableName);
|
|
916
1148
|
|
|
@@ -918,10 +1150,6 @@ class CodebaseIndexer {
|
|
|
918
1150
|
const hasFilters = !includeArchived || options.fileType || options.language ||
|
|
919
1151
|
options.modifiedAfter || options.modifiedBefore ||
|
|
920
1152
|
(options.tags && options.tags.length > 0);
|
|
921
|
-
const indexConfig = INDEX_PRESETS[this.indexName];
|
|
922
|
-
const indexHybridEnabled = indexConfig?.hybrid ?? false;
|
|
923
|
-
const indexBM25Weight = indexConfig?.bm25_weight ?? HYBRID_CONFIG.bm25_weight;
|
|
924
|
-
const isHybrid = indexHybridEnabled || options.hybrid;
|
|
925
1153
|
const fetchLimit = (hasFilters || isHybrid) ? Math.max(limit *3, 50) : limit;
|
|
926
1154
|
let results;
|
|
927
1155
|
try {
|
|
@@ -1005,35 +1233,7 @@ class CodebaseIndexer {
|
|
|
1005
1233
|
}
|
|
1006
1234
|
|
|
1007
1235
|
// ── Metadata filters ──────────────────────────────────────────────────
|
|
1008
|
-
|
|
1009
|
-
results = results.filter((r) => !r.archived);
|
|
1010
|
-
}
|
|
1011
|
-
|
|
1012
|
-
if (options.fileType) {
|
|
1013
|
-
results = results.filter((r) => r.file_type === options.fileType);
|
|
1014
|
-
}
|
|
1015
|
-
|
|
1016
|
-
if (options.language) {
|
|
1017
|
-
results = results.filter((r) => r.language === options.language);
|
|
1018
|
-
}
|
|
1019
|
-
|
|
1020
|
-
if (options.modifiedAfter) {
|
|
1021
|
-
const after = new Date(options.modifiedAfter).getTime();
|
|
1022
|
-
results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() >= after);
|
|
1023
|
-
}
|
|
1024
|
-
|
|
1025
|
-
if (options.modifiedBefore) {
|
|
1026
|
-
const before = new Date(options.modifiedBefore).getTime();
|
|
1027
|
-
results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() <= before);
|
|
1028
|
-
}
|
|
1029
|
-
|
|
1030
|
-
if (options.tags && options.tags.length > 0) {
|
|
1031
|
-
results = results.filter((r) => {
|
|
1032
|
-
const rowTags = (r.tags || "").split(",").filter(Boolean);
|
|
1033
|
-
return options.tags.some((t) => rowTags.includes(t));
|
|
1034
|
-
});
|
|
1035
|
-
}
|
|
1036
|
-
|
|
1236
|
+
results = this._applyMetadataFilters(results, includeArchived, options);
|
|
1037
1237
|
const finalResults = results.slice(0, limit);
|
|
1038
1238
|
|
|
1039
1239
|
// ── Metrics tracking ────────────────────────────────────────────────────
|
|
@@ -1058,68 +1258,7 @@ class CodebaseIndexer {
|
|
|
1058
1258
|
}
|
|
1059
1259
|
|
|
1060
1260
|
// ── Graph context expansion (v3) ───────────────────────────────────────
|
|
1061
|
-
|
|
1062
|
-
for (const result of finalResults) {
|
|
1063
|
-
if (!result.chunk_id) continue;
|
|
1064
|
-
|
|
1065
|
-
const outgoing = await this.graphDB.getOutgoing(result.chunk_id);
|
|
1066
|
-
const incoming = await this.graphDB.getIncoming(result.chunk_id);
|
|
1067
|
-
// Filter out structural and meta edges — only relation edges are useful for context
|
|
1068
|
-
const allEdges = [...outgoing, ...incoming].filter(
|
|
1069
|
-
e => e.predicate !== "belongs_to" && e.predicate !== "graph_built" && !isStructuralPredicate(e.predicate)
|
|
1070
|
-
);
|
|
1071
|
-
|
|
1072
|
-
const neighbors = [];
|
|
1073
|
-
for (const edge of allEdges) {
|
|
1074
|
-
const neighborId = edge.subject === result.chunk_id ? edge.object : edge.subject;
|
|
1075
|
-
const neighborChunk = await this.findChunkById(neighborId);
|
|
1076
|
-
if (!neighborChunk) continue;
|
|
1077
|
-
|
|
1078
|
-
const similarity = this.cosineSimilarity(neighborChunk.vector, queryEmbedding);
|
|
1079
|
-
const score = edge.weight * similarity;
|
|
1080
|
-
|
|
1081
|
-
neighbors.push({
|
|
1082
|
-
chunk_id: neighborId,
|
|
1083
|
-
file: neighborChunk.file,
|
|
1084
|
-
content: neighborChunk.content,
|
|
1085
|
-
relation: edge.predicate,
|
|
1086
|
-
score,
|
|
1087
|
-
via: edge.source
|
|
1088
|
-
});
|
|
1089
|
-
}
|
|
1090
|
-
|
|
1091
|
-
// Apply min_relevance filter, then cap at max_related
|
|
1092
|
-
neighbors.sort((a, b) => b.score - a.score);
|
|
1093
|
-
const minRelevance = GRAPH_CONFIG.min_relevance ?? 0.5;
|
|
1094
|
-
const maxRelated = GRAPH_CONFIG.max_related ?? 4;
|
|
1095
|
-
result.relatedContext = neighbors
|
|
1096
|
-
.filter(n => n.score >= minRelevance)
|
|
1097
|
-
.slice(0, maxRelated);
|
|
1098
|
-
|
|
1099
|
-
// FR-060: Record provenance for each attached chunk
|
|
1100
|
-
if (this.usageTracker) {
|
|
1101
|
-
for (const n of result.relatedContext) {
|
|
1102
|
-
this.usageTracker.recordProvenance(query, result.chunk_id, n.chunk_id, n.relation);
|
|
1103
|
-
}
|
|
1104
|
-
}
|
|
1105
|
-
}
|
|
1106
|
-
}
|
|
1107
|
-
|
|
1108
|
-
// FR-061: Record usage counts for all returned chunks (main + attached)
|
|
1109
|
-
if (this.usageTracker) {
|
|
1110
|
-
const allChunkIds = [];
|
|
1111
|
-
for (const r of finalResults) {
|
|
1112
|
-
if (r.chunk_id) allChunkIds.push(r.chunk_id);
|
|
1113
|
-
if (r.relatedContext) {
|
|
1114
|
-
for (const rc of r.relatedContext) {
|
|
1115
|
-
if (rc.chunk_id) allChunkIds.push(rc.chunk_id);
|
|
1116
|
-
}
|
|
1117
|
-
}
|
|
1118
|
-
}
|
|
1119
|
-
this.usageTracker.recordSearchResults(allChunkIds);
|
|
1120
|
-
// Save asynchronously (non-blocking)
|
|
1121
|
-
this.usageTracker.save().catch(() => {});
|
|
1122
|
-
}
|
|
1261
|
+
await this._expandGraphContext(finalResults, queryEmbedding, query);
|
|
1123
1262
|
|
|
1124
1263
|
return finalResults;
|
|
1125
1264
|
}
|
|
@@ -1128,22 +1267,39 @@ class CodebaseIndexer {
|
|
|
1128
1267
|
// Lazy-build an in-memory Map keyed by chunk_id on first call.
|
|
1129
1268
|
// The cache lives until unloadModel() clears it.
|
|
1130
1269
|
if (!this._chunkCache) {
|
|
1270
|
+
this._chunkCache = new Map();
|
|
1271
|
+
|
|
1272
|
+
// Primary: LanceDB (has vectors for cosine similarity in graph expansion)
|
|
1131
1273
|
const tableName = "chunks";
|
|
1132
1274
|
const tables = await this.db.tableNames();
|
|
1133
|
-
if (
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1275
|
+
if (tables.includes(tableName)) {
|
|
1276
|
+
try {
|
|
1277
|
+
const table = await this.db.openTable(tableName);
|
|
1278
|
+
const rows = await table.filter("true").limit(100000).execute();
|
|
1279
|
+
for (const row of rows) {
|
|
1280
|
+
if (row.chunk_id) {
|
|
1281
|
+
this._chunkCache.set(row.chunk_id, row);
|
|
1282
|
+
}
|
|
1283
|
+
}
|
|
1284
|
+
} catch (e) {
|
|
1285
|
+
if (DEBUG) console.log("[vectorizer] Chunk cache from LanceDB failed:", e.message);
|
|
1286
|
+
}
|
|
1142
1287
|
}
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
|
|
1288
|
+
|
|
1289
|
+
// Fallback: ChunkStore (no vectors, but has content for BM25-only mode)
|
|
1290
|
+
if (this._chunkCache.size === 0 && this.chunkStore) {
|
|
1291
|
+
try {
|
|
1292
|
+
const allChunks = this.chunkStore.getAllChunks();
|
|
1293
|
+
for (const chunk of allChunks) {
|
|
1294
|
+
if (chunk.chunk_id) {
|
|
1295
|
+
this._chunkCache.set(chunk.chunk_id, chunk);
|
|
1296
|
+
}
|
|
1297
|
+
}
|
|
1298
|
+
if (DEBUG && allChunks.length > 0) {
|
|
1299
|
+
console.log(`[vectorizer] Chunk cache from ChunkStore (${allChunks.length} chunks, no vectors)`);
|
|
1300
|
+
}
|
|
1301
|
+
} catch (e) {
|
|
1302
|
+
if (DEBUG) console.log("[vectorizer] Chunk cache from ChunkStore failed:", e.message);
|
|
1147
1303
|
}
|
|
1148
1304
|
}
|
|
1149
1305
|
}
|
|
@@ -1235,6 +1391,9 @@ class CodebaseIndexer {
|
|
|
1235
1391
|
// best effort
|
|
1236
1392
|
}
|
|
1237
1393
|
}
|
|
1394
|
+
if (this.chunkStore) {
|
|
1395
|
+
try { this.chunkStore.deleteByFile(relPath); } catch { /* best effort */ }
|
|
1396
|
+
}
|
|
1238
1397
|
delete this.hashes[relPath];
|
|
1239
1398
|
deleted++;
|
|
1240
1399
|
}
|
|
@@ -1297,6 +1456,11 @@ class CodebaseIndexer {
|
|
|
1297
1456
|
}
|
|
1298
1457
|
}
|
|
1299
1458
|
|
|
1459
|
+
// Delete chunks from ChunkStore
|
|
1460
|
+
if (this.chunkStore) {
|
|
1461
|
+
try { this.chunkStore.deleteByFile(relPath); } catch { /* best effort */ }
|
|
1462
|
+
}
|
|
1463
|
+
|
|
1300
1464
|
delete this.hashes[relPath];
|
|
1301
1465
|
removed++;
|
|
1302
1466
|
}
|
|
@@ -1411,23 +1575,34 @@ class CodebaseIndexer {
|
|
|
1411
1575
|
|
|
1412
1576
|
async getStats() {
|
|
1413
1577
|
const fileCount = Object.keys(this.hashes).length;
|
|
1414
|
-
let
|
|
1578
|
+
let vectorChunkCount = 0;
|
|
1579
|
+
let totalChunkCount = 0;
|
|
1580
|
+
let hasUnvectorized = false;
|
|
1415
1581
|
|
|
1416
1582
|
try {
|
|
1417
1583
|
const tables = await this.db.tableNames();
|
|
1418
1584
|
if (tables.includes("chunks")) {
|
|
1419
1585
|
const table = await this.db.openTable("chunks");
|
|
1420
|
-
|
|
1586
|
+
vectorChunkCount = await table.countRows();
|
|
1421
1587
|
}
|
|
1422
1588
|
} catch {}
|
|
1423
1589
|
|
|
1590
|
+
if (this.chunkStore) {
|
|
1591
|
+
try {
|
|
1592
|
+
totalChunkCount = this.chunkStore.count();
|
|
1593
|
+
hasUnvectorized = this.chunkStore.hasUnvectorizedChunks();
|
|
1594
|
+
} catch {}
|
|
1595
|
+
}
|
|
1596
|
+
|
|
1424
1597
|
const preset = INDEX_PRESETS[this.indexName];
|
|
1425
1598
|
return {
|
|
1426
1599
|
indexName: this.indexName,
|
|
1427
1600
|
description: preset?.description || "Custom index",
|
|
1428
1601
|
model: EMBEDDING_MODEL,
|
|
1429
1602
|
fileCount,
|
|
1430
|
-
chunkCount,
|
|
1603
|
+
chunkCount: totalChunkCount || vectorChunkCount,
|
|
1604
|
+
vectorizedChunks: vectorChunkCount,
|
|
1605
|
+
pendingEmbedding: hasUnvectorized,
|
|
1431
1606
|
features: {
|
|
1432
1607
|
chunking: CHUNKING_CONFIG.strategy,
|
|
1433
1608
|
hybrid: preset?.hybrid ?? false,
|