@comfanion/usethis_search 3.0.0-dev.26 → 3.0.0-dev.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -1
- package/tools/search.ts +13 -3
- package/vectorizer/chunk-store.ts +207 -0
- package/vectorizer/index.ts +478 -129
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@comfanion/usethis_search",
|
|
3
|
-
"version": "3.0.0-dev.
|
|
3
|
+
"version": "3.0.0-dev.28",
|
|
4
4
|
"description": "OpenCode plugin: semantic search with graph-based context (v3: graph relations, 1-hop context, LSP + regex analyzers)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./index.ts",
|
|
@@ -32,6 +32,7 @@
|
|
|
32
32
|
"vectorizer/query-cache.ts",
|
|
33
33
|
"vectorizer/search-metrics.ts",
|
|
34
34
|
"vectorizer/graph-db.ts",
|
|
35
|
+
"vectorizer/chunk-store.ts",
|
|
35
36
|
"vectorizer/usage-tracker.ts",
|
|
36
37
|
"vectorizer/graph-builder.ts",
|
|
37
38
|
"vectorizer/analyzers/regex-analyzer.ts",
|
package/tools/search.ts
CHANGED
|
@@ -218,9 +218,12 @@ Examples:
|
|
|
218
218
|
// ── Reranking — boost results where query keywords appear in text ──────
|
|
219
219
|
const queryKeywords = args.query.toLowerCase().split(/\s+/).filter((w: string) => w.length > 2)
|
|
220
220
|
for (const r of allResults) {
|
|
221
|
+
const isBM25Only = !!r._bm25Only
|
|
221
222
|
const vectorScore = r._distance != null ? Math.max(0, 1 - r._distance / 2) : 0
|
|
222
223
|
r._vectorScore = vectorScore
|
|
223
|
-
r._bm25Component =
|
|
224
|
+
r._bm25Component = isBM25Only
|
|
225
|
+
? (r._combinedScore ?? 0)
|
|
226
|
+
: (r._combinedScore != null ? Math.max(0, r._combinedScore - vectorScore) : 0)
|
|
224
227
|
const baseScore = r._combinedScore ?? vectorScore
|
|
225
228
|
|
|
226
229
|
const text = (r.content || "").toLowerCase()
|
|
@@ -264,10 +267,15 @@ Examples:
|
|
|
264
267
|
|
|
265
268
|
// ── Confidence signal ──────────────────────────────────────────────────
|
|
266
269
|
const topScore = sortedGroups[0].best._finalScore ?? 0
|
|
270
|
+
const hasBM25Only = allResults.some((r: any) => r._bm25Only)
|
|
267
271
|
const scope = args.searchAll ? "all indexes" : `index "${indexName}"`
|
|
268
272
|
const filterLabel = args.filter ? ` filter:"${args.filter}"` : ""
|
|
269
273
|
let output = `## Search Results for: "${args.query}" (${scope}${filterLabel})\n\n`
|
|
270
274
|
|
|
275
|
+
if (hasBM25Only) {
|
|
276
|
+
output += `> **BM25-only mode** — vector embeddings not yet available. Results are keyword-based. Quality will improve after embedding completes.\n\n`
|
|
277
|
+
}
|
|
278
|
+
|
|
271
279
|
if (topScore < 0.45) {
|
|
272
280
|
output += `> **Low confidence results.** Best score: ${topScore.toFixed(3)}. These results may not be relevant to your query.\n> Try more specific keywords or different phrasing.\n\n`
|
|
273
281
|
}
|
|
@@ -287,8 +295,10 @@ Examples:
|
|
|
287
295
|
const metaLine = metaParts.length > 0 ? ` (${metaParts.join(", ")})` : ""
|
|
288
296
|
|
|
289
297
|
// Score breakdown
|
|
290
|
-
const breakdownParts: string[] =
|
|
291
|
-
|
|
298
|
+
const breakdownParts: string[] = r._bm25Only
|
|
299
|
+
? [`bm25: ${(r._bm25Component ?? 0).toFixed(2)}`]
|
|
300
|
+
: [`vec: ${(r._vectorScore ?? 0).toFixed(2)}`]
|
|
301
|
+
if (!r._bm25Only && r._bm25Component > 0.005) breakdownParts.push(`bm25: +${r._bm25Component.toFixed(2)}`)
|
|
292
302
|
if (r._keywordBonus > 0.005) breakdownParts.push(`kw: +${r._keywordBonus.toFixed(2)}`)
|
|
293
303
|
const breakdown = breakdownParts.join(", ")
|
|
294
304
|
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ChunkStore — SQLite-based persistent chunk storage.
|
|
3
|
+
* Populated by Phase 1 (no vectors needed). Provides BM25 search
|
|
4
|
+
* and metadata queries immediately, before embedding is complete.
|
|
5
|
+
*
|
|
6
|
+
* Uses bun:sqlite with WAL mode for concurrent read access.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { Database } from "bun:sqlite"
|
|
10
|
+
|
|
11
|
+
export interface StoredChunk {
|
|
12
|
+
chunk_id: string
|
|
13
|
+
file: string
|
|
14
|
+
chunk_index: number
|
|
15
|
+
content: string
|
|
16
|
+
file_type: string
|
|
17
|
+
language: string
|
|
18
|
+
last_modified: string
|
|
19
|
+
file_size: number
|
|
20
|
+
heading_context: string
|
|
21
|
+
function_name: string
|
|
22
|
+
class_name: string
|
|
23
|
+
tags: string
|
|
24
|
+
start_line: number
|
|
25
|
+
end_line: number
|
|
26
|
+
archived: boolean
|
|
27
|
+
vectorized: boolean
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export class ChunkStore {
|
|
31
|
+
private db: Database | null = null
|
|
32
|
+
|
|
33
|
+
// Prepared statements
|
|
34
|
+
private _stmtInsert: any = null
|
|
35
|
+
private _stmtByFile: any = null
|
|
36
|
+
private _stmtDeleteByFile: any = null
|
|
37
|
+
private _stmtAll: any = null
|
|
38
|
+
private _stmtByChunkId: any = null
|
|
39
|
+
private _stmtMarkVectorized: any = null
|
|
40
|
+
private _stmtHasVectors: any = null
|
|
41
|
+
private _stmtCount: any = null
|
|
42
|
+
private _stmtSearch: any = null
|
|
43
|
+
|
|
44
|
+
constructor(private dbPath: string) {}
|
|
45
|
+
|
|
46
|
+
async init(): Promise<this> {
|
|
47
|
+
const fullPath = this.dbPath.endsWith(".db") ? this.dbPath : this.dbPath + ".db"
|
|
48
|
+
this.db = new Database(fullPath)
|
|
49
|
+
|
|
50
|
+
this.db.exec("PRAGMA journal_mode = WAL")
|
|
51
|
+
this.db.exec("PRAGMA synchronous = NORMAL")
|
|
52
|
+
this.db.exec("PRAGMA cache_size = -4000") // 4MB cache
|
|
53
|
+
|
|
54
|
+
this.db.exec(`
|
|
55
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
56
|
+
chunk_id TEXT PRIMARY KEY,
|
|
57
|
+
file TEXT NOT NULL,
|
|
58
|
+
chunk_index INTEGER NOT NULL DEFAULT 0,
|
|
59
|
+
content TEXT NOT NULL,
|
|
60
|
+
file_type TEXT NOT NULL DEFAULT '',
|
|
61
|
+
language TEXT NOT NULL DEFAULT '',
|
|
62
|
+
last_modified TEXT NOT NULL DEFAULT '',
|
|
63
|
+
file_size INTEGER NOT NULL DEFAULT 0,
|
|
64
|
+
heading_context TEXT NOT NULL DEFAULT '',
|
|
65
|
+
function_name TEXT NOT NULL DEFAULT '',
|
|
66
|
+
class_name TEXT NOT NULL DEFAULT '',
|
|
67
|
+
tags TEXT NOT NULL DEFAULT '',
|
|
68
|
+
start_line INTEGER NOT NULL DEFAULT -1,
|
|
69
|
+
end_line INTEGER NOT NULL DEFAULT -1,
|
|
70
|
+
archived INTEGER NOT NULL DEFAULT 0,
|
|
71
|
+
vectorized INTEGER NOT NULL DEFAULT 0
|
|
72
|
+
)
|
|
73
|
+
`)
|
|
74
|
+
|
|
75
|
+
this.db.exec("CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file)")
|
|
76
|
+
this.db.exec("CREATE INDEX IF NOT EXISTS idx_chunks_vectorized ON chunks(vectorized)")
|
|
77
|
+
this.db.exec("CREATE INDEX IF NOT EXISTS idx_chunks_language ON chunks(language)")
|
|
78
|
+
|
|
79
|
+
// Prepare statements
|
|
80
|
+
this._stmtInsert = this.db.prepare(`
|
|
81
|
+
INSERT OR REPLACE INTO chunks
|
|
82
|
+
(chunk_id, file, chunk_index, content, file_type, language, last_modified, file_size,
|
|
83
|
+
heading_context, function_name, class_name, tags, start_line, end_line, archived, vectorized)
|
|
84
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 0)
|
|
85
|
+
`)
|
|
86
|
+
this._stmtByFile = this.db.prepare("SELECT * FROM chunks WHERE file = ?")
|
|
87
|
+
this._stmtDeleteByFile = this.db.prepare("DELETE FROM chunks WHERE file = ?")
|
|
88
|
+
this._stmtAll = this.db.prepare("SELECT * FROM chunks")
|
|
89
|
+
this._stmtByChunkId = this.db.prepare("SELECT * FROM chunks WHERE chunk_id = ?")
|
|
90
|
+
this._stmtMarkVectorized = this.db.prepare("UPDATE chunks SET vectorized = 1 WHERE file = ?")
|
|
91
|
+
this._stmtHasVectors = this.db.prepare("SELECT COUNT(*) as cnt FROM chunks WHERE vectorized = 0")
|
|
92
|
+
this._stmtCount = this.db.prepare("SELECT COUNT(*) as cnt FROM chunks")
|
|
93
|
+
|
|
94
|
+
return this
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Store chunks from Phase 1 (batch, in transaction).
|
|
99
|
+
*/
|
|
100
|
+
storeChunks(rows: Array<{
|
|
101
|
+
chunk_id: string, file: string, chunk_index: number, content: string,
|
|
102
|
+
file_type: string, language: string, last_modified: string, file_size: number,
|
|
103
|
+
heading_context: string, function_name: string, class_name: string, tags: string,
|
|
104
|
+
start_line: number, end_line: number, archived: boolean
|
|
105
|
+
}>): void {
|
|
106
|
+
if (!this.db) throw new Error("ChunkStore not initialized")
|
|
107
|
+
|
|
108
|
+
const insertMany = this.db.transaction((items: typeof rows) => {
|
|
109
|
+
for (const r of items) {
|
|
110
|
+
this._stmtInsert.run(
|
|
111
|
+
r.chunk_id, r.file, r.chunk_index, r.content,
|
|
112
|
+
r.file_type, r.language, r.last_modified, r.file_size,
|
|
113
|
+
r.heading_context, r.function_name, r.class_name, r.tags,
|
|
114
|
+
r.start_line, r.end_line, r.archived ? 1 : 0
|
|
115
|
+
)
|
|
116
|
+
}
|
|
117
|
+
})
|
|
118
|
+
insertMany(rows)
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Delete all chunks for a file (before re-indexing).
|
|
123
|
+
*/
|
|
124
|
+
deleteByFile(filePath: string): void {
|
|
125
|
+
if (!this.db) throw new Error("ChunkStore not initialized")
|
|
126
|
+
this._stmtDeleteByFile.run(filePath)
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Mark all chunks for a file as vectorized (Phase 2 complete).
|
|
131
|
+
*/
|
|
132
|
+
markVectorized(filePath: string): void {
|
|
133
|
+
if (!this.db) throw new Error("ChunkStore not initialized")
|
|
134
|
+
this._stmtMarkVectorized.run(filePath)
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Check if all chunks have vectors.
|
|
139
|
+
*/
|
|
140
|
+
hasUnvectorizedChunks(): boolean {
|
|
141
|
+
if (!this.db) return false
|
|
142
|
+
const row = this._stmtHasVectors.get() as { cnt: number }
|
|
143
|
+
return row.cnt > 0
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Get all chunks (for BM25 index building).
|
|
148
|
+
*/
|
|
149
|
+
getAllChunks(): StoredChunk[] {
|
|
150
|
+
if (!this.db) return []
|
|
151
|
+
return this._stmtAll.all().map((r: any) => this.toChunk(r))
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* Get chunks for a specific file.
|
|
156
|
+
*/
|
|
157
|
+
getChunksByFile(filePath: string): StoredChunk[] {
|
|
158
|
+
if (!this.db) return []
|
|
159
|
+
return this._stmtByFile.all(filePath).map((r: any) => this.toChunk(r))
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* Get a single chunk by ID.
|
|
164
|
+
*/
|
|
165
|
+
getChunkById(chunkId: string): StoredChunk | null {
|
|
166
|
+
if (!this.db) return null
|
|
167
|
+
const row = this._stmtByChunkId.get(chunkId)
|
|
168
|
+
return row ? this.toChunk(row) : null
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Total chunk count.
|
|
173
|
+
*/
|
|
174
|
+
count(): number {
|
|
175
|
+
if (!this.db) return 0
|
|
176
|
+
const row = this._stmtCount.get() as { cnt: number }
|
|
177
|
+
return row.cnt
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
close(): void {
|
|
181
|
+
if (this.db) {
|
|
182
|
+
this.db.close()
|
|
183
|
+
this.db = null
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
private toChunk(row: any): StoredChunk {
|
|
188
|
+
return {
|
|
189
|
+
chunk_id: row.chunk_id,
|
|
190
|
+
file: row.file,
|
|
191
|
+
chunk_index: row.chunk_index,
|
|
192
|
+
content: row.content,
|
|
193
|
+
file_type: row.file_type,
|
|
194
|
+
language: row.language,
|
|
195
|
+
last_modified: row.last_modified,
|
|
196
|
+
file_size: row.file_size,
|
|
197
|
+
heading_context: row.heading_context,
|
|
198
|
+
function_name: row.function_name,
|
|
199
|
+
class_name: row.class_name,
|
|
200
|
+
tags: row.tags,
|
|
201
|
+
start_line: row.start_line,
|
|
202
|
+
end_line: row.end_line,
|
|
203
|
+
archived: !!row.archived,
|
|
204
|
+
vectorized: !!row.vectorized,
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
}
|
package/vectorizer/index.ts
CHANGED
|
@@ -18,6 +18,7 @@ import { SearchMetrics } from "./search-metrics.ts";
|
|
|
18
18
|
import { GraphDB } from "./graph-db.ts";
|
|
19
19
|
import { GraphBuilder, isStructuralPredicate } from "./graph-builder.ts";
|
|
20
20
|
import { UsageTracker } from "./usage-tracker.ts";
|
|
21
|
+
import { ChunkStore } from "./chunk-store.ts";
|
|
21
22
|
|
|
22
23
|
// Suppress transformers.js logs unless DEBUG is set
|
|
23
24
|
const DEBUG = process.env.DEBUG?.includes("vectorizer") || process.env.DEBUG === "*";
|
|
@@ -448,6 +449,7 @@ class CodebaseIndexer {
|
|
|
448
449
|
this.graphBuilder = null; // Graph builder orchestrator
|
|
449
450
|
this._chunkCache = null; // Lazy Map<chunk_id, row> for findChunkById
|
|
450
451
|
this.usageTracker = null; // Usage tracking & provenance (v3)
|
|
452
|
+
this.chunkStore = null; // SQLite chunk store (BM25 without vectors)
|
|
451
453
|
}
|
|
452
454
|
|
|
453
455
|
async init() {
|
|
@@ -459,6 +461,16 @@ class CodebaseIndexer {
|
|
|
459
461
|
this.db = await lancedb.connect(path.join(this.cacheDir, "lancedb"));
|
|
460
462
|
await this.loadHashes();
|
|
461
463
|
|
|
464
|
+
// ChunkStore — SQLite store for BM25 search without vectors
|
|
465
|
+
try {
|
|
466
|
+
const chunkStorePath = path.join(this.cacheDir, "chunks.db");
|
|
467
|
+
this.chunkStore = await new ChunkStore(chunkStorePath).init();
|
|
468
|
+
if (DEBUG) console.log(`[vectorizer] ChunkStore initialized: ${chunkStorePath}`);
|
|
469
|
+
} catch (e) {
|
|
470
|
+
if (DEBUG) console.log(`[vectorizer] ChunkStore init failed: ${e.message || e}`);
|
|
471
|
+
this.chunkStore = null;
|
|
472
|
+
}
|
|
473
|
+
|
|
462
474
|
// Graph DB — only if graph is enabled in config
|
|
463
475
|
// Non-fatal: if LevelDB lock fails (parallel access), search works without graph
|
|
464
476
|
if (GRAPH_CONFIG.enabled) {
|
|
@@ -512,6 +524,11 @@ class CodebaseIndexer {
|
|
|
512
524
|
}
|
|
513
525
|
this._bm25Rows = null;
|
|
514
526
|
this.metrics = null;
|
|
527
|
+
// Close ChunkStore
|
|
528
|
+
if (this.chunkStore) {
|
|
529
|
+
try { this.chunkStore.close(); } catch { /* best effort */ }
|
|
530
|
+
this.chunkStore = null;
|
|
531
|
+
}
|
|
515
532
|
// Close graph DB to release LevelDB lock
|
|
516
533
|
if (this.graphDB) {
|
|
517
534
|
try { await this.graphDB.close(); } catch { /* best effort */ }
|
|
@@ -612,7 +629,161 @@ class CodebaseIndexer {
|
|
|
612
629
|
return this.hashes[relPath] !== currentHash;
|
|
613
630
|
}
|
|
614
631
|
|
|
615
|
-
// ──
|
|
632
|
+
// ── Phase 1: Prepare file (chunk + graph, NO embedding) ─────────────────
|
|
633
|
+
// Returns prepared chunk data ready for embedding, or null if skipped.
|
|
634
|
+
|
|
635
|
+
async prepareFile(filePath) {
|
|
636
|
+
const relPath = path.relative(this.root, filePath);
|
|
637
|
+
|
|
638
|
+
let content;
|
|
639
|
+
try {
|
|
640
|
+
content = await fs.readFile(filePath, "utf8");
|
|
641
|
+
} catch {
|
|
642
|
+
return null;
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
const hash = this.fileHash(content);
|
|
646
|
+
if (this.hashes[relPath] === hash) {
|
|
647
|
+
return null; // unchanged
|
|
648
|
+
}
|
|
649
|
+
|
|
650
|
+
// Extract metadata
|
|
651
|
+
const fileMeta = await extractFileMetadata(filePath, content);
|
|
652
|
+
const archived = this.isArchived(relPath, content);
|
|
653
|
+
|
|
654
|
+
// Clean content before chunking
|
|
655
|
+
const cleaned = cleanContent(content, fileMeta.file_type, CLEANING_CONFIG);
|
|
656
|
+
|
|
657
|
+
// Semantic chunking
|
|
658
|
+
const chunks = chunkContent(cleaned, fileMeta.file_type, fileMeta.language, CHUNKING_CONFIG);
|
|
659
|
+
|
|
660
|
+
// Assign chunk IDs
|
|
661
|
+
const chunksWithIds = this.graphBuilder
|
|
662
|
+
? this.graphBuilder.assignChunkIds(relPath, chunks)
|
|
663
|
+
: chunks.map((c, i) => ({ ...c, chunk_id: `chunk:${relPath}::_chunk_${i}` }));
|
|
664
|
+
|
|
665
|
+
// Build graph edges (Phase 1 — no embedding needed)
|
|
666
|
+
if (this.graphBuilder && this.graphDB) {
|
|
667
|
+
await this.graphDB.deleteByFile(relPath);
|
|
668
|
+
const edgesBuilt = await this.graphBuilder.buildEdges(relPath, content, chunksWithIds, fileMeta.file_type);
|
|
669
|
+
|
|
670
|
+
if (edgesBuilt > 0 || DEBUG) {
|
|
671
|
+
const timestamp = new Date().toISOString().slice(11, 19);
|
|
672
|
+
const logMsg = `${timestamp} Graph built: ${relPath} (${chunksWithIds.length} chunks)`;
|
|
673
|
+
if (DEBUG) console.log(`[vectorizer] ${logMsg}`);
|
|
674
|
+
try {
|
|
675
|
+
const logPath = path.join(this.root, ".opencode", "indexer.log");
|
|
676
|
+
const fsSync = await import("fs");
|
|
677
|
+
fsSync.appendFileSync(logPath, `${logMsg}\n`);
|
|
678
|
+
} catch { /* non-fatal */ }
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
try {
|
|
682
|
+
await this.graphDB.setFileMeta(relPath, hash, Date.now());
|
|
683
|
+
} catch { /* non-fatal */ }
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
// Return prepared rows (without vector — Phase 2 fills it)
|
|
687
|
+
const rows = chunksWithIds.map((chunk, i) => ({
|
|
688
|
+
chunk_id: chunk.chunk_id,
|
|
689
|
+
file: relPath,
|
|
690
|
+
chunk_index: i,
|
|
691
|
+
content: chunk.content,
|
|
692
|
+
archived,
|
|
693
|
+
file_type: fileMeta.file_type,
|
|
694
|
+
language: fileMeta.language,
|
|
695
|
+
last_modified: fileMeta.last_modified,
|
|
696
|
+
file_size: fileMeta.file_size,
|
|
697
|
+
heading_context: chunk.heading_context || "",
|
|
698
|
+
function_name: chunk.function_name || "",
|
|
699
|
+
class_name: chunk.class_name || "",
|
|
700
|
+
tags: (fileMeta.tags || []).join(","),
|
|
701
|
+
start_line: chunk.start_line ?? -1,
|
|
702
|
+
end_line: chunk.end_line ?? -1,
|
|
703
|
+
}));
|
|
704
|
+
|
|
705
|
+
// Store chunks in ChunkStore (Phase 1 — BM25 available immediately)
|
|
706
|
+
if (this.chunkStore) {
|
|
707
|
+
try {
|
|
708
|
+
this.chunkStore.deleteByFile(relPath);
|
|
709
|
+
this.chunkStore.storeChunks(rows);
|
|
710
|
+
} catch (e) {
|
|
711
|
+
if (DEBUG) console.log(`[vectorizer] ChunkStore write failed for ${relPath}: ${e.message || e}`);
|
|
712
|
+
}
|
|
713
|
+
}
|
|
714
|
+
|
|
715
|
+
return { relPath, hash, rows };
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
// ── Phase 2: Batch embed + store ──────────────────────────────────────────
|
|
719
|
+
// Takes prepared rows from prepareFile(), embeds in batches, stores in LanceDB.
|
|
720
|
+
|
|
721
|
+
async embedAndStore(preparedFiles, batchSize = 32, onProgress = null) {
|
|
722
|
+
if (preparedFiles.length === 0) return 0;
|
|
723
|
+
|
|
724
|
+
// Collect all rows with their content for batch embedding
|
|
725
|
+
const allRows = [];
|
|
726
|
+
for (const pf of preparedFiles) {
|
|
727
|
+
for (const row of pf.rows) {
|
|
728
|
+
allRows.push(row);
|
|
729
|
+
}
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
if (allRows.length === 0) return 0;
|
|
733
|
+
|
|
734
|
+
// Load model once
|
|
735
|
+
const model = await this.loadModel();
|
|
736
|
+
|
|
737
|
+
// Batch embed
|
|
738
|
+
const allData = [];
|
|
739
|
+
for (let i = 0; i < allRows.length; i += batchSize) {
|
|
740
|
+
const batch = allRows.slice(i, i + batchSize);
|
|
741
|
+
const texts = batch.map(r => r.content);
|
|
742
|
+
|
|
743
|
+
// Embed batch — @xenova/transformers processes array inputs efficiently
|
|
744
|
+
const embeddings = [];
|
|
745
|
+
for (const text of texts) {
|
|
746
|
+
const result = await model(text, { pooling: "mean", normalize: true });
|
|
747
|
+
embeddings.push(Array.from(result.data));
|
|
748
|
+
}
|
|
749
|
+
|
|
750
|
+
for (let j = 0; j < batch.length; j++) {
|
|
751
|
+
allData.push({ ...batch[j], vector: embeddings[j] });
|
|
752
|
+
}
|
|
753
|
+
|
|
754
|
+
if (onProgress) {
|
|
755
|
+
onProgress(Math.min(i + batchSize, allRows.length), allRows.length, "embedding");
|
|
756
|
+
}
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
// Bulk store in LanceDB
|
|
760
|
+
const tableName = "chunks";
|
|
761
|
+
const tables = await this.db.tableNames();
|
|
762
|
+
if (tables.includes(tableName)) {
|
|
763
|
+
const table = await this.db.openTable(tableName);
|
|
764
|
+
await table.add(allData);
|
|
765
|
+
} else {
|
|
766
|
+
await this.db.createTable(tableName, allData);
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
// Update hashes + mark vectorized in ChunkStore
|
|
770
|
+
for (const pf of preparedFiles) {
|
|
771
|
+
this.hashes[pf.relPath] = pf.hash;
|
|
772
|
+
if (this.chunkStore) {
|
|
773
|
+
try { this.chunkStore.markVectorized(pf.relPath); } catch { /* non-fatal */ }
|
|
774
|
+
}
|
|
775
|
+
}
|
|
776
|
+
await this.saveHashes();
|
|
777
|
+
|
|
778
|
+
// Invalidate caches
|
|
779
|
+
if (this.bm25) { this.bm25.clear(); this.bm25 = null; }
|
|
780
|
+
this._bm25Rows = null;
|
|
781
|
+
this._chunkCache = null;
|
|
782
|
+
|
|
783
|
+
return allData.length;
|
|
784
|
+
}
|
|
785
|
+
|
|
786
|
+
// ── Index a single file (legacy — used by freshen/on-change) ───────────
|
|
616
787
|
|
|
617
788
|
async indexFile(filePath) {
|
|
618
789
|
const relPath = path.relative(this.root, filePath);
|
|
@@ -701,6 +872,24 @@ class CodebaseIndexer {
|
|
|
701
872
|
|
|
702
873
|
if (data.length === 0) return false;
|
|
703
874
|
|
|
875
|
+
// Store in ChunkStore (Phase 1 data) + mark vectorized (has embedding)
|
|
876
|
+
if (this.chunkStore) {
|
|
877
|
+
try {
|
|
878
|
+
this.chunkStore.deleteByFile(relPath);
|
|
879
|
+
this.chunkStore.storeChunks(data.map(d => ({
|
|
880
|
+
chunk_id: d.chunk_id, file: d.file, chunk_index: d.chunk_index,
|
|
881
|
+
content: d.content, file_type: d.file_type, language: d.language,
|
|
882
|
+
last_modified: d.last_modified, file_size: d.file_size,
|
|
883
|
+
heading_context: d.heading_context, function_name: d.function_name,
|
|
884
|
+
class_name: d.class_name, tags: d.tags,
|
|
885
|
+
start_line: d.start_line, end_line: d.end_line, archived: d.archived,
|
|
886
|
+
})));
|
|
887
|
+
this.chunkStore.markVectorized(relPath);
|
|
888
|
+
} catch (e) {
|
|
889
|
+
if (DEBUG) console.log(`[vectorizer] ChunkStore write failed for ${relPath}: ${e.message || e}`);
|
|
890
|
+
}
|
|
891
|
+
}
|
|
892
|
+
|
|
704
893
|
const tableName = "chunks";
|
|
705
894
|
const tables = await this.db.tableNames();
|
|
706
895
|
if (tables.includes(tableName)) {
|
|
@@ -728,6 +917,35 @@ class CodebaseIndexer {
|
|
|
728
917
|
async ensureBM25() {
|
|
729
918
|
if (this.bm25) return this.bm25;
|
|
730
919
|
|
|
920
|
+
// Primary source: ChunkStore (SQLite) — available after Phase 1, no vectors needed
|
|
921
|
+
if (this.chunkStore) {
|
|
922
|
+
try {
|
|
923
|
+
const allChunks = this.chunkStore.getAllChunks();
|
|
924
|
+
if (allChunks.length > 0) {
|
|
925
|
+
// Sort for stable ID mapping between builds
|
|
926
|
+
allChunks.sort((a, b) => {
|
|
927
|
+
const ka = `${a.file}:${a.chunk_index}`;
|
|
928
|
+
const kb = `${b.file}:${b.chunk_index}`;
|
|
929
|
+
return ka.localeCompare(kb);
|
|
930
|
+
});
|
|
931
|
+
|
|
932
|
+
// Release previous data before rebuilding
|
|
933
|
+
if (this.bm25) this.bm25.clear();
|
|
934
|
+
this._bm25Rows = null;
|
|
935
|
+
|
|
936
|
+
this.bm25 = new BM25Index();
|
|
937
|
+
this.bm25.build(allChunks.map((r) => r.content));
|
|
938
|
+
this._bm25Rows = allChunks;
|
|
939
|
+
|
|
940
|
+
if (DEBUG) console.log(`[vectorizer] BM25 built from ChunkStore (${allChunks.length} chunks)`);
|
|
941
|
+
return this.bm25;
|
|
942
|
+
}
|
|
943
|
+
} catch (e) {
|
|
944
|
+
if (DEBUG) console.log("[vectorizer] BM25 from ChunkStore failed, trying LanceDB:", e.message);
|
|
945
|
+
}
|
|
946
|
+
}
|
|
947
|
+
|
|
948
|
+
// Fallback: LanceDB (legacy — for indexes without ChunkStore)
|
|
731
949
|
const tableName = "chunks";
|
|
732
950
|
const tables = await this.db.tableNames();
|
|
733
951
|
if (!tables.includes(tableName)) return null;
|
|
@@ -761,15 +979,170 @@ class CodebaseIndexer {
|
|
|
761
979
|
return this.bm25;
|
|
762
980
|
}
|
|
763
981
|
|
|
764
|
-
// ──
|
|
982
|
+
// ── Shared helpers for search paths ─────────────────────────────────────────
|
|
983
|
+
|
|
984
|
+
_applyMetadataFilters(results, includeArchived, options) {
|
|
985
|
+
if (!includeArchived) {
|
|
986
|
+
results = results.filter((r) => !r.archived);
|
|
987
|
+
}
|
|
988
|
+
if (options.fileType) {
|
|
989
|
+
results = results.filter((r) => r.file_type === options.fileType);
|
|
990
|
+
}
|
|
991
|
+
if (options.language) {
|
|
992
|
+
results = results.filter((r) => r.language === options.language);
|
|
993
|
+
}
|
|
994
|
+
if (options.modifiedAfter) {
|
|
995
|
+
const after = new Date(options.modifiedAfter).getTime();
|
|
996
|
+
results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() >= after);
|
|
997
|
+
}
|
|
998
|
+
if (options.modifiedBefore) {
|
|
999
|
+
const before = new Date(options.modifiedBefore).getTime();
|
|
1000
|
+
results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() <= before);
|
|
1001
|
+
}
|
|
1002
|
+
if (options.tags && options.tags.length > 0) {
|
|
1003
|
+
results = results.filter((r) => {
|
|
1004
|
+
const rowTags = (r.tags || "").split(",").filter(Boolean);
|
|
1005
|
+
return options.tags.some((t) => rowTags.includes(t));
|
|
1006
|
+
});
|
|
1007
|
+
}
|
|
1008
|
+
return results;
|
|
1009
|
+
}
|
|
1010
|
+
|
|
1011
|
+
async _expandGraphContext(finalResults, queryEmbedding, query) {
|
|
1012
|
+
if (!this.graphDB) return;
|
|
1013
|
+
|
|
1014
|
+
for (const result of finalResults) {
|
|
1015
|
+
if (!result.chunk_id) continue;
|
|
1016
|
+
|
|
1017
|
+
const outgoing = await this.graphDB.getOutgoing(result.chunk_id);
|
|
1018
|
+
const incoming = await this.graphDB.getIncoming(result.chunk_id);
|
|
1019
|
+
const allEdges = [...outgoing, ...incoming].filter(
|
|
1020
|
+
e => e.predicate !== "belongs_to" && e.predicate !== "graph_built" && !isStructuralPredicate(e.predicate)
|
|
1021
|
+
);
|
|
1022
|
+
|
|
1023
|
+
const neighbors = [];
|
|
1024
|
+
for (const edge of allEdges) {
|
|
1025
|
+
const neighborId = edge.subject === result.chunk_id ? edge.object : edge.subject;
|
|
1026
|
+
const neighborChunk = await this.findChunkById(neighborId);
|
|
1027
|
+
if (!neighborChunk) continue;
|
|
1028
|
+
|
|
1029
|
+
let score;
|
|
1030
|
+
if (queryEmbedding && neighborChunk.vector) {
|
|
1031
|
+
const similarity = this.cosineSimilarity(neighborChunk.vector, queryEmbedding);
|
|
1032
|
+
score = edge.weight * similarity;
|
|
1033
|
+
} else {
|
|
1034
|
+
// No vectors — use edge weight only (BM25-only fallback)
|
|
1035
|
+
score = edge.weight * 0.7; // dampen without cosine confirmation
|
|
1036
|
+
}
|
|
1037
|
+
|
|
1038
|
+
neighbors.push({
|
|
1039
|
+
chunk_id: neighborId,
|
|
1040
|
+
file: neighborChunk.file,
|
|
1041
|
+
content: neighborChunk.content,
|
|
1042
|
+
relation: edge.predicate,
|
|
1043
|
+
score,
|
|
1044
|
+
via: edge.source
|
|
1045
|
+
});
|
|
1046
|
+
}
|
|
1047
|
+
|
|
1048
|
+
neighbors.sort((a, b) => b.score - a.score);
|
|
1049
|
+
const minRelevance = GRAPH_CONFIG.min_relevance ?? 0.5;
|
|
1050
|
+
const maxRelated = GRAPH_CONFIG.max_related ?? 4;
|
|
1051
|
+
result.relatedContext = neighbors
|
|
1052
|
+
.filter(n => n.score >= minRelevance)
|
|
1053
|
+
.slice(0, maxRelated);
|
|
1054
|
+
|
|
1055
|
+
if (this.usageTracker) {
|
|
1056
|
+
for (const n of result.relatedContext) {
|
|
1057
|
+
this.usageTracker.recordProvenance(query, result.chunk_id, n.chunk_id, n.relation);
|
|
1058
|
+
}
|
|
1059
|
+
}
|
|
1060
|
+
}
|
|
1061
|
+
|
|
1062
|
+
// Record usage counts for all returned chunks
|
|
1063
|
+
if (this.usageTracker) {
|
|
1064
|
+
const allChunkIds = [];
|
|
1065
|
+
for (const r of finalResults) {
|
|
1066
|
+
if (r.chunk_id) allChunkIds.push(r.chunk_id);
|
|
1067
|
+
if (r.relatedContext) {
|
|
1068
|
+
for (const rc of r.relatedContext) {
|
|
1069
|
+
if (rc.chunk_id) allChunkIds.push(rc.chunk_id);
|
|
1070
|
+
}
|
|
1071
|
+
}
|
|
1072
|
+
}
|
|
1073
|
+
this.usageTracker.recordSearchResults(allChunkIds);
|
|
1074
|
+
this.usageTracker.save().catch(() => {});
|
|
1075
|
+
}
|
|
1076
|
+
}
|
|
1077
|
+
|
|
1078
|
+
// ── Search (v3: hybrid + BM25-only fallback + metadata filters + metrics) ──
|
|
765
1079
|
|
|
766
1080
|
async search(query, limit = 5, includeArchived = false, options = {}) {
|
|
767
1081
|
const tableName = "chunks";
|
|
768
1082
|
const tables = await this.db.tableNames();
|
|
769
|
-
|
|
770
|
-
|
|
1083
|
+
|
|
1084
|
+
const indexConfig = INDEX_PRESETS[this.indexName];
|
|
1085
|
+
const indexHybridEnabled = indexConfig?.hybrid ?? false;
|
|
1086
|
+
const indexBM25Weight = indexConfig?.bm25_weight ?? HYBRID_CONFIG.bm25_weight;
|
|
1087
|
+
const isHybrid = indexHybridEnabled || options.hybrid;
|
|
1088
|
+
|
|
1089
|
+
// ── Detect if vectors are available ──────────────────────────────────────
|
|
1090
|
+
const hasVectorTable = tables.includes(tableName);
|
|
1091
|
+
let hasVectors = false;
|
|
1092
|
+
if (hasVectorTable) {
|
|
1093
|
+
try {
|
|
1094
|
+
const table = await this.db.openTable(tableName);
|
|
1095
|
+
const rowCount = await table.countRows();
|
|
1096
|
+
hasVectors = rowCount > 0;
|
|
1097
|
+
} catch {
|
|
1098
|
+
hasVectors = false;
|
|
1099
|
+
}
|
|
771
1100
|
}
|
|
772
1101
|
|
|
1102
|
+
// ── BM25-only fallback (Phase 1 complete, Phase 2 not yet) ──────────────
|
|
1103
|
+
if (!hasVectors) {
|
|
1104
|
+
const bm25 = await this.ensureBM25();
|
|
1105
|
+
if (!bm25 || !this._bm25Rows) {
|
|
1106
|
+
// No vectors AND no chunks — nothing indexed yet
|
|
1107
|
+
return [];
|
|
1108
|
+
}
|
|
1109
|
+
|
|
1110
|
+
if (DEBUG) console.log("[vectorizer] BM25-only search (no vectors yet)");
|
|
1111
|
+
|
|
1112
|
+
const fetchLimit = Math.max(limit * 3, 50);
|
|
1113
|
+
const bm25Results = bm25.search(query, fetchLimit);
|
|
1114
|
+
|
|
1115
|
+
// Normalize BM25 scores to [0, 1]
|
|
1116
|
+
let maxBM25 = 0;
|
|
1117
|
+
for (const r of bm25Results) {
|
|
1118
|
+
if (r.score > maxBM25) maxBM25 = r.score;
|
|
1119
|
+
}
|
|
1120
|
+
|
|
1121
|
+
let results = [];
|
|
1122
|
+
for (const br of bm25Results) {
|
|
1123
|
+
if (br.id < this._bm25Rows.length) {
|
|
1124
|
+
const row = this._bm25Rows[br.id];
|
|
1125
|
+
const normScore = maxBM25 > 0 ? br.score / maxBM25 : 0;
|
|
1126
|
+
results.push({
|
|
1127
|
+
...row,
|
|
1128
|
+
_combinedScore: normScore,
|
|
1129
|
+
_distance: null, // no vector distance available
|
|
1130
|
+
_bm25Only: true,
|
|
1131
|
+
});
|
|
1132
|
+
}
|
|
1133
|
+
}
|
|
1134
|
+
|
|
1135
|
+
// Apply metadata filters then return (graph context added below)
|
|
1136
|
+
results = this._applyMetadataFilters(results, includeArchived, options);
|
|
1137
|
+
const finalResults = results.slice(0, limit);
|
|
1138
|
+
|
|
1139
|
+
// Graph context expansion (same as vector path)
|
|
1140
|
+
await this._expandGraphContext(finalResults, null, query);
|
|
1141
|
+
|
|
1142
|
+
return finalResults;
|
|
1143
|
+
}
|
|
1144
|
+
|
|
1145
|
+
// ── Vector search (Phase 2 complete) ─────────────────────────────────────
|
|
773
1146
|
const queryEmbedding = await this.embedQuery(query);
|
|
774
1147
|
const table = await this.db.openTable(tableName);
|
|
775
1148
|
|
|
@@ -777,10 +1150,6 @@ class CodebaseIndexer {
|
|
|
777
1150
|
const hasFilters = !includeArchived || options.fileType || options.language ||
|
|
778
1151
|
options.modifiedAfter || options.modifiedBefore ||
|
|
779
1152
|
(options.tags && options.tags.length > 0);
|
|
780
|
-
const indexConfig = INDEX_PRESETS[this.indexName];
|
|
781
|
-
const indexHybridEnabled = indexConfig?.hybrid ?? false;
|
|
782
|
-
const indexBM25Weight = indexConfig?.bm25_weight ?? HYBRID_CONFIG.bm25_weight;
|
|
783
|
-
const isHybrid = indexHybridEnabled || options.hybrid;
|
|
784
1153
|
const fetchLimit = (hasFilters || isHybrid) ? Math.max(limit *3, 50) : limit;
|
|
785
1154
|
let results;
|
|
786
1155
|
try {
|
|
@@ -864,35 +1233,7 @@ class CodebaseIndexer {
|
|
|
864
1233
|
}
|
|
865
1234
|
|
|
866
1235
|
// ── Metadata filters ──────────────────────────────────────────────────
|
|
867
|
-
|
|
868
|
-
results = results.filter((r) => !r.archived);
|
|
869
|
-
}
|
|
870
|
-
|
|
871
|
-
if (options.fileType) {
|
|
872
|
-
results = results.filter((r) => r.file_type === options.fileType);
|
|
873
|
-
}
|
|
874
|
-
|
|
875
|
-
if (options.language) {
|
|
876
|
-
results = results.filter((r) => r.language === options.language);
|
|
877
|
-
}
|
|
878
|
-
|
|
879
|
-
if (options.modifiedAfter) {
|
|
880
|
-
const after = new Date(options.modifiedAfter).getTime();
|
|
881
|
-
results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() >= after);
|
|
882
|
-
}
|
|
883
|
-
|
|
884
|
-
if (options.modifiedBefore) {
|
|
885
|
-
const before = new Date(options.modifiedBefore).getTime();
|
|
886
|
-
results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() <= before);
|
|
887
|
-
}
|
|
888
|
-
|
|
889
|
-
if (options.tags && options.tags.length > 0) {
|
|
890
|
-
results = results.filter((r) => {
|
|
891
|
-
const rowTags = (r.tags || "").split(",").filter(Boolean);
|
|
892
|
-
return options.tags.some((t) => rowTags.includes(t));
|
|
893
|
-
});
|
|
894
|
-
}
|
|
895
|
-
|
|
1236
|
+
results = this._applyMetadataFilters(results, includeArchived, options);
|
|
896
1237
|
const finalResults = results.slice(0, limit);
|
|
897
1238
|
|
|
898
1239
|
// ── Metrics tracking ────────────────────────────────────────────────────
|
|
@@ -917,68 +1258,7 @@ class CodebaseIndexer {
|
|
|
917
1258
|
}
|
|
918
1259
|
|
|
919
1260
|
// ── Graph context expansion (v3) ───────────────────────────────────────
|
|
920
|
-
|
|
921
|
-
for (const result of finalResults) {
|
|
922
|
-
if (!result.chunk_id) continue;
|
|
923
|
-
|
|
924
|
-
const outgoing = await this.graphDB.getOutgoing(result.chunk_id);
|
|
925
|
-
const incoming = await this.graphDB.getIncoming(result.chunk_id);
|
|
926
|
-
// Filter out structural and meta edges — only relation edges are useful for context
|
|
927
|
-
const allEdges = [...outgoing, ...incoming].filter(
|
|
928
|
-
e => e.predicate !== "belongs_to" && e.predicate !== "graph_built" && !isStructuralPredicate(e.predicate)
|
|
929
|
-
);
|
|
930
|
-
|
|
931
|
-
const neighbors = [];
|
|
932
|
-
for (const edge of allEdges) {
|
|
933
|
-
const neighborId = edge.subject === result.chunk_id ? edge.object : edge.subject;
|
|
934
|
-
const neighborChunk = await this.findChunkById(neighborId);
|
|
935
|
-
if (!neighborChunk) continue;
|
|
936
|
-
|
|
937
|
-
const similarity = this.cosineSimilarity(neighborChunk.vector, queryEmbedding);
|
|
938
|
-
const score = edge.weight * similarity;
|
|
939
|
-
|
|
940
|
-
neighbors.push({
|
|
941
|
-
chunk_id: neighborId,
|
|
942
|
-
file: neighborChunk.file,
|
|
943
|
-
content: neighborChunk.content,
|
|
944
|
-
relation: edge.predicate,
|
|
945
|
-
score,
|
|
946
|
-
via: edge.source
|
|
947
|
-
});
|
|
948
|
-
}
|
|
949
|
-
|
|
950
|
-
// Apply min_relevance filter, then cap at max_related
|
|
951
|
-
neighbors.sort((a, b) => b.score - a.score);
|
|
952
|
-
const minRelevance = GRAPH_CONFIG.min_relevance ?? 0.5;
|
|
953
|
-
const maxRelated = GRAPH_CONFIG.max_related ?? 4;
|
|
954
|
-
result.relatedContext = neighbors
|
|
955
|
-
.filter(n => n.score >= minRelevance)
|
|
956
|
-
.slice(0, maxRelated);
|
|
957
|
-
|
|
958
|
-
// FR-060: Record provenance for each attached chunk
|
|
959
|
-
if (this.usageTracker) {
|
|
960
|
-
for (const n of result.relatedContext) {
|
|
961
|
-
this.usageTracker.recordProvenance(query, result.chunk_id, n.chunk_id, n.relation);
|
|
962
|
-
}
|
|
963
|
-
}
|
|
964
|
-
}
|
|
965
|
-
}
|
|
966
|
-
|
|
967
|
-
// FR-061: Record usage counts for all returned chunks (main + attached)
|
|
968
|
-
if (this.usageTracker) {
|
|
969
|
-
const allChunkIds = [];
|
|
970
|
-
for (const r of finalResults) {
|
|
971
|
-
if (r.chunk_id) allChunkIds.push(r.chunk_id);
|
|
972
|
-
if (r.relatedContext) {
|
|
973
|
-
for (const rc of r.relatedContext) {
|
|
974
|
-
if (rc.chunk_id) allChunkIds.push(rc.chunk_id);
|
|
975
|
-
}
|
|
976
|
-
}
|
|
977
|
-
}
|
|
978
|
-
this.usageTracker.recordSearchResults(allChunkIds);
|
|
979
|
-
// Save asynchronously (non-blocking)
|
|
980
|
-
this.usageTracker.save().catch(() => {});
|
|
981
|
-
}
|
|
1261
|
+
await this._expandGraphContext(finalResults, queryEmbedding, query);
|
|
982
1262
|
|
|
983
1263
|
return finalResults;
|
|
984
1264
|
}
|
|
@@ -987,22 +1267,39 @@ class CodebaseIndexer {
|
|
|
987
1267
|
// Lazy-build an in-memory Map keyed by chunk_id on first call.
|
|
988
1268
|
// The cache lives until unloadModel() clears it.
|
|
989
1269
|
if (!this._chunkCache) {
|
|
1270
|
+
this._chunkCache = new Map();
|
|
1271
|
+
|
|
1272
|
+
// Primary: LanceDB (has vectors for cosine similarity in graph expansion)
|
|
990
1273
|
const tableName = "chunks";
|
|
991
1274
|
const tables = await this.db.tableNames();
|
|
992
|
-
if (
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1275
|
+
if (tables.includes(tableName)) {
|
|
1276
|
+
try {
|
|
1277
|
+
const table = await this.db.openTable(tableName);
|
|
1278
|
+
const rows = await table.filter("true").limit(100000).execute();
|
|
1279
|
+
for (const row of rows) {
|
|
1280
|
+
if (row.chunk_id) {
|
|
1281
|
+
this._chunkCache.set(row.chunk_id, row);
|
|
1282
|
+
}
|
|
1283
|
+
}
|
|
1284
|
+
} catch (e) {
|
|
1285
|
+
if (DEBUG) console.log("[vectorizer] Chunk cache from LanceDB failed:", e.message);
|
|
1286
|
+
}
|
|
1001
1287
|
}
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1288
|
+
|
|
1289
|
+
// Fallback: ChunkStore (no vectors, but has content for BM25-only mode)
|
|
1290
|
+
if (this._chunkCache.size === 0 && this.chunkStore) {
|
|
1291
|
+
try {
|
|
1292
|
+
const allChunks = this.chunkStore.getAllChunks();
|
|
1293
|
+
for (const chunk of allChunks) {
|
|
1294
|
+
if (chunk.chunk_id) {
|
|
1295
|
+
this._chunkCache.set(chunk.chunk_id, chunk);
|
|
1296
|
+
}
|
|
1297
|
+
}
|
|
1298
|
+
if (DEBUG && allChunks.length > 0) {
|
|
1299
|
+
console.log(`[vectorizer] Chunk cache from ChunkStore (${allChunks.length} chunks, no vectors)`);
|
|
1300
|
+
}
|
|
1301
|
+
} catch (e) {
|
|
1302
|
+
if (DEBUG) console.log("[vectorizer] Chunk cache from ChunkStore failed:", e.message);
|
|
1006
1303
|
}
|
|
1007
1304
|
}
|
|
1008
1305
|
}
|
|
@@ -1094,6 +1391,9 @@ class CodebaseIndexer {
|
|
|
1094
1391
|
// best effort
|
|
1095
1392
|
}
|
|
1096
1393
|
}
|
|
1394
|
+
if (this.chunkStore) {
|
|
1395
|
+
try { this.chunkStore.deleteByFile(relPath); } catch { /* best effort */ }
|
|
1396
|
+
}
|
|
1097
1397
|
delete this.hashes[relPath];
|
|
1098
1398
|
deleted++;
|
|
1099
1399
|
}
|
|
@@ -1156,6 +1456,11 @@ class CodebaseIndexer {
|
|
|
1156
1456
|
}
|
|
1157
1457
|
}
|
|
1158
1458
|
|
|
1459
|
+
// Delete chunks from ChunkStore
|
|
1460
|
+
if (this.chunkStore) {
|
|
1461
|
+
try { this.chunkStore.deleteByFile(relPath); } catch { /* best effort */ }
|
|
1462
|
+
}
|
|
1463
|
+
|
|
1159
1464
|
delete this.hashes[relPath];
|
|
1160
1465
|
removed++;
|
|
1161
1466
|
}
|
|
@@ -1170,31 +1475,64 @@ class CodebaseIndexer {
|
|
|
1170
1475
|
}
|
|
1171
1476
|
}
|
|
1172
1477
|
|
|
1173
|
-
let indexed = 0;
|
|
1174
|
-
let skipped = 0;
|
|
1175
1478
|
const total = files.length;
|
|
1479
|
+
const CONCURRENCY = 5;
|
|
1176
1480
|
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1481
|
+
// ══════════════════════════════════════════════════════════════════════════
|
|
1482
|
+
// Phase 1: Prepare files in parallel (chunk + graph, no embedding)
|
|
1483
|
+
// ══════════════════════════════════════════════════════════════════════════
|
|
1484
|
+
const preparedFiles = [];
|
|
1485
|
+
let prepared = 0;
|
|
1486
|
+
let skipped = 0;
|
|
1487
|
+
|
|
1488
|
+
// Process in batches of CONCURRENCY
|
|
1489
|
+
for (let i = 0; i < files.length; i += CONCURRENCY) {
|
|
1490
|
+
const batch = files.slice(i, i + CONCURRENCY);
|
|
1491
|
+
const promises = batch.map(async (relPath) => {
|
|
1492
|
+
const filePath = path.join(this.root, relPath);
|
|
1493
|
+
try {
|
|
1494
|
+
const result = await this.prepareFile(filePath);
|
|
1495
|
+
return result;
|
|
1496
|
+
} catch {
|
|
1497
|
+
return null;
|
|
1498
|
+
}
|
|
1499
|
+
});
|
|
1500
|
+
|
|
1501
|
+
const results = await Promise.all(promises);
|
|
1502
|
+
for (let j = 0; j < results.length; j++) {
|
|
1503
|
+
if (results[j]) {
|
|
1504
|
+
preparedFiles.push(results[j]);
|
|
1505
|
+
prepared++;
|
|
1506
|
+
if (onProgress) onProgress(prepared, total, results[j].relPath, i + j + 1, "prepare");
|
|
1186
1507
|
} else {
|
|
1187
1508
|
skipped++;
|
|
1188
1509
|
}
|
|
1189
|
-
} catch {
|
|
1190
|
-
skipped++;
|
|
1191
1510
|
}
|
|
1192
1511
|
}
|
|
1193
1512
|
|
|
1513
|
+
if (DEBUG) console.log(`[vectorizer] Phase 1 done: ${prepared} files prepared, ${skipped} skipped`);
|
|
1514
|
+
|
|
1515
|
+
// ══════════════════════════════════════════════════════════════════════════
|
|
1516
|
+
// Phase 2: Batch embed + store (sequential, batch forward pass)
|
|
1517
|
+
// ══════════════════════════════════════════════════════════════════════════
|
|
1518
|
+
let chunksEmbedded = 0;
|
|
1519
|
+
if (preparedFiles.length > 0) {
|
|
1520
|
+
const totalChunks = preparedFiles.reduce((sum, pf) => sum + pf.rows.length, 0);
|
|
1521
|
+
if (DEBUG) console.log(`[vectorizer] Phase 2: embedding ${totalChunks} chunks from ${preparedFiles.length} files`);
|
|
1522
|
+
|
|
1523
|
+
chunksEmbedded = await this.embedAndStore(preparedFiles, 32, (done, embedTotal, phase) => {
|
|
1524
|
+
if (onProgress) onProgress(done, embedTotal, `embedding`, done, "embed");
|
|
1525
|
+
});
|
|
1526
|
+
|
|
1527
|
+
if (DEBUG) console.log(`[vectorizer] Phase 2 done: ${chunksEmbedded} chunks embedded and stored`);
|
|
1528
|
+
}
|
|
1529
|
+
|
|
1530
|
+
const indexed = prepared; // file count for backward compat
|
|
1531
|
+
|
|
1194
1532
|
// FR-005: Build semantic similarity edges as post-pass
|
|
1195
1533
|
// Disabled by default (O(n²) — slow on large repos). Enable via graph.semantic_edges: true
|
|
1196
1534
|
let semanticEdges = 0;
|
|
1197
|
-
if (
|
|
1535
|
+
if (chunksEmbedded > 0 && this.graphBuilder && this.graphDB && GRAPH_CONFIG.semantic_edges) {
|
|
1198
1536
|
try {
|
|
1199
1537
|
const tableName = "chunks";
|
|
1200
1538
|
const tables = await this.db.tableNames();
|
|
@@ -1237,23 +1575,34 @@ class CodebaseIndexer {
|
|
|
1237
1575
|
|
|
1238
1576
|
async getStats() {
|
|
1239
1577
|
const fileCount = Object.keys(this.hashes).length;
|
|
1240
|
-
let
|
|
1578
|
+
let vectorChunkCount = 0;
|
|
1579
|
+
let totalChunkCount = 0;
|
|
1580
|
+
let hasUnvectorized = false;
|
|
1241
1581
|
|
|
1242
1582
|
try {
|
|
1243
1583
|
const tables = await this.db.tableNames();
|
|
1244
1584
|
if (tables.includes("chunks")) {
|
|
1245
1585
|
const table = await this.db.openTable("chunks");
|
|
1246
|
-
|
|
1586
|
+
vectorChunkCount = await table.countRows();
|
|
1247
1587
|
}
|
|
1248
1588
|
} catch {}
|
|
1249
1589
|
|
|
1590
|
+
if (this.chunkStore) {
|
|
1591
|
+
try {
|
|
1592
|
+
totalChunkCount = this.chunkStore.count();
|
|
1593
|
+
hasUnvectorized = this.chunkStore.hasUnvectorizedChunks();
|
|
1594
|
+
} catch {}
|
|
1595
|
+
}
|
|
1596
|
+
|
|
1250
1597
|
const preset = INDEX_PRESETS[this.indexName];
|
|
1251
1598
|
return {
|
|
1252
1599
|
indexName: this.indexName,
|
|
1253
1600
|
description: preset?.description || "Custom index",
|
|
1254
1601
|
model: EMBEDDING_MODEL,
|
|
1255
1602
|
fileCount,
|
|
1256
|
-
chunkCount,
|
|
1603
|
+
chunkCount: totalChunkCount || vectorChunkCount,
|
|
1604
|
+
vectorizedChunks: vectorChunkCount,
|
|
1605
|
+
pendingEmbedding: hasUnvectorized,
|
|
1257
1606
|
features: {
|
|
1258
1607
|
chunking: CHUNKING_CONFIG.strategy,
|
|
1259
1608
|
hybrid: preset?.hybrid ?? false,
|