@comfanion/usethis_search 3.0.0-dev.25 → 3.0.0-dev.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -3
- package/vectorizer/graph-db.ts +122 -148
- package/vectorizer/index.ts +189 -15
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@comfanion/usethis_search",
|
|
3
|
-
"version": "3.0.0-dev.
|
|
3
|
+
"version": "3.0.0-dev.27",
|
|
4
4
|
"description": "OpenCode plugin: semantic search with graph-based context (v3: graph relations, 1-hop context, LSP + regex analyzers)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./index.ts",
|
|
@@ -48,8 +48,6 @@
|
|
|
48
48
|
"@opencode-ai/plugin": ">=1.1.0",
|
|
49
49
|
"@xenova/transformers": "^2.17.0",
|
|
50
50
|
"glob": "^10.3.10",
|
|
51
|
-
"level": "^8.0.1",
|
|
52
|
-
"levelgraph": "^4.0.0",
|
|
53
51
|
"vectordb": "^0.4.0"
|
|
54
52
|
},
|
|
55
53
|
"peerDependencies": {
|
package/vectorizer/graph-db.ts
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
import
|
|
2
|
-
import { Level } from "level"
|
|
1
|
+
import { Database } from "bun:sqlite"
|
|
3
2
|
import { filePathFromNodeId, isStructuralPredicate } from "./graph-builder"
|
|
4
3
|
|
|
5
4
|
export interface Triple {
|
|
@@ -13,98 +12,137 @@ export interface Triple {
|
|
|
13
12
|
}
|
|
14
13
|
|
|
15
14
|
export class GraphDB {
|
|
16
|
-
private db:
|
|
15
|
+
private db: Database | null = null
|
|
17
16
|
private initialized: boolean = false
|
|
18
17
|
|
|
18
|
+
// Prepared statements (cached for performance)
|
|
19
|
+
private _stmtInsert: any = null
|
|
20
|
+
private _stmtBySubject: any = null
|
|
21
|
+
private _stmtByObject: any = null
|
|
22
|
+
private _stmtByFile: any = null
|
|
23
|
+
private _stmtDeleteByFile: any = null
|
|
24
|
+
private _stmtBySubjectPredicate: any = null
|
|
25
|
+
private _stmtByPredicate: any = null
|
|
26
|
+
private _stmtAll: any = null
|
|
27
|
+
|
|
19
28
|
constructor(private dbPath: string) {}
|
|
20
29
|
|
|
21
30
|
async init(): Promise<this> {
|
|
22
|
-
|
|
23
|
-
this.db
|
|
31
|
+
// bun:sqlite uses a file path; append .db if not already
|
|
32
|
+
const fullPath = this.dbPath.endsWith(".db") ? this.dbPath : this.dbPath + ".db"
|
|
33
|
+
this.db = new Database(fullPath)
|
|
34
|
+
|
|
35
|
+
// WAL mode for concurrent readers
|
|
36
|
+
this.db.exec("PRAGMA journal_mode = WAL")
|
|
37
|
+
this.db.exec("PRAGMA synchronous = NORMAL") // faster writes, safe with WAL
|
|
38
|
+
this.db.exec("PRAGMA cache_size = -2000") // 2MB cache
|
|
39
|
+
|
|
40
|
+
// Create triples table
|
|
41
|
+
this.db.exec(`
|
|
42
|
+
CREATE TABLE IF NOT EXISTS triples (
|
|
43
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
44
|
+
subject TEXT NOT NULL,
|
|
45
|
+
predicate TEXT NOT NULL,
|
|
46
|
+
object TEXT NOT NULL,
|
|
47
|
+
weight REAL NOT NULL DEFAULT 0,
|
|
48
|
+
source TEXT NOT NULL DEFAULT '',
|
|
49
|
+
file TEXT NOT NULL DEFAULT '',
|
|
50
|
+
line INTEGER
|
|
51
|
+
)
|
|
52
|
+
`)
|
|
53
|
+
|
|
54
|
+
// Indexes for fast lookups
|
|
55
|
+
this.db.exec("CREATE INDEX IF NOT EXISTS idx_subject ON triples(subject)")
|
|
56
|
+
this.db.exec("CREATE INDEX IF NOT EXISTS idx_object ON triples(object)")
|
|
57
|
+
this.db.exec("CREATE INDEX IF NOT EXISTS idx_file ON triples(file)")
|
|
58
|
+
this.db.exec("CREATE INDEX IF NOT EXISTS idx_predicate ON triples(predicate)")
|
|
59
|
+
this.db.exec("CREATE INDEX IF NOT EXISTS idx_subject_predicate ON triples(subject, predicate)")
|
|
60
|
+
|
|
61
|
+
// Prepare statements
|
|
62
|
+
this._stmtInsert = this.db.prepare(
|
|
63
|
+
"INSERT INTO triples (subject, predicate, object, weight, source, file, line) VALUES (?, ?, ?, ?, ?, ?, ?)"
|
|
64
|
+
)
|
|
65
|
+
this._stmtBySubject = this.db.prepare("SELECT * FROM triples WHERE subject = ?")
|
|
66
|
+
this._stmtByObject = this.db.prepare("SELECT * FROM triples WHERE object = ?")
|
|
67
|
+
this._stmtByFile = this.db.prepare("SELECT * FROM triples WHERE file = ?")
|
|
68
|
+
this._stmtDeleteByFile = this.db.prepare("DELETE FROM triples WHERE file = ?")
|
|
69
|
+
this._stmtBySubjectPredicate = this.db.prepare("SELECT * FROM triples WHERE subject = ? AND predicate = ?")
|
|
70
|
+
this._stmtByPredicate = this.db.prepare("SELECT * FROM triples WHERE predicate = ?")
|
|
71
|
+
this._stmtAll = this.db.prepare("SELECT * FROM triples")
|
|
72
|
+
|
|
24
73
|
this.initialized = true
|
|
25
74
|
return this
|
|
26
75
|
}
|
|
27
76
|
|
|
77
|
+
private toTriple(row: any): Triple {
|
|
78
|
+
return {
|
|
79
|
+
subject: row.subject,
|
|
80
|
+
predicate: row.predicate,
|
|
81
|
+
object: row.object,
|
|
82
|
+
weight: row.weight,
|
|
83
|
+
source: row.source,
|
|
84
|
+
file: row.file,
|
|
85
|
+
line: row.line ?? undefined,
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
28
89
|
async putEdges(triples: Triple[]): Promise<void> {
|
|
29
|
-
if (!this.initialized) {
|
|
90
|
+
if (!this.initialized || !this.db) {
|
|
30
91
|
throw new Error("GraphDB not initialized. Call init() first.")
|
|
31
92
|
}
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
93
|
+
|
|
94
|
+
// Batch insert in a single transaction — much faster than individual inserts
|
|
95
|
+
const insertMany = this.db.transaction((items: Triple[]) => {
|
|
96
|
+
for (const t of items) {
|
|
97
|
+
this._stmtInsert.run(t.subject, t.predicate, t.object, t.weight, t.source, t.file, t.line ?? null)
|
|
98
|
+
}
|
|
37
99
|
})
|
|
100
|
+
insertMany(triples)
|
|
38
101
|
}
|
|
39
102
|
|
|
40
103
|
async getOutgoing(chunkId: string): Promise<Triple[]> {
|
|
41
|
-
if (!this.initialized) {
|
|
104
|
+
if (!this.initialized || !this.db) {
|
|
42
105
|
throw new Error("GraphDB not initialized. Call init() first.")
|
|
43
106
|
}
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
if (err) reject(err)
|
|
47
|
-
else resolve(triples || [])
|
|
48
|
-
})
|
|
49
|
-
})
|
|
107
|
+
const rows = this._stmtBySubject.all(chunkId)
|
|
108
|
+
return rows.map((r: any) => this.toTriple(r))
|
|
50
109
|
}
|
|
51
110
|
|
|
52
111
|
async getIncoming(chunkId: string): Promise<Triple[]> {
|
|
53
|
-
if (!this.initialized) {
|
|
112
|
+
if (!this.initialized || !this.db) {
|
|
54
113
|
throw new Error("GraphDB not initialized. Call init() first.")
|
|
55
114
|
}
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
if (err) reject(err)
|
|
59
|
-
else resolve(triples || [])
|
|
60
|
-
})
|
|
61
|
-
})
|
|
115
|
+
const rows = this._stmtByObject.all(chunkId)
|
|
116
|
+
return rows.map((r: any) => this.toTriple(r))
|
|
62
117
|
}
|
|
63
118
|
|
|
64
119
|
async deleteByFile(filePath: string): Promise<void> {
|
|
65
|
-
if (!this.initialized) {
|
|
120
|
+
if (!this.initialized || !this.db) {
|
|
66
121
|
throw new Error("GraphDB not initialized. Call init() first.")
|
|
67
122
|
}
|
|
68
|
-
|
|
69
|
-
this.db.get({}, (err: Error | undefined, triples: Triple[]) => {
|
|
70
|
-
if (err) reject(err)
|
|
71
|
-
else resolve(triples || [])
|
|
72
|
-
})
|
|
73
|
-
})
|
|
74
|
-
|
|
75
|
-
const toDelete = allTriples.filter(t => t.file === filePath)
|
|
76
|
-
|
|
77
|
-
for (const t of toDelete) {
|
|
78
|
-
await new Promise<void>((resolve, reject) => {
|
|
79
|
-
this.db.del(t, (err: Error | undefined) => {
|
|
80
|
-
if (err) reject(err)
|
|
81
|
-
else resolve()
|
|
82
|
-
})
|
|
83
|
-
})
|
|
84
|
-
}
|
|
123
|
+
this._stmtDeleteByFile.run(filePath)
|
|
85
124
|
}
|
|
86
125
|
|
|
87
126
|
async close(): Promise<void> {
|
|
88
127
|
if (this.initialized && this.db) {
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
128
|
+
this.db.close()
|
|
129
|
+
this.db = null
|
|
130
|
+
this._stmtInsert = null
|
|
131
|
+
this._stmtBySubject = null
|
|
132
|
+
this._stmtByObject = null
|
|
133
|
+
this._stmtByFile = null
|
|
134
|
+
this._stmtDeleteByFile = null
|
|
135
|
+
this._stmtBySubjectPredicate = null
|
|
136
|
+
this._stmtByPredicate = null
|
|
137
|
+
this._stmtAll = null
|
|
95
138
|
this.initialized = false
|
|
96
139
|
}
|
|
97
140
|
}
|
|
98
141
|
|
|
99
142
|
// ---- FR-054: File metadata triples for incremental updates -----------------
|
|
100
143
|
|
|
101
|
-
/**
|
|
102
|
-
* Store graph build metadata for a file as a special triple.
|
|
103
|
-
* Subject: `meta:<filePath>`, Predicate: `graph_built`, Object: `<hash>`.
|
|
104
|
-
* Weight encodes the timestamp (seconds since epoch).
|
|
105
|
-
*/
|
|
106
144
|
async setFileMeta(filePath: string, hash: string, timestamp: number): Promise<void> {
|
|
107
|
-
if (!this.initialized) throw new Error("GraphDB not initialized. Call init() first.")
|
|
145
|
+
if (!this.initialized || !this.db) throw new Error("GraphDB not initialized. Call init() first.")
|
|
108
146
|
|
|
109
147
|
// Remove old meta triple for this file first
|
|
110
148
|
await this.deleteFileMeta(filePath)
|
|
@@ -113,111 +151,61 @@ export class GraphDB {
|
|
|
113
151
|
subject: `meta:${filePath}`,
|
|
114
152
|
predicate: "graph_built",
|
|
115
153
|
object: hash,
|
|
116
|
-
weight: Math.floor(timestamp / 1000),
|
|
154
|
+
weight: Math.floor(timestamp / 1000),
|
|
117
155
|
source: "meta",
|
|
118
156
|
file: filePath,
|
|
119
157
|
}
|
|
120
158
|
await this.putEdges([triple])
|
|
121
159
|
}
|
|
122
160
|
|
|
123
|
-
/**
|
|
124
|
-
* Get the stored graph build metadata for a file.
|
|
125
|
-
* Returns { hash, timestamp } or null if not found.
|
|
126
|
-
*/
|
|
127
161
|
async getFileMeta(filePath: string): Promise<{ hash: string; timestamp: number } | null> {
|
|
128
|
-
if (!this.initialized) throw new Error("GraphDB not initialized. Call init() first.")
|
|
129
|
-
|
|
130
|
-
const triples = await new Promise<Triple[]>((resolve, reject) => {
|
|
131
|
-
this.db.get(
|
|
132
|
-
{ subject: `meta:${filePath}`, predicate: "graph_built" },
|
|
133
|
-
(err: Error | undefined, result: Triple[]) => {
|
|
134
|
-
if (err) reject(err)
|
|
135
|
-
else resolve(result || [])
|
|
136
|
-
},
|
|
137
|
-
)
|
|
138
|
-
})
|
|
162
|
+
if (!this.initialized || !this.db) throw new Error("GraphDB not initialized. Call init() first.")
|
|
139
163
|
|
|
140
|
-
|
|
164
|
+
const rows = this._stmtBySubjectPredicate.all(`meta:${filePath}`, "graph_built")
|
|
165
|
+
if (rows.length === 0) return null
|
|
141
166
|
return {
|
|
142
|
-
hash:
|
|
143
|
-
timestamp:
|
|
167
|
+
hash: rows[0].object,
|
|
168
|
+
timestamp: rows[0].weight * 1000,
|
|
144
169
|
}
|
|
145
170
|
}
|
|
146
171
|
|
|
147
|
-
/**
|
|
148
|
-
* Delete file meta triple.
|
|
149
|
-
*/
|
|
150
172
|
async deleteFileMeta(filePath: string): Promise<void> {
|
|
151
|
-
if (!this.initialized) throw new Error("GraphDB not initialized. Call init() first.")
|
|
173
|
+
if (!this.initialized || !this.db) throw new Error("GraphDB not initialized. Call init() first.")
|
|
152
174
|
|
|
153
175
|
try {
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
if (err) reject(err)
|
|
159
|
-
else resolve(result || [])
|
|
160
|
-
},
|
|
161
|
-
)
|
|
162
|
-
})
|
|
163
|
-
|
|
164
|
-
for (const t of triples) {
|
|
165
|
-
await new Promise<void>((resolve, reject) => {
|
|
166
|
-
this.db.del(t, (err: Error | undefined) => {
|
|
167
|
-
if (err) reject(err)
|
|
168
|
-
else resolve()
|
|
169
|
-
})
|
|
170
|
-
})
|
|
171
|
-
}
|
|
172
|
-
} catch (err) {
|
|
173
|
-
// Silently ignore errors (e.g., no meta triple exists)
|
|
176
|
+
this.db!.prepare("DELETE FROM triples WHERE subject = ? AND predicate = ?")
|
|
177
|
+
.run(`meta:${filePath}`, "graph_built")
|
|
178
|
+
} catch {
|
|
179
|
+
// Silently ignore errors
|
|
174
180
|
}
|
|
175
181
|
}
|
|
176
182
|
|
|
177
|
-
/**
|
|
178
|
-
* Get all file metadata triples (for validation / stats).
|
|
179
|
-
*/
|
|
180
183
|
async getAllFileMeta(): Promise<Array<{ filePath: string; hash: string; timestamp: number }>> {
|
|
181
|
-
if (!this.initialized) throw new Error("GraphDB not initialized. Call init() first.")
|
|
182
|
-
|
|
183
|
-
const triples = await new Promise<Triple[]>((resolve, reject) => {
|
|
184
|
-
this.db.get({ predicate: "graph_built" }, (err: Error | undefined, result: Triple[]) => {
|
|
185
|
-
if (err) reject(err)
|
|
186
|
-
else resolve(result || [])
|
|
187
|
-
})
|
|
188
|
-
})
|
|
184
|
+
if (!this.initialized || !this.db) throw new Error("GraphDB not initialized. Call init() first.")
|
|
189
185
|
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
186
|
+
const rows = this._stmtByPredicate.all("graph_built")
|
|
187
|
+
return rows.map((r: any) => ({
|
|
188
|
+
filePath: r.subject.replace(/^meta:/, ""),
|
|
189
|
+
hash: r.object,
|
|
190
|
+
timestamp: r.weight * 1000,
|
|
194
191
|
}))
|
|
195
192
|
}
|
|
196
193
|
|
|
197
|
-
/**
|
|
198
|
-
* Get all triples in the graph (for validation/stats).
|
|
199
|
-
* Excludes meta, anchor, and structural triples by default.
|
|
200
|
-
* Pass includeStructural=true to also get structural edges.
|
|
201
|
-
*/
|
|
202
194
|
async getAllTriples(includeStructural: boolean = false): Promise<Triple[]> {
|
|
203
|
-
if (!this.initialized) throw new Error("GraphDB not initialized. Call init() first.")
|
|
204
|
-
|
|
205
|
-
const
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
195
|
+
if (!this.initialized || !this.db) throw new Error("GraphDB not initialized. Call init() first.")
|
|
196
|
+
|
|
197
|
+
const allRows = this._stmtAll.all()
|
|
198
|
+
return allRows
|
|
199
|
+
.map((r: any) => this.toTriple(r))
|
|
200
|
+
.filter((t: Triple) => {
|
|
201
|
+
if (t.predicate === "graph_built" || t.predicate === "belongs_to") return false
|
|
202
|
+
if (!includeStructural && isStructuralPredicate(t.predicate)) return false
|
|
203
|
+
return true
|
|
209
204
|
})
|
|
210
|
-
})
|
|
211
|
-
|
|
212
|
-
return allTriples.filter(t => {
|
|
213
|
-
if (t.predicate === "graph_built" || t.predicate === "belongs_to") return false
|
|
214
|
-
if (!includeStructural && isStructuralPredicate(t.predicate)) return false
|
|
215
|
-
return true
|
|
216
|
-
})
|
|
217
205
|
}
|
|
218
206
|
|
|
219
207
|
async getRelatedFiles(chunkId: string, maxDepth: number = 1): Promise<{path: string, relation: string, weight: number}[]> {
|
|
220
|
-
if (!this.initialized) {
|
|
208
|
+
if (!this.initialized || !this.db) {
|
|
221
209
|
throw new Error("GraphDB not initialized. Call init() first.")
|
|
222
210
|
}
|
|
223
211
|
|
|
@@ -225,7 +213,6 @@ export class GraphDB {
|
|
|
225
213
|
const visited = new Set<string>()
|
|
226
214
|
const self = this
|
|
227
215
|
|
|
228
|
-
// Resolve the caller's file directly from the node ID
|
|
229
216
|
const callerFile = filePathFromNodeId(chunkId)
|
|
230
217
|
|
|
231
218
|
async function traverse(currentId: string, currentDepth: number, currentRelation: string) {
|
|
@@ -236,19 +223,12 @@ export class GraphDB {
|
|
|
236
223
|
visited.add(currentId)
|
|
237
224
|
|
|
238
225
|
try {
|
|
239
|
-
const outgoing =
|
|
240
|
-
self.db.get({ subject: currentId }, (err: Error | undefined, triples: Triple[]) => {
|
|
241
|
-
if (err) reject(err)
|
|
242
|
-
else resolve(triples || [])
|
|
243
|
-
})
|
|
244
|
-
})
|
|
226
|
+
const outgoing = self._stmtBySubject.all(currentId).map((r: any) => self.toTriple(r))
|
|
245
227
|
|
|
246
228
|
for (const triple of outgoing) {
|
|
247
|
-
// Skip meta, anchor, and structural-only edges
|
|
248
229
|
if (triple.predicate === "graph_built" || triple.predicate === "belongs_to") continue
|
|
249
230
|
if (isStructuralPredicate(triple.predicate)) continue
|
|
250
231
|
|
|
251
|
-
// Resolve file for the target node directly from its ID
|
|
252
232
|
const targetFile = filePathFromNodeId(triple.object)
|
|
253
233
|
if (!targetFile) continue
|
|
254
234
|
|
|
@@ -267,12 +247,7 @@ export class GraphDB {
|
|
|
267
247
|
}
|
|
268
248
|
}
|
|
269
249
|
|
|
270
|
-
const incoming =
|
|
271
|
-
self.db.get({ object: currentId }, (err: Error | undefined, triples: Triple[]) => {
|
|
272
|
-
if (err) reject(err)
|
|
273
|
-
else resolve(triples || [])
|
|
274
|
-
})
|
|
275
|
-
})
|
|
250
|
+
const incoming = self._stmtByObject.all(currentId).map((r: any) => self.toTriple(r))
|
|
276
251
|
|
|
277
252
|
for (const triple of incoming) {
|
|
278
253
|
if (triple.predicate === "graph_built" || triple.predicate === "belongs_to") continue
|
|
@@ -298,7 +273,6 @@ export class GraphDB {
|
|
|
298
273
|
|
|
299
274
|
await traverse(chunkId, 0, "")
|
|
300
275
|
|
|
301
|
-
// Remove the caller's own file from results
|
|
302
276
|
if (callerFile) relatedFiles.delete(callerFile)
|
|
303
277
|
|
|
304
278
|
return Array.from(relatedFiles.entries())
|
package/vectorizer/index.ts
CHANGED
|
@@ -612,7 +612,148 @@ class CodebaseIndexer {
|
|
|
612
612
|
return this.hashes[relPath] !== currentHash;
|
|
613
613
|
}
|
|
614
614
|
|
|
615
|
-
// ──
|
|
615
|
+
// ── Phase 1: Prepare file (chunk + graph, NO embedding) ─────────────────
|
|
616
|
+
// Returns prepared chunk data ready for embedding, or null if skipped.
|
|
617
|
+
|
|
618
|
+
async prepareFile(filePath) {
|
|
619
|
+
const relPath = path.relative(this.root, filePath);
|
|
620
|
+
|
|
621
|
+
let content;
|
|
622
|
+
try {
|
|
623
|
+
content = await fs.readFile(filePath, "utf8");
|
|
624
|
+
} catch {
|
|
625
|
+
return null;
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
const hash = this.fileHash(content);
|
|
629
|
+
if (this.hashes[relPath] === hash) {
|
|
630
|
+
return null; // unchanged
|
|
631
|
+
}
|
|
632
|
+
|
|
633
|
+
// Extract metadata
|
|
634
|
+
const fileMeta = await extractFileMetadata(filePath, content);
|
|
635
|
+
const archived = this.isArchived(relPath, content);
|
|
636
|
+
|
|
637
|
+
// Clean content before chunking
|
|
638
|
+
const cleaned = cleanContent(content, fileMeta.file_type, CLEANING_CONFIG);
|
|
639
|
+
|
|
640
|
+
// Semantic chunking
|
|
641
|
+
const chunks = chunkContent(cleaned, fileMeta.file_type, fileMeta.language, CHUNKING_CONFIG);
|
|
642
|
+
|
|
643
|
+
// Assign chunk IDs
|
|
644
|
+
const chunksWithIds = this.graphBuilder
|
|
645
|
+
? this.graphBuilder.assignChunkIds(relPath, chunks)
|
|
646
|
+
: chunks.map((c, i) => ({ ...c, chunk_id: `chunk:${relPath}::_chunk_${i}` }));
|
|
647
|
+
|
|
648
|
+
// Build graph edges (Phase 1 — no embedding needed)
|
|
649
|
+
if (this.graphBuilder && this.graphDB) {
|
|
650
|
+
await this.graphDB.deleteByFile(relPath);
|
|
651
|
+
const edgesBuilt = await this.graphBuilder.buildEdges(relPath, content, chunksWithIds, fileMeta.file_type);
|
|
652
|
+
|
|
653
|
+
if (edgesBuilt > 0 || DEBUG) {
|
|
654
|
+
const timestamp = new Date().toISOString().slice(11, 19);
|
|
655
|
+
const logMsg = `${timestamp} Graph built: ${relPath} (${chunksWithIds.length} chunks)`;
|
|
656
|
+
if (DEBUG) console.log(`[vectorizer] ${logMsg}`);
|
|
657
|
+
try {
|
|
658
|
+
const logPath = path.join(this.root, ".opencode", "indexer.log");
|
|
659
|
+
const fsSync = await import("fs");
|
|
660
|
+
fsSync.appendFileSync(logPath, `${logMsg}\n`);
|
|
661
|
+
} catch { /* non-fatal */ }
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
try {
|
|
665
|
+
await this.graphDB.setFileMeta(relPath, hash, Date.now());
|
|
666
|
+
} catch { /* non-fatal */ }
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
// Return prepared rows (without vector — Phase 2 fills it)
|
|
670
|
+
const rows = chunksWithIds.map((chunk, i) => ({
|
|
671
|
+
chunk_id: chunk.chunk_id,
|
|
672
|
+
file: relPath,
|
|
673
|
+
chunk_index: i,
|
|
674
|
+
content: chunk.content,
|
|
675
|
+
archived,
|
|
676
|
+
file_type: fileMeta.file_type,
|
|
677
|
+
language: fileMeta.language,
|
|
678
|
+
last_modified: fileMeta.last_modified,
|
|
679
|
+
file_size: fileMeta.file_size,
|
|
680
|
+
heading_context: chunk.heading_context || "",
|
|
681
|
+
function_name: chunk.function_name || "",
|
|
682
|
+
class_name: chunk.class_name || "",
|
|
683
|
+
tags: (fileMeta.tags || []).join(","),
|
|
684
|
+
start_line: chunk.start_line ?? -1,
|
|
685
|
+
end_line: chunk.end_line ?? -1,
|
|
686
|
+
}));
|
|
687
|
+
|
|
688
|
+
return { relPath, hash, rows };
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
// ── Phase 2: Batch embed + store ──────────────────────────────────────────
|
|
692
|
+
// Takes prepared rows from prepareFile(), embeds in batches, stores in LanceDB.
|
|
693
|
+
|
|
694
|
+
async embedAndStore(preparedFiles, batchSize = 32, onProgress = null) {
|
|
695
|
+
if (preparedFiles.length === 0) return 0;
|
|
696
|
+
|
|
697
|
+
// Collect all rows with their content for batch embedding
|
|
698
|
+
const allRows = [];
|
|
699
|
+
for (const pf of preparedFiles) {
|
|
700
|
+
for (const row of pf.rows) {
|
|
701
|
+
allRows.push(row);
|
|
702
|
+
}
|
|
703
|
+
}
|
|
704
|
+
|
|
705
|
+
if (allRows.length === 0) return 0;
|
|
706
|
+
|
|
707
|
+
// Load model once
|
|
708
|
+
const model = await this.loadModel();
|
|
709
|
+
|
|
710
|
+
// Batch embed
|
|
711
|
+
const allData = [];
|
|
712
|
+
for (let i = 0; i < allRows.length; i += batchSize) {
|
|
713
|
+
const batch = allRows.slice(i, i + batchSize);
|
|
714
|
+
const texts = batch.map(r => r.content);
|
|
715
|
+
|
|
716
|
+
// Embed batch — @xenova/transformers processes array inputs efficiently
|
|
717
|
+
const embeddings = [];
|
|
718
|
+
for (const text of texts) {
|
|
719
|
+
const result = await model(text, { pooling: "mean", normalize: true });
|
|
720
|
+
embeddings.push(Array.from(result.data));
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
for (let j = 0; j < batch.length; j++) {
|
|
724
|
+
allData.push({ ...batch[j], vector: embeddings[j] });
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
if (onProgress) {
|
|
728
|
+
onProgress(Math.min(i + batchSize, allRows.length), allRows.length, "embedding");
|
|
729
|
+
}
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
// Bulk store in LanceDB
|
|
733
|
+
const tableName = "chunks";
|
|
734
|
+
const tables = await this.db.tableNames();
|
|
735
|
+
if (tables.includes(tableName)) {
|
|
736
|
+
const table = await this.db.openTable(tableName);
|
|
737
|
+
await table.add(allData);
|
|
738
|
+
} else {
|
|
739
|
+
await this.db.createTable(tableName, allData);
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
// Update hashes for all prepared files
|
|
743
|
+
for (const pf of preparedFiles) {
|
|
744
|
+
this.hashes[pf.relPath] = pf.hash;
|
|
745
|
+
}
|
|
746
|
+
await this.saveHashes();
|
|
747
|
+
|
|
748
|
+
// Invalidate caches
|
|
749
|
+
if (this.bm25) { this.bm25.clear(); this.bm25 = null; }
|
|
750
|
+
this._bm25Rows = null;
|
|
751
|
+
this._chunkCache = null;
|
|
752
|
+
|
|
753
|
+
return allData.length;
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
// ── Index a single file (legacy — used by freshen/on-change) ───────────
|
|
616
757
|
|
|
617
758
|
async indexFile(filePath) {
|
|
618
759
|
const relPath = path.relative(this.root, filePath);
|
|
@@ -1170,31 +1311,64 @@ class CodebaseIndexer {
|
|
|
1170
1311
|
}
|
|
1171
1312
|
}
|
|
1172
1313
|
|
|
1173
|
-
let indexed = 0;
|
|
1174
|
-
let skipped = 0;
|
|
1175
1314
|
const total = files.length;
|
|
1315
|
+
const CONCURRENCY = 5;
|
|
1176
1316
|
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1317
|
+
// ══════════════════════════════════════════════════════════════════════════
|
|
1318
|
+
// Phase 1: Prepare files in parallel (chunk + graph, no embedding)
|
|
1319
|
+
// ══════════════════════════════════════════════════════════════════════════
|
|
1320
|
+
const preparedFiles = [];
|
|
1321
|
+
let prepared = 0;
|
|
1322
|
+
let skipped = 0;
|
|
1323
|
+
|
|
1324
|
+
// Process in batches of CONCURRENCY
|
|
1325
|
+
for (let i = 0; i < files.length; i += CONCURRENCY) {
|
|
1326
|
+
const batch = files.slice(i, i + CONCURRENCY);
|
|
1327
|
+
const promises = batch.map(async (relPath) => {
|
|
1328
|
+
const filePath = path.join(this.root, relPath);
|
|
1329
|
+
try {
|
|
1330
|
+
const result = await this.prepareFile(filePath);
|
|
1331
|
+
return result;
|
|
1332
|
+
} catch {
|
|
1333
|
+
return null;
|
|
1334
|
+
}
|
|
1335
|
+
});
|
|
1336
|
+
|
|
1337
|
+
const results = await Promise.all(promises);
|
|
1338
|
+
for (let j = 0; j < results.length; j++) {
|
|
1339
|
+
if (results[j]) {
|
|
1340
|
+
preparedFiles.push(results[j]);
|
|
1341
|
+
prepared++;
|
|
1342
|
+
if (onProgress) onProgress(prepared, total, results[j].relPath, i + j + 1, "prepare");
|
|
1186
1343
|
} else {
|
|
1187
1344
|
skipped++;
|
|
1188
1345
|
}
|
|
1189
|
-
} catch {
|
|
1190
|
-
skipped++;
|
|
1191
1346
|
}
|
|
1192
1347
|
}
|
|
1193
1348
|
|
|
1349
|
+
if (DEBUG) console.log(`[vectorizer] Phase 1 done: ${prepared} files prepared, ${skipped} skipped`);
|
|
1350
|
+
|
|
1351
|
+
// ══════════════════════════════════════════════════════════════════════════
|
|
1352
|
+
// Phase 2: Batch embed + store (sequential, batch forward pass)
|
|
1353
|
+
// ══════════════════════════════════════════════════════════════════════════
|
|
1354
|
+
let chunksEmbedded = 0;
|
|
1355
|
+
if (preparedFiles.length > 0) {
|
|
1356
|
+
const totalChunks = preparedFiles.reduce((sum, pf) => sum + pf.rows.length, 0);
|
|
1357
|
+
if (DEBUG) console.log(`[vectorizer] Phase 2: embedding ${totalChunks} chunks from ${preparedFiles.length} files`);
|
|
1358
|
+
|
|
1359
|
+
chunksEmbedded = await this.embedAndStore(preparedFiles, 32, (done, embedTotal, phase) => {
|
|
1360
|
+
if (onProgress) onProgress(done, embedTotal, `embedding`, done, "embed");
|
|
1361
|
+
});
|
|
1362
|
+
|
|
1363
|
+
if (DEBUG) console.log(`[vectorizer] Phase 2 done: ${chunksEmbedded} chunks embedded and stored`);
|
|
1364
|
+
}
|
|
1365
|
+
|
|
1366
|
+
const indexed = prepared; // file count for backward compat
|
|
1367
|
+
|
|
1194
1368
|
// FR-005: Build semantic similarity edges as post-pass
|
|
1195
1369
|
// Disabled by default (O(n²) — slow on large repos). Enable via graph.semantic_edges: true
|
|
1196
1370
|
let semanticEdges = 0;
|
|
1197
|
-
if (
|
|
1371
|
+
if (chunksEmbedded > 0 && this.graphBuilder && this.graphDB && GRAPH_CONFIG.semantic_edges) {
|
|
1198
1372
|
try {
|
|
1199
1373
|
const tableName = "chunks";
|
|
1200
1374
|
const tables = await this.db.tableNames();
|