@neuralsea/workspace-indexer 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +356 -0
- package/dist/chunk-QPQCSCBN.js +2374 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +91 -0
- package/dist/index.d.ts +505 -0
- package/dist/index.js +22 -0
- package/package.json +45 -0
|
@@ -0,0 +1,2374 @@
|
|
|
1
|
+
// src/vector/factory.ts
|
|
2
|
+
import path4 from "path";
|
|
3
|
+
import { pathToFileURL } from "url";
|
|
4
|
+
|
|
5
|
+
// src/util.ts
|
|
6
|
+
import crypto from "crypto";
|
|
7
|
+
import os from "os";
|
|
8
|
+
import path from "path";
|
|
9
|
+
function sha256Hex(data) {
|
|
10
|
+
return crypto.createHash("sha256").update(data).digest("hex");
|
|
11
|
+
}
|
|
12
|
+
function defaultCacheDir() {
|
|
13
|
+
const home = os.homedir();
|
|
14
|
+
return path.join(home, ".cache", "petri");
|
|
15
|
+
}
|
|
16
|
+
function fromPosixPath(p) {
|
|
17
|
+
return p.split("/").join(path.sep);
|
|
18
|
+
}
|
|
19
|
+
function estimateTokens(text) {
|
|
20
|
+
return Math.max(1, Math.ceil(text.length / 4));
|
|
21
|
+
}
|
|
22
|
+
function normalise(vec) {
|
|
23
|
+
let sumSq = 0;
|
|
24
|
+
for (let i = 0; i < vec.length; i++) sumSq += vec[i] * vec[i];
|
|
25
|
+
const norm = Math.sqrt(sumSq) || 1;
|
|
26
|
+
const out = new Float32Array(vec.length);
|
|
27
|
+
for (let i = 0; i < vec.length; i++) out[i] = vec[i] / norm;
|
|
28
|
+
return out;
|
|
29
|
+
}
|
|
30
|
+
function dot(a, b) {
|
|
31
|
+
const n = Math.min(a.length, b.length);
|
|
32
|
+
let s = 0;
|
|
33
|
+
for (let i = 0; i < n; i++) s += a[i] * b[i];
|
|
34
|
+
return s;
|
|
35
|
+
}
|
|
36
|
+
function clamp(x, lo, hi) {
|
|
37
|
+
return Math.min(hi, Math.max(lo, x));
|
|
38
|
+
}
|
|
39
|
+
function makePreview(text, maxLen = 240) {
|
|
40
|
+
const oneLine = text.replace(/\s+/g, " ").trim();
|
|
41
|
+
return oneLine.length <= maxLen ? oneLine : oneLine.slice(0, maxLen - 1) + "\u2026";
|
|
42
|
+
}
|
|
43
|
+
function nowMs() {
|
|
44
|
+
return Date.now();
|
|
45
|
+
}
|
|
46
|
+
function recencyScore(fileMtimeMs, halfLifeDays = 14) {
|
|
47
|
+
const ageMs = Math.max(0, nowMs() - fileMtimeMs);
|
|
48
|
+
const halfLifeMs = halfLifeDays * 24 * 60 * 60 * 1e3;
|
|
49
|
+
const score = Math.pow(0.5, ageMs / Math.max(1, halfLifeMs));
|
|
50
|
+
return clamp(score, 0, 1);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// src/vector/bruteforce.ts
|
|
54
|
+
var BruteForceVectorIndex = class {
|
|
55
|
+
kind = "bruteforce";
|
|
56
|
+
metric = "cosine";
|
|
57
|
+
dimension = 0;
|
|
58
|
+
ids = [];
|
|
59
|
+
vecs = [];
|
|
60
|
+
pos = /* @__PURE__ */ new Map();
|
|
61
|
+
async init(init) {
|
|
62
|
+
this.metric = init.metric;
|
|
63
|
+
this.dimension = init.dimension;
|
|
64
|
+
}
|
|
65
|
+
async upsert(points) {
|
|
66
|
+
for (const p of points) this.upsertOne(p.id, p.vector);
|
|
67
|
+
}
|
|
68
|
+
upsertOne(id, vec) {
|
|
69
|
+
const v = normalise(vec);
|
|
70
|
+
const existing = this.pos.get(id);
|
|
71
|
+
if (existing !== void 0) {
|
|
72
|
+
this.vecs[existing] = v;
|
|
73
|
+
return;
|
|
74
|
+
}
|
|
75
|
+
this.pos.set(id, this.ids.length);
|
|
76
|
+
this.ids.push(id);
|
|
77
|
+
this.vecs.push(v);
|
|
78
|
+
}
|
|
79
|
+
async remove(ids) {
|
|
80
|
+
for (const id of ids) this.removeOne(id);
|
|
81
|
+
}
|
|
82
|
+
removeOne(id) {
|
|
83
|
+
const i = this.pos.get(id);
|
|
84
|
+
if (i === void 0) return;
|
|
85
|
+
const last = this.ids.length - 1;
|
|
86
|
+
if (i !== last) {
|
|
87
|
+
const lastId = this.ids[last];
|
|
88
|
+
this.ids[i] = lastId;
|
|
89
|
+
this.vecs[i] = this.vecs[last];
|
|
90
|
+
this.pos.set(lastId, i);
|
|
91
|
+
}
|
|
92
|
+
this.ids.pop();
|
|
93
|
+
this.vecs.pop();
|
|
94
|
+
this.pos.delete(id);
|
|
95
|
+
}
|
|
96
|
+
async rebuild(points) {
|
|
97
|
+
this.ids = [];
|
|
98
|
+
this.vecs = [];
|
|
99
|
+
this.pos = /* @__PURE__ */ new Map();
|
|
100
|
+
for (const p of points) this.upsertOne(p.id, p.vector);
|
|
101
|
+
}
|
|
102
|
+
async search(query, k) {
|
|
103
|
+
const q = normalise(query);
|
|
104
|
+
const top = [];
|
|
105
|
+
for (let i = 0; i < this.ids.length; i++) {
|
|
106
|
+
const s = dot(q, this.vecs[i]);
|
|
107
|
+
if (top.length < k) {
|
|
108
|
+
top.push({ id: this.ids[i], score: s });
|
|
109
|
+
top.sort((a, b) => b.score - a.score);
|
|
110
|
+
} else if (s > top[top.length - 1].score) {
|
|
111
|
+
top[top.length - 1] = { id: this.ids[i], score: s };
|
|
112
|
+
top.sort((a, b) => b.score - a.score);
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
return top;
|
|
116
|
+
}
|
|
117
|
+
async count() {
|
|
118
|
+
return this.ids.length;
|
|
119
|
+
}
|
|
120
|
+
async flush() {
|
|
121
|
+
}
|
|
122
|
+
async close() {
|
|
123
|
+
}
|
|
124
|
+
};
|
|
125
|
+
|
|
126
|
+
// src/vector/faiss.ts
|
|
127
|
+
import fs from "fs";
|
|
128
|
+
import path2 from "path";
|
|
129
|
+
async function importFaiss() {
|
|
130
|
+
const modName = "faiss-node";
|
|
131
|
+
try {
|
|
132
|
+
const mod = await import(modName);
|
|
133
|
+
const ns = mod?.Index ? mod : mod?.default;
|
|
134
|
+
if (!ns?.Index) throw new Error("faiss-node did not export Index");
|
|
135
|
+
return ns;
|
|
136
|
+
} catch (e) {
|
|
137
|
+
const hint = "To use the 'faiss' provider, install the optional dependency: npm i faiss-node";
|
|
138
|
+
throw new Error(`${String(e?.message ?? e)}
|
|
139
|
+
${hint}`);
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
function safeReadJson(p) {
|
|
143
|
+
try {
|
|
144
|
+
return JSON.parse(fs.readFileSync(p, "utf8"));
|
|
145
|
+
} catch {
|
|
146
|
+
return null;
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
var FaissVectorIndex = class {
|
|
150
|
+
kind = "faiss";
|
|
151
|
+
metric = "cosine";
|
|
152
|
+
dimension = 0;
|
|
153
|
+
cfg;
|
|
154
|
+
index = null;
|
|
155
|
+
labelToId = [];
|
|
156
|
+
points = /* @__PURE__ */ new Map();
|
|
157
|
+
dirty = false;
|
|
158
|
+
readyForIncremental = false;
|
|
159
|
+
indexPath = "";
|
|
160
|
+
mapPath = "";
|
|
161
|
+
constructor(cfg = {}) {
|
|
162
|
+
this.cfg = {
|
|
163
|
+
descriptor: cfg.descriptor ?? "HNSW,Flat",
|
|
164
|
+
persist: cfg.persist ?? true,
|
|
165
|
+
persistDebounceMs: cfg.persistDebounceMs ?? 2e3,
|
|
166
|
+
rebuildStrategy: cfg.rebuildStrategy ?? "lazy"
|
|
167
|
+
};
|
|
168
|
+
}
|
|
169
|
+
async init(init) {
|
|
170
|
+
this.metric = init.metric;
|
|
171
|
+
this.dimension = init.dimension;
|
|
172
|
+
const baseDir = path2.join(init.cacheDir, "vector", init.repoId);
|
|
173
|
+
fs.mkdirSync(baseDir, { recursive: true });
|
|
174
|
+
this.indexPath = path2.join(baseDir, `${init.commit}.faiss.idx`);
|
|
175
|
+
this.mapPath = path2.join(baseDir, `${init.commit}.faiss.map.json`);
|
|
176
|
+
if (this.cfg.persist && fs.existsSync(this.indexPath) && fs.existsSync(this.mapPath)) {
|
|
177
|
+
const map = safeReadJson(this.mapPath);
|
|
178
|
+
if (map && map.version === 1 && map.dimension === this.dimension && map.metric === this.metric) {
|
|
179
|
+
const { Index } = await importFaiss();
|
|
180
|
+
this.index = Index.read(this.indexPath);
|
|
181
|
+
this.labelToId = map.labelToId;
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
async rebuild(points) {
|
|
186
|
+
this.points = new Map(points.map((p) => [p.id, p.vector]));
|
|
187
|
+
this.readyForIncremental = true;
|
|
188
|
+
await this.rebuildFromPoints();
|
|
189
|
+
}
|
|
190
|
+
async rebuildFromPoints() {
|
|
191
|
+
const { Index, MetricType } = await importFaiss();
|
|
192
|
+
const metric = this.metric === "l2" ? MetricType.METRIC_L2 : MetricType.METRIC_INNER_PRODUCT;
|
|
193
|
+
const idx = Index.fromFactory(this.dimension, this.cfg.descriptor, metric);
|
|
194
|
+
const ids = [];
|
|
195
|
+
const flat = [];
|
|
196
|
+
for (const [id, v] of this.points.entries()) {
|
|
197
|
+
ids.push(id);
|
|
198
|
+
const vec = this.metric === "cosine" ? normalise(v) : v;
|
|
199
|
+
for (let i = 0; i < vec.length; i++) flat.push(vec[i]);
|
|
200
|
+
}
|
|
201
|
+
if (flat.length > 0) {
|
|
202
|
+
if (typeof idx.isTrained === "function" && !idx.isTrained()) {
|
|
203
|
+
idx.train(flat);
|
|
204
|
+
}
|
|
205
|
+
idx.add(flat);
|
|
206
|
+
}
|
|
207
|
+
this.index = idx;
|
|
208
|
+
this.labelToId = ids;
|
|
209
|
+
this.dirty = false;
|
|
210
|
+
if (this.cfg.persist) {
|
|
211
|
+
const map = {
|
|
212
|
+
version: 1,
|
|
213
|
+
dimension: this.dimension,
|
|
214
|
+
metric: this.metric,
|
|
215
|
+
labelToId: ids
|
|
216
|
+
};
|
|
217
|
+
fs.writeFileSync(this.mapPath, JSON.stringify(map));
|
|
218
|
+
idx.write(this.indexPath);
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
async upsert(points) {
|
|
222
|
+
if (!this.readyForIncremental) {
|
|
223
|
+
throw new Error("FAISS provider requires an initial rebuild() before incremental writes.");
|
|
224
|
+
}
|
|
225
|
+
for (const p of points) this.points.set(p.id, p.vector);
|
|
226
|
+
this.dirty = true;
|
|
227
|
+
if (this.cfg.rebuildStrategy === "eager") await this.rebuildFromPoints();
|
|
228
|
+
}
|
|
229
|
+
async remove(ids) {
|
|
230
|
+
if (!this.readyForIncremental) {
|
|
231
|
+
throw new Error("FAISS provider requires an initial rebuild() before incremental writes.");
|
|
232
|
+
}
|
|
233
|
+
for (const id of ids) this.points.delete(id);
|
|
234
|
+
this.dirty = true;
|
|
235
|
+
if (this.cfg.rebuildStrategy === "eager") await this.rebuildFromPoints();
|
|
236
|
+
}
|
|
237
|
+
async search(query, k) {
|
|
238
|
+
if (this.dirty && this.cfg.rebuildStrategy === "lazy") {
|
|
239
|
+
await this.rebuildFromPoints();
|
|
240
|
+
}
|
|
241
|
+
if (!this.index) return [];
|
|
242
|
+
const q = this.metric === "cosine" ? Array.from(normalise(query)) : Array.from(query);
|
|
243
|
+
const res = this.index.search(q, k);
|
|
244
|
+
const out = [];
|
|
245
|
+
for (let i = 0; i < res.labels.length; i++) {
|
|
246
|
+
const label = res.labels[i];
|
|
247
|
+
if (label < 0) continue;
|
|
248
|
+
const id = this.labelToId[label];
|
|
249
|
+
if (!id) continue;
|
|
250
|
+
const d = res.distances[i];
|
|
251
|
+
const score = this.metric === "l2" ? -d : d;
|
|
252
|
+
out.push({ id, score });
|
|
253
|
+
}
|
|
254
|
+
out.sort((a, b) => b.score - a.score);
|
|
255
|
+
return out;
|
|
256
|
+
}
|
|
257
|
+
async count() {
|
|
258
|
+
if (this.index && typeof this.index.ntotal === "function") return this.index.ntotal();
|
|
259
|
+
return this.labelToId.length;
|
|
260
|
+
}
|
|
261
|
+
async flush() {
|
|
262
|
+
}
|
|
263
|
+
async close() {
|
|
264
|
+
if (this.dirty && this.cfg.rebuildStrategy === "lazy") {
|
|
265
|
+
try {
|
|
266
|
+
await this.rebuildFromPoints();
|
|
267
|
+
} catch {
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
this.index = null;
|
|
271
|
+
}
|
|
272
|
+
};
|
|
273
|
+
|
|
274
|
+
// src/vector/hnswlib.ts
|
|
275
|
+
import fs2 from "fs";
|
|
276
|
+
import path3 from "path";
|
|
277
|
+
async function importHnswlib() {
|
|
278
|
+
const modName = "hnswlib-node";
|
|
279
|
+
try {
|
|
280
|
+
const mod = await import(modName);
|
|
281
|
+
const ns = mod?.HierarchicalNSW ? mod : mod?.default;
|
|
282
|
+
if (!ns?.HierarchicalNSW) {
|
|
283
|
+
throw new Error("hnswlib-node did not export HierarchicalNSW");
|
|
284
|
+
}
|
|
285
|
+
return ns;
|
|
286
|
+
} catch (e) {
|
|
287
|
+
const hint = "To use the 'hnswlib' provider, install the optional dependency: npm i hnswlib-node";
|
|
288
|
+
throw new Error(`${String(e?.message ?? e)}
|
|
289
|
+
${hint}`);
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
function safeReadJson2(p) {
|
|
293
|
+
try {
|
|
294
|
+
return JSON.parse(fs2.readFileSync(p, "utf8"));
|
|
295
|
+
} catch {
|
|
296
|
+
return null;
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
var HnswlibVectorIndex = class {
|
|
300
|
+
kind = "hnswlib";
|
|
301
|
+
metric = "cosine";
|
|
302
|
+
dimension = 0;
|
|
303
|
+
cfg;
|
|
304
|
+
index = null;
|
|
305
|
+
idToLabel = /* @__PURE__ */ new Map();
|
|
306
|
+
labelToId = /* @__PURE__ */ new Map();
|
|
307
|
+
nextLabel = 0;
|
|
308
|
+
indexPath = "";
|
|
309
|
+
mapPath = "";
|
|
310
|
+
dirty = false;
|
|
311
|
+
constructor(cfg = {}) {
|
|
312
|
+
this.cfg = {
|
|
313
|
+
maxElements: cfg.maxElements ?? 5e4,
|
|
314
|
+
m: cfg.m ?? 16,
|
|
315
|
+
efConstruction: cfg.efConstruction ?? 200,
|
|
316
|
+
efSearch: cfg.efSearch ?? 64,
|
|
317
|
+
persist: cfg.persist ?? true,
|
|
318
|
+
persistDebounceMs: cfg.persistDebounceMs ?? 2e3
|
|
319
|
+
};
|
|
320
|
+
}
|
|
321
|
+
async init(init) {
|
|
322
|
+
this.metric = init.metric;
|
|
323
|
+
this.dimension = init.dimension;
|
|
324
|
+
const baseDir = path3.join(init.cacheDir, "vector", init.repoId);
|
|
325
|
+
fs2.mkdirSync(baseDir, { recursive: true });
|
|
326
|
+
this.indexPath = path3.join(baseDir, `${init.commit}.hnsw.dat`);
|
|
327
|
+
this.mapPath = path3.join(baseDir, `${init.commit}.hnsw.map.json`);
|
|
328
|
+
if (this.cfg.persist && fs2.existsSync(this.indexPath) && fs2.existsSync(this.mapPath)) {
|
|
329
|
+
const map = safeReadJson2(this.mapPath);
|
|
330
|
+
if (map && map.version === 1 && map.dimension === this.dimension && map.metric === this.metric) {
|
|
331
|
+
const { HierarchicalNSW } = await importHnswlib();
|
|
332
|
+
const space = this.toHnswSpace(this.metric);
|
|
333
|
+
const idx = new HierarchicalNSW(space, this.dimension);
|
|
334
|
+
idx.readIndexSync(this.indexPath, true);
|
|
335
|
+
if (typeof idx.setEf === "function") idx.setEf(this.cfg.efSearch);
|
|
336
|
+
this.index = idx;
|
|
337
|
+
this.idToLabel = new Map(Object.entries(map.idToLabel).map(([id, label]) => [id, label]));
|
|
338
|
+
this.labelToId = new Map(Array.from(this.idToLabel.entries()).map(([id, label]) => [label, id]));
|
|
339
|
+
this.nextLabel = Math.max(-1, ...Array.from(this.labelToId.keys())) + 1;
|
|
340
|
+
return;
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
await this.resetEmpty();
|
|
344
|
+
}
|
|
345
|
+
async resetEmpty() {
|
|
346
|
+
const { HierarchicalNSW } = await importHnswlib();
|
|
347
|
+
const idx = new HierarchicalNSW(this.toHnswSpace(this.metric), this.dimension);
|
|
348
|
+
idx.initIndex({
|
|
349
|
+
maxElements: this.cfg.maxElements,
|
|
350
|
+
m: this.cfg.m,
|
|
351
|
+
efConstruction: this.cfg.efConstruction,
|
|
352
|
+
allowReplaceDeleted: true
|
|
353
|
+
});
|
|
354
|
+
if (typeof idx.setEf === "function") idx.setEf(this.cfg.efSearch);
|
|
355
|
+
this.index = idx;
|
|
356
|
+
this.idToLabel = /* @__PURE__ */ new Map();
|
|
357
|
+
this.labelToId = /* @__PURE__ */ new Map();
|
|
358
|
+
this.nextLabel = 0;
|
|
359
|
+
this.dirty = false;
|
|
360
|
+
}
|
|
361
|
+
async rebuild(points) {
|
|
362
|
+
const desired = Math.max(this.cfg.maxElements, Math.ceil(points.length * 1.1) + 1024);
|
|
363
|
+
const { HierarchicalNSW } = await importHnswlib();
|
|
364
|
+
const idx = new HierarchicalNSW(this.toHnswSpace(this.metric), this.dimension);
|
|
365
|
+
idx.initIndex({
|
|
366
|
+
maxElements: desired,
|
|
367
|
+
m: this.cfg.m,
|
|
368
|
+
efConstruction: this.cfg.efConstruction,
|
|
369
|
+
allowReplaceDeleted: true
|
|
370
|
+
});
|
|
371
|
+
if (typeof idx.setEf === "function") idx.setEf(this.cfg.efSearch);
|
|
372
|
+
this.index = idx;
|
|
373
|
+
this.idToLabel = /* @__PURE__ */ new Map();
|
|
374
|
+
this.labelToId = /* @__PURE__ */ new Map();
|
|
375
|
+
this.nextLabel = 0;
|
|
376
|
+
for (const p of points) {
|
|
377
|
+
const label = this.nextLabel++;
|
|
378
|
+
this.idToLabel.set(p.id, label);
|
|
379
|
+
this.labelToId.set(label, p.id);
|
|
380
|
+
idx.addPoint(Array.from(normalise(p.vector)), label, true);
|
|
381
|
+
}
|
|
382
|
+
this.markDirty();
|
|
383
|
+
await this.flush();
|
|
384
|
+
}
|
|
385
|
+
async upsert(points) {
|
|
386
|
+
if (!this.index) throw new Error("HNSW index not initialised");
|
|
387
|
+
for (const p of points) {
|
|
388
|
+
let label = this.idToLabel.get(p.id);
|
|
389
|
+
if (label === void 0) {
|
|
390
|
+
label = this.nextLabel++;
|
|
391
|
+
this.idToLabel.set(p.id, label);
|
|
392
|
+
this.labelToId.set(label, p.id);
|
|
393
|
+
} else {
|
|
394
|
+
if (typeof this.index.unmarkDelete === "function") {
|
|
395
|
+
try {
|
|
396
|
+
this.index.unmarkDelete(label);
|
|
397
|
+
} catch {
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
if (typeof this.index.getMaxElements === "function" && typeof this.index.getCurrentCount === "function") {
|
|
402
|
+
const max = this.index.getMaxElements();
|
|
403
|
+
const cur = this.index.getCurrentCount();
|
|
404
|
+
if (cur + 1 > max && typeof this.index.resizeIndex === "function") {
|
|
405
|
+
this.index.resizeIndex(Math.ceil((cur + 1) * 1.2) + 1024);
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
this.index.addPoint(Array.from(normalise(p.vector)), label, true);
|
|
409
|
+
}
|
|
410
|
+
this.markDirty();
|
|
411
|
+
}
|
|
412
|
+
async remove(ids) {
|
|
413
|
+
if (!this.index) throw new Error("HNSW index not initialised");
|
|
414
|
+
for (const id of ids) {
|
|
415
|
+
const label = this.idToLabel.get(id);
|
|
416
|
+
if (label === void 0) continue;
|
|
417
|
+
if (typeof this.index.markDelete === "function") {
|
|
418
|
+
try {
|
|
419
|
+
this.index.markDelete(label);
|
|
420
|
+
} catch {
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
this.markDirty();
|
|
425
|
+
}
|
|
426
|
+
async search(query, k) {
|
|
427
|
+
if (!this.index) throw new Error("HNSW index not initialised");
|
|
428
|
+
const q = Array.from(normalise(query));
|
|
429
|
+
const res = this.index.searchKnn(q, k);
|
|
430
|
+
const out = [];
|
|
431
|
+
for (let i = 0; i < res.neighbors.length; i++) {
|
|
432
|
+
const label = res.neighbors[i];
|
|
433
|
+
if (label < 0) continue;
|
|
434
|
+
const id = this.labelToId.get(label);
|
|
435
|
+
if (!id) continue;
|
|
436
|
+
const d = res.distances[i];
|
|
437
|
+
const score = this.metric === "cosine" ? 1 - d : -d;
|
|
438
|
+
out.push({ id, score });
|
|
439
|
+
}
|
|
440
|
+
out.sort((a, b) => b.score - a.score);
|
|
441
|
+
return out;
|
|
442
|
+
}
|
|
443
|
+
async count() {
|
|
444
|
+
if (!this.index) return 0;
|
|
445
|
+
if (typeof this.index.getCurrentCount === "function") return this.index.getCurrentCount();
|
|
446
|
+
return this.idToLabel.size;
|
|
447
|
+
}
|
|
448
|
+
markDirty() {
|
|
449
|
+
if (!this.cfg.persist) return;
|
|
450
|
+
this.dirty = true;
|
|
451
|
+
}
|
|
452
|
+
async flush() {
|
|
453
|
+
if (!this.cfg.persist) return;
|
|
454
|
+
if (!this.dirty) return;
|
|
455
|
+
if (!this.index) return;
|
|
456
|
+
this.dirty = false;
|
|
457
|
+
const map = {
|
|
458
|
+
version: 1,
|
|
459
|
+
dimension: this.dimension,
|
|
460
|
+
metric: this.metric,
|
|
461
|
+
idToLabel: Object.fromEntries(this.idToLabel.entries())
|
|
462
|
+
};
|
|
463
|
+
fs2.writeFileSync(this.mapPath, JSON.stringify(map));
|
|
464
|
+
this.index.writeIndexSync(this.indexPath);
|
|
465
|
+
}
|
|
466
|
+
async close() {
|
|
467
|
+
await this.flush();
|
|
468
|
+
this.index = null;
|
|
469
|
+
}
|
|
470
|
+
toHnswSpace(metric) {
|
|
471
|
+
if (metric === "l2") return "l2";
|
|
472
|
+
if (metric === "ip") return "ip";
|
|
473
|
+
return "cosine";
|
|
474
|
+
}
|
|
475
|
+
};
|
|
476
|
+
|
|
477
|
+
// src/vector/qdrant.ts
|
|
478
|
+
import crypto2 from "crypto";
|
|
479
|
+
async function importQdrant() {
|
|
480
|
+
const modName = "@qdrant/js-client-rest";
|
|
481
|
+
try {
|
|
482
|
+
const mod = await import(modName);
|
|
483
|
+
const ns = mod?.QdrantClient ? mod : mod?.default;
|
|
484
|
+
if (!ns?.QdrantClient) throw new Error("@qdrant/js-client-rest did not export QdrantClient");
|
|
485
|
+
return ns;
|
|
486
|
+
} catch (e) {
|
|
487
|
+
const hint = "To use the 'qdrant' provider, install the optional dependency: npm i @qdrant/js-client-rest";
|
|
488
|
+
throw new Error(`${String(e?.message ?? e)}
|
|
489
|
+
${hint}`);
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
var PETRI_UUID_NAMESPACE = "b0f67f3b-2c75-44b8-9b4d-8f71a8a2f3f2";
|
|
493
|
+
function uuidToBytes(uuid) {
|
|
494
|
+
const hex = uuid.replace(/-/g, "");
|
|
495
|
+
if (hex.length !== 32) throw new Error(`Invalid UUID: ${uuid}`);
|
|
496
|
+
const out = new Uint8Array(16);
|
|
497
|
+
for (let i = 0; i < 16; i++) out[i] = parseInt(hex.slice(i * 2, i * 2 + 2), 16);
|
|
498
|
+
return out;
|
|
499
|
+
}
|
|
500
|
+
function bytesToUuid(b) {
|
|
501
|
+
const hex = Array.from(b).map((x) => x.toString(16).padStart(2, "0")).join("");
|
|
502
|
+
return `${hex.slice(0, 8)}-${hex.slice(8, 12)}-${hex.slice(12, 16)}-${hex.slice(16, 20)}-${hex.slice(20)}`;
|
|
503
|
+
}
|
|
504
|
+
function uuidv5(name, namespace) {
|
|
505
|
+
const ns = uuidToBytes(namespace);
|
|
506
|
+
const nameBytes = Buffer.from(name, "utf8");
|
|
507
|
+
const hash = crypto2.createHash("sha1").update(ns).update(nameBytes).digest();
|
|
508
|
+
const out = new Uint8Array(hash.subarray(0, 16));
|
|
509
|
+
out[6] = out[6] & 15 | 80;
|
|
510
|
+
out[8] = out[8] & 63 | 128;
|
|
511
|
+
return bytesToUuid(out);
|
|
512
|
+
}
|
|
513
|
+
function sanitizeCollection(s) {
|
|
514
|
+
return s.toLowerCase().replace(/[^a-z0-9_\-]/g, "_").replace(/_+/g, "_").replace(/^_+|_+$/g, "").slice(0, 60);
|
|
515
|
+
}
|
|
516
|
+
var QdrantVectorIndex = class {
|
|
517
|
+
kind = "qdrant";
|
|
518
|
+
metric = "cosine";
|
|
519
|
+
dimension = 0;
|
|
520
|
+
cfg;
|
|
521
|
+
client = null;
|
|
522
|
+
collection = "";
|
|
523
|
+
commit = "";
|
|
524
|
+
collectionMode = "commit";
|
|
525
|
+
constructor(cfg = {}) {
|
|
526
|
+
this.cfg = {
|
|
527
|
+
url: cfg.url ?? "",
|
|
528
|
+
host: cfg.host ?? "localhost",
|
|
529
|
+
port: cfg.port ?? 6333,
|
|
530
|
+
apiKey: cfg.apiKey ?? "",
|
|
531
|
+
collectionPrefix: cfg.collectionPrefix ?? "petri",
|
|
532
|
+
collectionMode: cfg.collectionMode ?? "commit",
|
|
533
|
+
recreateOnRebuild: cfg.recreateOnRebuild ?? true
|
|
534
|
+
};
|
|
535
|
+
}
|
|
536
|
+
async init(init) {
|
|
537
|
+
this.metric = init.metric;
|
|
538
|
+
this.dimension = init.dimension;
|
|
539
|
+
this.commit = init.commit;
|
|
540
|
+
this.collectionMode = this.cfg.collectionMode;
|
|
541
|
+
const { QdrantClient } = await importQdrant();
|
|
542
|
+
const args = this.cfg.url ? { url: this.cfg.url, apiKey: this.cfg.apiKey || void 0 } : { host: this.cfg.host, port: this.cfg.port, apiKey: this.cfg.apiKey || void 0 };
|
|
543
|
+
this.client = new QdrantClient(args);
|
|
544
|
+
const repoPart = sanitizeCollection(init.repoId);
|
|
545
|
+
const metricPart = this.metric === "cosine" ? "cos" : this.metric;
|
|
546
|
+
const dimPart = String(this.dimension);
|
|
547
|
+
const commitPart = init.commit.slice(0, 8);
|
|
548
|
+
const prefix = sanitizeCollection(this.cfg.collectionPrefix);
|
|
549
|
+
this.collection = this.collectionMode === "commit" ? sanitizeCollection(`${prefix}_${repoPart}_${commitPart}_${dimPart}_${metricPart}`) : sanitizeCollection(`${prefix}_${repoPart}_${dimPart}_${metricPart}`);
|
|
550
|
+
await this.ensureCollection();
|
|
551
|
+
}
|
|
552
|
+
pointId(chunkId) {
|
|
553
|
+
const name = this.collectionMode === "commit" ? chunkId : `${this.commit}:${chunkId}`;
|
|
554
|
+
return uuidv5(name, PETRI_UUID_NAMESPACE);
|
|
555
|
+
}
|
|
556
|
+
commitFilter() {
|
|
557
|
+
return {
|
|
558
|
+
must: [{ key: "commit", match: { value: this.commit } }]
|
|
559
|
+
};
|
|
560
|
+
}
|
|
561
|
+
distanceName(metric) {
|
|
562
|
+
if (metric === "l2") return "Euclid";
|
|
563
|
+
if (metric === "ip") return "Dot";
|
|
564
|
+
return "Cosine";
|
|
565
|
+
}
|
|
566
|
+
async ensureCollection() {
|
|
567
|
+
if (!this.client) throw new Error("Qdrant client not initialised");
|
|
568
|
+
const existing = await this.client.getCollections();
|
|
569
|
+
const names = (existing?.collections ?? existing?.result?.collections ?? []).map((c) => c.name);
|
|
570
|
+
if (names.includes(this.collection)) return;
|
|
571
|
+
await this.client.createCollection(this.collection, {
|
|
572
|
+
vectors: { size: this.dimension, distance: this.distanceName(this.metric) }
|
|
573
|
+
});
|
|
574
|
+
}
|
|
575
|
+
async rebuild(points) {
|
|
576
|
+
if (!this.client) throw new Error("Qdrant client not initialised");
|
|
577
|
+
if (this.collectionMode === "commit") {
|
|
578
|
+
if (this.cfg.recreateOnRebuild) {
|
|
579
|
+
try {
|
|
580
|
+
await this.client.deleteCollection(this.collection);
|
|
581
|
+
} catch {
|
|
582
|
+
}
|
|
583
|
+
} else {
|
|
584
|
+
try {
|
|
585
|
+
await this.client.delete(this.collection, { filter: {} });
|
|
586
|
+
} catch {
|
|
587
|
+
}
|
|
588
|
+
}
|
|
589
|
+
await this.ensureCollection();
|
|
590
|
+
} else {
|
|
591
|
+
try {
|
|
592
|
+
await this.client.delete(this.collection, { filter: this.commitFilter() });
|
|
593
|
+
} catch {
|
|
594
|
+
}
|
|
595
|
+
}
|
|
596
|
+
const batchSize = 256;
|
|
597
|
+
for (let i = 0; i < points.length; i += batchSize) {
|
|
598
|
+
const batch = points.slice(i, i + batchSize);
|
|
599
|
+
await this.upsert(batch);
|
|
600
|
+
}
|
|
601
|
+
}
|
|
602
|
+
async upsert(points) {
|
|
603
|
+
if (!this.client) throw new Error("Qdrant client not initialised");
|
|
604
|
+
if (points.length === 0) return;
|
|
605
|
+
const qPoints = points.map((p) => {
|
|
606
|
+
const payload = { ...p.payload ?? {}, cid: p.id };
|
|
607
|
+
if (this.collectionMode === "repo") payload.commit = this.commit;
|
|
608
|
+
return {
|
|
609
|
+
id: this.pointId(p.id),
|
|
610
|
+
vector: Array.from(normalise(p.vector)),
|
|
611
|
+
payload
|
|
612
|
+
};
|
|
613
|
+
});
|
|
614
|
+
await this.client.upsert(this.collection, { points: qPoints });
|
|
615
|
+
}
|
|
616
|
+
async remove(ids) {
|
|
617
|
+
if (!this.client) throw new Error("Qdrant client not initialised");
|
|
618
|
+
if (ids.length === 0) return;
|
|
619
|
+
const pointIds = ids.map((id) => this.pointId(id));
|
|
620
|
+
await this.client.delete(this.collection, { points: pointIds });
|
|
621
|
+
}
|
|
622
|
+
async search(query, k) {
|
|
623
|
+
if (!this.client) throw new Error("Qdrant client not initialised");
|
|
624
|
+
const q = Array.from(normalise(query));
|
|
625
|
+
const req = {
|
|
626
|
+
query: q,
|
|
627
|
+
limit: k,
|
|
628
|
+
with_payload: true
|
|
629
|
+
};
|
|
630
|
+
if (this.collectionMode === "repo") req.filter = this.commitFilter();
|
|
631
|
+
let res;
|
|
632
|
+
if (typeof this.client.query === "function") {
|
|
633
|
+
res = await this.client.query(this.collection, req);
|
|
634
|
+
} else if (typeof this.client.search === "function") {
|
|
635
|
+
res = await this.client.search(this.collection, { vector: q, limit: k, with_payload: true, filter: req.filter });
|
|
636
|
+
} else {
|
|
637
|
+
throw new Error("Qdrant client missing query/search method");
|
|
638
|
+
}
|
|
639
|
+
const hits = res?.points ?? res?.result ?? res ?? [];
|
|
640
|
+
const out = [];
|
|
641
|
+
for (const h of hits) {
|
|
642
|
+
const payload = h.payload ?? h?.payloads;
|
|
643
|
+
const cid = payload?.cid;
|
|
644
|
+
if (!cid) continue;
|
|
645
|
+
const score = typeof h.score === "number" ? h.score : typeof h?.result?.score === "number" ? h.result.score : 0;
|
|
646
|
+
out.push({ id: String(cid), score });
|
|
647
|
+
}
|
|
648
|
+
out.sort((a, b) => b.score - a.score);
|
|
649
|
+
return out;
|
|
650
|
+
}
|
|
651
|
+
async count() {
|
|
652
|
+
if (!this.client) return 0;
|
|
653
|
+
try {
|
|
654
|
+
if (this.collectionMode === "repo" && typeof this.client.count === "function") {
|
|
655
|
+
const res = await this.client.count(this.collection, { filter: this.commitFilter(), exact: false });
|
|
656
|
+
return Number(res?.count ?? res?.result?.count ?? 0);
|
|
657
|
+
}
|
|
658
|
+
const info = await this.client.getCollection(this.collection);
|
|
659
|
+
const c = info?.points_count ?? info?.result?.points_count;
|
|
660
|
+
if (typeof c === "number") return c;
|
|
661
|
+
} catch {
|
|
662
|
+
}
|
|
663
|
+
return 0;
|
|
664
|
+
}
|
|
665
|
+
async flush() {
|
|
666
|
+
}
|
|
667
|
+
async close() {
|
|
668
|
+
this.client = null;
|
|
669
|
+
}
|
|
670
|
+
};
|
|
671
|
+
|
|
672
|
+
// src/vector/factory.ts
|
|
673
|
+
function isVectorIndex(obj) {
|
|
674
|
+
return obj && typeof obj.init === "function" && typeof obj.rebuild === "function" && typeof obj.upsert === "function" && typeof obj.remove === "function" && typeof obj.search === "function" && typeof obj.count === "function" && typeof obj.flush === "function" && typeof obj.close === "function";
|
|
675
|
+
}
|
|
676
|
+
async function createVectorIndex(vector) {
|
|
677
|
+
const provider = vector?.provider ?? "bruteforce";
|
|
678
|
+
if (provider === "bruteforce") return new BruteForceVectorIndex();
|
|
679
|
+
if (provider === "hnswlib") return new HnswlibVectorIndex(vector?.hnswlib);
|
|
680
|
+
if (provider === "qdrant") return new QdrantVectorIndex(vector?.qdrant);
|
|
681
|
+
if (provider === "faiss") return new FaissVectorIndex(vector?.faiss);
|
|
682
|
+
if (provider === "custom") {
|
|
683
|
+
const c = vector?.custom;
|
|
684
|
+
if (!c?.module) {
|
|
685
|
+
throw new Error("vector.provider=custom requires vector.custom.module");
|
|
686
|
+
}
|
|
687
|
+
const abs = path4.isAbsolute(c.module) ? c.module : path4.resolve(process.cwd(), c.module);
|
|
688
|
+
const url = pathToFileURL(abs).href;
|
|
689
|
+
const mod = await import(url);
|
|
690
|
+
const expName = c.export ?? "default";
|
|
691
|
+
const exp = mod[expName] ?? mod.default ?? mod;
|
|
692
|
+
const instance = typeof exp === "function" ? (
|
|
693
|
+
// If it's a class, `new` works; if it's a factory function, it should return a VectorIndex.
|
|
694
|
+
(() => {
|
|
695
|
+
try {
|
|
696
|
+
return new exp(c.options);
|
|
697
|
+
} catch {
|
|
698
|
+
return exp(c.options);
|
|
699
|
+
}
|
|
700
|
+
})()
|
|
701
|
+
) : exp;
|
|
702
|
+
if (!isVectorIndex(instance)) {
|
|
703
|
+
throw new Error(
|
|
704
|
+
`Custom vector provider '${abs}' export '${expName}' did not produce a VectorIndex implementation.`
|
|
705
|
+
);
|
|
706
|
+
}
|
|
707
|
+
return instance;
|
|
708
|
+
}
|
|
709
|
+
if (provider === "auto") {
|
|
710
|
+
if (vector?.qdrant?.url || vector?.qdrant?.host) {
|
|
711
|
+
try {
|
|
712
|
+
const q = "@qdrant/js-client-rest";
|
|
713
|
+
await import(q);
|
|
714
|
+
return new QdrantVectorIndex(vector.qdrant);
|
|
715
|
+
} catch {
|
|
716
|
+
}
|
|
717
|
+
}
|
|
718
|
+
try {
|
|
719
|
+
const modName = "hnswlib-node";
|
|
720
|
+
await import(modName);
|
|
721
|
+
return new HnswlibVectorIndex(vector?.hnswlib);
|
|
722
|
+
} catch {
|
|
723
|
+
return new BruteForceVectorIndex();
|
|
724
|
+
}
|
|
725
|
+
}
|
|
726
|
+
return new BruteForceVectorIndex();
|
|
727
|
+
}
|
|
728
|
+
|
|
729
|
+
// src/indexer/repoIndexer.ts
|
|
730
|
+
import fs6 from "fs";
|
|
731
|
+
import path8 from "path";
|
|
732
|
+
import chokidar from "chokidar";
|
|
733
|
+
import pLimit from "p-limit";
|
|
734
|
+
|
|
735
|
+
// src/git.ts
|
|
736
|
+
import { execFile } from "child_process";
|
|
737
|
+
import { promisify } from "util";
|
|
738
|
+
var execFileAsync = promisify(execFile);
|
|
739
|
+
async function git(repoRoot, args) {
|
|
740
|
+
const { stdout } = await execFileAsync("git", args, { cwd: repoRoot, encoding: "utf8" });
|
|
741
|
+
return stdout;
|
|
742
|
+
}
|
|
743
|
+
async function getHeadCommit(repoRoot) {
|
|
744
|
+
return (await git(repoRoot, ["rev-parse", "HEAD"])).trim();
|
|
745
|
+
}
|
|
746
|
+
async function getBranchName(repoRoot) {
|
|
747
|
+
return (await git(repoRoot, ["rev-parse", "--abbrev-ref", "HEAD"])).trim();
|
|
748
|
+
}
|
|
749
|
+
async function listWorkingFiles(repoRoot) {
|
|
750
|
+
const out = await git(repoRoot, ["ls-files", "-z", "--cached", "--others", "--exclude-standard"]);
|
|
751
|
+
const parts = out.split("\0").map((s) => s.trim()).filter(Boolean);
|
|
752
|
+
const seen = /* @__PURE__ */ new Set();
|
|
753
|
+
const files = [];
|
|
754
|
+
for (const p of parts) {
|
|
755
|
+
if (!seen.has(p)) {
|
|
756
|
+
seen.add(p);
|
|
757
|
+
files.push(p);
|
|
758
|
+
}
|
|
759
|
+
}
|
|
760
|
+
return files;
|
|
761
|
+
}
|
|
762
|
+
async function listChangedFiles(repoRoot, baseRef = "HEAD~1") {
|
|
763
|
+
const out = await git(repoRoot, ["diff", "--name-only", "-z", `${baseRef}...HEAD`]);
|
|
764
|
+
return out.split("\0").map((s) => s.trim()).filter(Boolean);
|
|
765
|
+
}
|
|
766
|
+
|
|
767
|
+
// src/ignore.ts
|
|
768
|
+
import fs3 from "fs";
|
|
769
|
+
import path5 from "path";
|
|
770
|
+
import ignore from "ignore";
|
|
771
|
+
function loadExtraIgnore(repoRoot, ignoreFiles) {
|
|
772
|
+
const ig = ignore();
|
|
773
|
+
for (const name of ignoreFiles) {
|
|
774
|
+
const p = path5.join(repoRoot, name);
|
|
775
|
+
if (!fs3.existsSync(p)) continue;
|
|
776
|
+
const raw = fs3.readFileSync(p, "utf8");
|
|
777
|
+
ig.add(raw.split(/\r?\n/));
|
|
778
|
+
}
|
|
779
|
+
return (posixRelPath) => ig.ignores(posixRelPath);
|
|
780
|
+
}
|
|
781
|
+
|
|
782
|
+
// src/optionalTypescript.ts
|
|
783
|
+
import { createRequire } from "module";
|
|
784
|
+
var cached = null;
|
|
785
|
+
var didTryLoad = false;
|
|
786
|
+
function getTypeScript() {
|
|
787
|
+
if (didTryLoad) return cached;
|
|
788
|
+
try {
|
|
789
|
+
const require2 = createRequire(import.meta.url);
|
|
790
|
+
cached = require2("typescript");
|
|
791
|
+
didTryLoad = true;
|
|
792
|
+
return cached;
|
|
793
|
+
} catch {
|
|
794
|
+
cached = null;
|
|
795
|
+
didTryLoad = true;
|
|
796
|
+
return null;
|
|
797
|
+
}
|
|
798
|
+
}
|
|
799
|
+
|
|
800
|
+
// src/chunker.ts
|
|
801
|
+
function languageFromPath(posixPath) {
|
|
802
|
+
const ext = posixPath.toLowerCase().split(".").pop() ?? "";
|
|
803
|
+
if (["ts", "tsx"].includes(ext)) return "typescript";
|
|
804
|
+
if (["js", "jsx", "mjs", "cjs"].includes(ext)) return "javascript";
|
|
805
|
+
if (["py"].includes(ext)) return "python";
|
|
806
|
+
if (["go"].includes(ext)) return "go";
|
|
807
|
+
if (["rs"].includes(ext)) return "rust";
|
|
808
|
+
if (["java"].includes(ext)) return "java";
|
|
809
|
+
if (["kt", "kts"].includes(ext)) return "kotlin";
|
|
810
|
+
if (["cs"].includes(ext)) return "csharp";
|
|
811
|
+
if (["md"].includes(ext)) return "markdown";
|
|
812
|
+
if (["json", "yaml", "yml", "toml"].includes(ext)) return "config";
|
|
813
|
+
return "text";
|
|
814
|
+
}
|
|
815
|
+
function chunkByLines(sourceText, cfg) {
|
|
816
|
+
const lines = sourceText.split(/\r?\n/);
|
|
817
|
+
const out = [];
|
|
818
|
+
const step = Math.max(1, cfg.maxLines - cfg.overlapLines);
|
|
819
|
+
for (let start = 0; start < lines.length; start += step) {
|
|
820
|
+
const end = Math.min(lines.length, start + cfg.maxLines);
|
|
821
|
+
const text = lines.slice(start, end).join("\n");
|
|
822
|
+
if (!text.trim()) continue;
|
|
823
|
+
out.push({
|
|
824
|
+
startLine: start + 1,
|
|
825
|
+
endLine: end,
|
|
826
|
+
text,
|
|
827
|
+
contentHash: sha256Hex(text),
|
|
828
|
+
tokens: estimateTokens(text)
|
|
829
|
+
});
|
|
830
|
+
}
|
|
831
|
+
return out;
|
|
832
|
+
}
|
|
833
|
+
function splitIfTooLarge(ch, cfg) {
|
|
834
|
+
if (ch.text.length <= cfg.maxChars) return [ch];
|
|
835
|
+
return chunkByLines(ch.text, cfg).map((sub) => ({
|
|
836
|
+
...sub,
|
|
837
|
+
startLine: ch.startLine + (sub.startLine - 1),
|
|
838
|
+
endLine: ch.startLine + (sub.endLine - 1)
|
|
839
|
+
}));
|
|
840
|
+
}
|
|
841
|
+
function isTopLevelChunkable(tsMod, stmt) {
|
|
842
|
+
return tsMod.isFunctionDeclaration(stmt) || tsMod.isClassDeclaration(stmt) || tsMod.isInterfaceDeclaration(stmt) || tsMod.isTypeAliasDeclaration(stmt) || tsMod.isEnumDeclaration(stmt) || tsMod.isModuleDeclaration(stmt) || tsMod.isVariableStatement(stmt) || tsMod.isExportAssignment(stmt) || tsMod.isExportDeclaration(stmt);
|
|
843
|
+
}
|
|
844
|
+
function chunkTypeScriptLike(sourceText, virtualFileName, cfg) {
|
|
845
|
+
const tsMod = getTypeScript();
|
|
846
|
+
if (!tsMod) return chunkByLines(sourceText, cfg);
|
|
847
|
+
const sf = tsMod.createSourceFile(
|
|
848
|
+
virtualFileName,
|
|
849
|
+
sourceText,
|
|
850
|
+
tsMod.ScriptTarget.Latest,
|
|
851
|
+
true
|
|
852
|
+
);
|
|
853
|
+
const chunks = [];
|
|
854
|
+
for (const stmt of sf.statements) {
|
|
855
|
+
if (!isTopLevelChunkable(tsMod, stmt)) continue;
|
|
856
|
+
const startPos = stmt.getFullStart();
|
|
857
|
+
const endPos = stmt.end;
|
|
858
|
+
const startLC = tsMod.getLineAndCharacterOfPosition(sf, startPos);
|
|
859
|
+
const endLC = tsMod.getLineAndCharacterOfPosition(sf, endPos);
|
|
860
|
+
const text = sourceText.slice(startPos, endPos).trimEnd();
|
|
861
|
+
if (!text.trim()) continue;
|
|
862
|
+
const base = {
|
|
863
|
+
startLine: startLC.line + 1,
|
|
864
|
+
endLine: endLC.line + 1,
|
|
865
|
+
text,
|
|
866
|
+
contentHash: sha256Hex(text),
|
|
867
|
+
tokens: estimateTokens(text)
|
|
868
|
+
};
|
|
869
|
+
chunks.push(...splitIfTooLarge(base, cfg));
|
|
870
|
+
}
|
|
871
|
+
if (chunks.length === 0) return chunkByLines(sourceText, cfg);
|
|
872
|
+
return chunks;
|
|
873
|
+
}
|
|
874
|
+
function chunkSource(posixPath, sourceText, cfg) {
|
|
875
|
+
const language = languageFromPath(posixPath);
|
|
876
|
+
if (language === "typescript" || language === "javascript") {
|
|
877
|
+
return { language, chunks: chunkTypeScriptLike(sourceText, posixPath, cfg) };
|
|
878
|
+
}
|
|
879
|
+
return { language, chunks: chunkByLines(sourceText, cfg) };
|
|
880
|
+
}
|
|
881
|
+
|
|
882
|
+
// src/relations.ts
|
|
883
|
+
function extractTsRelations(virtualFileName, sourceText) {
|
|
884
|
+
const tsMod = getTypeScript();
|
|
885
|
+
if (!tsMod) return { imports: [], exports: [] };
|
|
886
|
+
const sf = tsMod.createSourceFile(virtualFileName, sourceText, tsMod.ScriptTarget.Latest, true);
|
|
887
|
+
const imports = [];
|
|
888
|
+
const exports = [];
|
|
889
|
+
const isExported2 = (node) => {
|
|
890
|
+
if (!tsMod.canHaveModifiers(node)) return false;
|
|
891
|
+
const mods = tsMod.getModifiers(node);
|
|
892
|
+
return !!mods?.some((m) => m.kind === tsMod.SyntaxKind.ExportKeyword);
|
|
893
|
+
};
|
|
894
|
+
for (const stmt of sf.statements) {
|
|
895
|
+
if (tsMod.isImportDeclaration(stmt) && tsMod.isStringLiteral(stmt.moduleSpecifier)) {
|
|
896
|
+
imports.push(stmt.moduleSpecifier.text);
|
|
897
|
+
}
|
|
898
|
+
if (tsMod.isExportDeclaration(stmt)) {
|
|
899
|
+
if (stmt.exportClause && tsMod.isNamedExports(stmt.exportClause)) {
|
|
900
|
+
for (const el of stmt.exportClause.elements) exports.push(el.name.text);
|
|
901
|
+
} else {
|
|
902
|
+
exports.push("*");
|
|
903
|
+
}
|
|
904
|
+
}
|
|
905
|
+
if (tsMod.isFunctionDeclaration(stmt) && isExported2(stmt) && stmt.name) {
|
|
906
|
+
exports.push(stmt.name.text);
|
|
907
|
+
}
|
|
908
|
+
if (tsMod.isClassDeclaration(stmt) && isExported2(stmt) && stmt.name) {
|
|
909
|
+
exports.push(stmt.name.text);
|
|
910
|
+
}
|
|
911
|
+
if (tsMod.isVariableStatement(stmt) && isExported2(stmt)) {
|
|
912
|
+
for (const decl of stmt.declarationList.declarations) {
|
|
913
|
+
if (tsMod.isIdentifier(decl.name)) exports.push(decl.name.text);
|
|
914
|
+
}
|
|
915
|
+
}
|
|
916
|
+
}
|
|
917
|
+
return { imports, exports };
|
|
918
|
+
}
|
|
919
|
+
|
|
920
|
+
// src/synopsis.ts
|
|
921
|
+
function isExported(tsMod, node) {
|
|
922
|
+
if (!tsMod.canHaveModifiers(node)) return false;
|
|
923
|
+
const mods = tsMod.getModifiers(node);
|
|
924
|
+
return !!mods?.some((m) => m.kind === tsMod.SyntaxKind.ExportKeyword);
|
|
925
|
+
}
|
|
926
|
+
function leadingCommentExcerpt(sourceText, maxChars = 600) {
|
|
927
|
+
const t = sourceText.trimStart();
|
|
928
|
+
const block = t.match(/^\/\*[\s\S]*?\*\//);
|
|
929
|
+
if (block) return block[0].slice(0, maxChars);
|
|
930
|
+
const lines = t.split(/\r?\n/);
|
|
931
|
+
const commentLines = [];
|
|
932
|
+
for (const line of lines) {
|
|
933
|
+
const m = line.match(/^\s*\/\/(.*)$/);
|
|
934
|
+
if (!m) break;
|
|
935
|
+
commentLines.push(line.trim());
|
|
936
|
+
if (commentLines.join("\n").length >= maxChars) break;
|
|
937
|
+
}
|
|
938
|
+
if (commentLines.length) return commentLines.join("\n").slice(0, maxChars);
|
|
939
|
+
return "";
|
|
940
|
+
}
|
|
941
|
+
function uniq(xs) {
|
|
942
|
+
const out = [];
|
|
943
|
+
const s = /* @__PURE__ */ new Set();
|
|
944
|
+
for (const x of xs) {
|
|
945
|
+
if (!x) continue;
|
|
946
|
+
if (s.has(x)) continue;
|
|
947
|
+
s.add(x);
|
|
948
|
+
out.push(x);
|
|
949
|
+
}
|
|
950
|
+
return out;
|
|
951
|
+
}
|
|
952
|
+
function renderGenericSynopsis(input) {
|
|
953
|
+
const lead = leadingCommentExcerpt(input.sourceText);
|
|
954
|
+
const parts = [];
|
|
955
|
+
parts.push(`File synopsis`);
|
|
956
|
+
parts.push(`path: ${input.posixPath}`);
|
|
957
|
+
parts.push(`language: ${input.language}`);
|
|
958
|
+
if (lead) parts.push(`comment: ${lead.replace(/\s+/g, " ").trim()}`);
|
|
959
|
+
return parts.join("\n").slice(0, 1200);
|
|
960
|
+
}
|
|
961
|
+
function buildTsLikeSynopsis(input) {
|
|
962
|
+
const tsMod = getTypeScript();
|
|
963
|
+
if (!tsMod) return renderGenericSynopsis(input);
|
|
964
|
+
const lead = leadingCommentExcerpt(input.sourceText);
|
|
965
|
+
const sf = tsMod.createSourceFile(input.posixPath, input.sourceText, tsMod.ScriptTarget.Latest, true);
|
|
966
|
+
const imports = [];
|
|
967
|
+
const exports = [];
|
|
968
|
+
const topLevel = [];
|
|
969
|
+
for (const stmt of sf.statements) {
|
|
970
|
+
if (tsMod.isImportDeclaration(stmt) && tsMod.isStringLiteral(stmt.moduleSpecifier)) {
|
|
971
|
+
imports.push(stmt.moduleSpecifier.text);
|
|
972
|
+
}
|
|
973
|
+
const isExportedStmt = isExported(tsMod, stmt);
|
|
974
|
+
if (tsMod.isFunctionDeclaration(stmt) && stmt.name) topLevel.push(`fn ${stmt.name.text}${isExportedStmt ? " (export)" : ""}`);
|
|
975
|
+
if (tsMod.isClassDeclaration(stmt) && stmt.name) topLevel.push(`class ${stmt.name.text}${isExportedStmt ? " (export)" : ""}`);
|
|
976
|
+
if (tsMod.isInterfaceDeclaration(stmt) && stmt.name) topLevel.push(`interface ${stmt.name.text}${isExportedStmt ? " (export)" : ""}`);
|
|
977
|
+
if (tsMod.isTypeAliasDeclaration(stmt) && stmt.name) topLevel.push(`type ${stmt.name.text}${isExportedStmt ? " (export)" : ""}`);
|
|
978
|
+
if (tsMod.isEnumDeclaration(stmt) && stmt.name) topLevel.push(`enum ${stmt.name.text}${isExportedStmt ? " (export)" : ""}`);
|
|
979
|
+
if (tsMod.isModuleDeclaration(stmt) && stmt.name && tsMod.isIdentifier(stmt.name)) topLevel.push(`namespace ${stmt.name.text}${isExportedStmt ? " (export)" : ""}`);
|
|
980
|
+
if (tsMod.isVariableStatement(stmt)) {
|
|
981
|
+
const isExp = isExported(tsMod, stmt);
|
|
982
|
+
for (const decl of stmt.declarationList.declarations) {
|
|
983
|
+
if (tsMod.isIdentifier(decl.name)) topLevel.push(`var ${decl.name.text}${isExp ? " (export)" : ""}`);
|
|
984
|
+
}
|
|
985
|
+
}
|
|
986
|
+
if (tsMod.isExportDeclaration(stmt)) {
|
|
987
|
+
if (stmt.exportClause && tsMod.isNamedExports(stmt.exportClause)) {
|
|
988
|
+
for (const el of stmt.exportClause.elements) exports.push(el.name.text);
|
|
989
|
+
} else {
|
|
990
|
+
exports.push("*");
|
|
991
|
+
}
|
|
992
|
+
}
|
|
993
|
+
if ((tsMod.isFunctionDeclaration(stmt) || tsMod.isClassDeclaration(stmt)) && isExportedStmt && stmt.name) {
|
|
994
|
+
exports.push(stmt.name.text);
|
|
995
|
+
}
|
|
996
|
+
}
|
|
997
|
+
const importsU = uniq(imports).slice(0, 30);
|
|
998
|
+
const exportsU = uniq(exports).slice(0, 40);
|
|
999
|
+
const topU = uniq(topLevel).slice(0, 60);
|
|
1000
|
+
const parts = [];
|
|
1001
|
+
parts.push(`File synopsis`);
|
|
1002
|
+
parts.push(`path: ${input.posixPath}`);
|
|
1003
|
+
parts.push(`language: ${input.language}`);
|
|
1004
|
+
if (importsU.length) parts.push(`imports: ${importsU.join(", ")}`);
|
|
1005
|
+
if (exportsU.length) parts.push(`exports: ${exportsU.join(", ")}`);
|
|
1006
|
+
if (topU.length) parts.push(`top-level: ${topU.join(" | ")}`);
|
|
1007
|
+
if (lead) parts.push(`comment: ${lead.replace(/\s+/g, " ").trim()}`);
|
|
1008
|
+
return parts.join("\n").slice(0, 1800);
|
|
1009
|
+
}
|
|
1010
|
+
var builtInStrategies = [
|
|
1011
|
+
{
|
|
1012
|
+
name: "ts-like",
|
|
1013
|
+
supports: ({ language }) => language === "typescript" || language === "javascript",
|
|
1014
|
+
build: buildTsLikeSynopsis
|
|
1015
|
+
},
|
|
1016
|
+
{
|
|
1017
|
+
name: "markdown",
|
|
1018
|
+
supports: ({ language }) => language === "markdown",
|
|
1019
|
+
build: (input) => {
|
|
1020
|
+
const lead = leadingCommentExcerpt(input.sourceText);
|
|
1021
|
+
const parts = [];
|
|
1022
|
+
parts.push(`File synopsis`);
|
|
1023
|
+
parts.push(`path: ${input.posixPath}`);
|
|
1024
|
+
parts.push(`language: ${input.language}`);
|
|
1025
|
+
if (lead) parts.push(`comment: ${lead.replace(/\s+/g, " ").trim()}`);
|
|
1026
|
+
const headings = input.sourceText.split(/\r?\n/).filter((l) => /^#{1,3}\s+/.test(l)).slice(0, 12);
|
|
1027
|
+
if (headings.length) parts.push(`headings: ${headings.map((h) => h.replace(/\s+/g, " ").trim()).join(" | ")}`);
|
|
1028
|
+
return parts.join("\n").slice(0, 1200);
|
|
1029
|
+
}
|
|
1030
|
+
},
|
|
1031
|
+
{
|
|
1032
|
+
name: "generic",
|
|
1033
|
+
supports: () => true,
|
|
1034
|
+
build: renderGenericSynopsis
|
|
1035
|
+
}
|
|
1036
|
+
];
|
|
1037
|
+
var customStrategies = [];
|
|
1038
|
+
function buildSynopsis(posixPath, language, sourceText) {
|
|
1039
|
+
const input = { posixPath, language, sourceText };
|
|
1040
|
+
const all = [...customStrategies, ...builtInStrategies];
|
|
1041
|
+
const strategy = all.find((s) => s.supports(input)) ?? builtInStrategies[builtInStrategies.length - 1];
|
|
1042
|
+
try {
|
|
1043
|
+
return strategy.build(input);
|
|
1044
|
+
} catch {
|
|
1045
|
+
return renderGenericSynopsis(input);
|
|
1046
|
+
}
|
|
1047
|
+
}
|
|
1048
|
+
|
|
1049
|
+
// src/store/embeddingCache.ts
|
|
1050
|
+
import fs4 from "fs";
|
|
1051
|
+
import path6 from "path";
|
|
1052
|
+
import Database from "better-sqlite3";
|
|
1053
|
+
var EmbeddingCache = class {
|
|
1054
|
+
db;
|
|
1055
|
+
constructor(cacheFilePath) {
|
|
1056
|
+
fs4.mkdirSync(path6.dirname(cacheFilePath), { recursive: true });
|
|
1057
|
+
this.db = new Database(cacheFilePath);
|
|
1058
|
+
this.db.pragma("journal_mode = WAL");
|
|
1059
|
+
this.db.exec(`
|
|
1060
|
+
CREATE TABLE IF NOT EXISTS embedding_cache (
|
|
1061
|
+
provider_id TEXT NOT NULL,
|
|
1062
|
+
content_hash TEXT NOT NULL,
|
|
1063
|
+
embedding BLOB NOT NULL,
|
|
1064
|
+
dim INTEGER NOT NULL,
|
|
1065
|
+
created_at INTEGER NOT NULL,
|
|
1066
|
+
PRIMARY KEY(provider_id, content_hash)
|
|
1067
|
+
);
|
|
1068
|
+
`);
|
|
1069
|
+
}
|
|
1070
|
+
get(providerId, contentHash) {
|
|
1071
|
+
const row = this.db.prepare(
|
|
1072
|
+
`SELECT embedding, dim FROM embedding_cache WHERE provider_id = ? AND content_hash = ?`
|
|
1073
|
+
).get(providerId, contentHash);
|
|
1074
|
+
if (!row) return null;
|
|
1075
|
+
const buf = row.embedding;
|
|
1076
|
+
const view = new Float32Array(buf.buffer, buf.byteOffset, buf.byteLength / 4);
|
|
1077
|
+
return new Float32Array(view);
|
|
1078
|
+
}
|
|
1079
|
+
put(providerId, contentHash, embedding) {
|
|
1080
|
+
const buf = Buffer.from(embedding.buffer, embedding.byteOffset, embedding.byteLength);
|
|
1081
|
+
this.db.prepare(
|
|
1082
|
+
`INSERT OR REPLACE INTO embedding_cache(provider_id, content_hash, embedding, dim, created_at)
|
|
1083
|
+
VALUES (?, ?, ?, ?, ?)`
|
|
1084
|
+
).run(providerId, contentHash, buf, embedding.length, Date.now());
|
|
1085
|
+
}
|
|
1086
|
+
close() {
|
|
1087
|
+
this.db.close();
|
|
1088
|
+
}
|
|
1089
|
+
};
|
|
1090
|
+
|
|
1091
|
+
// src/store/repoStore.ts
|
|
1092
|
+
import fs5 from "fs";
|
|
1093
|
+
import path7 from "path";
|
|
1094
|
+
import Database2 from "better-sqlite3";
|
|
1095
|
+
var RepoStore = class {
|
|
1096
|
+
db;
|
|
1097
|
+
constructor(dbPath) {
|
|
1098
|
+
fs5.mkdirSync(path7.dirname(dbPath), { recursive: true });
|
|
1099
|
+
this.db = new Database2(dbPath);
|
|
1100
|
+
this.db.pragma("journal_mode = WAL");
|
|
1101
|
+
this.db.exec(`
|
|
1102
|
+
CREATE TABLE IF NOT EXISTS meta (
|
|
1103
|
+
k TEXT PRIMARY KEY,
|
|
1104
|
+
v TEXT NOT NULL
|
|
1105
|
+
);
|
|
1106
|
+
|
|
1107
|
+
CREATE TABLE IF NOT EXISTS files (
|
|
1108
|
+
path TEXT PRIMARY KEY,
|
|
1109
|
+
hash TEXT NOT NULL,
|
|
1110
|
+
mtime INTEGER NOT NULL,
|
|
1111
|
+
language TEXT NOT NULL,
|
|
1112
|
+
size INTEGER NOT NULL
|
|
1113
|
+
);
|
|
1114
|
+
|
|
1115
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
1116
|
+
id TEXT PRIMARY KEY,
|
|
1117
|
+
path TEXT NOT NULL,
|
|
1118
|
+
language TEXT NOT NULL,
|
|
1119
|
+
kind TEXT NOT NULL DEFAULT 'chunk',
|
|
1120
|
+
start_line INTEGER NOT NULL,
|
|
1121
|
+
end_line INTEGER NOT NULL,
|
|
1122
|
+
content_hash TEXT NOT NULL,
|
|
1123
|
+
tokens INTEGER NOT NULL,
|
|
1124
|
+
file_mtime INTEGER NOT NULL,
|
|
1125
|
+
text TEXT NOT NULL,
|
|
1126
|
+
embedding BLOB NOT NULL
|
|
1127
|
+
);
|
|
1128
|
+
|
|
1129
|
+
CREATE INDEX IF NOT EXISTS idx_chunks_path ON chunks(path);
|
|
1130
|
+
CREATE INDEX IF NOT EXISTS idx_chunks_kind_path ON chunks(kind, path);
|
|
1131
|
+
|
|
1132
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
|
|
1133
|
+
id UNINDEXED,
|
|
1134
|
+
path,
|
|
1135
|
+
language,
|
|
1136
|
+
kind,
|
|
1137
|
+
text,
|
|
1138
|
+
tokenize='unicode61'
|
|
1139
|
+
);
|
|
1140
|
+
|
|
1141
|
+
CREATE TABLE IF NOT EXISTS edges (
|
|
1142
|
+
from_path TEXT NOT NULL,
|
|
1143
|
+
kind TEXT NOT NULL,
|
|
1144
|
+
value TEXT NOT NULL,
|
|
1145
|
+
PRIMARY KEY(from_path, kind, value)
|
|
1146
|
+
);
|
|
1147
|
+
|
|
1148
|
+
CREATE INDEX IF NOT EXISTS idx_edges_from ON edges(from_path);
|
|
1149
|
+
`);
|
|
1150
|
+
if (this.getMeta("storeVersion") === null) this.setMeta("storeVersion", "0");
|
|
1151
|
+
}
|
|
1152
|
+
/** Monotonically increases whenever the chunk-store is mutated. */
|
|
1153
|
+
getStoreVersion() {
|
|
1154
|
+
const v = this.getMeta("storeVersion");
|
|
1155
|
+
return v ? Number(v) : 0;
|
|
1156
|
+
}
|
|
1157
|
+
/** Internal: bump store version (call inside the same transaction that mutates chunks). */
|
|
1158
|
+
bumpStoreVersion() {
|
|
1159
|
+
const next = this.getStoreVersion() + 1;
|
|
1160
|
+
this.setMeta("storeVersion", String(next));
|
|
1161
|
+
}
|
|
1162
|
+
/** Vector index sync marker (per backend kind). */
|
|
1163
|
+
getVectorIndexVersion(kind) {
|
|
1164
|
+
const v = this.getMeta(`vector.${kind}.storeVersion`);
|
|
1165
|
+
return v ? Number(v) : 0;
|
|
1166
|
+
}
|
|
1167
|
+
setVectorIndexVersion(kind, storeVersion) {
|
|
1168
|
+
this.setMeta(`vector.${kind}.storeVersion`, String(storeVersion));
|
|
1169
|
+
}
|
|
1170
|
+
setMeta(k, v) {
|
|
1171
|
+
this.db.prepare(`INSERT OR REPLACE INTO meta(k, v) VALUES (?, ?)`).run(k, v);
|
|
1172
|
+
}
|
|
1173
|
+
getMeta(k) {
|
|
1174
|
+
const row = this.db.prepare(`SELECT v FROM meta WHERE k = ?`).get(k);
|
|
1175
|
+
return row?.v ?? null;
|
|
1176
|
+
}
|
|
1177
|
+
getFileHash(posixPath) {
|
|
1178
|
+
const row = this.db.prepare(`SELECT hash FROM files WHERE path = ?`).get(posixPath);
|
|
1179
|
+
return row?.hash ?? null;
|
|
1180
|
+
}
|
|
1181
|
+
getFileMtime(posixPath) {
|
|
1182
|
+
const row = this.db.prepare(`SELECT mtime FROM files WHERE path = ?`).get(posixPath);
|
|
1183
|
+
return row?.mtime ?? null;
|
|
1184
|
+
}
|
|
1185
|
+
upsertFile(posixPath, hash, mtime, language, size) {
|
|
1186
|
+
this.db.prepare(`
|
|
1187
|
+
INSERT INTO files(path, hash, mtime, language, size)
|
|
1188
|
+
VALUES (?, ?, ?, ?, ?)
|
|
1189
|
+
ON CONFLICT(path) DO UPDATE SET
|
|
1190
|
+
hash = excluded.hash,
|
|
1191
|
+
mtime = excluded.mtime,
|
|
1192
|
+
language = excluded.language,
|
|
1193
|
+
size = excluded.size
|
|
1194
|
+
`).run(posixPath, hash, mtime, language, size);
|
|
1195
|
+
}
|
|
1196
|
+
deleteFile(posixPath) {
|
|
1197
|
+
const tx = this.db.transaction(() => {
|
|
1198
|
+
this.db.prepare(`DELETE FROM chunks WHERE path = ?`).run(posixPath);
|
|
1199
|
+
this.db.prepare(`DELETE FROM chunks_fts WHERE path = ?`).run(posixPath);
|
|
1200
|
+
this.db.prepare(`DELETE FROM edges WHERE from_path = ?`).run(posixPath);
|
|
1201
|
+
this.db.prepare(`DELETE FROM files WHERE path = ?`).run(posixPath);
|
|
1202
|
+
this.bumpStoreVersion();
|
|
1203
|
+
});
|
|
1204
|
+
tx();
|
|
1205
|
+
}
|
|
1206
|
+
replaceChunksForFile(posixPath, rows) {
|
|
1207
|
+
const tx = this.db.transaction(() => {
|
|
1208
|
+
this.db.prepare(`DELETE FROM chunks WHERE path = ?`).run(posixPath);
|
|
1209
|
+
this.db.prepare(`DELETE FROM chunks_fts WHERE path = ?`).run(posixPath);
|
|
1210
|
+
const ins = this.db.prepare(`
|
|
1211
|
+
INSERT INTO chunks(id, path, language, kind, start_line, end_line, content_hash, tokens, file_mtime, text, embedding)
|
|
1212
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
1213
|
+
`);
|
|
1214
|
+
const insFts = this.db.prepare(`
|
|
1215
|
+
INSERT INTO chunks_fts(id, path, language, kind, text)
|
|
1216
|
+
VALUES (?, ?, ?, ?, ?)
|
|
1217
|
+
`);
|
|
1218
|
+
for (const r of rows) {
|
|
1219
|
+
const buf = Buffer.from(r.embedding.buffer, r.embedding.byteOffset, r.embedding.byteLength);
|
|
1220
|
+
ins.run(r.id, posixPath, r.language, r.kind, r.startLine, r.endLine, r.contentHash, r.tokens, r.fileMtime, r.text, buf);
|
|
1221
|
+
insFts.run(r.id, posixPath, r.language, r.kind, r.ftsText);
|
|
1222
|
+
}
|
|
1223
|
+
this.bumpStoreVersion();
|
|
1224
|
+
});
|
|
1225
|
+
tx();
|
|
1226
|
+
}
|
|
1227
|
+
setEdges(fromPath, kind, values) {
|
|
1228
|
+
const tx = this.db.transaction(() => {
|
|
1229
|
+
this.db.prepare(`DELETE FROM edges WHERE from_path = ? AND kind = ?`).run(fromPath, kind);
|
|
1230
|
+
const ins = this.db.prepare(`INSERT OR REPLACE INTO edges(from_path, kind, value) VALUES (?, ?, ?)`);
|
|
1231
|
+
for (const v of values) ins.run(fromPath, kind, v);
|
|
1232
|
+
});
|
|
1233
|
+
tx();
|
|
1234
|
+
}
|
|
1235
|
+
listEdges(fromPath, kind) {
|
|
1236
|
+
const rows = this.db.prepare(`SELECT value FROM edges WHERE from_path = ? AND kind = ?`).all(fromPath, kind);
|
|
1237
|
+
return rows.map((r) => r.value);
|
|
1238
|
+
}
|
|
1239
|
+
listAllFiles() {
|
|
1240
|
+
const rows = this.db.prepare(`SELECT path FROM files`).all();
|
|
1241
|
+
return rows.map((r) => r.path);
|
|
1242
|
+
}
|
|
1243
|
+
countChunks() {
|
|
1244
|
+
const row = this.db.prepare(`SELECT COUNT(*) AS c FROM chunks`).get();
|
|
1245
|
+
return row.c;
|
|
1246
|
+
}
|
|
1247
|
+
/**
|
|
1248
|
+
* Returns the embedding dimension if any chunks exist, otherwise null.
|
|
1249
|
+
* Efficient (doesn't load all embeddings).
|
|
1250
|
+
*/
|
|
1251
|
+
getAnyEmbeddingDimension() {
|
|
1252
|
+
const row = this.db.prepare(`SELECT embedding FROM chunks LIMIT 1`).get();
|
|
1253
|
+
if (!row) return null;
|
|
1254
|
+
return Math.floor(row.embedding.byteLength / 4);
|
|
1255
|
+
}
|
|
1256
|
+
loadAllChunkEmbeddings() {
|
|
1257
|
+
const rows = this.db.prepare(`SELECT id, embedding FROM chunks`).all();
|
|
1258
|
+
return rows.map((r) => {
|
|
1259
|
+
const view = new Float32Array(r.embedding.buffer, r.embedding.byteOffset, r.embedding.byteLength / 4);
|
|
1260
|
+
return { id: r.id, embedding: new Float32Array(view) };
|
|
1261
|
+
});
|
|
1262
|
+
}
|
|
1263
|
+
getChunkById(id) {
|
|
1264
|
+
const row = this.db.prepare(`SELECT * FROM chunks WHERE id = ?`).get(id);
|
|
1265
|
+
return row ?? null;
|
|
1266
|
+
}
|
|
1267
|
+
listChunksForFile(posixPath, kind) {
|
|
1268
|
+
if (kind) {
|
|
1269
|
+
return this.db.prepare(`SELECT id, start_line, end_line, kind FROM chunks WHERE path = ? AND kind = ? ORDER BY start_line ASC`).all(posixPath, kind);
|
|
1270
|
+
}
|
|
1271
|
+
return this.db.prepare(`SELECT id, start_line, end_line, kind FROM chunks WHERE path = ? ORDER BY kind DESC, start_line ASC`).all(posixPath);
|
|
1272
|
+
}
|
|
1273
|
+
/**
|
|
1274
|
+
* Best-effort lexical search using SQLite FTS5.
|
|
1275
|
+
* Returns ids with bm25 values (lower is better).
|
|
1276
|
+
*/
|
|
1277
|
+
searchFts(ftq, limit, includePaths) {
|
|
1278
|
+
try {
|
|
1279
|
+
if (includePaths && includePaths.length > 0) {
|
|
1280
|
+
const placeholders = includePaths.map(() => "?").join(", ");
|
|
1281
|
+
const sql2 = `
|
|
1282
|
+
SELECT id, bm25(chunks_fts) AS bm25
|
|
1283
|
+
FROM chunks_fts
|
|
1284
|
+
WHERE chunks_fts MATCH ? AND path IN (${placeholders})
|
|
1285
|
+
ORDER BY bm25 ASC
|
|
1286
|
+
LIMIT ?
|
|
1287
|
+
`;
|
|
1288
|
+
const args = [ftq, ...includePaths, limit];
|
|
1289
|
+
return this.db.prepare(sql2).all(...args);
|
|
1290
|
+
}
|
|
1291
|
+
const sql = `
|
|
1292
|
+
SELECT id, bm25(chunks_fts) AS bm25
|
|
1293
|
+
FROM chunks_fts
|
|
1294
|
+
WHERE chunks_fts MATCH ?
|
|
1295
|
+
ORDER BY bm25 ASC
|
|
1296
|
+
LIMIT ?
|
|
1297
|
+
`;
|
|
1298
|
+
return this.db.prepare(sql).all(ftq, limit);
|
|
1299
|
+
} catch {
|
|
1300
|
+
return [];
|
|
1301
|
+
}
|
|
1302
|
+
}
|
|
1303
|
+
close() {
|
|
1304
|
+
this.db.close();
|
|
1305
|
+
}
|
|
1306
|
+
};
|
|
1307
|
+
|
|
1308
|
+
// src/retrieval/tokens.ts
|
|
1309
|
+
function uniq2(xs) {
|
|
1310
|
+
const out = [];
|
|
1311
|
+
const s = /* @__PURE__ */ new Set();
|
|
1312
|
+
for (const x of xs) {
|
|
1313
|
+
if (!x) continue;
|
|
1314
|
+
if (s.has(x)) continue;
|
|
1315
|
+
s.add(x);
|
|
1316
|
+
out.push(x);
|
|
1317
|
+
}
|
|
1318
|
+
return out;
|
|
1319
|
+
}
|
|
1320
|
+
function extractLexicalTokens(text, maxTokens = 2500) {
|
|
1321
|
+
const toks = text.match(/[A-Za-z_][A-Za-z0-9_]{1,}/g) ?? [];
|
|
1322
|
+
const u = uniq2(toks).slice(0, maxTokens);
|
|
1323
|
+
return u.join(" ");
|
|
1324
|
+
}
|
|
1325
|
+
|
|
1326
|
+
// src/retrieval/fts.ts
|
|
1327
|
+
function ftsQueryFromText(input) {
|
|
1328
|
+
const tokens = (input.match(/[A-Za-z0-9_]{2,}/g) ?? []).slice(0, 24);
|
|
1329
|
+
if (tokens.length === 0) return "";
|
|
1330
|
+
const parts = tokens.map((t) => {
|
|
1331
|
+
const safe = t.replace(/"/g, '""');
|
|
1332
|
+
return `${safe}*`;
|
|
1333
|
+
});
|
|
1334
|
+
return parts.join(" OR ");
|
|
1335
|
+
}
|
|
1336
|
+
function bm25ToScore01(bm25) {
|
|
1337
|
+
if (!Number.isFinite(bm25)) return 0;
|
|
1338
|
+
const s = 1 / (1 + Math.max(0, bm25));
|
|
1339
|
+
return clamp(s, 0, 1);
|
|
1340
|
+
}
|
|
1341
|
+
function vectorCosineToScore01(cosine) {
|
|
1342
|
+
if (!Number.isFinite(cosine)) return 0;
|
|
1343
|
+
return clamp((cosine + 1) / 2, 0, 1);
|
|
1344
|
+
}
|
|
1345
|
+
|
|
1346
|
+
// src/indexer/repoIndexer.ts
|
|
1347
|
+
function repoIdFromRoot(repoRoot) {
|
|
1348
|
+
return sha256Hex(path8.resolve(repoRoot)).slice(0, 16);
|
|
1349
|
+
}
|
|
1350
|
+
function looksBinary(buf) {
|
|
1351
|
+
let nul = 0;
|
|
1352
|
+
const n = Math.min(buf.length, 4096);
|
|
1353
|
+
for (let i = 0; i < n; i++) if (buf[i] === 0) nul++;
|
|
1354
|
+
return nul > 2;
|
|
1355
|
+
}
|
|
1356
|
+
var RepoIndexer = class {
|
|
1357
|
+
// id -> row cache
|
|
1358
|
+
constructor(repoRoot, embedder, config = {}) {
|
|
1359
|
+
this.embedder = embedder;
|
|
1360
|
+
this.repoRoot = path8.resolve(repoRoot);
|
|
1361
|
+
this.repoId = repoIdFromRoot(this.repoRoot);
|
|
1362
|
+
const cacheDir = config.cacheDir ?? defaultCacheDir();
|
|
1363
|
+
this.config = {
|
|
1364
|
+
cacheDir,
|
|
1365
|
+
includeExtensions: config.includeExtensions ?? [
|
|
1366
|
+
".ts",
|
|
1367
|
+
".tsx",
|
|
1368
|
+
".js",
|
|
1369
|
+
".jsx",
|
|
1370
|
+
".mjs",
|
|
1371
|
+
".cjs",
|
|
1372
|
+
".py",
|
|
1373
|
+
".go",
|
|
1374
|
+
".rs",
|
|
1375
|
+
".java",
|
|
1376
|
+
".kt",
|
|
1377
|
+
".kts",
|
|
1378
|
+
".cs",
|
|
1379
|
+
".md",
|
|
1380
|
+
".json",
|
|
1381
|
+
".yml",
|
|
1382
|
+
".yaml",
|
|
1383
|
+
".toml"
|
|
1384
|
+
],
|
|
1385
|
+
maxFileBytes: config.maxFileBytes ?? 2e6,
|
|
1386
|
+
chunk: {
|
|
1387
|
+
maxChars: config.chunk?.maxChars ?? 12e3,
|
|
1388
|
+
maxLines: config.chunk?.maxLines ?? 240,
|
|
1389
|
+
overlapLines: config.chunk?.overlapLines ?? 40
|
|
1390
|
+
},
|
|
1391
|
+
embed: {
|
|
1392
|
+
batchSize: config.embed?.batchSize ?? 32,
|
|
1393
|
+
concurrency: config.embed?.concurrency ?? 4
|
|
1394
|
+
},
|
|
1395
|
+
watch: {
|
|
1396
|
+
debounceMs: config.watch?.debounceMs ?? 250
|
|
1397
|
+
},
|
|
1398
|
+
ignoreFiles: config.ignoreFiles ?? [".petriignore", ".augmentignore"],
|
|
1399
|
+
redact: {
|
|
1400
|
+
enabled: config.redact?.enabled ?? true,
|
|
1401
|
+
skipPathSubstrings: config.redact?.skipPathSubstrings ?? [".env", "id_rsa", ".pem", ".p12", "secrets"],
|
|
1402
|
+
patterns: config.redact?.patterns ?? [
|
|
1403
|
+
{ name: "private_key_block", regex: /-----BEGIN [A-Z ]+PRIVATE KEY-----[\s\S]*?-----END [A-Z ]+PRIVATE KEY-----/g, replaceWith: "[REDACTED_PRIVATE_KEY]" },
|
|
1404
|
+
{ name: "aws_access_key_id", regex: /\bAKIA[0-9A-Z]{16}\b/g, replaceWith: "[REDACTED_AWS_KEY]" }
|
|
1405
|
+
]
|
|
1406
|
+
},
|
|
1407
|
+
storage: {
|
|
1408
|
+
storeText: config.storage?.storeText ?? true,
|
|
1409
|
+
ftsMode: config.storage?.ftsMode ?? "full"
|
|
1410
|
+
},
|
|
1411
|
+
vector: {
|
|
1412
|
+
provider: config.vector?.provider ?? "bruteforce",
|
|
1413
|
+
metric: config.vector?.metric ?? "cosine",
|
|
1414
|
+
hnswlib: config.vector?.hnswlib,
|
|
1415
|
+
qdrant: config.vector?.qdrant,
|
|
1416
|
+
faiss: config.vector?.faiss,
|
|
1417
|
+
custom: config.vector?.custom
|
|
1418
|
+
},
|
|
1419
|
+
profiles: config.profiles ?? {}
|
|
1420
|
+
};
|
|
1421
|
+
this.embeddingCache = new EmbeddingCache(path8.join(cacheDir, "embedding-cache.sqlite"));
|
|
1422
|
+
}
|
|
1423
|
+
repoRoot;
|
|
1424
|
+
repoId;
|
|
1425
|
+
config;
|
|
1426
|
+
store = null;
|
|
1427
|
+
vec = null;
|
|
1428
|
+
vecFlushTimer = null;
|
|
1429
|
+
vecFlushInFlight = null;
|
|
1430
|
+
embeddingCache;
|
|
1431
|
+
currentCommit = null;
|
|
1432
|
+
currentBranch = null;
|
|
1433
|
+
watcher = null;
|
|
1434
|
+
fileIgnore = null;
|
|
1435
|
+
serial = pLimit(1);
|
|
1436
|
+
chunkCache = /* @__PURE__ */ new Map();
|
|
1437
|
+
getCommit() {
|
|
1438
|
+
return this.currentCommit;
|
|
1439
|
+
}
|
|
1440
|
+
getBranch() {
|
|
1441
|
+
return this.currentBranch;
|
|
1442
|
+
}
|
|
1443
|
+
getStore() {
|
|
1444
|
+
return this.store;
|
|
1445
|
+
}
|
|
1446
|
+
dbPathForCommit(commit) {
|
|
1447
|
+
return path8.join(this.config.cacheDir, "index", this.repoId, `${commit}.sqlite`);
|
|
1448
|
+
}
|
|
1449
|
+
shouldIndexPath(posixRelPath) {
|
|
1450
|
+
const lower = posixRelPath.toLowerCase();
|
|
1451
|
+
const ext = "." + (lower.split(".").pop() ?? "");
|
|
1452
|
+
if (!this.config.includeExtensions.includes(ext)) return false;
|
|
1453
|
+
if (this.config.redact.enabled) {
|
|
1454
|
+
for (const s of this.config.redact.skipPathSubstrings) {
|
|
1455
|
+
if (lower.includes(s.toLowerCase())) return false;
|
|
1456
|
+
}
|
|
1457
|
+
}
|
|
1458
|
+
if (this.fileIgnore && this.fileIgnore(posixRelPath)) return false;
|
|
1459
|
+
return true;
|
|
1460
|
+
}
|
|
1461
|
+
applyRedactions(text) {
|
|
1462
|
+
if (!this.config.redact.enabled) return text;
|
|
1463
|
+
let t = text;
|
|
1464
|
+
for (const p of this.config.redact.patterns) t = t.replace(p.regex, p.replaceWith);
|
|
1465
|
+
return t;
|
|
1466
|
+
}
|
|
1467
|
+
vectorMetric() {
|
|
1468
|
+
return this.config.vector.metric ?? "cosine";
|
|
1469
|
+
}
|
|
1470
|
+
vectorFlushDebounceMs() {
|
|
1471
|
+
const kind = this.vec?.kind;
|
|
1472
|
+
if (!kind) return 0;
|
|
1473
|
+
if (kind === "hnswlib") return this.config.vector.hnswlib?.persistDebounceMs ?? 2e3;
|
|
1474
|
+
if (kind === "faiss") return this.config.vector.faiss?.persistDebounceMs ?? 2e3;
|
|
1475
|
+
return 0;
|
|
1476
|
+
}
|
|
1477
|
+
scheduleVectorFlush() {
|
|
1478
|
+
if (!this.vec || !this.store) return;
|
|
1479
|
+
const ms = this.vectorFlushDebounceMs();
|
|
1480
|
+
if (this.vecFlushTimer) {
|
|
1481
|
+
clearTimeout(this.vecFlushTimer);
|
|
1482
|
+
this.vecFlushTimer = null;
|
|
1483
|
+
}
|
|
1484
|
+
if (ms <= 0) {
|
|
1485
|
+
void this.flushVectorNow();
|
|
1486
|
+
return;
|
|
1487
|
+
}
|
|
1488
|
+
this.vecFlushTimer = setTimeout(() => {
|
|
1489
|
+
this.vecFlushTimer = null;
|
|
1490
|
+
void this.flushVectorNow();
|
|
1491
|
+
}, ms);
|
|
1492
|
+
}
|
|
1493
|
+
async flushVectorNow() {
|
|
1494
|
+
if (!this.vec || !this.store) return;
|
|
1495
|
+
if (this.vecFlushTimer) {
|
|
1496
|
+
clearTimeout(this.vecFlushTimer);
|
|
1497
|
+
this.vecFlushTimer = null;
|
|
1498
|
+
}
|
|
1499
|
+
if (this.vecFlushInFlight) return this.vecFlushInFlight;
|
|
1500
|
+
const work = (async () => {
|
|
1501
|
+
await this.vec.flush();
|
|
1502
|
+
const sv = this.store.getStoreVersion();
|
|
1503
|
+
this.store.setVectorIndexVersion(this.vec.kind, sv);
|
|
1504
|
+
})();
|
|
1505
|
+
this.vecFlushInFlight = work.finally(() => {
|
|
1506
|
+
this.vecFlushInFlight = null;
|
|
1507
|
+
});
|
|
1508
|
+
return this.vecFlushInFlight;
|
|
1509
|
+
}
|
|
1510
|
+
async ensureVectorIndex(dimension) {
|
|
1511
|
+
if (!this.currentCommit || !this.currentBranch) return null;
|
|
1512
|
+
if (this.vec && this.vec.dimension === dimension) return this.vec;
|
|
1513
|
+
if (this.vec) {
|
|
1514
|
+
try {
|
|
1515
|
+
await this.vec.close();
|
|
1516
|
+
} catch {
|
|
1517
|
+
}
|
|
1518
|
+
this.vec = null;
|
|
1519
|
+
}
|
|
1520
|
+
const vec = await createVectorIndex(this.config.vector);
|
|
1521
|
+
await vec.init({
|
|
1522
|
+
repoId: this.repoId,
|
|
1523
|
+
repoRoot: this.repoRoot,
|
|
1524
|
+
commit: this.currentCommit,
|
|
1525
|
+
branch: this.currentBranch,
|
|
1526
|
+
cacheDir: this.config.cacheDir,
|
|
1527
|
+
dimension,
|
|
1528
|
+
metric: this.vectorMetric()
|
|
1529
|
+
});
|
|
1530
|
+
this.vec = vec;
|
|
1531
|
+
return vec;
|
|
1532
|
+
}
|
|
1533
|
+
async openForCurrentHead() {
|
|
1534
|
+
const commit = await getHeadCommit(this.repoRoot);
|
|
1535
|
+
const branch = await getBranchName(this.repoRoot);
|
|
1536
|
+
if (this.currentCommit === commit && this.store) return;
|
|
1537
|
+
this.currentCommit = commit;
|
|
1538
|
+
this.currentBranch = branch;
|
|
1539
|
+
this.chunkCache.clear();
|
|
1540
|
+
if (this.vecFlushTimer) {
|
|
1541
|
+
clearTimeout(this.vecFlushTimer);
|
|
1542
|
+
this.vecFlushTimer = null;
|
|
1543
|
+
}
|
|
1544
|
+
this.vecFlushInFlight = null;
|
|
1545
|
+
if (this.vec) {
|
|
1546
|
+
try {
|
|
1547
|
+
await this.vec.close();
|
|
1548
|
+
} catch {
|
|
1549
|
+
}
|
|
1550
|
+
this.vec = null;
|
|
1551
|
+
}
|
|
1552
|
+
if (this.store) this.store.close();
|
|
1553
|
+
this.fileIgnore = loadExtraIgnore(this.repoRoot, this.config.ignoreFiles);
|
|
1554
|
+
this.store = new RepoStore(this.dbPathForCommit(commit));
|
|
1555
|
+
this.store.setMeta("repoRoot", this.repoRoot);
|
|
1556
|
+
this.store.setMeta("repoId", this.repoId);
|
|
1557
|
+
this.store.setMeta("commit", commit);
|
|
1558
|
+
this.store.setMeta("branch", branch);
|
|
1559
|
+
this.store.setMeta("embedderId", this.embedder.id);
|
|
1560
|
+
const dim = this.embedder.dimension ?? this.store.getAnyEmbeddingDimension();
|
|
1561
|
+
if (!dim) return;
|
|
1562
|
+
const vec = await this.ensureVectorIndex(dim);
|
|
1563
|
+
if (!vec) return;
|
|
1564
|
+
const storeCount = this.store.countChunks();
|
|
1565
|
+
const storeVersion = this.store.getStoreVersion();
|
|
1566
|
+
const vecVersion = this.store.getVectorIndexVersion(vec.kind);
|
|
1567
|
+
const vecCount = await vec.count();
|
|
1568
|
+
if (storeCount > 0 && (vecVersion !== storeVersion || vecCount !== storeCount || vec.kind === "faiss")) {
|
|
1569
|
+
await vec.rebuild(
|
|
1570
|
+
this.store.loadAllChunkEmbeddings().map((e) => ({ id: e.id, vector: e.embedding }))
|
|
1571
|
+
);
|
|
1572
|
+
await this.flushVectorNow();
|
|
1573
|
+
}
|
|
1574
|
+
}
|
|
1575
|
+
async indexAll() {
|
|
1576
|
+
await this.openForCurrentHead();
|
|
1577
|
+
if (!this.store) throw new Error("RepoStore not initialised");
|
|
1578
|
+
const files = (await listWorkingFiles(this.repoRoot)).filter((p) => this.shouldIndexPath(p));
|
|
1579
|
+
const currentSet = new Set(files);
|
|
1580
|
+
for (const known of this.store.listAllFiles()) {
|
|
1581
|
+
if (!currentSet.has(known)) {
|
|
1582
|
+
await this.deleteFile(known);
|
|
1583
|
+
}
|
|
1584
|
+
}
|
|
1585
|
+
const limit = pLimit(this.config.embed.concurrency);
|
|
1586
|
+
await Promise.all(files.map((f) => limit(() => this.indexFile(f))));
|
|
1587
|
+
if (this.vec) await this.flushVectorNow();
|
|
1588
|
+
}
|
|
1589
|
+
async indexFile(posixRelPath) {
|
|
1590
|
+
return this.serial(async () => {
|
|
1591
|
+
await this.openForCurrentHead();
|
|
1592
|
+
if (!this.store) throw new Error("RepoStore not initialised");
|
|
1593
|
+
if (!this.shouldIndexPath(posixRelPath)) return;
|
|
1594
|
+
const abs = path8.join(this.repoRoot, fromPosixPath(posixRelPath));
|
|
1595
|
+
let stat;
|
|
1596
|
+
try {
|
|
1597
|
+
stat = fs6.statSync(abs);
|
|
1598
|
+
} catch {
|
|
1599
|
+
this.deleteFile(posixRelPath);
|
|
1600
|
+
return;
|
|
1601
|
+
}
|
|
1602
|
+
if (!stat.isFile()) return;
|
|
1603
|
+
if (stat.size > this.config.maxFileBytes) return;
|
|
1604
|
+
const buf = fs6.readFileSync(abs);
|
|
1605
|
+
if (looksBinary(buf)) return;
|
|
1606
|
+
const raw = buf.toString("utf8");
|
|
1607
|
+
const redacted = this.applyRedactions(raw);
|
|
1608
|
+
const fileHash = sha256Hex(redacted);
|
|
1609
|
+
const prev = this.store.getFileHash(posixRelPath);
|
|
1610
|
+
if (prev === fileHash) return;
|
|
1611
|
+
const { language, chunks } = chunkSource(posixRelPath, redacted, this.config.chunk);
|
|
1612
|
+
if (language === "typescript" || language === "javascript") {
|
|
1613
|
+
const rel = extractTsRelations(posixRelPath, redacted);
|
|
1614
|
+
this.store.setEdges(posixRelPath, "import", rel.imports);
|
|
1615
|
+
this.store.setEdges(posixRelPath, "export", rel.exports);
|
|
1616
|
+
} else {
|
|
1617
|
+
this.store.setEdges(posixRelPath, "import", []);
|
|
1618
|
+
this.store.setEdges(posixRelPath, "export", []);
|
|
1619
|
+
}
|
|
1620
|
+
const synopsisText = buildSynopsis(posixRelPath, language, redacted);
|
|
1621
|
+
const synopsis = synopsisText.trim() ? [{
|
|
1622
|
+
startLine: 1,
|
|
1623
|
+
endLine: 1,
|
|
1624
|
+
text: synopsisText,
|
|
1625
|
+
contentHash: sha256Hex(synopsisText),
|
|
1626
|
+
tokens: Math.max(1, Math.ceil(synopsisText.length / 4)),
|
|
1627
|
+
kind: "synopsis"
|
|
1628
|
+
}] : [];
|
|
1629
|
+
const headerLines = Math.min(80, this.config.chunk.maxLines);
|
|
1630
|
+
const headerText = redacted.split(/\r?\n/).slice(0, headerLines).join("\n").trimEnd();
|
|
1631
|
+
const headerChunk = headerText.trim() ? [{
|
|
1632
|
+
startLine: 1,
|
|
1633
|
+
endLine: headerLines,
|
|
1634
|
+
text: headerText,
|
|
1635
|
+
contentHash: sha256Hex(headerText),
|
|
1636
|
+
tokens: Math.max(1, Math.ceil(headerText.length / 4)),
|
|
1637
|
+
kind: "chunk"
|
|
1638
|
+
}] : [];
|
|
1639
|
+
const combined = [
|
|
1640
|
+
...synopsis,
|
|
1641
|
+
...headerChunk,
|
|
1642
|
+
...chunks.map((c) => ({ ...c, kind: "chunk" }))
|
|
1643
|
+
];
|
|
1644
|
+
const embedTexts = [];
|
|
1645
|
+
const embedPlan = [];
|
|
1646
|
+
const embeddings = combined.map(() => null);
|
|
1647
|
+
for (let i = 0; i < combined.length; i++) {
|
|
1648
|
+
const ch = combined[i];
|
|
1649
|
+
const cached2 = this.embeddingCache.get(this.embedder.id, ch.contentHash);
|
|
1650
|
+
if (cached2) {
|
|
1651
|
+
embeddings[i] = cached2;
|
|
1652
|
+
continue;
|
|
1653
|
+
}
|
|
1654
|
+
embedTexts.push(
|
|
1655
|
+
`repo:${path8.basename(this.repoRoot)}
|
|
1656
|
+
path:${posixRelPath}
|
|
1657
|
+
language:${language}
|
|
1658
|
+
kind:${ch.kind}
|
|
1659
|
+
lines:${ch.startLine}-${ch.endLine}
|
|
1660
|
+
---
|
|
1661
|
+
${ch.text}`
|
|
1662
|
+
);
|
|
1663
|
+
embedPlan.push({ chunkIdx: i, contentHash: ch.contentHash });
|
|
1664
|
+
}
|
|
1665
|
+
const batchSize = this.config.embed.batchSize;
|
|
1666
|
+
for (let start = 0; start < embedTexts.length; start += batchSize) {
|
|
1667
|
+
const end = Math.min(embedTexts.length, start + batchSize);
|
|
1668
|
+
const batch = embedTexts.slice(start, end);
|
|
1669
|
+
const vecs = await this.embedder.embed(batch);
|
|
1670
|
+
for (let j = 0; j < vecs.length; j++) {
|
|
1671
|
+
const plan = embedPlan[start + j];
|
|
1672
|
+
embeddings[plan.chunkIdx] = vecs[j];
|
|
1673
|
+
this.embeddingCache.put(this.embedder.id, plan.contentHash, vecs[j]);
|
|
1674
|
+
}
|
|
1675
|
+
}
|
|
1676
|
+
const fileMtime = stat.mtimeMs;
|
|
1677
|
+
const ftsMode = this.config.storage.ftsMode;
|
|
1678
|
+
const storeText = this.config.storage.storeText;
|
|
1679
|
+
const oldChunkIds = this.store.listChunksForFile(posixRelPath).map((r) => r.id);
|
|
1680
|
+
const points = [];
|
|
1681
|
+
const rows = combined.map((ch, i) => {
|
|
1682
|
+
const id = sha256Hex(`${this.repoId}:${posixRelPath}:${ch.kind}:${ch.startLine}:${ch.endLine}:${ch.contentHash}`).slice(0, 32);
|
|
1683
|
+
const emb = embeddings[i];
|
|
1684
|
+
if (!emb) throw new Error("Embedding missing unexpectedly");
|
|
1685
|
+
points.push({ id, vector: emb });
|
|
1686
|
+
const textToStore = ch.kind === "synopsis" ? ch.text : storeText ? ch.text : "";
|
|
1687
|
+
const ftsText = ftsMode === "off" ? "" : ftsMode === "tokens" ? extractLexicalTokens(ch.text) : ch.text;
|
|
1688
|
+
return {
|
|
1689
|
+
id,
|
|
1690
|
+
language,
|
|
1691
|
+
kind: ch.kind,
|
|
1692
|
+
startLine: ch.startLine,
|
|
1693
|
+
endLine: ch.endLine,
|
|
1694
|
+
contentHash: ch.contentHash,
|
|
1695
|
+
text: textToStore,
|
|
1696
|
+
ftsText,
|
|
1697
|
+
tokens: ch.tokens,
|
|
1698
|
+
fileMtime,
|
|
1699
|
+
embedding: emb
|
|
1700
|
+
};
|
|
1701
|
+
});
|
|
1702
|
+
this.store.upsertFile(posixRelPath, fileHash, fileMtime, language, stat.size);
|
|
1703
|
+
this.store.replaceChunksForFile(posixRelPath, rows);
|
|
1704
|
+
const dim = points[0]?.vector.length;
|
|
1705
|
+
if (dim) {
|
|
1706
|
+
await this.ensureVectorIndex(dim);
|
|
1707
|
+
if (this.vec) {
|
|
1708
|
+
await this.vec.remove(oldChunkIds);
|
|
1709
|
+
await this.vec.upsert(points);
|
|
1710
|
+
this.scheduleVectorFlush();
|
|
1711
|
+
}
|
|
1712
|
+
}
|
|
1713
|
+
});
|
|
1714
|
+
}
|
|
1715
|
+
async deleteFileInner(posixRelPath) {
|
|
1716
|
+
if (!this.store) return;
|
|
1717
|
+
const chunkIds = this.store.listChunksForFile(posixRelPath).map((r) => r.id);
|
|
1718
|
+
this.store.deleteFile(posixRelPath);
|
|
1719
|
+
if (this.vec && chunkIds.length > 0) {
|
|
1720
|
+
await this.vec.remove(chunkIds);
|
|
1721
|
+
this.scheduleVectorFlush();
|
|
1722
|
+
}
|
|
1723
|
+
}
|
|
1724
|
+
async deleteFile(posixRelPath) {
|
|
1725
|
+
return this.serial(async () => {
|
|
1726
|
+
await this.openForCurrentHead();
|
|
1727
|
+
await this.deleteFileInner(posixRelPath);
|
|
1728
|
+
});
|
|
1729
|
+
}
|
|
1730
|
+
async vectorCandidates(queryEmbedding, k, includePaths) {
|
|
1731
|
+
await this.openForCurrentHead();
|
|
1732
|
+
if (!this.store) throw new Error("RepoStore not initialised");
|
|
1733
|
+
if (!this.vec) {
|
|
1734
|
+
const dim = this.embedder.dimension ?? this.store.getAnyEmbeddingDimension();
|
|
1735
|
+
if (dim) await this.ensureVectorIndex(dim);
|
|
1736
|
+
}
|
|
1737
|
+
if (!this.vec) return [];
|
|
1738
|
+
if (!includePaths || includePaths.length === 0) {
|
|
1739
|
+
return await this.vec.search(queryEmbedding, k);
|
|
1740
|
+
}
|
|
1741
|
+
const tries = [k * 5, k * 15, k * 40].map((x) => Math.max(k, x));
|
|
1742
|
+
for (const kk of tries) {
|
|
1743
|
+
const cand2 = await this.vec.search(queryEmbedding, kk);
|
|
1744
|
+
const filtered2 = [];
|
|
1745
|
+
for (const c of cand2) {
|
|
1746
|
+
const row = this.getChunkRowCached(c.id);
|
|
1747
|
+
if (row && includePaths.includes(row.path)) filtered2.push(c);
|
|
1748
|
+
if (filtered2.length >= k) break;
|
|
1749
|
+
}
|
|
1750
|
+
if (filtered2.length >= k) return filtered2.slice(0, k);
|
|
1751
|
+
}
|
|
1752
|
+
const cand = await this.vec.search(queryEmbedding, tries[tries.length - 1]);
|
|
1753
|
+
const filtered = cand.filter((c) => {
|
|
1754
|
+
const row = this.getChunkRowCached(c.id);
|
|
1755
|
+
return row ? includePaths.includes(row.path) : false;
|
|
1756
|
+
});
|
|
1757
|
+
return filtered.slice(0, k);
|
|
1758
|
+
}
|
|
1759
|
+
async lexicalCandidates(queryText, k, includePaths) {
|
|
1760
|
+
await this.openForCurrentHead();
|
|
1761
|
+
if (!this.store) throw new Error("RepoStore not initialised");
|
|
1762
|
+
if (this.config.storage.ftsMode === "off") return [];
|
|
1763
|
+
const ftq = ftsQueryFromText(queryText);
|
|
1764
|
+
if (!ftq) return [];
|
|
1765
|
+
const rows = this.store.searchFts(ftq, k, includePaths);
|
|
1766
|
+
return rows.map((r) => ({ id: r.id, score: bm25ToScore01(r.bm25) }));
|
|
1767
|
+
}
|
|
1768
|
+
getChunkRowCached(id) {
|
|
1769
|
+
const cached2 = this.chunkCache.get(id);
|
|
1770
|
+
if (cached2) return cached2;
|
|
1771
|
+
if (!this.store) return null;
|
|
1772
|
+
const row = this.store.getChunkById(id);
|
|
1773
|
+
if (row) this.chunkCache.set(id, row);
|
|
1774
|
+
return row ?? null;
|
|
1775
|
+
}
|
|
1776
|
+
readChunkTextFallback(row) {
|
|
1777
|
+
const abs = path8.join(this.repoRoot, fromPosixPath(row.path));
|
|
1778
|
+
try {
|
|
1779
|
+
const raw = fs6.readFileSync(abs, "utf8");
|
|
1780
|
+
const lines = raw.split(/\r?\n/);
|
|
1781
|
+
const start = Math.max(1, row.start_line);
|
|
1782
|
+
const end = Math.max(start, row.end_line);
|
|
1783
|
+
return lines.slice(start - 1, end).join("\n");
|
|
1784
|
+
} catch {
|
|
1785
|
+
return "";
|
|
1786
|
+
}
|
|
1787
|
+
}
|
|
1788
|
+
getChunkRecord(id) {
|
|
1789
|
+
const row = this.getChunkRowCached(id);
|
|
1790
|
+
if (!row) return null;
|
|
1791
|
+
const text = row.text && row.text.trim().length > 0 ? row.text : this.readChunkTextFallback(row);
|
|
1792
|
+
return {
|
|
1793
|
+
id: row.id,
|
|
1794
|
+
repoId: this.repoId,
|
|
1795
|
+
repoRoot: this.repoRoot,
|
|
1796
|
+
path: row.path,
|
|
1797
|
+
language: row.language,
|
|
1798
|
+
startLine: row.start_line,
|
|
1799
|
+
endLine: row.end_line,
|
|
1800
|
+
contentHash: row.content_hash,
|
|
1801
|
+
text,
|
|
1802
|
+
tokens: row.tokens,
|
|
1803
|
+
fileMtimeMs: row.file_mtime,
|
|
1804
|
+
kind: row.kind === "synopsis" ? "synopsis" : "chunk"
|
|
1805
|
+
};
|
|
1806
|
+
}
|
|
1807
|
+
getChunkMeta(id) {
|
|
1808
|
+
const row = this.getChunkRowCached(id);
|
|
1809
|
+
if (!row) return null;
|
|
1810
|
+
return {
|
|
1811
|
+
id: row.id,
|
|
1812
|
+
repoId: this.repoId,
|
|
1813
|
+
repoRoot: this.repoRoot,
|
|
1814
|
+
path: row.path,
|
|
1815
|
+
language: row.language,
|
|
1816
|
+
startLine: row.start_line,
|
|
1817
|
+
endLine: row.end_line,
|
|
1818
|
+
contentHash: row.content_hash,
|
|
1819
|
+
tokens: row.tokens,
|
|
1820
|
+
fileMtimeMs: row.file_mtime,
|
|
1821
|
+
kind: row.kind === "synopsis" ? "synopsis" : "chunk"
|
|
1822
|
+
};
|
|
1823
|
+
}
|
|
1824
|
+
getChunkText(id) {
|
|
1825
|
+
const row = this.getChunkRowCached(id);
|
|
1826
|
+
if (!row) return "";
|
|
1827
|
+
return row.text && row.text.trim().length > 0 ? row.text : this.readChunkTextFallback(row);
|
|
1828
|
+
}
|
|
1829
|
+
getChunkPreview(id) {
|
|
1830
|
+
const r = this.getChunkRecord(id);
|
|
1831
|
+
return r ? makePreview(r.text) : "";
|
|
1832
|
+
}
|
|
1833
|
+
/**
|
|
1834
|
+
* Expand context around a hit:
|
|
1835
|
+
* - adjacency (previous/next chunks in file)
|
|
1836
|
+
* - follow relative imports to include imported file synopses/headers
|
|
1837
|
+
*/
|
|
1838
|
+
async expandContext(chunkId, opts) {
|
|
1839
|
+
await this.openForCurrentHead();
|
|
1840
|
+
if (!this.store) throw new Error("RepoStore not initialised");
|
|
1841
|
+
const row = this.getChunkRowCached(chunkId);
|
|
1842
|
+
if (!row) return [];
|
|
1843
|
+
const out = [];
|
|
1844
|
+
if (opts.includeFileSynopsis) {
|
|
1845
|
+
const synopsis = this.store.listChunksForFile(row.path, "synopsis")[0];
|
|
1846
|
+
if (synopsis) out.push({ id: synopsis.id, reason: "file synopsis" });
|
|
1847
|
+
}
|
|
1848
|
+
const adj = Math.max(0, opts.adjacentChunks);
|
|
1849
|
+
if (adj > 0) {
|
|
1850
|
+
const fileChunks = this.store.listChunksForFile(row.path).filter((c) => c.kind !== "synopsis");
|
|
1851
|
+
const idx = fileChunks.findIndex((c) => c.id === chunkId);
|
|
1852
|
+
if (idx >= 0) {
|
|
1853
|
+
for (let d = 1; d <= adj; d++) {
|
|
1854
|
+
for (const j of [idx - d, idx + d]) {
|
|
1855
|
+
if (j < 0 || j >= fileChunks.length) continue;
|
|
1856
|
+
out.push({ id: fileChunks[j].id, reason: `adjacent chunk (\xB1${d})` });
|
|
1857
|
+
}
|
|
1858
|
+
}
|
|
1859
|
+
}
|
|
1860
|
+
}
|
|
1861
|
+
const follow = Math.max(0, opts.followImports);
|
|
1862
|
+
if (follow > 0) {
|
|
1863
|
+
const imports = this.store.listEdges(row.path, "import").filter((s) => s.startsWith("."));
|
|
1864
|
+
for (const spec of imports.slice(0, follow)) {
|
|
1865
|
+
const candidates = [
|
|
1866
|
+
spec,
|
|
1867
|
+
`${spec}.ts`,
|
|
1868
|
+
`${spec}.tsx`,
|
|
1869
|
+
`${spec}.js`,
|
|
1870
|
+
`${spec}.jsx`,
|
|
1871
|
+
`${spec}/index.ts`,
|
|
1872
|
+
`${spec}/index.tsx`,
|
|
1873
|
+
`${spec}/index.js`,
|
|
1874
|
+
`${spec}/index.jsx`
|
|
1875
|
+
].map((s) => path8.posix.normalize(path8.posix.join(path8.posix.dirname(row.path), s)));
|
|
1876
|
+
for (const c of candidates) {
|
|
1877
|
+
const syn = this.store.listChunksForFile(c, "synopsis")[0];
|
|
1878
|
+
if (syn) {
|
|
1879
|
+
out.push({ id: syn.id, reason: `imported file synopsis (${spec})` });
|
|
1880
|
+
break;
|
|
1881
|
+
}
|
|
1882
|
+
const header = this.store.listChunksForFile(c).find((x) => x.kind !== "synopsis");
|
|
1883
|
+
if (header) {
|
|
1884
|
+
out.push({ id: header.id, reason: `imported file header (${spec})` });
|
|
1885
|
+
break;
|
|
1886
|
+
}
|
|
1887
|
+
}
|
|
1888
|
+
}
|
|
1889
|
+
}
|
|
1890
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1891
|
+
const deduped = [];
|
|
1892
|
+
for (const x of out) {
|
|
1893
|
+
if (seen.has(x.id)) continue;
|
|
1894
|
+
seen.add(x.id);
|
|
1895
|
+
deduped.push(x);
|
|
1896
|
+
}
|
|
1897
|
+
return deduped;
|
|
1898
|
+
}
|
|
1899
|
+
async watch() {
|
|
1900
|
+
await this.openForCurrentHead();
|
|
1901
|
+
const debounceMs = this.config.watch.debounceMs;
|
|
1902
|
+
let timer = null;
|
|
1903
|
+
const schedule = (fn) => {
|
|
1904
|
+
if (timer) clearTimeout(timer);
|
|
1905
|
+
timer = setTimeout(fn, debounceMs);
|
|
1906
|
+
};
|
|
1907
|
+
const headPath = path8.join(this.repoRoot, ".git", "HEAD");
|
|
1908
|
+
this.watcher = chokidar.watch([this.repoRoot, headPath], {
|
|
1909
|
+
ignoreInitial: true,
|
|
1910
|
+
ignored: (p) => {
|
|
1911
|
+
const rel = path8.relative(this.repoRoot, p);
|
|
1912
|
+
if (!rel) return false;
|
|
1913
|
+
const posix = rel.split(path8.sep).join("/");
|
|
1914
|
+
if (posix.startsWith(".git/")) return true;
|
|
1915
|
+
if (posix.includes("node_modules/")) return true;
|
|
1916
|
+
if (posix.includes("/.cache/")) return true;
|
|
1917
|
+
return false;
|
|
1918
|
+
}
|
|
1919
|
+
});
|
|
1920
|
+
this.watcher.on("change", (p) => {
|
|
1921
|
+
const rel = path8.relative(this.repoRoot, p);
|
|
1922
|
+
const posix = rel.split(path8.sep).join("/");
|
|
1923
|
+
if (posix === ".git/HEAD") {
|
|
1924
|
+
schedule(() => this.indexAll().catch(() => void 0));
|
|
1925
|
+
return;
|
|
1926
|
+
}
|
|
1927
|
+
schedule(() => this.indexFile(posix).catch(() => void 0));
|
|
1928
|
+
});
|
|
1929
|
+
this.watcher.on("add", (p) => {
|
|
1930
|
+
const rel = path8.relative(this.repoRoot, p);
|
|
1931
|
+
const posix = rel.split(path8.sep).join("/");
|
|
1932
|
+
schedule(() => this.indexFile(posix).catch(() => void 0));
|
|
1933
|
+
});
|
|
1934
|
+
this.watcher.on("unlink", (p) => {
|
|
1935
|
+
const rel = path8.relative(this.repoRoot, p);
|
|
1936
|
+
const posix = rel.split(path8.sep).join("/");
|
|
1937
|
+
schedule(() => this.deleteFile(posix).catch(() => void 0));
|
|
1938
|
+
});
|
|
1939
|
+
}
|
|
1940
|
+
async closeAsync() {
|
|
1941
|
+
if (this.vecFlushTimer) {
|
|
1942
|
+
clearTimeout(this.vecFlushTimer);
|
|
1943
|
+
this.vecFlushTimer = null;
|
|
1944
|
+
}
|
|
1945
|
+
if (this.vec && this.store) {
|
|
1946
|
+
try {
|
|
1947
|
+
await this.flushVectorNow();
|
|
1948
|
+
} catch {
|
|
1949
|
+
}
|
|
1950
|
+
}
|
|
1951
|
+
if (this.vec) {
|
|
1952
|
+
try {
|
|
1953
|
+
await this.vec.close();
|
|
1954
|
+
} catch {
|
|
1955
|
+
}
|
|
1956
|
+
this.vec = null;
|
|
1957
|
+
}
|
|
1958
|
+
await this.watcher?.close().catch(() => void 0);
|
|
1959
|
+
this.watcher = null;
|
|
1960
|
+
this.store?.close();
|
|
1961
|
+
this.store = null;
|
|
1962
|
+
this.embeddingCache.close();
|
|
1963
|
+
this.chunkCache.clear();
|
|
1964
|
+
}
|
|
1965
|
+
close() {
|
|
1966
|
+
void this.closeAsync();
|
|
1967
|
+
}
|
|
1968
|
+
};
|
|
1969
|
+
|
|
1970
|
+
// src/profiles.ts
|
|
1971
|
+
var DEFAULT_PROFILES = {
|
|
1972
|
+
search: {
|
|
1973
|
+
name: "search",
|
|
1974
|
+
k: 10,
|
|
1975
|
+
weights: { vector: 0.65, lexical: 0.35, recency: 0 },
|
|
1976
|
+
expand: { adjacentChunks: 0, followImports: 0, includeFileSynopsis: false },
|
|
1977
|
+
candidates: { vectorK: 25, lexicalK: 25, maxMergedCandidates: 60 }
|
|
1978
|
+
},
|
|
1979
|
+
refactor: {
|
|
1980
|
+
name: "refactor",
|
|
1981
|
+
k: 15,
|
|
1982
|
+
weights: { vector: 0.55, lexical: 0.35, recency: 0.1 },
|
|
1983
|
+
expand: { adjacentChunks: 1, followImports: 2, includeFileSynopsis: true },
|
|
1984
|
+
candidates: { vectorK: 60, lexicalK: 40, maxMergedCandidates: 140 }
|
|
1985
|
+
},
|
|
1986
|
+
review: {
|
|
1987
|
+
name: "review",
|
|
1988
|
+
k: 20,
|
|
1989
|
+
weights: { vector: 0.45, lexical: 0.35, recency: 0.2 },
|
|
1990
|
+
expand: { adjacentChunks: 1, followImports: 1, includeFileSynopsis: true },
|
|
1991
|
+
candidates: { vectorK: 80, lexicalK: 60, maxMergedCandidates: 180 }
|
|
1992
|
+
},
|
|
1993
|
+
architecture: {
|
|
1994
|
+
name: "architecture",
|
|
1995
|
+
k: 20,
|
|
1996
|
+
weights: { vector: 0.7, lexical: 0.2, recency: 0.1 },
|
|
1997
|
+
expand: { adjacentChunks: 0, followImports: 3, includeFileSynopsis: true },
|
|
1998
|
+
candidates: { vectorK: 120, lexicalK: 40, maxMergedCandidates: 220 }
|
|
1999
|
+
},
|
|
2000
|
+
rca: {
|
|
2001
|
+
name: "rca",
|
|
2002
|
+
k: 25,
|
|
2003
|
+
weights: { vector: 0.5, lexical: 0.25, recency: 0.25 },
|
|
2004
|
+
expand: { adjacentChunks: 2, followImports: 1, includeFileSynopsis: true },
|
|
2005
|
+
candidates: { vectorK: 140, lexicalK: 80, maxMergedCandidates: 260 }
|
|
2006
|
+
},
|
|
2007
|
+
custom: {
|
|
2008
|
+
name: "custom",
|
|
2009
|
+
k: 10,
|
|
2010
|
+
weights: { vector: 0.65, lexical: 0.35, recency: 0 },
|
|
2011
|
+
expand: { adjacentChunks: 0, followImports: 0, includeFileSynopsis: false },
|
|
2012
|
+
candidates: { vectorK: 25, lexicalK: 25, maxMergedCandidates: 60 }
|
|
2013
|
+
}
|
|
2014
|
+
};
|
|
2015
|
+
function deepMergeProfile(base, patch) {
|
|
2016
|
+
if (!patch) return base;
|
|
2017
|
+
const merged = {
|
|
2018
|
+
...base,
|
|
2019
|
+
...patch,
|
|
2020
|
+
weights: { ...base.weights, ...patch.weights ?? {} },
|
|
2021
|
+
expand: { ...base.expand, ...patch.expand ?? {} },
|
|
2022
|
+
candidates: { ...base.candidates, ...patch.candidates ?? {} }
|
|
2023
|
+
};
|
|
2024
|
+
return merged;
|
|
2025
|
+
}
|
|
2026
|
+
|
|
2027
|
+
// src/indexer/workspaceIndexer.ts
|
|
2028
|
+
import fs7 from "fs";
|
|
2029
|
+
import path9 from "path";
|
|
2030
|
+
function findGitRepos(workspaceRoot) {
|
|
2031
|
+
const out = [];
|
|
2032
|
+
const stack = [path9.resolve(workspaceRoot)];
|
|
2033
|
+
while (stack.length) {
|
|
2034
|
+
const dir = stack.pop();
|
|
2035
|
+
if (dir.includes("node_modules")) continue;
|
|
2036
|
+
const gitDir = path9.join(dir, ".git");
|
|
2037
|
+
if (fs7.existsSync(gitDir) && fs7.statSync(gitDir).isDirectory()) {
|
|
2038
|
+
out.push(dir);
|
|
2039
|
+
continue;
|
|
2040
|
+
}
|
|
2041
|
+
let entries;
|
|
2042
|
+
try {
|
|
2043
|
+
entries = fs7.readdirSync(dir, { withFileTypes: true });
|
|
2044
|
+
} catch {
|
|
2045
|
+
continue;
|
|
2046
|
+
}
|
|
2047
|
+
for (const e of entries) {
|
|
2048
|
+
if (!e.isDirectory()) continue;
|
|
2049
|
+
if (e.name.startsWith(".")) continue;
|
|
2050
|
+
stack.push(path9.join(dir, e.name));
|
|
2051
|
+
}
|
|
2052
|
+
}
|
|
2053
|
+
return out;
|
|
2054
|
+
}
|
|
2055
|
+
function halfLifeDaysForProfile(profileName) {
|
|
2056
|
+
if (profileName === "rca") return 7;
|
|
2057
|
+
if (profileName === "review") return 14;
|
|
2058
|
+
if (profileName === "refactor") return 21;
|
|
2059
|
+
return 30;
|
|
2060
|
+
}
|
|
2061
|
+
var WorkspaceIndexer = class {
|
|
2062
|
+
constructor(workspaceRoot, embedder, config = {}) {
|
|
2063
|
+
this.workspaceRoot = workspaceRoot;
|
|
2064
|
+
this.embedder = embedder;
|
|
2065
|
+
this.config = { ...config };
|
|
2066
|
+
if (!this.config.cacheDir) this.config.cacheDir = defaultCacheDir();
|
|
2067
|
+
}
|
|
2068
|
+
repos = [];
|
|
2069
|
+
config;
|
|
2070
|
+
async open() {
|
|
2071
|
+
const repoRoots = findGitRepos(this.workspaceRoot);
|
|
2072
|
+
this.repos = repoRoots.map((r) => new RepoIndexer(r, this.embedder, this.config));
|
|
2073
|
+
}
|
|
2074
|
+
async indexAll() {
|
|
2075
|
+
if (this.repos.length === 0) await this.open();
|
|
2076
|
+
await Promise.all(this.repos.map((r) => r.indexAll()));
|
|
2077
|
+
}
|
|
2078
|
+
async watch() {
|
|
2079
|
+
if (this.repos.length === 0) await this.open();
|
|
2080
|
+
await Promise.all(this.repos.map((r) => r.watch()));
|
|
2081
|
+
}
|
|
2082
|
+
getRepoIndexers() {
|
|
2083
|
+
return this.repos.slice();
|
|
2084
|
+
}
|
|
2085
|
+
resolveProfile(opts) {
|
|
2086
|
+
const name = opts?.profile ?? "search";
|
|
2087
|
+
const base = DEFAULT_PROFILES[name] ?? DEFAULT_PROFILES.search;
|
|
2088
|
+
const configPatch = this.config.profiles?.[name] ?? {};
|
|
2089
|
+
const merged1 = deepMergeProfile(base, configPatch);
|
|
2090
|
+
const merged2 = deepMergeProfile(merged1, opts?.profileOverrides);
|
|
2091
|
+
const w = merged2.weights;
|
|
2092
|
+
const sum = Math.max(1e-6, w.vector + w.lexical + w.recency);
|
|
2093
|
+
merged2.weights = { vector: w.vector / sum, lexical: w.lexical / sum, recency: w.recency / sum };
|
|
2094
|
+
return merged2;
|
|
2095
|
+
}
|
|
2096
|
+
async retrieve(query, opts = {}) {
|
|
2097
|
+
if (this.repos.length === 0) await this.open();
|
|
2098
|
+
const profile = this.resolveProfile(opts);
|
|
2099
|
+
const qVec = (await this.embedder.embed([query]))[0];
|
|
2100
|
+
const vectorK = profile.candidates?.vectorK ?? Math.max(profile.k * 3, 30);
|
|
2101
|
+
const lexicalK = profile.candidates?.lexicalK ?? Math.max(profile.k * 3, 30);
|
|
2102
|
+
const maxMerged = profile.candidates?.maxMergedCandidates ?? Math.max(profile.k * 8, 120);
|
|
2103
|
+
const repoFilters = opts.filters?.repoRoots;
|
|
2104
|
+
const langFilter = opts.filters?.language;
|
|
2105
|
+
const pathPrefix = opts.filters?.pathPrefix;
|
|
2106
|
+
const candidates = [];
|
|
2107
|
+
let vecCount = 0;
|
|
2108
|
+
let lexCount = 0;
|
|
2109
|
+
for (const repo of this.repos) {
|
|
2110
|
+
if (repoFilters && !repoFilters.includes(repo.repoRoot)) continue;
|
|
2111
|
+
let includePaths = opts.scope?.includePaths?.slice();
|
|
2112
|
+
if (opts.scope?.changedOnly) {
|
|
2113
|
+
try {
|
|
2114
|
+
const changed = await listChangedFiles(repo.repoRoot, opts.scope.baseRef ?? "HEAD~1");
|
|
2115
|
+
includePaths = includePaths ? includePaths.filter((p) => changed.includes(p)) : changed;
|
|
2116
|
+
} catch {
|
|
2117
|
+
}
|
|
2118
|
+
}
|
|
2119
|
+
const [vHits, lHits] = await Promise.all([
|
|
2120
|
+
repo.vectorCandidates(qVec, vectorK, includePaths),
|
|
2121
|
+
repo.lexicalCandidates(query, lexicalK, includePaths)
|
|
2122
|
+
]);
|
|
2123
|
+
vecCount += vHits.length;
|
|
2124
|
+
lexCount += lHits.length;
|
|
2125
|
+
const m = /* @__PURE__ */ new Map();
|
|
2126
|
+
for (const vh of vHits) {
|
|
2127
|
+
const id = vh.id;
|
|
2128
|
+
const vector01 = vectorCosineToScore01(vh.score);
|
|
2129
|
+
m.set(id, { repo, id, vector01, combined: 0 });
|
|
2130
|
+
}
|
|
2131
|
+
for (const lh of lHits) {
|
|
2132
|
+
const id = lh.id;
|
|
2133
|
+
const prev = m.get(id);
|
|
2134
|
+
if (prev) prev.lexical01 = lh.score;
|
|
2135
|
+
else m.set(id, { repo, id, lexical01: lh.score, combined: 0 });
|
|
2136
|
+
}
|
|
2137
|
+
const halfLife = halfLifeDaysForProfile(profile.name);
|
|
2138
|
+
for (const c of m.values()) {
|
|
2139
|
+
const meta = repo.getChunkMeta(c.id);
|
|
2140
|
+
if (!meta) continue;
|
|
2141
|
+
if (langFilter && meta.language !== langFilter) continue;
|
|
2142
|
+
if (pathPrefix && !meta.path.startsWith(pathPrefix)) continue;
|
|
2143
|
+
c.recency01 = profile.weights.recency > 0 ? recencyScore(meta.fileMtimeMs, halfLife) : 0;
|
|
2144
|
+
let kindFactor = 1;
|
|
2145
|
+
if (meta.kind === "synopsis" && profile.name === "search") kindFactor = 0.85;
|
|
2146
|
+
if (meta.kind === "synopsis" && profile.name === "architecture") kindFactor = 1.05;
|
|
2147
|
+
const v = c.vector01 ?? 0;
|
|
2148
|
+
const l = c.lexical01 ?? 0;
|
|
2149
|
+
const r = c.recency01 ?? 0;
|
|
2150
|
+
c.combined = clamp(
|
|
2151
|
+
kindFactor * (profile.weights.vector * v + profile.weights.lexical * l + profile.weights.recency * r),
|
|
2152
|
+
0,
|
|
2153
|
+
1
|
|
2154
|
+
);
|
|
2155
|
+
candidates.push(c);
|
|
2156
|
+
}
|
|
2157
|
+
}
|
|
2158
|
+
candidates.sort((a, b) => b.combined - a.combined);
|
|
2159
|
+
const merged = candidates.slice(0, maxMerged);
|
|
2160
|
+
const top = merged.slice(0, profile.k);
|
|
2161
|
+
const hits = top.map((c) => {
|
|
2162
|
+
const meta = c.repo.getChunkMeta(c.id);
|
|
2163
|
+
const preview = makePreview(c.repo.getChunkText(c.id));
|
|
2164
|
+
return {
|
|
2165
|
+
score: c.combined,
|
|
2166
|
+
scoreBreakdown: { vector: c.vector01, lexical: c.lexical01, recency: c.recency01 },
|
|
2167
|
+
chunk: { ...meta, preview }
|
|
2168
|
+
};
|
|
2169
|
+
});
|
|
2170
|
+
const contextBlocks = [];
|
|
2171
|
+
const seenKey = /* @__PURE__ */ new Set();
|
|
2172
|
+
const addBlock = (repoRoot, path11, startLine, endLine, text, reason) => {
|
|
2173
|
+
const key = `${repoRoot}:${path11}:${startLine}:${endLine}:${text.length}:${reason}`;
|
|
2174
|
+
if (seenKey.has(key)) return;
|
|
2175
|
+
seenKey.add(key);
|
|
2176
|
+
if (!text.trim()) return;
|
|
2177
|
+
contextBlocks.push({ repoRoot, path: path11, startLine, endLine, text, reason });
|
|
2178
|
+
};
|
|
2179
|
+
for (const h of hits) {
|
|
2180
|
+
const repo = this.repos.find((r) => r.repoRoot === h.chunk.repoRoot);
|
|
2181
|
+
if (!repo) continue;
|
|
2182
|
+
const text = repo.getChunkText(h.chunk.id);
|
|
2183
|
+
addBlock(h.chunk.repoRoot, h.chunk.path, h.chunk.startLine, h.chunk.endLine, text, "primary hit");
|
|
2184
|
+
const expanded = await repo.expandContext(h.chunk.id, {
|
|
2185
|
+
adjacentChunks: profile.expand.adjacentChunks ?? 0,
|
|
2186
|
+
followImports: profile.expand.followImports ?? 0,
|
|
2187
|
+
includeFileSynopsis: profile.expand.includeFileSynopsis ?? false
|
|
2188
|
+
});
|
|
2189
|
+
for (const ex of expanded) {
|
|
2190
|
+
const meta = repo.getChunkMeta(ex.id);
|
|
2191
|
+
if (!meta) continue;
|
|
2192
|
+
const t = repo.getChunkText(ex.id);
|
|
2193
|
+
addBlock(meta.repoRoot, meta.path, meta.startLine, meta.endLine, t, ex.reason);
|
|
2194
|
+
}
|
|
2195
|
+
}
|
|
2196
|
+
return {
|
|
2197
|
+
hits,
|
|
2198
|
+
context: contextBlocks,
|
|
2199
|
+
stats: {
|
|
2200
|
+
profile: profile.name,
|
|
2201
|
+
reposSearched: this.repos.length,
|
|
2202
|
+
candidates: {
|
|
2203
|
+
vector: vecCount,
|
|
2204
|
+
lexical: lexCount,
|
|
2205
|
+
merged: merged.length,
|
|
2206
|
+
returned: hits.length
|
|
2207
|
+
}
|
|
2208
|
+
}
|
|
2209
|
+
};
|
|
2210
|
+
}
|
|
2211
|
+
async search(query, k = 10) {
|
|
2212
|
+
const bundle = await this.retrieve(query, { profile: "search", profileOverrides: { k } });
|
|
2213
|
+
return bundle.hits;
|
|
2214
|
+
}
|
|
2215
|
+
async closeAsync() {
|
|
2216
|
+
for (const r of this.repos) {
|
|
2217
|
+
await r.closeAsync().catch(() => void 0);
|
|
2218
|
+
}
|
|
2219
|
+
this.repos = [];
|
|
2220
|
+
}
|
|
2221
|
+
close() {
|
|
2222
|
+
void this.closeAsync();
|
|
2223
|
+
}
|
|
2224
|
+
};
|
|
2225
|
+
|
|
2226
|
+
// src/embeddings/ollama.ts
|
|
2227
|
+
import pLimit2 from "p-limit";
|
|
2228
|
+
var OllamaEmbeddingsProvider = class {
|
|
2229
|
+
id;
|
|
2230
|
+
dimension = null;
|
|
2231
|
+
baseUrl;
|
|
2232
|
+
model;
|
|
2233
|
+
concurrency;
|
|
2234
|
+
constructor(opts) {
|
|
2235
|
+
this.model = opts.model;
|
|
2236
|
+
this.baseUrl = opts.baseUrl ?? "http://localhost:11434";
|
|
2237
|
+
this.concurrency = opts.concurrency ?? 4;
|
|
2238
|
+
this.id = `ollama:${this.model}`;
|
|
2239
|
+
}
|
|
2240
|
+
async tryBatchEndpoint(texts) {
|
|
2241
|
+
const res = await fetch(`${this.baseUrl}/api/embed`, {
|
|
2242
|
+
method: "POST",
|
|
2243
|
+
headers: { "Content-Type": "application/json" },
|
|
2244
|
+
body: JSON.stringify({ model: this.model, input: texts })
|
|
2245
|
+
}).catch(() => null);
|
|
2246
|
+
if (!res || !res.ok) return null;
|
|
2247
|
+
const json = await res.json();
|
|
2248
|
+
const embeddings = json.embeddings;
|
|
2249
|
+
if (!embeddings) return null;
|
|
2250
|
+
const out = embeddings.map((v) => Float32Array.from(v));
|
|
2251
|
+
if (out.length > 0) this.dimension = out[0].length;
|
|
2252
|
+
return out;
|
|
2253
|
+
}
|
|
2254
|
+
async embedOne(text) {
|
|
2255
|
+
const res = await fetch(`${this.baseUrl}/api/embeddings`, {
|
|
2256
|
+
method: "POST",
|
|
2257
|
+
headers: { "Content-Type": "application/json" },
|
|
2258
|
+
body: JSON.stringify({ model: this.model, prompt: text })
|
|
2259
|
+
});
|
|
2260
|
+
if (!res.ok) {
|
|
2261
|
+
const errText = await res.text().catch(() => "");
|
|
2262
|
+
throw new Error(`Ollama embeddings failed: ${res.status} ${res.statusText} ${errText}`);
|
|
2263
|
+
}
|
|
2264
|
+
const json = await res.json();
|
|
2265
|
+
const emb = Float32Array.from(json.embedding);
|
|
2266
|
+
this.dimension = emb.length;
|
|
2267
|
+
return emb;
|
|
2268
|
+
}
|
|
2269
|
+
async embed(texts) {
|
|
2270
|
+
const batch = await this.tryBatchEndpoint(texts);
|
|
2271
|
+
if (batch) return batch;
|
|
2272
|
+
const limit = pLimit2(this.concurrency);
|
|
2273
|
+
const out = await Promise.all(texts.map((t) => limit(() => this.embedOne(t))));
|
|
2274
|
+
return out;
|
|
2275
|
+
}
|
|
2276
|
+
};
|
|
2277
|
+
|
|
2278
|
+
// src/embeddings/openai.ts
|
|
2279
|
+
var OpenAIEmbeddingsProvider = class {
|
|
2280
|
+
constructor(opts) {
|
|
2281
|
+
this.opts = opts;
|
|
2282
|
+
this.id = `openai:${opts.model}`;
|
|
2283
|
+
}
|
|
2284
|
+
id;
|
|
2285
|
+
dimension = null;
|
|
2286
|
+
async embed(texts) {
|
|
2287
|
+
const baseUrl = this.opts.baseUrl ?? "https://api.openai.com";
|
|
2288
|
+
const res = await fetch(`${baseUrl}/v1/embeddings`, {
|
|
2289
|
+
method: "POST",
|
|
2290
|
+
headers: {
|
|
2291
|
+
"Authorization": `Bearer ${this.opts.apiKey}`,
|
|
2292
|
+
"Content-Type": "application/json"
|
|
2293
|
+
},
|
|
2294
|
+
body: JSON.stringify({
|
|
2295
|
+
model: this.opts.model,
|
|
2296
|
+
input: texts
|
|
2297
|
+
})
|
|
2298
|
+
});
|
|
2299
|
+
if (!res.ok) {
|
|
2300
|
+
const errText = await res.text().catch(() => "");
|
|
2301
|
+
throw new Error(`OpenAI embeddings failed: ${res.status} ${res.statusText} ${errText}`);
|
|
2302
|
+
}
|
|
2303
|
+
const json = await res.json();
|
|
2304
|
+
const arr = json.data.map((d) => Float32Array.from(d.embedding));
|
|
2305
|
+
if (arr.length > 0) this.dimension = arr[0].length;
|
|
2306
|
+
return arr;
|
|
2307
|
+
}
|
|
2308
|
+
};
|
|
2309
|
+
|
|
2310
|
+
// src/embeddings/hash.ts
|
|
2311
|
+
import crypto3 from "crypto";
|
|
2312
|
+
var HashEmbeddingsProvider = class {
|
|
2313
|
+
id;
|
|
2314
|
+
dimension;
|
|
2315
|
+
constructor(dimension = 384) {
|
|
2316
|
+
this.dimension = dimension;
|
|
2317
|
+
this.id = `hash:${dimension}`;
|
|
2318
|
+
}
|
|
2319
|
+
async embed(texts) {
|
|
2320
|
+
return texts.map((t) => this.embedOne(t));
|
|
2321
|
+
}
|
|
2322
|
+
embedOne(text) {
|
|
2323
|
+
const v = new Float32Array(this.dimension);
|
|
2324
|
+
const tokens = text.split(/[^A-Za-z0-9_]+/).filter(Boolean).slice(0, 6e3);
|
|
2325
|
+
for (const tok of tokens) {
|
|
2326
|
+
const h = crypto3.createHash("sha256").update(tok).digest();
|
|
2327
|
+
const idx = h.readUInt32LE(0) % this.dimension;
|
|
2328
|
+
const sign = h[4] & 1 ? 1 : -1;
|
|
2329
|
+
v[idx] += sign;
|
|
2330
|
+
}
|
|
2331
|
+
let sumSq = 0;
|
|
2332
|
+
for (let i = 0; i < v.length; i++) sumSq += v[i] * v[i];
|
|
2333
|
+
const norm = Math.sqrt(sumSq) || 1;
|
|
2334
|
+
for (let i = 0; i < v.length; i++) v[i] /= norm;
|
|
2335
|
+
return v;
|
|
2336
|
+
}
|
|
2337
|
+
};
|
|
2338
|
+
|
|
2339
|
+
// src/config.ts
|
|
2340
|
+
import fs8 from "fs";
|
|
2341
|
+
import path10 from "path";
|
|
2342
|
+
function loadConfigFile(filePath) {
|
|
2343
|
+
const abs = path10.resolve(filePath);
|
|
2344
|
+
const raw = fs8.readFileSync(abs, "utf8");
|
|
2345
|
+
const json = JSON.parse(raw);
|
|
2346
|
+
const cfg = { ...json };
|
|
2347
|
+
if (json.redact?.patterns && Array.isArray(json.redact.patterns)) {
|
|
2348
|
+
const pats = json.redact.patterns;
|
|
2349
|
+
cfg.redact = cfg.redact ?? {};
|
|
2350
|
+
cfg.redact.patterns = pats.map((p) => {
|
|
2351
|
+
if (p.regex instanceof RegExp) {
|
|
2352
|
+
return { name: p.name, regex: p.regex, replaceWith: p.replaceWith };
|
|
2353
|
+
}
|
|
2354
|
+
return {
|
|
2355
|
+
name: p.name,
|
|
2356
|
+
regex: new RegExp(p.regex, p.flags ?? "g"),
|
|
2357
|
+
replaceWith: p.replaceWith
|
|
2358
|
+
};
|
|
2359
|
+
});
|
|
2360
|
+
}
|
|
2361
|
+
return cfg;
|
|
2362
|
+
}
|
|
2363
|
+
|
|
2364
|
+
export {
|
|
2365
|
+
createVectorIndex,
|
|
2366
|
+
RepoIndexer,
|
|
2367
|
+
DEFAULT_PROFILES,
|
|
2368
|
+
deepMergeProfile,
|
|
2369
|
+
WorkspaceIndexer,
|
|
2370
|
+
OllamaEmbeddingsProvider,
|
|
2371
|
+
OpenAIEmbeddingsProvider,
|
|
2372
|
+
HashEmbeddingsProvider,
|
|
2373
|
+
loadConfigFile
|
|
2374
|
+
};
|