ctxloom-pro 1.5.6 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +25 -3
- package/apps/dashboard/dist/server/index.js +544 -26
- package/apps/dashboard/package.json +2 -2
- package/dist/VectorStore-WDL3H7QT.js +9 -0
- package/dist/chunk-6FGTNOCP.js +397 -0
- package/dist/{chunk-JULFFD7O.js → chunk-7S2ELKNU.js} +123 -3
- package/dist/{chunk-FPMNXF4D.js → chunk-FFCLVZCO.js} +685 -43
- package/dist/{chunk-II2DPYRJ.js → chunk-YHLMQVBV.js} +200 -10
- package/dist/embedder-2JWDJUE2.js +26 -0
- package/dist/index.js +11 -11
- package/dist/setup/postinstall.js +1 -1
- package/dist/{src-DL44T55H.js → src-QAYZWPSL.js} +6 -4
- package/dist/workers/indexerWorker.js +2 -2
- package/package.json +1 -1
- package/dist/VectorStore-2LVECRTY.js +0 -8
- package/dist/chunk-WDX4PJGL.js +0 -214
- package/dist/embedder-3AE4CSR7.js +0 -14
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
"dependencies": {
|
|
16
16
|
"@ctxloom/core": "*",
|
|
17
17
|
"cors": "^2.8.5",
|
|
18
|
-
"express": "^4.
|
|
18
|
+
"express": "^4.22.2",
|
|
19
19
|
"open": "^10.1.0",
|
|
20
20
|
"react-syntax-highlighter": "^16.1.1"
|
|
21
21
|
},
|
|
@@ -36,7 +36,7 @@
|
|
|
36
36
|
"clsx": "^2.1.1",
|
|
37
37
|
"d3": "^7.9.0",
|
|
38
38
|
"jsdom": "^24.0.0",
|
|
39
|
-
"postcss": "^8.
|
|
39
|
+
"postcss": "^8.5.15",
|
|
40
40
|
"react": "^18.3.0",
|
|
41
41
|
"react-dom": "^18.3.0",
|
|
42
42
|
"react-router-dom": "^6.23.0",
|
|
@@ -0,0 +1,397 @@
|
|
|
1
|
+
import {
|
|
2
|
+
logger
|
|
3
|
+
} from "./chunk-TYDMSHV7.js";
|
|
4
|
+
|
|
5
|
+
// packages/core/src/indexer/embedder.ts
|
|
6
|
+
import fs from "fs";
|
|
7
|
+
import path from "path";
|
|
8
|
+
var CHUNK_SIZE = 4096;
|
|
9
|
+
var MODEL_REGISTRY = {
|
|
10
|
+
// The historical default. General English, 384-dim, ~90 MB.
|
|
11
|
+
// Kept as the free-tier default so existing users see zero change.
|
|
12
|
+
minilm: {
|
|
13
|
+
hfId: "sentence-transformers/all-MiniLM-L6-v2",
|
|
14
|
+
dim: 384,
|
|
15
|
+
minBytes: 80 * 1024 * 1024,
|
|
16
|
+
description: "General-purpose English sentence embedder (2020). 384-dim. The legacy default."
|
|
17
|
+
},
|
|
18
|
+
// Code-specific embedding model (Jina AI). 768-dim, ~140 MB.
|
|
19
|
+
// Empirically 20-40% better recall on code-similarity queries than
|
|
20
|
+
// MiniLM — the upgrade path recommended in the v1.7.0 analysis.
|
|
21
|
+
// Runs through the same @huggingface/transformers pipeline so the
|
|
22
|
+
// privacy story (fully local, no network at inference time) is
|
|
23
|
+
// preserved.
|
|
24
|
+
"jina-code": {
|
|
25
|
+
hfId: "jinaai/jina-embeddings-v2-base-code",
|
|
26
|
+
dim: 768,
|
|
27
|
+
minBytes: 130 * 1024 * 1024,
|
|
28
|
+
description: "Code-specific embedder (Jina, 2024). 768-dim. Better recall on code-similarity tasks."
|
|
29
|
+
}
|
|
30
|
+
};
|
|
31
|
+
function resolveEmbeddingModel(env = process.env) {
|
|
32
|
+
const envModel = env.CTXLOOM_EMBEDDING_MODEL?.trim();
|
|
33
|
+
if (!envModel) return MODEL_REGISTRY.minilm;
|
|
34
|
+
const registered = MODEL_REGISTRY[envModel];
|
|
35
|
+
if (registered) return registered;
|
|
36
|
+
const envDim = env.CTXLOOM_EMBEDDING_DIM ? Number.parseInt(env.CTXLOOM_EMBEDDING_DIM, 10) : NaN;
|
|
37
|
+
if (!Number.isFinite(envDim) || envDim <= 0) {
|
|
38
|
+
throw new Error(
|
|
39
|
+
`CTXLOOM_EMBEDDING_MODEL=${envModel} is not a known alias. Either use one of [${Object.keys(MODEL_REGISTRY).join(", ")}] or set CTXLOOM_EMBEDDING_DIM=<vector-length> alongside a raw HF id.`
|
|
40
|
+
);
|
|
41
|
+
}
|
|
42
|
+
return {
|
|
43
|
+
hfId: envModel,
|
|
44
|
+
dim: envDim,
|
|
45
|
+
// Without a known artifact size we can't enforce the truncated-download
|
|
46
|
+
// guard. Use 1 MB as the minimum; the worst case is a redundant retry
|
|
47
|
+
// rather than a hung process.
|
|
48
|
+
minBytes: 1024 * 1024,
|
|
49
|
+
description: `User-supplied model: ${envModel} (${envDim}-dim)`
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
var ACTIVE_MODEL = resolveEmbeddingModel();
|
|
53
|
+
var EMBEDDING_DIMENSION = ACTIVE_MODEL.dim;
|
|
54
|
+
var MODEL_ID = ACTIVE_MODEL.hfId;
|
|
55
|
+
var MIN_MODEL_BYTES = ACTIVE_MODEL.minBytes;
|
|
56
|
+
var embedder = null;
|
|
57
|
+
var embedderInitInFlight = null;
|
|
58
|
+
async function loadEmbedder() {
|
|
59
|
+
const { pipeline } = await import("@huggingface/transformers");
|
|
60
|
+
return await pipeline("feature-extraction", MODEL_ID, {
|
|
61
|
+
dtype: "fp32"
|
|
62
|
+
});
|
|
63
|
+
}
|
|
64
|
+
function extractModelPathFromProtobufError(message) {
|
|
65
|
+
const match = /Load model from (.+) failed:Protobuf parsing failed/i.exec(message);
|
|
66
|
+
return match ? match[1] : null;
|
|
67
|
+
}
|
|
68
|
+
function tryRemoveTruncatedModel(modelPath) {
|
|
69
|
+
try {
|
|
70
|
+
const stat = fs.statSync(modelPath);
|
|
71
|
+
if (stat.size >= MIN_MODEL_BYTES) return false;
|
|
72
|
+
fs.unlinkSync(modelPath);
|
|
73
|
+
logger.warn("Removed truncated embedding model; next attempt will re-download", {
|
|
74
|
+
path: modelPath,
|
|
75
|
+
sizeBytes: stat.size,
|
|
76
|
+
minBytes: MIN_MODEL_BYTES
|
|
77
|
+
});
|
|
78
|
+
return true;
|
|
79
|
+
} catch (err) {
|
|
80
|
+
logger.warn("Could not inspect/remove suspected truncated model", {
|
|
81
|
+
path: modelPath,
|
|
82
|
+
detail: err instanceof Error ? err.message : String(err)
|
|
83
|
+
});
|
|
84
|
+
return false;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
async function getEmbedder() {
|
|
88
|
+
if (embedder) return embedder;
|
|
89
|
+
if (embedderInitInFlight) return embedderInitInFlight;
|
|
90
|
+
embedderInitInFlight = (async () => {
|
|
91
|
+
const MAX_ATTEMPTS = 3;
|
|
92
|
+
let lastErr;
|
|
93
|
+
for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) {
|
|
94
|
+
try {
|
|
95
|
+
const pipe = await loadEmbedder();
|
|
96
|
+
embedder = pipe;
|
|
97
|
+
return pipe;
|
|
98
|
+
} catch (err) {
|
|
99
|
+
lastErr = err;
|
|
100
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
101
|
+
const isProtobufRace = /protobuf parsing failed/i.test(msg);
|
|
102
|
+
if (!isProtobufRace || attempt === MAX_ATTEMPTS) break;
|
|
103
|
+
const modelPath = extractModelPathFromProtobufError(msg);
|
|
104
|
+
if (modelPath && tryRemoveTruncatedModel(modelPath)) {
|
|
105
|
+
logger.warn("Retrying embedding model load after truncated-cache removal", {
|
|
106
|
+
attempt
|
|
107
|
+
});
|
|
108
|
+
continue;
|
|
109
|
+
}
|
|
110
|
+
const delay = attempt * 1e3;
|
|
111
|
+
logger.warn("Embedding model load failed; retrying after FS settle", {
|
|
112
|
+
attempt,
|
|
113
|
+
delayMs: delay
|
|
114
|
+
});
|
|
115
|
+
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
embedderInitInFlight = null;
|
|
119
|
+
throw lastErr;
|
|
120
|
+
})();
|
|
121
|
+
try {
|
|
122
|
+
return await embedderInitInFlight;
|
|
123
|
+
} finally {
|
|
124
|
+
if (embedder) embedderInitInFlight = null;
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
async function generateEmbedding(text) {
|
|
128
|
+
const pipe = await getEmbedder();
|
|
129
|
+
const output = await pipe(text.slice(0, CHUNK_SIZE), {
|
|
130
|
+
pooling: "mean",
|
|
131
|
+
normalize: true
|
|
132
|
+
});
|
|
133
|
+
const data = output.tolist();
|
|
134
|
+
if (Array.isArray(data[0])) {
|
|
135
|
+
return data[0];
|
|
136
|
+
}
|
|
137
|
+
return data;
|
|
138
|
+
}
|
|
139
|
+
async function generateEmbeddingBatch(texts) {
|
|
140
|
+
if (texts.length === 0) return [];
|
|
141
|
+
const pipe = await getEmbedder();
|
|
142
|
+
const truncated = texts.map((t) => t.slice(0, CHUNK_SIZE));
|
|
143
|
+
const output = await pipe(truncated, {
|
|
144
|
+
pooling: "mean",
|
|
145
|
+
normalize: true
|
|
146
|
+
});
|
|
147
|
+
const data = output.tolist();
|
|
148
|
+
if (!Array.isArray(data[0])) {
|
|
149
|
+
throw new Error(
|
|
150
|
+
`generateEmbeddingBatch: pipeline returned 1D tensor for ${texts.length} inputs (expected 2D [batch_size, embedding_dim]). Likely a transformers.js version regression.`
|
|
151
|
+
);
|
|
152
|
+
}
|
|
153
|
+
return data;
|
|
154
|
+
}
|
|
155
|
+
function collectFiles(dir, results = []) {
|
|
156
|
+
const IGNORED_DIRS = /* @__PURE__ */ new Set([
|
|
157
|
+
// Build artifacts + dependency caches
|
|
158
|
+
"node_modules",
|
|
159
|
+
"dist",
|
|
160
|
+
"build",
|
|
161
|
+
"out",
|
|
162
|
+
"target",
|
|
163
|
+
"coverage",
|
|
164
|
+
".cache",
|
|
165
|
+
".turbo",
|
|
166
|
+
".next",
|
|
167
|
+
".nuxt",
|
|
168
|
+
// Version control + ctxloom state
|
|
169
|
+
".git",
|
|
170
|
+
".ctxloom",
|
|
171
|
+
// Other tools' working state (often contains duplicated source)
|
|
172
|
+
".claude",
|
|
173
|
+
".code-review-graph",
|
|
174
|
+
".vscode-test"
|
|
175
|
+
]);
|
|
176
|
+
const SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
177
|
+
".ts",
|
|
178
|
+
".tsx",
|
|
179
|
+
".js",
|
|
180
|
+
".jsx",
|
|
181
|
+
".mjs",
|
|
182
|
+
".vue",
|
|
183
|
+
".py",
|
|
184
|
+
".rs",
|
|
185
|
+
".go",
|
|
186
|
+
".java",
|
|
187
|
+
".cs",
|
|
188
|
+
".rb",
|
|
189
|
+
".kt",
|
|
190
|
+
".kts",
|
|
191
|
+
".swift",
|
|
192
|
+
".php",
|
|
193
|
+
".dart",
|
|
194
|
+
".c",
|
|
195
|
+
".cpp",
|
|
196
|
+
".h",
|
|
197
|
+
".md",
|
|
198
|
+
".json",
|
|
199
|
+
".yaml",
|
|
200
|
+
".yml",
|
|
201
|
+
".toml",
|
|
202
|
+
".ipynb"
|
|
203
|
+
]);
|
|
204
|
+
const entries = fs.readdirSync(dir, { withFileTypes: true });
|
|
205
|
+
for (const entry of entries) {
|
|
206
|
+
const fullPath = path.join(dir, entry.name);
|
|
207
|
+
if (entry.isDirectory()) {
|
|
208
|
+
if (!IGNORED_DIRS.has(entry.name)) {
|
|
209
|
+
collectFiles(fullPath, results);
|
|
210
|
+
}
|
|
211
|
+
} else if (entry.isFile()) {
|
|
212
|
+
const ext = path.extname(entry.name);
|
|
213
|
+
if (SUPPORTED_EXTENSIONS.has(ext)) {
|
|
214
|
+
results.push(fullPath);
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
return results;
|
|
219
|
+
}
|
|
220
|
+
async function* collectFilesStream(dir) {
|
|
221
|
+
const entries = fs.readdirSync(dir, { withFileTypes: true });
|
|
222
|
+
for (const entry of entries) {
|
|
223
|
+
const fullPath = path.join(dir, entry.name);
|
|
224
|
+
if (entry.isDirectory()) {
|
|
225
|
+
if (!INDEX_IGNORED_DIRS.has(entry.name)) {
|
|
226
|
+
yield* collectFilesStream(fullPath);
|
|
227
|
+
}
|
|
228
|
+
} else if (entry.isFile()) {
|
|
229
|
+
if (INDEX_SUPPORTED_EXTENSIONS.has(path.extname(entry.name))) {
|
|
230
|
+
yield fullPath;
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
var INDEXER_IGNORED_DIRS = /* @__PURE__ */ new Set([
|
|
236
|
+
// Build artifacts + dependency caches
|
|
237
|
+
"node_modules",
|
|
238
|
+
"dist",
|
|
239
|
+
"build",
|
|
240
|
+
"out",
|
|
241
|
+
"target",
|
|
242
|
+
"coverage",
|
|
243
|
+
".cache",
|
|
244
|
+
".turbo",
|
|
245
|
+
".next",
|
|
246
|
+
".nuxt",
|
|
247
|
+
// Version control + ctxloom state
|
|
248
|
+
".git",
|
|
249
|
+
".ctxloom",
|
|
250
|
+
// Other tools' working state (often contains duplicated source)
|
|
251
|
+
".claude",
|
|
252
|
+
".code-review-graph",
|
|
253
|
+
".vscode-test"
|
|
254
|
+
]);
|
|
255
|
+
var INDEX_IGNORED_DIRS = INDEXER_IGNORED_DIRS;
|
|
256
|
+
var INDEX_SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
257
|
+
".ts",
|
|
258
|
+
".tsx",
|
|
259
|
+
".js",
|
|
260
|
+
".jsx",
|
|
261
|
+
".mjs",
|
|
262
|
+
".vue",
|
|
263
|
+
".py",
|
|
264
|
+
".rs",
|
|
265
|
+
".go",
|
|
266
|
+
".java",
|
|
267
|
+
".cs",
|
|
268
|
+
".rb",
|
|
269
|
+
".kt",
|
|
270
|
+
".kts",
|
|
271
|
+
".swift",
|
|
272
|
+
".php",
|
|
273
|
+
".dart",
|
|
274
|
+
".c",
|
|
275
|
+
".cpp",
|
|
276
|
+
".h",
|
|
277
|
+
".md",
|
|
278
|
+
".json",
|
|
279
|
+
".yaml",
|
|
280
|
+
".yml",
|
|
281
|
+
".toml",
|
|
282
|
+
".ipynb"
|
|
283
|
+
]);
|
|
284
|
+
async function indexDirectory(rootDir, onProgress) {
|
|
285
|
+
const { VectorStore } = await import("./VectorStore-WDL3H7QT.js");
|
|
286
|
+
const store = new VectorStore(path.join(rootDir, ".ctxloom", "vectors.lancedb"));
|
|
287
|
+
await store.init();
|
|
288
|
+
let indexed = 0;
|
|
289
|
+
let errors = 0;
|
|
290
|
+
let processed = 0;
|
|
291
|
+
const total = 0;
|
|
292
|
+
const CONCURRENCY = 4;
|
|
293
|
+
const BATCH_SIZE = 50;
|
|
294
|
+
try {
|
|
295
|
+
let batch = [];
|
|
296
|
+
let chunk = [];
|
|
297
|
+
for await (const filePath of collectFilesStream(rootDir)) {
|
|
298
|
+
chunk.push(filePath);
|
|
299
|
+
if (chunk.length < CONCURRENCY) continue;
|
|
300
|
+
await processChunk(chunk);
|
|
301
|
+
chunk = [];
|
|
302
|
+
}
|
|
303
|
+
if (chunk.length > 0) {
|
|
304
|
+
await processChunk(chunk);
|
|
305
|
+
}
|
|
306
|
+
if (batch.length > 0) {
|
|
307
|
+
await store.upsertBatch(batch);
|
|
308
|
+
batch = [];
|
|
309
|
+
}
|
|
310
|
+
async function processChunk(paths) {
|
|
311
|
+
const readResults = await Promise.allSettled(
|
|
312
|
+
paths.map((filePath) => {
|
|
313
|
+
const MAX_INDEX_SIZE = 5 * 1024 * 1024;
|
|
314
|
+
const stat = fs.statSync(filePath);
|
|
315
|
+
if (stat.size > MAX_INDEX_SIZE) {
|
|
316
|
+
logger.warn("Skipping oversized file", { file: filePath, size: stat.size });
|
|
317
|
+
return null;
|
|
318
|
+
}
|
|
319
|
+
const content = fs.readFileSync(filePath, "utf-8");
|
|
320
|
+
if (!content.trim()) return null;
|
|
321
|
+
const relPath = path.relative(rootDir, filePath);
|
|
322
|
+
return { filePath: relPath, content };
|
|
323
|
+
})
|
|
324
|
+
);
|
|
325
|
+
const ready = [];
|
|
326
|
+
for (const r of readResults) {
|
|
327
|
+
processed++;
|
|
328
|
+
if (r.status === "fulfilled") {
|
|
329
|
+
if (r.value !== null) ready.push(r.value);
|
|
330
|
+
} else {
|
|
331
|
+
errors++;
|
|
332
|
+
logger.error("Failed to read file for indexing", {
|
|
333
|
+
detail: r.reason instanceof Error ? r.reason.message : String(r.reason)
|
|
334
|
+
});
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
if (ready.length === 0) return;
|
|
338
|
+
let embeddings;
|
|
339
|
+
try {
|
|
340
|
+
embeddings = await generateEmbeddingBatch(ready.map((r) => r.content));
|
|
341
|
+
} catch (err) {
|
|
342
|
+
errors += ready.length;
|
|
343
|
+
logger.error("Batch embedding failed; chunk lost", {
|
|
344
|
+
chunkSize: ready.length,
|
|
345
|
+
detail: err instanceof Error ? err.message : String(err)
|
|
346
|
+
});
|
|
347
|
+
return;
|
|
348
|
+
}
|
|
349
|
+
if (embeddings.length !== ready.length) {
|
|
350
|
+
errors += ready.length;
|
|
351
|
+
logger.error("Embedding batch length mismatch \u2014 chunk lost", {
|
|
352
|
+
expected: ready.length,
|
|
353
|
+
got: embeddings.length
|
|
354
|
+
});
|
|
355
|
+
return;
|
|
356
|
+
}
|
|
357
|
+
for (let i = 0; i < ready.length; i++) {
|
|
358
|
+
batch.push({
|
|
359
|
+
filePath: ready[i].filePath,
|
|
360
|
+
embedding: embeddings[i],
|
|
361
|
+
content: ready[i].content
|
|
362
|
+
});
|
|
363
|
+
indexed++;
|
|
364
|
+
onProgress?.(ready[i].filePath, processed, total);
|
|
365
|
+
if (batch.length >= BATCH_SIZE) {
|
|
366
|
+
await store.upsertBatch(batch);
|
|
367
|
+
batch = [];
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
} finally {
|
|
372
|
+
await store.close();
|
|
373
|
+
}
|
|
374
|
+
return { indexed, errors };
|
|
375
|
+
}
|
|
376
|
+
var EMBEDDING_MODEL_ID = MODEL_ID;
|
|
377
|
+
function getActiveEmbeddingModel() {
|
|
378
|
+
return {
|
|
379
|
+
hfId: ACTIVE_MODEL.hfId,
|
|
380
|
+
dim: ACTIVE_MODEL.dim,
|
|
381
|
+
description: ACTIVE_MODEL.description
|
|
382
|
+
};
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
export {
|
|
386
|
+
resolveEmbeddingModel,
|
|
387
|
+
EMBEDDING_DIMENSION,
|
|
388
|
+
generateEmbedding,
|
|
389
|
+
generateEmbeddingBatch,
|
|
390
|
+
collectFiles,
|
|
391
|
+
collectFilesStream,
|
|
392
|
+
INDEXER_IGNORED_DIRS,
|
|
393
|
+
indexDirectory,
|
|
394
|
+
EMBEDDING_MODEL_ID,
|
|
395
|
+
getActiveEmbeddingModel
|
|
396
|
+
};
|
|
397
|
+
//# sourceMappingURL=chunk-6FGTNOCP.js.map
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
import {
|
|
2
|
+
EMBEDDING_DIMENSION,
|
|
3
|
+
EMBEDDING_MODEL_ID
|
|
4
|
+
} from "./chunk-6FGTNOCP.js";
|
|
1
5
|
import {
|
|
2
6
|
logger
|
|
3
7
|
} from "./chunk-TYDMSHV7.js";
|
|
@@ -34,8 +38,12 @@ var VectorStore = class {
|
|
|
34
38
|
fs.mkdirSync(dir, { recursive: true });
|
|
35
39
|
}
|
|
36
40
|
const lancedb = await import("@lancedb/lancedb");
|
|
41
|
+
const lanceModule = lancedb.default ?? lancedb;
|
|
42
|
+
const lance = lanceModule;
|
|
37
43
|
const { makeArrowTable } = lancedb;
|
|
38
|
-
this.db = await
|
|
44
|
+
this.db = await lance.connect(this.dbPath);
|
|
45
|
+
const markerPath = path.join(path.dirname(this.dbPath), "embedding-model.json");
|
|
46
|
+
this.assertModelCompatibility(markerPath);
|
|
39
47
|
const existingTables = await this.db.tableNames();
|
|
40
48
|
if (existingTables.includes("code_embeddings")) {
|
|
41
49
|
this.table = await this.db.openTable("code_embeddings");
|
|
@@ -44,7 +52,7 @@ var VectorStore = class {
|
|
|
44
52
|
{
|
|
45
53
|
id: "__seed__",
|
|
46
54
|
filePath: "__seed__",
|
|
47
|
-
vector: new Array(
|
|
55
|
+
vector: new Array(EMBEDDING_DIMENSION).fill(0),
|
|
48
56
|
content: ""
|
|
49
57
|
}
|
|
50
58
|
]);
|
|
@@ -53,6 +61,47 @@ var VectorStore = class {
|
|
|
53
61
|
}
|
|
54
62
|
this.initialized = true;
|
|
55
63
|
}
|
|
64
|
+
/**
|
|
65
|
+
* Compare the active embedding model against the marker file written
|
|
66
|
+
* when the index was first built. Three cases:
|
|
67
|
+
*
|
|
68
|
+
* 1. Marker missing → legacy index (pre-v1.7.0). Assume MiniLM
|
|
69
|
+
* and write the marker so future runs are guarded.
|
|
70
|
+
* 2. Marker matches → proceed silently.
|
|
71
|
+
* 3. Marker differs → throw with a clear migration instruction.
|
|
72
|
+
* We don't auto-wipe — silently dropping a user's index because
|
|
73
|
+
* they set an env var is exactly the footgun the marker exists
|
|
74
|
+
* to prevent.
|
|
75
|
+
*/
|
|
76
|
+
assertModelCompatibility(markerPath) {
|
|
77
|
+
const active = { model: EMBEDDING_MODEL_ID, dim: EMBEDDING_DIMENSION };
|
|
78
|
+
let existing = null;
|
|
79
|
+
if (fs.existsSync(markerPath)) {
|
|
80
|
+
try {
|
|
81
|
+
existing = JSON.parse(fs.readFileSync(markerPath, "utf-8"));
|
|
82
|
+
} catch (err) {
|
|
83
|
+
logger.warn("Embedding-model marker is corrupt; treating as missing", {
|
|
84
|
+
path: markerPath,
|
|
85
|
+
detail: err instanceof Error ? err.message : String(err)
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
if (!existing) {
|
|
90
|
+
fs.writeFileSync(markerPath, JSON.stringify(active, null, 2));
|
|
91
|
+
return;
|
|
92
|
+
}
|
|
93
|
+
if (existing.model === active.model && existing.dim === active.dim) {
|
|
94
|
+
return;
|
|
95
|
+
}
|
|
96
|
+
throw new Error(
|
|
97
|
+
`Embedding-model mismatch: vector index at ${this.dbPath} was built with "${existing.model}" (${existing.dim}-dim) but the active model is "${active.model}" (${active.dim}-dim). Re-index required:
|
|
98
|
+
|
|
99
|
+
ctxloom vectors-cleanup --reset
|
|
100
|
+
ctxloom index
|
|
101
|
+
|
|
102
|
+
Or revert CTXLOOM_EMBEDDING_MODEL to "${existing.model}" to keep the existing index.`
|
|
103
|
+
);
|
|
104
|
+
}
|
|
56
105
|
/**
|
|
57
106
|
* Release LanceDB resources (file descriptors held by the underlying
|
|
58
107
|
* connection / table handles). Must be called at the end of long-lived
|
|
@@ -101,6 +150,48 @@ var VectorStore = class {
|
|
|
101
150
|
await this.compact();
|
|
102
151
|
}
|
|
103
152
|
}
|
|
153
|
+
/**
|
|
154
|
+
* Batch upsert — one LanceDB transaction for N file records.
|
|
155
|
+
*
|
|
156
|
+
* Why this exists (v1.7.0 monorepo support): the per-file `upsert()`
|
|
157
|
+
* does 2 LanceDB transactions (delete + add). On a 50k-file repo
|
|
158
|
+
* that's 100k transactions, each writing a manifest + transaction
|
|
159
|
+
* file. Compaction reclaims them every 200 upserts, but during the
|
|
160
|
+
* indexing burst the FD churn is enormous (observed: ~10k FDs/sec
|
|
161
|
+
* peak on Next.js, hitting the 256-default macOS process limit).
|
|
162
|
+
*
|
|
163
|
+
* With BATCH_SIZE=50 callers (the streaming indexer), this drops to
|
|
164
|
+
* ~2k transactions across the same corpus — 50× fewer manifests
|
|
165
|
+
* written, ~5× faster wall time, and the FD ceiling stops mattering.
|
|
166
|
+
*
|
|
167
|
+
* Compaction semantics unchanged: `upsertsSinceCompact` increments by
|
|
168
|
+
* one per RECORD (not per batch), so the compact trigger fires at
|
|
169
|
+
* the same total-record cadence as the per-file path.
|
|
170
|
+
*/
|
|
171
|
+
async upsertBatch(records) {
|
|
172
|
+
if (!this.table) throw new Error("VectorStore not initialized. Call init() first.");
|
|
173
|
+
if (records.length === 0) return;
|
|
174
|
+
const filter = records.map((r) => `filePath = '${sanitizeFilterPath(r.filePath)}'`).join(" OR ");
|
|
175
|
+
try {
|
|
176
|
+
await this.table.delete(filter);
|
|
177
|
+
} catch (err) {
|
|
178
|
+
logger.warn("Batch delete before upsert failed, continuing", {
|
|
179
|
+
detail: err instanceof Error ? err.message : String(err)
|
|
180
|
+
});
|
|
181
|
+
}
|
|
182
|
+
const rows = records.map((r) => ({
|
|
183
|
+
id: r.filePath,
|
|
184
|
+
filePath: r.filePath,
|
|
185
|
+
vector: r.embedding,
|
|
186
|
+
content: r.content.slice(0, 512)
|
|
187
|
+
}));
|
|
188
|
+
await this.table.add(rows);
|
|
189
|
+
this.upsertsSinceCompact += records.length;
|
|
190
|
+
if (this.upsertsSinceCompact >= this.compactEvery) {
|
|
191
|
+
this.upsertsSinceCompact = 0;
|
|
192
|
+
await this.compact();
|
|
193
|
+
}
|
|
194
|
+
}
|
|
104
195
|
/**
|
|
105
196
|
* Merge fragments and prune old LanceDB versions. Idempotent and safe to
|
|
106
197
|
* call mid-flight; the Table API serializes writes internally. Called
|
|
@@ -131,6 +222,35 @@ var VectorStore = class {
|
|
|
131
222
|
});
|
|
132
223
|
}
|
|
133
224
|
}
|
|
225
|
+
/**
|
|
226
|
+
* Retrieve the stored embedding for a known file. Returns null if the
|
|
227
|
+
* file isn't indexed. Used by blast-radius semantic-similarity search
|
|
228
|
+
* (v1.6.x) to find files semantically related to a seed without
|
|
229
|
+
* re-embedding the seed at query time.
|
|
230
|
+
*/
|
|
231
|
+
async findEmbeddingByPath(filePath) {
|
|
232
|
+
if (!this.table) throw new Error("VectorStore not initialized. Call init() first.");
|
|
233
|
+
try {
|
|
234
|
+
const safe = sanitizeFilterPath(filePath);
|
|
235
|
+
const rows = await this.table.query().where(`filePath = '${safe}'`).limit(1).toArray();
|
|
236
|
+
const first = rows[0];
|
|
237
|
+
if (!first) return null;
|
|
238
|
+
const vec = first.vector;
|
|
239
|
+
if (vec === null || vec === void 0) return null;
|
|
240
|
+
if (Array.isArray(vec)) return vec;
|
|
241
|
+
if (vec instanceof Float32Array) return Array.from(vec);
|
|
242
|
+
if (typeof vec.length === "number") {
|
|
243
|
+
return Array.from(vec);
|
|
244
|
+
}
|
|
245
|
+
return null;
|
|
246
|
+
} catch (err) {
|
|
247
|
+
logger.warn("VectorStore.findEmbeddingByPath failed", {
|
|
248
|
+
detail: err instanceof Error ? err.message : String(err),
|
|
249
|
+
filePath
|
|
250
|
+
});
|
|
251
|
+
return null;
|
|
252
|
+
}
|
|
253
|
+
}
|
|
134
254
|
/**
|
|
135
255
|
* Search for the top-K most similar code records using vector search.
|
|
136
256
|
*/
|
|
@@ -187,4 +307,4 @@ var VectorStore = class {
|
|
|
187
307
|
export {
|
|
188
308
|
VectorStore
|
|
189
309
|
};
|
|
190
|
-
//# sourceMappingURL=chunk-
|
|
310
|
+
//# sourceMappingURL=chunk-7S2ELKNU.js.map
|