ctxloom-pro 1.5.6 → 1.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +25 -3
- package/apps/dashboard/dist/server/index.js +544 -26
- package/apps/dashboard/package.json +2 -2
- package/dist/VectorStore-WDL3H7QT.js +9 -0
- package/dist/chunk-6FGTNOCP.js +397 -0
- package/dist/{chunk-FPMNXF4D.js → chunk-6W4DFPP2.js} +685 -43
- package/dist/{chunk-JULFFD7O.js → chunk-7S2ELKNU.js} +123 -3
- package/dist/{chunk-II2DPYRJ.js → chunk-YHLMQVBV.js} +200 -10
- package/dist/embedder-2JWDJUE2.js +26 -0
- package/dist/index.js +11 -11
- package/dist/setup/postinstall.js +1 -1
- package/dist/{src-DL44T55H.js → src-FQQOURSD.js} +6 -4
- package/dist/workers/indexerWorker.js +2 -2
- package/package.json +1 -1
- package/dist/VectorStore-2LVECRTY.js +0 -8
- package/dist/chunk-WDX4PJGL.js +0 -214
- package/dist/embedder-3AE4CSR7.js +0 -14
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
"dependencies": {
|
|
16
16
|
"@ctxloom/core": "*",
|
|
17
17
|
"cors": "^2.8.5",
|
|
18
|
-
"express": "^4.
|
|
18
|
+
"express": "^4.22.2",
|
|
19
19
|
"open": "^10.1.0",
|
|
20
20
|
"react-syntax-highlighter": "^16.1.1"
|
|
21
21
|
},
|
|
@@ -36,7 +36,7 @@
|
|
|
36
36
|
"clsx": "^2.1.1",
|
|
37
37
|
"d3": "^7.9.0",
|
|
38
38
|
"jsdom": "^24.0.0",
|
|
39
|
-
"postcss": "^8.
|
|
39
|
+
"postcss": "^8.5.15",
|
|
40
40
|
"react": "^18.3.0",
|
|
41
41
|
"react-dom": "^18.3.0",
|
|
42
42
|
"react-router-dom": "^6.23.0",
|
|
@@ -0,0 +1,397 @@
|
|
|
1
|
+
import {
|
|
2
|
+
logger
|
|
3
|
+
} from "./chunk-TYDMSHV7.js";
|
|
4
|
+
|
|
5
|
+
// packages/core/src/indexer/embedder.ts
|
|
6
|
+
import fs from "fs";
|
|
7
|
+
import path from "path";
|
|
8
|
+
var CHUNK_SIZE = 4096;
|
|
9
|
+
var MODEL_REGISTRY = {
|
|
10
|
+
// The historical default. General English, 384-dim, ~90 MB.
|
|
11
|
+
// Kept as the free-tier default so existing users see zero change.
|
|
12
|
+
minilm: {
|
|
13
|
+
hfId: "sentence-transformers/all-MiniLM-L6-v2",
|
|
14
|
+
dim: 384,
|
|
15
|
+
minBytes: 80 * 1024 * 1024,
|
|
16
|
+
description: "General-purpose English sentence embedder (2020). 384-dim. The legacy default."
|
|
17
|
+
},
|
|
18
|
+
// Code-specific embedding model (Jina AI). 768-dim, ~140 MB.
|
|
19
|
+
// Empirically 20-40% better recall on code-similarity queries than
|
|
20
|
+
// MiniLM — the upgrade path recommended in the v1.7.0 analysis.
|
|
21
|
+
// Runs through the same @huggingface/transformers pipeline so the
|
|
22
|
+
// privacy story (fully local, no network at inference time) is
|
|
23
|
+
// preserved.
|
|
24
|
+
"jina-code": {
|
|
25
|
+
hfId: "jinaai/jina-embeddings-v2-base-code",
|
|
26
|
+
dim: 768,
|
|
27
|
+
minBytes: 130 * 1024 * 1024,
|
|
28
|
+
description: "Code-specific embedder (Jina, 2024). 768-dim. Better recall on code-similarity tasks."
|
|
29
|
+
}
|
|
30
|
+
};
|
|
31
|
+
function resolveEmbeddingModel(env = process.env) {
|
|
32
|
+
const envModel = env.CTXLOOM_EMBEDDING_MODEL?.trim();
|
|
33
|
+
if (!envModel) return MODEL_REGISTRY.minilm;
|
|
34
|
+
const registered = MODEL_REGISTRY[envModel];
|
|
35
|
+
if (registered) return registered;
|
|
36
|
+
const envDim = env.CTXLOOM_EMBEDDING_DIM ? Number.parseInt(env.CTXLOOM_EMBEDDING_DIM, 10) : NaN;
|
|
37
|
+
if (!Number.isFinite(envDim) || envDim <= 0) {
|
|
38
|
+
throw new Error(
|
|
39
|
+
`CTXLOOM_EMBEDDING_MODEL=${envModel} is not a known alias. Either use one of [${Object.keys(MODEL_REGISTRY).join(", ")}] or set CTXLOOM_EMBEDDING_DIM=<vector-length> alongside a raw HF id.`
|
|
40
|
+
);
|
|
41
|
+
}
|
|
42
|
+
return {
|
|
43
|
+
hfId: envModel,
|
|
44
|
+
dim: envDim,
|
|
45
|
+
// Without a known artifact size we can't enforce the truncated-download
|
|
46
|
+
// guard. Use 1 MB as the minimum; the worst case is a redundant retry
|
|
47
|
+
// rather than a hung process.
|
|
48
|
+
minBytes: 1024 * 1024,
|
|
49
|
+
description: `User-supplied model: ${envModel} (${envDim}-dim)`
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
var ACTIVE_MODEL = resolveEmbeddingModel();
|
|
53
|
+
var EMBEDDING_DIMENSION = ACTIVE_MODEL.dim;
|
|
54
|
+
var MODEL_ID = ACTIVE_MODEL.hfId;
|
|
55
|
+
var MIN_MODEL_BYTES = ACTIVE_MODEL.minBytes;
|
|
56
|
+
var embedder = null;
|
|
57
|
+
var embedderInitInFlight = null;
|
|
58
|
+
async function loadEmbedder() {
|
|
59
|
+
const { pipeline } = await import("@huggingface/transformers");
|
|
60
|
+
return await pipeline("feature-extraction", MODEL_ID, {
|
|
61
|
+
dtype: "fp32"
|
|
62
|
+
});
|
|
63
|
+
}
|
|
64
|
+
function extractModelPathFromProtobufError(message) {
|
|
65
|
+
const match = /Load model from (.+) failed:Protobuf parsing failed/i.exec(message);
|
|
66
|
+
return match ? match[1] : null;
|
|
67
|
+
}
|
|
68
|
+
function tryRemoveTruncatedModel(modelPath) {
|
|
69
|
+
try {
|
|
70
|
+
const stat = fs.statSync(modelPath);
|
|
71
|
+
if (stat.size >= MIN_MODEL_BYTES) return false;
|
|
72
|
+
fs.unlinkSync(modelPath);
|
|
73
|
+
logger.warn("Removed truncated embedding model; next attempt will re-download", {
|
|
74
|
+
path: modelPath,
|
|
75
|
+
sizeBytes: stat.size,
|
|
76
|
+
minBytes: MIN_MODEL_BYTES
|
|
77
|
+
});
|
|
78
|
+
return true;
|
|
79
|
+
} catch (err) {
|
|
80
|
+
logger.warn("Could not inspect/remove suspected truncated model", {
|
|
81
|
+
path: modelPath,
|
|
82
|
+
detail: err instanceof Error ? err.message : String(err)
|
|
83
|
+
});
|
|
84
|
+
return false;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
async function getEmbedder() {
|
|
88
|
+
if (embedder) return embedder;
|
|
89
|
+
if (embedderInitInFlight) return embedderInitInFlight;
|
|
90
|
+
embedderInitInFlight = (async () => {
|
|
91
|
+
const MAX_ATTEMPTS = 3;
|
|
92
|
+
let lastErr;
|
|
93
|
+
for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) {
|
|
94
|
+
try {
|
|
95
|
+
const pipe = await loadEmbedder();
|
|
96
|
+
embedder = pipe;
|
|
97
|
+
return pipe;
|
|
98
|
+
} catch (err) {
|
|
99
|
+
lastErr = err;
|
|
100
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
101
|
+
const isProtobufRace = /protobuf parsing failed/i.test(msg);
|
|
102
|
+
if (!isProtobufRace || attempt === MAX_ATTEMPTS) break;
|
|
103
|
+
const modelPath = extractModelPathFromProtobufError(msg);
|
|
104
|
+
if (modelPath && tryRemoveTruncatedModel(modelPath)) {
|
|
105
|
+
logger.warn("Retrying embedding model load after truncated-cache removal", {
|
|
106
|
+
attempt
|
|
107
|
+
});
|
|
108
|
+
continue;
|
|
109
|
+
}
|
|
110
|
+
const delay = attempt * 1e3;
|
|
111
|
+
logger.warn("Embedding model load failed; retrying after FS settle", {
|
|
112
|
+
attempt,
|
|
113
|
+
delayMs: delay
|
|
114
|
+
});
|
|
115
|
+
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
embedderInitInFlight = null;
|
|
119
|
+
throw lastErr;
|
|
120
|
+
})();
|
|
121
|
+
try {
|
|
122
|
+
return await embedderInitInFlight;
|
|
123
|
+
} finally {
|
|
124
|
+
if (embedder) embedderInitInFlight = null;
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
async function generateEmbedding(text) {
|
|
128
|
+
const pipe = await getEmbedder();
|
|
129
|
+
const output = await pipe(text.slice(0, CHUNK_SIZE), {
|
|
130
|
+
pooling: "mean",
|
|
131
|
+
normalize: true
|
|
132
|
+
});
|
|
133
|
+
const data = output.tolist();
|
|
134
|
+
if (Array.isArray(data[0])) {
|
|
135
|
+
return data[0];
|
|
136
|
+
}
|
|
137
|
+
return data;
|
|
138
|
+
}
|
|
139
|
+
async function generateEmbeddingBatch(texts) {
|
|
140
|
+
if (texts.length === 0) return [];
|
|
141
|
+
const pipe = await getEmbedder();
|
|
142
|
+
const truncated = texts.map((t) => t.slice(0, CHUNK_SIZE));
|
|
143
|
+
const output = await pipe(truncated, {
|
|
144
|
+
pooling: "mean",
|
|
145
|
+
normalize: true
|
|
146
|
+
});
|
|
147
|
+
const data = output.tolist();
|
|
148
|
+
if (!Array.isArray(data[0])) {
|
|
149
|
+
throw new Error(
|
|
150
|
+
`generateEmbeddingBatch: pipeline returned 1D tensor for ${texts.length} inputs (expected 2D [batch_size, embedding_dim]). Likely a transformers.js version regression.`
|
|
151
|
+
);
|
|
152
|
+
}
|
|
153
|
+
return data;
|
|
154
|
+
}
|
|
155
|
+
function collectFiles(dir, results = []) {
|
|
156
|
+
const IGNORED_DIRS = /* @__PURE__ */ new Set([
|
|
157
|
+
// Build artifacts + dependency caches
|
|
158
|
+
"node_modules",
|
|
159
|
+
"dist",
|
|
160
|
+
"build",
|
|
161
|
+
"out",
|
|
162
|
+
"target",
|
|
163
|
+
"coverage",
|
|
164
|
+
".cache",
|
|
165
|
+
".turbo",
|
|
166
|
+
".next",
|
|
167
|
+
".nuxt",
|
|
168
|
+
// Version control + ctxloom state
|
|
169
|
+
".git",
|
|
170
|
+
".ctxloom",
|
|
171
|
+
// Other tools' working state (often contains duplicated source)
|
|
172
|
+
".claude",
|
|
173
|
+
".code-review-graph",
|
|
174
|
+
".vscode-test"
|
|
175
|
+
]);
|
|
176
|
+
const SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
177
|
+
".ts",
|
|
178
|
+
".tsx",
|
|
179
|
+
".js",
|
|
180
|
+
".jsx",
|
|
181
|
+
".mjs",
|
|
182
|
+
".vue",
|
|
183
|
+
".py",
|
|
184
|
+
".rs",
|
|
185
|
+
".go",
|
|
186
|
+
".java",
|
|
187
|
+
".cs",
|
|
188
|
+
".rb",
|
|
189
|
+
".kt",
|
|
190
|
+
".kts",
|
|
191
|
+
".swift",
|
|
192
|
+
".php",
|
|
193
|
+
".dart",
|
|
194
|
+
".c",
|
|
195
|
+
".cpp",
|
|
196
|
+
".h",
|
|
197
|
+
".md",
|
|
198
|
+
".json",
|
|
199
|
+
".yaml",
|
|
200
|
+
".yml",
|
|
201
|
+
".toml",
|
|
202
|
+
".ipynb"
|
|
203
|
+
]);
|
|
204
|
+
const entries = fs.readdirSync(dir, { withFileTypes: true });
|
|
205
|
+
for (const entry of entries) {
|
|
206
|
+
const fullPath = path.join(dir, entry.name);
|
|
207
|
+
if (entry.isDirectory()) {
|
|
208
|
+
if (!IGNORED_DIRS.has(entry.name)) {
|
|
209
|
+
collectFiles(fullPath, results);
|
|
210
|
+
}
|
|
211
|
+
} else if (entry.isFile()) {
|
|
212
|
+
const ext = path.extname(entry.name);
|
|
213
|
+
if (SUPPORTED_EXTENSIONS.has(ext)) {
|
|
214
|
+
results.push(fullPath);
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
return results;
|
|
219
|
+
}
|
|
220
|
+
async function* collectFilesStream(dir) {
|
|
221
|
+
const entries = fs.readdirSync(dir, { withFileTypes: true });
|
|
222
|
+
for (const entry of entries) {
|
|
223
|
+
const fullPath = path.join(dir, entry.name);
|
|
224
|
+
if (entry.isDirectory()) {
|
|
225
|
+
if (!INDEX_IGNORED_DIRS.has(entry.name)) {
|
|
226
|
+
yield* collectFilesStream(fullPath);
|
|
227
|
+
}
|
|
228
|
+
} else if (entry.isFile()) {
|
|
229
|
+
if (INDEX_SUPPORTED_EXTENSIONS.has(path.extname(entry.name))) {
|
|
230
|
+
yield fullPath;
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
var INDEXER_IGNORED_DIRS = /* @__PURE__ */ new Set([
|
|
236
|
+
// Build artifacts + dependency caches
|
|
237
|
+
"node_modules",
|
|
238
|
+
"dist",
|
|
239
|
+
"build",
|
|
240
|
+
"out",
|
|
241
|
+
"target",
|
|
242
|
+
"coverage",
|
|
243
|
+
".cache",
|
|
244
|
+
".turbo",
|
|
245
|
+
".next",
|
|
246
|
+
".nuxt",
|
|
247
|
+
// Version control + ctxloom state
|
|
248
|
+
".git",
|
|
249
|
+
".ctxloom",
|
|
250
|
+
// Other tools' working state (often contains duplicated source)
|
|
251
|
+
".claude",
|
|
252
|
+
".code-review-graph",
|
|
253
|
+
".vscode-test"
|
|
254
|
+
]);
|
|
255
|
+
var INDEX_IGNORED_DIRS = INDEXER_IGNORED_DIRS;
|
|
256
|
+
var INDEX_SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
257
|
+
".ts",
|
|
258
|
+
".tsx",
|
|
259
|
+
".js",
|
|
260
|
+
".jsx",
|
|
261
|
+
".mjs",
|
|
262
|
+
".vue",
|
|
263
|
+
".py",
|
|
264
|
+
".rs",
|
|
265
|
+
".go",
|
|
266
|
+
".java",
|
|
267
|
+
".cs",
|
|
268
|
+
".rb",
|
|
269
|
+
".kt",
|
|
270
|
+
".kts",
|
|
271
|
+
".swift",
|
|
272
|
+
".php",
|
|
273
|
+
".dart",
|
|
274
|
+
".c",
|
|
275
|
+
".cpp",
|
|
276
|
+
".h",
|
|
277
|
+
".md",
|
|
278
|
+
".json",
|
|
279
|
+
".yaml",
|
|
280
|
+
".yml",
|
|
281
|
+
".toml",
|
|
282
|
+
".ipynb"
|
|
283
|
+
]);
|
|
284
|
+
async function indexDirectory(rootDir, onProgress) {
|
|
285
|
+
const { VectorStore } = await import("./VectorStore-WDL3H7QT.js");
|
|
286
|
+
const store = new VectorStore(path.join(rootDir, ".ctxloom", "vectors.lancedb"));
|
|
287
|
+
await store.init();
|
|
288
|
+
let indexed = 0;
|
|
289
|
+
let errors = 0;
|
|
290
|
+
let processed = 0;
|
|
291
|
+
const total = 0;
|
|
292
|
+
const CONCURRENCY = 4;
|
|
293
|
+
const BATCH_SIZE = 50;
|
|
294
|
+
try {
|
|
295
|
+
let batch = [];
|
|
296
|
+
let chunk = [];
|
|
297
|
+
for await (const filePath of collectFilesStream(rootDir)) {
|
|
298
|
+
chunk.push(filePath);
|
|
299
|
+
if (chunk.length < CONCURRENCY) continue;
|
|
300
|
+
await processChunk(chunk);
|
|
301
|
+
chunk = [];
|
|
302
|
+
}
|
|
303
|
+
if (chunk.length > 0) {
|
|
304
|
+
await processChunk(chunk);
|
|
305
|
+
}
|
|
306
|
+
if (batch.length > 0) {
|
|
307
|
+
await store.upsertBatch(batch);
|
|
308
|
+
batch = [];
|
|
309
|
+
}
|
|
310
|
+
async function processChunk(paths) {
|
|
311
|
+
const readResults = await Promise.allSettled(
|
|
312
|
+
paths.map((filePath) => {
|
|
313
|
+
const MAX_INDEX_SIZE = 5 * 1024 * 1024;
|
|
314
|
+
const stat = fs.statSync(filePath);
|
|
315
|
+
if (stat.size > MAX_INDEX_SIZE) {
|
|
316
|
+
logger.warn("Skipping oversized file", { file: filePath, size: stat.size });
|
|
317
|
+
return null;
|
|
318
|
+
}
|
|
319
|
+
const content = fs.readFileSync(filePath, "utf-8");
|
|
320
|
+
if (!content.trim()) return null;
|
|
321
|
+
const relPath = path.relative(rootDir, filePath);
|
|
322
|
+
return { filePath: relPath, content };
|
|
323
|
+
})
|
|
324
|
+
);
|
|
325
|
+
const ready = [];
|
|
326
|
+
for (const r of readResults) {
|
|
327
|
+
processed++;
|
|
328
|
+
if (r.status === "fulfilled") {
|
|
329
|
+
if (r.value !== null) ready.push(r.value);
|
|
330
|
+
} else {
|
|
331
|
+
errors++;
|
|
332
|
+
logger.error("Failed to read file for indexing", {
|
|
333
|
+
detail: r.reason instanceof Error ? r.reason.message : String(r.reason)
|
|
334
|
+
});
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
if (ready.length === 0) return;
|
|
338
|
+
let embeddings;
|
|
339
|
+
try {
|
|
340
|
+
embeddings = await generateEmbeddingBatch(ready.map((r) => r.content));
|
|
341
|
+
} catch (err) {
|
|
342
|
+
errors += ready.length;
|
|
343
|
+
logger.error("Batch embedding failed; chunk lost", {
|
|
344
|
+
chunkSize: ready.length,
|
|
345
|
+
detail: err instanceof Error ? err.message : String(err)
|
|
346
|
+
});
|
|
347
|
+
return;
|
|
348
|
+
}
|
|
349
|
+
if (embeddings.length !== ready.length) {
|
|
350
|
+
errors += ready.length;
|
|
351
|
+
logger.error("Embedding batch length mismatch \u2014 chunk lost", {
|
|
352
|
+
expected: ready.length,
|
|
353
|
+
got: embeddings.length
|
|
354
|
+
});
|
|
355
|
+
return;
|
|
356
|
+
}
|
|
357
|
+
for (let i = 0; i < ready.length; i++) {
|
|
358
|
+
batch.push({
|
|
359
|
+
filePath: ready[i].filePath,
|
|
360
|
+
embedding: embeddings[i],
|
|
361
|
+
content: ready[i].content
|
|
362
|
+
});
|
|
363
|
+
indexed++;
|
|
364
|
+
onProgress?.(ready[i].filePath, processed, total);
|
|
365
|
+
if (batch.length >= BATCH_SIZE) {
|
|
366
|
+
await store.upsertBatch(batch);
|
|
367
|
+
batch = [];
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
} finally {
|
|
372
|
+
await store.close();
|
|
373
|
+
}
|
|
374
|
+
return { indexed, errors };
|
|
375
|
+
}
|
|
376
|
+
var EMBEDDING_MODEL_ID = MODEL_ID;
|
|
377
|
+
function getActiveEmbeddingModel() {
|
|
378
|
+
return {
|
|
379
|
+
hfId: ACTIVE_MODEL.hfId,
|
|
380
|
+
dim: ACTIVE_MODEL.dim,
|
|
381
|
+
description: ACTIVE_MODEL.description
|
|
382
|
+
};
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
export {
|
|
386
|
+
resolveEmbeddingModel,
|
|
387
|
+
EMBEDDING_DIMENSION,
|
|
388
|
+
generateEmbedding,
|
|
389
|
+
generateEmbeddingBatch,
|
|
390
|
+
collectFiles,
|
|
391
|
+
collectFilesStream,
|
|
392
|
+
INDEXER_IGNORED_DIRS,
|
|
393
|
+
indexDirectory,
|
|
394
|
+
EMBEDDING_MODEL_ID,
|
|
395
|
+
getActiveEmbeddingModel
|
|
396
|
+
};
|
|
397
|
+
//# sourceMappingURL=chunk-6FGTNOCP.js.map
|