@zuvia-software-solutions/code-mapper 2.4.1 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/analyze.d.ts +0 -1
- package/dist/cli/analyze.js +11 -87
- package/dist/cli/index.js +2 -2
- package/dist/core/embeddings/index.d.ts +2 -3
- package/dist/core/embeddings/index.js +2 -3
- package/dist/core/embeddings/nl-embed-worker.d.ts +8 -0
- package/dist/core/embeddings/nl-embed-worker.js +38 -0
- package/dist/core/embeddings/nl-embedder.d.ts +1 -1
- package/dist/core/embeddings/nl-embedder.js +199 -30
- package/dist/core/incremental/refresh.js +18 -26
- package/dist/mcp/local/local-backend.js +40 -27
- package/dist/mcp/server.js +2 -2
- package/dist/mcp/tools.js +1 -0
- package/package.json +2 -5
- package/models/jina-code-0.5b-mlx/config.json +0 -73
- package/models/jina-code-0.5b-mlx/model.py +0 -127
- package/models/mlx-embedder.py +0 -604
package/dist/cli/analyze.d.ts
CHANGED
package/dist/cli/analyze.js
CHANGED
|
@@ -347,101 +347,25 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
347
347
|
catch { /* some may fail if node was removed, that's fine */ }
|
|
348
348
|
}
|
|
349
349
|
}
|
|
350
|
-
// Phase 4: Embeddings (
|
|
350
|
+
// Phase 4: Embeddings — bge-small NL embeddings (CPU, Node.js, no Python)
|
|
351
|
+
// Extracts natural language from code (comments, names, enums, patterns)
|
|
352
|
+
// and embeds with bge-small-en-v1.5 (33M params, 384-dim, ~6ms/doc).
|
|
351
353
|
const stats = getStats(db);
|
|
352
354
|
let embeddingFailed = false;
|
|
353
355
|
if (options?.embeddings) {
|
|
354
356
|
recordPhase('embeddings');
|
|
355
|
-
updateBar(90, 'Generating embeddings...');
|
|
356
|
-
// Close DB so Python can write to it
|
|
357
|
-
closeDb(dbPath);
|
|
358
|
-
// Run Python embedder in batch mode — reads from SQLite, embeds, writes back.
|
|
359
|
-
// Zero IPC overhead: ~3x faster than Node↔Python JSON streaming.
|
|
360
|
-
const { spawn: spawnChild } = await import('child_process');
|
|
361
|
-
const { fileURLToPath } = await import('url');
|
|
362
|
-
const mlxScript = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..', '..', 'models', 'mlx-embedder.py');
|
|
363
|
-
await new Promise((resolve) => {
|
|
364
|
-
// Use spawn (not execFile) — no internal buffer limit, streams only.
|
|
365
|
-
// execFile buffers all stdout in memory which causes OOM/kill on large codebases.
|
|
366
|
-
const proc = spawnChild('python3', [mlxScript, 'batch', dbPath, '--dims', '256', '--max-tokens', '2048'], {
|
|
367
|
-
stdio: ['ignore', 'pipe', 'pipe'],
|
|
368
|
-
});
|
|
369
|
-
let stderrBuf = '';
|
|
370
|
-
proc.stderr?.on('data', (chunk) => {
|
|
371
|
-
stderrBuf += chunk.toString();
|
|
372
|
-
if (stderrBuf.length > 10240)
|
|
373
|
-
stderrBuf = stderrBuf.slice(-10240);
|
|
374
|
-
});
|
|
375
|
-
proc.on('close', (code) => {
|
|
376
|
-
if (code !== 0) {
|
|
377
|
-
// Non-fatal: index is already saved, just embeddings failed
|
|
378
|
-
console.error(`\n Warning: Embedding failed (exit code ${code}). Index saved without embeddings.`);
|
|
379
|
-
if (stderrBuf.trim())
|
|
380
|
-
console.error(` ${stderrBuf.trim().split('\n').slice(-3).join('\n ')}`);
|
|
381
|
-
embeddingFailed = true;
|
|
382
|
-
}
|
|
383
|
-
resolve();
|
|
384
|
-
});
|
|
385
|
-
proc.on('error', (err) => {
|
|
386
|
-
console.error(`\n Warning: Embedding failed: ${err.message}. Index saved without embeddings.`);
|
|
387
|
-
embeddingFailed = true;
|
|
388
|
-
resolve();
|
|
389
|
-
});
|
|
390
|
-
// Stream progress from Python's JSON lines on stdout
|
|
391
|
-
let lineBuf = '';
|
|
392
|
-
proc.stdout?.on('data', (chunk) => {
|
|
393
|
-
lineBuf += chunk.toString();
|
|
394
|
-
const lines = lineBuf.split('\n');
|
|
395
|
-
lineBuf = lines.pop() || '';
|
|
396
|
-
for (const line of lines) {
|
|
397
|
-
if (!line.trim())
|
|
398
|
-
continue;
|
|
399
|
-
try {
|
|
400
|
-
const msg = JSON.parse(line);
|
|
401
|
-
if (msg.phase === 'downloading' || msg.phase === 'converting') {
|
|
402
|
-
updateBar(90, msg.message);
|
|
403
|
-
}
|
|
404
|
-
else if (msg.phase === 'loaded') {
|
|
405
|
-
updateBar(91, `Model loaded (${msg.load_ms}ms)`);
|
|
406
|
-
}
|
|
407
|
-
else if (msg.phase === 'queried') {
|
|
408
|
-
updateBar(92, `Found ${msg.nodes} embeddable nodes${msg.skipped_tests ? ` (${msg.skipped_tests} test files skipped)` : ''}`);
|
|
409
|
-
}
|
|
410
|
-
else if (msg.phase === 'prepared') {
|
|
411
|
-
updateBar(93, `${msg.to_embed} to embed, ${msg.skipped} cached`);
|
|
412
|
-
}
|
|
413
|
-
else if (msg.phase === 'embedding') {
|
|
414
|
-
const scaled = 93 + Math.round((msg.progress / 100) * 4);
|
|
415
|
-
updateBar(scaled, `Embedding... ${msg.progress}% (${msg.embedded} written)`);
|
|
416
|
-
}
|
|
417
|
-
else if (msg.phase === 'embedded') {
|
|
418
|
-
updateBar(97, `Embedded ${msg.count} nodes (${(msg.ms / 1000).toFixed(1)}s)`);
|
|
419
|
-
}
|
|
420
|
-
else if (msg.phase === 'done') {
|
|
421
|
-
updateBar(98, `Embeddings complete (${msg.embedded} new, ${msg.skipped} cached)`);
|
|
422
|
-
}
|
|
423
|
-
}
|
|
424
|
-
catch { }
|
|
425
|
-
}
|
|
426
|
-
});
|
|
427
|
-
});
|
|
428
|
-
// Reopen DB after Python is done
|
|
429
|
-
db = openDb(dbPath);
|
|
430
|
-
}
|
|
431
|
-
// Phase 4b: NL Embeddings (bge-small, CPU, Node.js)
|
|
432
|
-
if (options?.nlEmbeddings) {
|
|
433
|
-
recordPhase('nl-embeddings');
|
|
434
|
-
updateBar(95, 'Generating NL embeddings (bge-small)...');
|
|
357
|
+
updateBar(90, 'Generating embeddings (bge-small)...');
|
|
435
358
|
const { buildNlEmbeddings } = await import('../core/embeddings/nl-embedder.js');
|
|
436
359
|
try {
|
|
437
|
-
const
|
|
438
|
-
const pct =
|
|
439
|
-
updateBar(pct, `
|
|
360
|
+
const result = await buildNlEmbeddings(db, (current, total) => {
|
|
361
|
+
const pct = 90 + Math.round((current / Math.max(total, 1)) * 8);
|
|
362
|
+
updateBar(pct, `Embeddings (${current}/${total})`, 'Embeddings');
|
|
440
363
|
});
|
|
441
|
-
updateBar(98, `
|
|
364
|
+
updateBar(98, `Embeddings: ${result.embedded} embedded, ${result.skipped} cached (${(result.durationMs / 1000).toFixed(1)}s)`);
|
|
442
365
|
}
|
|
443
366
|
catch (err) {
|
|
444
|
-
console.error(`\n Warning:
|
|
367
|
+
console.error(`\n Warning: Embedding failed: ${err instanceof Error ? err.message : err}`);
|
|
368
|
+
embeddingFailed = true;
|
|
445
369
|
}
|
|
446
370
|
}
|
|
447
371
|
// Phase 5: Finalize (98-100%)
|
|
@@ -535,7 +459,7 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
535
459
|
'search-text': 'Search text',
|
|
536
460
|
fts: 'FTS indexing',
|
|
537
461
|
'restore-embeddings': 'Restore embeddings',
|
|
538
|
-
embeddings: 'Embeddings (
|
|
462
|
+
embeddings: 'Embeddings (bge-small)',
|
|
539
463
|
finalize: 'Finalize & context',
|
|
540
464
|
done: 'Done',
|
|
541
465
|
};
|
package/dist/cli/index.js
CHANGED
|
@@ -22,8 +22,8 @@ program
|
|
|
22
22
|
.command('analyze [path]')
|
|
23
23
|
.description('Index a repository (full analysis)')
|
|
24
24
|
.option('-f, --force', 'Force full re-index even if up to date')
|
|
25
|
-
.option('--embeddings', '
|
|
26
|
-
.option('--
|
|
25
|
+
.option('--embeddings', 'Generate semantic embeddings (bge-small, CPU, fast)')
|
|
26
|
+
.option('--no-embeddings', 'Skip embedding generation')
|
|
27
27
|
.option('--no-tsgo', 'Skip tsgo LSP for call resolution (faster, less accurate)')
|
|
28
28
|
.option('-v, --verbose', 'Enable verbose ingestion warnings (default: false)')
|
|
29
29
|
.addHelpText('after', '\nEnvironment variables:\n CODE_MAPPER_NO_GITIGNORE=1 Skip .gitignore parsing (still reads .code-mapperignore)')
|
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
/** @file
|
|
1
|
+
/** @file Barrel re-exports for the embedding system (bge-small NL embedder) */
|
|
2
2
|
export * from './types.js';
|
|
3
|
-
export * from './embedder.js';
|
|
4
3
|
export * from './text-generator.js';
|
|
5
|
-
export * from './
|
|
4
|
+
export * from './nl-embedder.js';
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
// code-mapper/src/core/embeddings/index.ts
|
|
2
|
-
/** @file
|
|
2
|
+
/** @file Barrel re-exports for the embedding system (bge-small NL embedder) */
|
|
3
3
|
export * from './types.js';
|
|
4
|
-
export * from './embedder.js';
|
|
5
4
|
export * from './text-generator.js';
|
|
6
|
-
export * from './
|
|
5
|
+
export * from './nl-embedder.js';
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @file Worker process for parallel NL embedding.
|
|
3
|
+
* Spawned by buildNlEmbeddings — loads bge-small independently,
|
|
4
|
+
* embeds texts received via IPC, sends vectors back.
|
|
5
|
+
*
|
|
6
|
+
* Same architecture as parallel tsgo: N processes, each with own model.
|
|
7
|
+
*/
|
|
8
|
+
export {};
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
// code-mapper/src/core/embeddings/nl-embed-worker.ts
|
|
2
|
+
/**
|
|
3
|
+
* @file Worker process for parallel NL embedding.
|
|
4
|
+
* Spawned by buildNlEmbeddings — loads bge-small independently,
|
|
5
|
+
* embeds texts received via IPC, sends vectors back.
|
|
6
|
+
*
|
|
7
|
+
* Same architecture as parallel tsgo: N processes, each with own model.
|
|
8
|
+
*/
|
|
9
|
+
import { pipeline } from '@huggingface/transformers';
|
|
10
|
+
const MODEL_ID = 'Xenova/bge-small-en-v1.5';
|
|
11
|
+
async function main() {
|
|
12
|
+
// Load model
|
|
13
|
+
const extractor = await pipeline('feature-extraction', MODEL_ID, { quantized: true });
|
|
14
|
+
process.send({ type: 'ready' });
|
|
15
|
+
// Process messages from parent
|
|
16
|
+
process.on('message', async (msg) => {
|
|
17
|
+
if (msg.type === 'embed') {
|
|
18
|
+
const results = [];
|
|
19
|
+
for (const item of msg.items) {
|
|
20
|
+
try {
|
|
21
|
+
const result = await extractor(item.text, { pooling: 'cls', normalize: true });
|
|
22
|
+
results.push({ nodeId: item.nodeId, vec: Array.from(result.data) });
|
|
23
|
+
}
|
|
24
|
+
catch {
|
|
25
|
+
// Skip failed embeddings
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
process.send({ type: 'results', results, batchId: msg.batchId });
|
|
29
|
+
}
|
|
30
|
+
else if (msg.type === 'exit') {
|
|
31
|
+
process.exit(0);
|
|
32
|
+
}
|
|
33
|
+
});
|
|
34
|
+
}
|
|
35
|
+
main().catch(err => {
|
|
36
|
+
console.error('NL embed worker failed:', err);
|
|
37
|
+
process.exit(1);
|
|
38
|
+
});
|
|
@@ -14,7 +14,7 @@ export declare function initNlEmbedder(): Promise<void>;
|
|
|
14
14
|
export declare function isNlEmbedderReady(): boolean;
|
|
15
15
|
/** Embed a single text, returns Float32Array */
|
|
16
16
|
export declare function nlEmbed(text: string): Promise<number[]>;
|
|
17
|
-
/** Embed a batch of texts */
|
|
17
|
+
/** Embed a batch of texts (processes in sub-batches for memory efficiency) */
|
|
18
18
|
export declare function nlEmbedBatch(texts: string[]): Promise<number[][]>;
|
|
19
19
|
interface NodeForNl {
|
|
20
20
|
id: string;
|
|
@@ -20,7 +20,11 @@ export async function initNlEmbedder() {
|
|
|
20
20
|
if (loadPromise)
|
|
21
21
|
return loadPromise;
|
|
22
22
|
loadPromise = (async () => {
|
|
23
|
-
const { pipeline } = await import('@huggingface/transformers');
|
|
23
|
+
const { pipeline, env } = await import('@huggingface/transformers');
|
|
24
|
+
// Use all available CPU threads for ONNX inference
|
|
25
|
+
if (env.backends?.onnx?.wasm) {
|
|
26
|
+
env.backends.onnx.wasm.numThreads = Math.max(1, (await import('os')).cpus().length);
|
|
27
|
+
}
|
|
24
28
|
extractor = await pipeline('feature-extraction', MODEL_ID, { quantized: true });
|
|
25
29
|
})();
|
|
26
30
|
return loadPromise;
|
|
@@ -36,14 +40,19 @@ export async function nlEmbed(text) {
|
|
|
36
40
|
const result = await extractor(text, { pooling: 'cls', normalize: true });
|
|
37
41
|
return Array.from(result.data);
|
|
38
42
|
}
|
|
39
|
-
/** Embed a batch of texts */
|
|
43
|
+
/** Embed a batch of texts (processes in sub-batches for memory efficiency) */
|
|
40
44
|
export async function nlEmbedBatch(texts) {
|
|
41
45
|
if (!extractor)
|
|
42
46
|
await initNlEmbedder();
|
|
47
|
+
const BATCH = 32; // sub-batch size — balances throughput vs memory
|
|
43
48
|
const results = [];
|
|
44
|
-
for (
|
|
45
|
-
const
|
|
46
|
-
|
|
49
|
+
for (let i = 0; i < texts.length; i += BATCH) {
|
|
50
|
+
const batch = texts.slice(i, i + BATCH);
|
|
51
|
+
// Process sub-batch — transformers.js handles arrays
|
|
52
|
+
const batchResults = await Promise.all(batch.map(text => extractor(text, { pooling: 'cls', normalize: true })));
|
|
53
|
+
for (const result of batchResults) {
|
|
54
|
+
results.push(Array.from(result.data));
|
|
55
|
+
}
|
|
47
56
|
}
|
|
48
57
|
return results;
|
|
49
58
|
}
|
|
@@ -197,9 +206,10 @@ export async function buildNlEmbeddings(db, onProgress) {
|
|
|
197
206
|
const labels = ['Function', 'Class', 'Method', 'Interface', 'Const', 'Enum', 'TypeAlias', 'Namespace', 'Module', 'Struct'];
|
|
198
207
|
const placeholders = labels.map(() => '?').join(',');
|
|
199
208
|
const rows = db.prepare(`SELECT id, name, label, filePath, content, startLine, description FROM nodes WHERE label IN (${placeholders})`).all(...labels);
|
|
200
|
-
//
|
|
201
|
-
|
|
202
|
-
|
|
209
|
+
// NL embeddings include ALL files (including tests) — test names describe
|
|
210
|
+
// functionality in natural language which helps conceptual search.
|
|
211
|
+
// The bge-small model is fast enough (~6ms/doc) that the cost is trivial.
|
|
212
|
+
const filteredRows = rows;
|
|
203
213
|
// Extract NL documents
|
|
204
214
|
const allDocs = [];
|
|
205
215
|
for (const row of filteredRows) {
|
|
@@ -210,10 +220,20 @@ export async function buildNlEmbeddings(db, onProgress) {
|
|
|
210
220
|
if (allDocs.length === 0) {
|
|
211
221
|
return { embedded: 0, skipped: 0, durationMs: Date.now() - t0 };
|
|
212
222
|
}
|
|
223
|
+
// Deduplicate: one embedding per nodeId — prefer 'comment' source over 'name' or 'enum'
|
|
224
|
+
const SOURCE_PRIORITY = { comment: 0, enum: 1, name: 2 };
|
|
225
|
+
const bestByNode = new Map();
|
|
226
|
+
for (const doc of allDocs) {
|
|
227
|
+
const existing = bestByNode.get(doc.nodeId);
|
|
228
|
+
if (!existing || (SOURCE_PRIORITY[doc.source] ?? 9) < (SOURCE_PRIORITY[existing.source] ?? 9)) {
|
|
229
|
+
bestByNode.set(doc.nodeId, doc);
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
const uniqueDocs = [...bestByNode.values()];
|
|
213
233
|
// Check existing hashes for skip detection
|
|
214
234
|
const existingHashes = new Map();
|
|
215
235
|
try {
|
|
216
|
-
const hashRows = db.prepare('SELECT nodeId, textHash FROM
|
|
236
|
+
const hashRows = db.prepare('SELECT nodeId, textHash FROM embeddings WHERE textHash IS NOT NULL').all();
|
|
217
237
|
for (const r of hashRows)
|
|
218
238
|
existingHashes.set(r.nodeId + ':' + r.textHash, '1');
|
|
219
239
|
}
|
|
@@ -221,7 +241,7 @@ export async function buildNlEmbeddings(db, onProgress) {
|
|
|
221
241
|
// Filter to docs that need embedding
|
|
222
242
|
const toEmbed = [];
|
|
223
243
|
let skipped = 0;
|
|
224
|
-
for (const doc of
|
|
244
|
+
for (const doc of uniqueDocs) {
|
|
225
245
|
const hash = md5(doc.text);
|
|
226
246
|
if (existingHashes.has(doc.nodeId + ':' + hash)) {
|
|
227
247
|
skipped++;
|
|
@@ -232,31 +252,180 @@ export async function buildNlEmbeddings(db, onProgress) {
|
|
|
232
252
|
if (toEmbed.length === 0) {
|
|
233
253
|
return { embedded: 0, skipped, durationMs: Date.now() - t0 };
|
|
234
254
|
}
|
|
235
|
-
// Clear existing
|
|
236
|
-
db.prepare('DELETE FROM
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
255
|
+
// Clear existing embeddings and rebuild
|
|
256
|
+
db.prepare('DELETE FROM embeddings').run();
|
|
257
|
+
try {
|
|
258
|
+
db.prepare('DELETE FROM nl_embeddings').run();
|
|
259
|
+
}
|
|
260
|
+
catch { /* table may not exist */ }
|
|
261
|
+
// Parallel multi-process embedding — same architecture as tsgo
|
|
262
|
+
// Each worker loads its own bge-small model, embeds independently.
|
|
263
|
+
const os = await import('os');
|
|
264
|
+
const { fork } = await import('child_process');
|
|
265
|
+
const { fileURLToPath } = await import('url');
|
|
266
|
+
const pathMod = await import('path');
|
|
267
|
+
const cpuCount = os.cpus().length;
|
|
268
|
+
const maxByCore = Math.max(1, Math.floor(cpuCount * 0.75));
|
|
269
|
+
const maxByWorkload = Math.max(1, Math.floor(toEmbed.length / 50));
|
|
270
|
+
const workerCount = Math.min(maxByCore, maxByWorkload, 8); // cap at 8 for memory
|
|
271
|
+
// Find worker script path
|
|
272
|
+
const thisDir = pathMod.dirname(fileURLToPath(import.meta.url));
|
|
273
|
+
const workerScript = pathMod.join(thisDir, 'nl-embed-worker.js');
|
|
274
|
+
// Split work across workers
|
|
275
|
+
const ITEMS_PER_BATCH = 50;
|
|
276
|
+
let nextIdx = 0;
|
|
240
277
|
let embedded = 0;
|
|
241
|
-
|
|
278
|
+
const getNextBatch = () => {
|
|
279
|
+
if (nextIdx >= toEmbed.length)
|
|
280
|
+
return null;
|
|
281
|
+
const batch = toEmbed.slice(nextIdx, nextIdx + ITEMS_PER_BATCH);
|
|
282
|
+
nextIdx += ITEMS_PER_BATCH;
|
|
283
|
+
return batch;
|
|
284
|
+
};
|
|
285
|
+
// Prepare DB statements
|
|
286
|
+
const insertStmt = db.prepare('INSERT OR REPLACE INTO embeddings (nodeId, embedding, textHash) VALUES (?, ?, ?)');
|
|
287
|
+
let nlInsertStmt = null;
|
|
242
288
|
try {
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
289
|
+
nlInsertStmt = db.prepare('INSERT INTO nl_embeddings (nodeId, embedding, textHash, source, text) VALUES (?, ?, ?, ?, ?)');
|
|
290
|
+
}
|
|
291
|
+
catch { /* nl_embeddings table may not exist */ }
|
|
292
|
+
// Track doc metadata for nl_embeddings text lookup
|
|
293
|
+
const docMap = new Map();
|
|
294
|
+
for (const doc of toEmbed)
|
|
295
|
+
docMap.set(doc.nodeId, { source: doc.source, text: doc.text, hash: doc.hash });
|
|
296
|
+
if (workerCount <= 1) {
|
|
297
|
+
// Single process — use in-process embedding (small workloads)
|
|
298
|
+
await initNlEmbedder();
|
|
299
|
+
db.exec('BEGIN');
|
|
300
|
+
try {
|
|
301
|
+
for (let i = 0; i < toEmbed.length; i += ITEMS_PER_BATCH) {
|
|
302
|
+
const batch = toEmbed.slice(i, i + ITEMS_PER_BATCH);
|
|
303
|
+
const vecs = await nlEmbedBatch(batch.map(d => d.text));
|
|
304
|
+
for (let j = 0; j < batch.length; j++) {
|
|
305
|
+
const doc = batch[j];
|
|
306
|
+
const vec = vecs[j];
|
|
307
|
+
const blob = Buffer.from(new Float32Array(vec).buffer);
|
|
308
|
+
insertStmt.run(doc.nodeId, blob, doc.hash);
|
|
309
|
+
if (nlInsertStmt) {
|
|
310
|
+
try {
|
|
311
|
+
nlInsertStmt.run(doc.nodeId, blob, doc.hash, doc.source, doc.text);
|
|
312
|
+
}
|
|
313
|
+
catch { }
|
|
314
|
+
}
|
|
315
|
+
embedded++;
|
|
316
|
+
}
|
|
317
|
+
onProgress?.(Math.min(i + ITEMS_PER_BATCH, toEmbed.length), toEmbed.length);
|
|
252
318
|
}
|
|
253
|
-
|
|
319
|
+
db.exec('COMMIT');
|
|
320
|
+
}
|
|
321
|
+
catch (err) {
|
|
322
|
+
db.exec('ROLLBACK');
|
|
323
|
+
throw err;
|
|
254
324
|
}
|
|
255
|
-
db.exec('COMMIT');
|
|
256
325
|
}
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
326
|
+
else {
|
|
327
|
+
// Multi-process: spawn N workers, dynamic dispatch
|
|
328
|
+
const workers = [];
|
|
329
|
+
const workerReady = [];
|
|
330
|
+
for (let i = 0; i < workerCount; i++) {
|
|
331
|
+
const worker = fork(workerScript, [], { stdio: ['pipe', 'pipe', 'pipe', 'ipc'] });
|
|
332
|
+
workers.push(worker);
|
|
333
|
+
workerReady.push(new Promise((resolve) => {
|
|
334
|
+
const handler = (msg) => {
|
|
335
|
+
if (msg?.type === 'ready') {
|
|
336
|
+
worker.removeListener('message', handler);
|
|
337
|
+
resolve();
|
|
338
|
+
}
|
|
339
|
+
};
|
|
340
|
+
worker.on('message', handler);
|
|
341
|
+
// Timeout: if worker doesn't ready in 30s, skip it
|
|
342
|
+
setTimeout(() => resolve(), 30000);
|
|
343
|
+
}));
|
|
344
|
+
}
|
|
345
|
+
// Wait for all workers to load model
|
|
346
|
+
await Promise.all(workerReady);
|
|
347
|
+
const activeWorkers = workers.filter(w => w.connected);
|
|
348
|
+
if (activeWorkers.length === 0) {
|
|
349
|
+
// Fallback to single process
|
|
350
|
+
await initNlEmbedder();
|
|
351
|
+
db.exec('BEGIN');
|
|
352
|
+
try {
|
|
353
|
+
for (let i = 0; i < toEmbed.length; i += ITEMS_PER_BATCH) {
|
|
354
|
+
const batch = toEmbed.slice(i, i + ITEMS_PER_BATCH);
|
|
355
|
+
const vecs = await nlEmbedBatch(batch.map(d => d.text));
|
|
356
|
+
for (let j = 0; j < batch.length; j++) {
|
|
357
|
+
const doc = batch[j];
|
|
358
|
+
const blob = Buffer.from(new Float32Array(vecs[j]).buffer);
|
|
359
|
+
insertStmt.run(doc.nodeId, blob, doc.hash);
|
|
360
|
+
embedded++;
|
|
361
|
+
}
|
|
362
|
+
onProgress?.(Math.min(i + ITEMS_PER_BATCH, toEmbed.length), toEmbed.length);
|
|
363
|
+
}
|
|
364
|
+
db.exec('COMMIT');
|
|
365
|
+
}
|
|
366
|
+
catch (err) {
|
|
367
|
+
db.exec('ROLLBACK');
|
|
368
|
+
throw err;
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
else {
|
|
372
|
+
// Dynamic dispatch: each worker requests next batch when done
|
|
373
|
+
db.exec('BEGIN');
|
|
374
|
+
let batchId = 0;
|
|
375
|
+
const runWorker = (worker) => {
|
|
376
|
+
return new Promise((resolve) => {
|
|
377
|
+
const sendNext = () => {
|
|
378
|
+
const batch = getNextBatch();
|
|
379
|
+
if (!batch) {
|
|
380
|
+
worker.send({ type: 'exit' });
|
|
381
|
+
resolve();
|
|
382
|
+
return;
|
|
383
|
+
}
|
|
384
|
+
worker.send({
|
|
385
|
+
type: 'embed',
|
|
386
|
+
batchId: batchId++,
|
|
387
|
+
items: batch.map(d => ({ nodeId: d.nodeId, text: d.text })),
|
|
388
|
+
});
|
|
389
|
+
};
|
|
390
|
+
worker.on('message', (msg) => {
|
|
391
|
+
if (msg?.type === 'results') {
|
|
392
|
+
// Write results to DB
|
|
393
|
+
for (const r of msg.results) {
|
|
394
|
+
const blob = Buffer.from(new Float32Array(r.vec).buffer);
|
|
395
|
+
const meta = docMap.get(r.nodeId);
|
|
396
|
+
insertStmt.run(r.nodeId, blob, meta?.hash ?? '');
|
|
397
|
+
if (nlInsertStmt && meta) {
|
|
398
|
+
try {
|
|
399
|
+
nlInsertStmt.run(r.nodeId, blob, meta.hash, meta.source, meta.text);
|
|
400
|
+
}
|
|
401
|
+
catch { }
|
|
402
|
+
}
|
|
403
|
+
embedded++;
|
|
404
|
+
}
|
|
405
|
+
onProgress?.(embedded, toEmbed.length);
|
|
406
|
+
sendNext(); // request next batch
|
|
407
|
+
}
|
|
408
|
+
});
|
|
409
|
+
worker.on('exit', () => resolve());
|
|
410
|
+
sendNext(); // start first batch
|
|
411
|
+
});
|
|
412
|
+
};
|
|
413
|
+
try {
|
|
414
|
+
await Promise.all(activeWorkers.map(w => runWorker(w)));
|
|
415
|
+
db.exec('COMMIT');
|
|
416
|
+
}
|
|
417
|
+
catch (err) {
|
|
418
|
+
db.exec('ROLLBACK');
|
|
419
|
+
throw err;
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
// Cleanup workers
|
|
423
|
+
for (const w of workers) {
|
|
424
|
+
try {
|
|
425
|
+
w.kill();
|
|
426
|
+
}
|
|
427
|
+
catch { }
|
|
428
|
+
}
|
|
260
429
|
}
|
|
261
430
|
return { embedded, skipped, durationMs: Date.now() - t0 };
|
|
262
431
|
}
|
|
@@ -492,46 +492,38 @@ export async function refreshEmbeddings(db, dirtyFiles, hasEmbeddings) {
|
|
|
492
492
|
}
|
|
493
493
|
if (newNodes.length === 0)
|
|
494
494
|
return;
|
|
495
|
-
// Step 3:
|
|
496
|
-
|
|
497
|
-
const { fetchGraphContext, enrichTextWithGraphContext } = await import('../embeddings/embedding-pipeline.js');
|
|
498
|
-
const { generateEmbeddingText } = await import('../embeddings/text-generator.js');
|
|
499
|
-
const { initEmbedder, embedBatch, embeddingToArray } = await import('../embeddings/embedder.js');
|
|
500
|
-
const graphContext = fetchGraphContext(db, newNodes);
|
|
501
|
-
// Step 4: Generate enriched text + hash for skip detection
|
|
495
|
+
// Step 3: Extract NL text and embed with bge-small (same model as full analyze)
|
|
496
|
+
const { extractNlTexts, initNlEmbedder, nlEmbed } = await import('../embeddings/nl-embedder.js');
|
|
502
497
|
const { createHash } = await import('crypto');
|
|
503
498
|
const { getEmbeddingHashes } = await import('../db/adapter.js');
|
|
504
499
|
const existingHashes = getEmbeddingHashes(db);
|
|
500
|
+
await initNlEmbedder();
|
|
505
501
|
const toEmbed = [];
|
|
506
502
|
for (const node of newNodes) {
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
}
|
|
512
|
-
|
|
513
|
-
|
|
503
|
+
const nlDocs = extractNlTexts({
|
|
504
|
+
id: node.id, name: node.name, label: node.label,
|
|
505
|
+
filePath: node.filePath, content: node.content || '',
|
|
506
|
+
startLine: node.startLine ?? null, description: node.description || '',
|
|
507
|
+
});
|
|
508
|
+
// Pick best doc (prefer comment over name)
|
|
509
|
+
const best = nlDocs.find(d => d.source === 'comment') || nlDocs[0];
|
|
510
|
+
if (!best)
|
|
511
|
+
continue;
|
|
512
|
+
const hash = createHash('md5').update(best.text).digest('hex');
|
|
514
513
|
if (existingHashes.get(node.id) === hash)
|
|
515
514
|
continue;
|
|
516
|
-
toEmbed.push({ node, text, hash });
|
|
515
|
+
toEmbed.push({ nodeId: node.id, text: best.text, hash, source: best.source });
|
|
517
516
|
}
|
|
518
517
|
if (toEmbed.length === 0) {
|
|
519
518
|
console.error(`Code Mapper: All ${newNodes.length} node(s) unchanged (hash skip)`);
|
|
520
519
|
return;
|
|
521
520
|
}
|
|
522
521
|
console.error(`Code Mapper: Embedding ${toEmbed.length}/${newNodes.length} node(s) (${newNodes.length - toEmbed.length} unchanged)`);
|
|
523
|
-
// Step
|
|
524
|
-
await initEmbedder();
|
|
525
|
-
// Step 6: Batch embed only changed nodes
|
|
526
|
-
const embeddings = await embedBatch(toEmbed.map(e => e.text));
|
|
527
|
-
// Step 7: Insert with hashes
|
|
522
|
+
// Step 4: Embed and insert
|
|
528
523
|
const items = [];
|
|
529
|
-
for (
|
|
530
|
-
const
|
|
531
|
-
|
|
532
|
-
if (entry?.node && emb) {
|
|
533
|
-
items.push({ nodeId: toNodeId(entry.node.id), embedding: embeddingToArray(emb), textHash: entry.hash });
|
|
534
|
-
}
|
|
524
|
+
for (const entry of toEmbed) {
|
|
525
|
+
const vec = await nlEmbed(entry.text);
|
|
526
|
+
items.push({ nodeId: toNodeId(entry.nodeId), embedding: vec, textHash: entry.hash });
|
|
535
527
|
}
|
|
536
528
|
insertEmbeddingsBatch(db, items);
|
|
537
529
|
console.error(`Code Mapper: Embedded ${items.length} node(s) incrementally`);
|