@zuvia-software-solutions/code-mapper 2.4.1 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,6 @@
2
2
  export interface AnalyzeOptions {
3
3
  force?: boolean;
4
4
  embeddings?: boolean;
5
- nlEmbeddings?: boolean;
6
5
  tsgo?: boolean;
7
6
  verbose?: boolean;
8
7
  }
@@ -347,101 +347,25 @@ export const analyzeCommand = async (inputPath, options) => {
347
347
  catch { /* some may fail if node was removed, that's fine */ }
348
348
  }
349
349
  }
350
- // Phase 4: Embeddings (90-98%)
350
+ // Phase 4: Embeddings — bge-small NL embeddings (CPU, Node.js, no Python)
351
+ // Extracts natural language from code (comments, names, enums, patterns)
352
+ // and embeds with bge-small-en-v1.5 (33M params, 384-dim, ~6ms/doc).
351
353
  const stats = getStats(db);
352
354
  let embeddingFailed = false;
353
355
  if (options?.embeddings) {
354
356
  recordPhase('embeddings');
355
- updateBar(90, 'Generating embeddings...');
356
- // Close DB so Python can write to it
357
- closeDb(dbPath);
358
- // Run Python embedder in batch mode — reads from SQLite, embeds, writes back.
359
- // Zero IPC overhead: ~3x faster than Node↔Python JSON streaming.
360
- const { spawn: spawnChild } = await import('child_process');
361
- const { fileURLToPath } = await import('url');
362
- const mlxScript = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..', '..', 'models', 'mlx-embedder.py');
363
- await new Promise((resolve) => {
364
- // Use spawn (not execFile) — no internal buffer limit, streams only.
365
- // execFile buffers all stdout in memory which causes OOM/kill on large codebases.
366
- const proc = spawnChild('python3', [mlxScript, 'batch', dbPath, '--dims', '256', '--max-tokens', '2048'], {
367
- stdio: ['ignore', 'pipe', 'pipe'],
368
- });
369
- let stderrBuf = '';
370
- proc.stderr?.on('data', (chunk) => {
371
- stderrBuf += chunk.toString();
372
- if (stderrBuf.length > 10240)
373
- stderrBuf = stderrBuf.slice(-10240);
374
- });
375
- proc.on('close', (code) => {
376
- if (code !== 0) {
377
- // Non-fatal: index is already saved, just embeddings failed
378
- console.error(`\n Warning: Embedding failed (exit code ${code}). Index saved without embeddings.`);
379
- if (stderrBuf.trim())
380
- console.error(` ${stderrBuf.trim().split('\n').slice(-3).join('\n ')}`);
381
- embeddingFailed = true;
382
- }
383
- resolve();
384
- });
385
- proc.on('error', (err) => {
386
- console.error(`\n Warning: Embedding failed: ${err.message}. Index saved without embeddings.`);
387
- embeddingFailed = true;
388
- resolve();
389
- });
390
- // Stream progress from Python's JSON lines on stdout
391
- let lineBuf = '';
392
- proc.stdout?.on('data', (chunk) => {
393
- lineBuf += chunk.toString();
394
- const lines = lineBuf.split('\n');
395
- lineBuf = lines.pop() || '';
396
- for (const line of lines) {
397
- if (!line.trim())
398
- continue;
399
- try {
400
- const msg = JSON.parse(line);
401
- if (msg.phase === 'downloading' || msg.phase === 'converting') {
402
- updateBar(90, msg.message);
403
- }
404
- else if (msg.phase === 'loaded') {
405
- updateBar(91, `Model loaded (${msg.load_ms}ms)`);
406
- }
407
- else if (msg.phase === 'queried') {
408
- updateBar(92, `Found ${msg.nodes} embeddable nodes${msg.skipped_tests ? ` (${msg.skipped_tests} test files skipped)` : ''}`);
409
- }
410
- else if (msg.phase === 'prepared') {
411
- updateBar(93, `${msg.to_embed} to embed, ${msg.skipped} cached`);
412
- }
413
- else if (msg.phase === 'embedding') {
414
- const scaled = 93 + Math.round((msg.progress / 100) * 4);
415
- updateBar(scaled, `Embedding... ${msg.progress}% (${msg.embedded} written)`);
416
- }
417
- else if (msg.phase === 'embedded') {
418
- updateBar(97, `Embedded ${msg.count} nodes (${(msg.ms / 1000).toFixed(1)}s)`);
419
- }
420
- else if (msg.phase === 'done') {
421
- updateBar(98, `Embeddings complete (${msg.embedded} new, ${msg.skipped} cached)`);
422
- }
423
- }
424
- catch { }
425
- }
426
- });
427
- });
428
- // Reopen DB after Python is done
429
- db = openDb(dbPath);
430
- }
431
- // Phase 4b: NL Embeddings (bge-small, CPU, Node.js)
432
- if (options?.nlEmbeddings) {
433
- recordPhase('nl-embeddings');
434
- updateBar(95, 'Generating NL embeddings (bge-small)...');
357
+ updateBar(90, 'Generating embeddings (bge-small)...');
435
358
  const { buildNlEmbeddings } = await import('../core/embeddings/nl-embedder.js');
436
359
  try {
437
- const nlResult = await buildNlEmbeddings(db, (current, total) => {
438
- const pct = 95 + Math.round((current / Math.max(total, 1)) * 3);
439
- updateBar(pct, `NL embeddings (${current}/${total})`, 'NL embeddings');
360
+ const result = await buildNlEmbeddings(db, (current, total) => {
361
+ const pct = 90 + Math.round((current / Math.max(total, 1)) * 8);
362
+ updateBar(pct, `Embeddings (${current}/${total})`, 'Embeddings');
440
363
  });
441
- updateBar(98, `NL embeddings: ${nlResult.embedded} embedded, ${nlResult.skipped} cached (${(nlResult.durationMs / 1000).toFixed(1)}s)`);
364
+ updateBar(98, `Embeddings: ${result.embedded} embedded, ${result.skipped} cached (${(result.durationMs / 1000).toFixed(1)}s)`);
442
365
  }
443
366
  catch (err) {
444
- console.error(`\n Warning: NL embeddings failed: ${err instanceof Error ? err.message : err}`);
367
+ console.error(`\n Warning: Embedding failed: ${err instanceof Error ? err.message : err}`);
368
+ embeddingFailed = true;
445
369
  }
446
370
  }
447
371
  // Phase 5: Finalize (98-100%)
@@ -535,7 +459,7 @@ export const analyzeCommand = async (inputPath, options) => {
535
459
  'search-text': 'Search text',
536
460
  fts: 'FTS indexing',
537
461
  'restore-embeddings': 'Restore embeddings',
538
- embeddings: 'Embeddings (MLX)',
462
+ embeddings: 'Embeddings (bge-small)',
539
463
  finalize: 'Finalize & context',
540
464
  done: 'Done',
541
465
  };
package/dist/cli/index.js CHANGED
@@ -22,8 +22,8 @@ program
22
22
  .command('analyze [path]')
23
23
  .description('Index a repository (full analysis)')
24
24
  .option('-f, --force', 'Force full re-index even if up to date')
25
- .option('--embeddings', 'Enable code embedding generation (Jina/MLX, GPU)', false)
26
- .option('--nl-embeddings', 'Enable NL embedding generation (bge-small, CPU, recommended)', false)
25
+ .option('--embeddings', 'Generate semantic embeddings (bge-small, CPU, fast)')
26
+ .option('--no-embeddings', 'Skip embedding generation')
27
27
  .option('--no-tsgo', 'Skip tsgo LSP for call resolution (faster, less accurate)')
28
28
  .option('-v, --verbose', 'Enable verbose ingestion warnings (default: false)')
29
29
  .addHelpText('after', '\nEnvironment variables:\n CODE_MAPPER_NO_GITIGNORE=1 Skip .gitignore parsing (still reads .code-mapperignore)')
@@ -1,5 +1,4 @@
1
- /** @file index.ts @description Barrel re-exports for the embedding pipeline system */
1
+ /** @file Barrel re-exports for the embedding system (bge-small NL embedder) */
2
2
  export * from './types.js';
3
- export * from './embedder.js';
4
3
  export * from './text-generator.js';
5
- export * from './embedding-pipeline.js';
4
+ export * from './nl-embedder.js';
@@ -1,6 +1,5 @@
1
1
  // code-mapper/src/core/embeddings/index.ts
2
- /** @file index.ts @description Barrel re-exports for the embedding pipeline system */
2
+ /** @file Barrel re-exports for the embedding system (bge-small NL embedder) */
3
3
  export * from './types.js';
4
- export * from './embedder.js';
5
4
  export * from './text-generator.js';
6
- export * from './embedding-pipeline.js';
5
+ export * from './nl-embedder.js';
@@ -0,0 +1,8 @@
1
+ /**
2
+ * @file Worker process for parallel NL embedding.
3
+ * Spawned by buildNlEmbeddings — loads bge-small independently,
4
+ * embeds texts received via IPC, sends vectors back.
5
+ *
6
+ * Same architecture as parallel tsgo: N processes, each with own model.
7
+ */
8
+ export {};
@@ -0,0 +1,38 @@
1
+ // code-mapper/src/core/embeddings/nl-embed-worker.ts
2
+ /**
3
+ * @file Worker process for parallel NL embedding.
4
+ * Spawned by buildNlEmbeddings — loads bge-small independently,
5
+ * embeds texts received via IPC, sends vectors back.
6
+ *
7
+ * Same architecture as parallel tsgo: N processes, each with own model.
8
+ */
9
+ import { pipeline } from '@huggingface/transformers';
10
+ const MODEL_ID = 'Xenova/bge-small-en-v1.5';
11
+ async function main() {
12
+ // Load model
13
+ const extractor = await pipeline('feature-extraction', MODEL_ID, { quantized: true });
14
+ process.send({ type: 'ready' });
15
+ // Process messages from parent
16
+ process.on('message', async (msg) => {
17
+ if (msg.type === 'embed') {
18
+ const results = [];
19
+ for (const item of msg.items) {
20
+ try {
21
+ const result = await extractor(item.text, { pooling: 'cls', normalize: true });
22
+ results.push({ nodeId: item.nodeId, vec: Array.from(result.data) });
23
+ }
24
+ catch {
25
+ // Skip failed embeddings
26
+ }
27
+ }
28
+ process.send({ type: 'results', results, batchId: msg.batchId });
29
+ }
30
+ else if (msg.type === 'exit') {
31
+ process.exit(0);
32
+ }
33
+ });
34
+ }
35
+ main().catch(err => {
36
+ console.error('NL embed worker failed:', err);
37
+ process.exit(1);
38
+ });
@@ -14,7 +14,7 @@ export declare function initNlEmbedder(): Promise<void>;
14
14
  export declare function isNlEmbedderReady(): boolean;
15
15
  /** Embed a single text, returns Float32Array */
16
16
  export declare function nlEmbed(text: string): Promise<number[]>;
17
- /** Embed a batch of texts */
17
+ /** Embed a batch of texts (processes in sub-batches for memory efficiency) */
18
18
  export declare function nlEmbedBatch(texts: string[]): Promise<number[][]>;
19
19
  interface NodeForNl {
20
20
  id: string;
@@ -20,7 +20,11 @@ export async function initNlEmbedder() {
20
20
  if (loadPromise)
21
21
  return loadPromise;
22
22
  loadPromise = (async () => {
23
- const { pipeline } = await import('@huggingface/transformers');
23
+ const { pipeline, env } = await import('@huggingface/transformers');
24
+ // Use all available CPU threads for ONNX inference
25
+ if (env.backends?.onnx?.wasm) {
26
+ env.backends.onnx.wasm.numThreads = Math.max(1, (await import('os')).cpus().length);
27
+ }
24
28
  extractor = await pipeline('feature-extraction', MODEL_ID, { quantized: true });
25
29
  })();
26
30
  return loadPromise;
@@ -36,14 +40,19 @@ export async function nlEmbed(text) {
36
40
  const result = await extractor(text, { pooling: 'cls', normalize: true });
37
41
  return Array.from(result.data);
38
42
  }
39
- /** Embed a batch of texts */
43
+ /** Embed a batch of texts (processes in sub-batches for memory efficiency) */
40
44
  export async function nlEmbedBatch(texts) {
41
45
  if (!extractor)
42
46
  await initNlEmbedder();
47
+ const BATCH = 32; // sub-batch size — balances throughput vs memory
43
48
  const results = [];
44
- for (const text of texts) {
45
- const result = await extractor(text, { pooling: 'cls', normalize: true });
46
- results.push(Array.from(result.data));
49
+ for (let i = 0; i < texts.length; i += BATCH) {
50
+ const batch = texts.slice(i, i + BATCH);
51
+ // Process sub-batch — transformers.js handles arrays
52
+ const batchResults = await Promise.all(batch.map(text => extractor(text, { pooling: 'cls', normalize: true })));
53
+ for (const result of batchResults) {
54
+ results.push(Array.from(result.data));
55
+ }
47
56
  }
48
57
  return results;
49
58
  }
@@ -197,9 +206,10 @@ export async function buildNlEmbeddings(db, onProgress) {
197
206
  const labels = ['Function', 'Class', 'Method', 'Interface', 'Const', 'Enum', 'TypeAlias', 'Namespace', 'Module', 'Struct'];
198
207
  const placeholders = labels.map(() => '?').join(',');
199
208
  const rows = db.prepare(`SELECT id, name, label, filePath, content, startLine, description FROM nodes WHERE label IN (${placeholders})`).all(...labels);
200
- // Skip test files
201
- const testPatterns = ['/test/', '/tests/', '/spec/', '/fixtures/', '/__tests__/', '/__mocks__/', '.test.', '.spec.', '_test.', '_spec.'];
202
- const filteredRows = rows.filter(r => !testPatterns.some(p => r.filePath.includes(p)));
209
+ // NL embeddings include ALL files (including tests) — test names describe
210
+ // functionality in natural language which helps conceptual search.
211
+ // The bge-small model is fast enough (~6ms/doc) that the cost is trivial.
212
+ const filteredRows = rows;
203
213
  // Extract NL documents
204
214
  const allDocs = [];
205
215
  for (const row of filteredRows) {
@@ -210,10 +220,20 @@ export async function buildNlEmbeddings(db, onProgress) {
210
220
  if (allDocs.length === 0) {
211
221
  return { embedded: 0, skipped: 0, durationMs: Date.now() - t0 };
212
222
  }
223
+ // Deduplicate: one embedding per nodeId — prefer 'comment' source over 'name' or 'enum'
224
+ const SOURCE_PRIORITY = { comment: 0, enum: 1, name: 2 };
225
+ const bestByNode = new Map();
226
+ for (const doc of allDocs) {
227
+ const existing = bestByNode.get(doc.nodeId);
228
+ if (!existing || (SOURCE_PRIORITY[doc.source] ?? 9) < (SOURCE_PRIORITY[existing.source] ?? 9)) {
229
+ bestByNode.set(doc.nodeId, doc);
230
+ }
231
+ }
232
+ const uniqueDocs = [...bestByNode.values()];
213
233
  // Check existing hashes for skip detection
214
234
  const existingHashes = new Map();
215
235
  try {
216
- const hashRows = db.prepare('SELECT nodeId, textHash FROM nl_embeddings WHERE textHash IS NOT NULL').all();
236
+ const hashRows = db.prepare('SELECT nodeId, textHash FROM embeddings WHERE textHash IS NOT NULL').all();
217
237
  for (const r of hashRows)
218
238
  existingHashes.set(r.nodeId + ':' + r.textHash, '1');
219
239
  }
@@ -221,7 +241,7 @@ export async function buildNlEmbeddings(db, onProgress) {
221
241
  // Filter to docs that need embedding
222
242
  const toEmbed = [];
223
243
  let skipped = 0;
224
- for (const doc of allDocs) {
244
+ for (const doc of uniqueDocs) {
225
245
  const hash = md5(doc.text);
226
246
  if (existingHashes.has(doc.nodeId + ':' + hash)) {
227
247
  skipped++;
@@ -232,31 +252,180 @@ export async function buildNlEmbeddings(db, onProgress) {
232
252
  if (toEmbed.length === 0) {
233
253
  return { embedded: 0, skipped, durationMs: Date.now() - t0 };
234
254
  }
235
- // Clear existing NL embeddings and rebuild
236
- db.prepare('DELETE FROM nl_embeddings').run();
237
- // Embed in batches and write to DB
238
- const BATCH = 100;
239
- const insertStmt = db.prepare('INSERT INTO nl_embeddings (nodeId, embedding, textHash, source, text) VALUES (?, ?, ?, ?, ?)');
255
+ // Clear existing embeddings and rebuild
256
+ db.prepare('DELETE FROM embeddings').run();
257
+ try {
258
+ db.prepare('DELETE FROM nl_embeddings').run();
259
+ }
260
+ catch { /* table may not exist */ }
261
+ // Parallel multi-process embedding — same architecture as tsgo
262
+ // Each worker loads its own bge-small model, embeds independently.
263
+ const os = await import('os');
264
+ const { fork } = await import('child_process');
265
+ const { fileURLToPath } = await import('url');
266
+ const pathMod = await import('path');
267
+ const cpuCount = os.cpus().length;
268
+ const maxByCore = Math.max(1, Math.floor(cpuCount * 0.75));
269
+ const maxByWorkload = Math.max(1, Math.floor(toEmbed.length / 50));
270
+ const workerCount = Math.min(maxByCore, maxByWorkload, 8); // cap at 8 for memory
271
+ // Find worker script path
272
+ const thisDir = pathMod.dirname(fileURLToPath(import.meta.url));
273
+ const workerScript = pathMod.join(thisDir, 'nl-embed-worker.js');
274
+ // Split work across workers
275
+ const ITEMS_PER_BATCH = 50;
276
+ let nextIdx = 0;
240
277
  let embedded = 0;
241
- db.exec('BEGIN');
278
+ const getNextBatch = () => {
279
+ if (nextIdx >= toEmbed.length)
280
+ return null;
281
+ const batch = toEmbed.slice(nextIdx, nextIdx + ITEMS_PER_BATCH);
282
+ nextIdx += ITEMS_PER_BATCH;
283
+ return batch;
284
+ };
285
+ // Prepare DB statements
286
+ const insertStmt = db.prepare('INSERT OR REPLACE INTO embeddings (nodeId, embedding, textHash) VALUES (?, ?, ?)');
287
+ let nlInsertStmt = null;
242
288
  try {
243
- for (let i = 0; i < toEmbed.length; i += BATCH) {
244
- const batch = toEmbed.slice(i, i + BATCH);
245
- const vecs = await nlEmbedBatch(batch.map(d => d.text));
246
- for (let j = 0; j < batch.length; j++) {
247
- const doc = batch[j];
248
- const vec = vecs[j];
249
- const blob = Buffer.from(new Float32Array(vec).buffer);
250
- insertStmt.run(doc.nodeId, blob, doc.hash, doc.source, doc.text);
251
- embedded++;
289
+ nlInsertStmt = db.prepare('INSERT INTO nl_embeddings (nodeId, embedding, textHash, source, text) VALUES (?, ?, ?, ?, ?)');
290
+ }
291
+ catch { /* nl_embeddings table may not exist */ }
292
+ // Track doc metadata for nl_embeddings text lookup
293
+ const docMap = new Map();
294
+ for (const doc of toEmbed)
295
+ docMap.set(doc.nodeId, { source: doc.source, text: doc.text, hash: doc.hash });
296
+ if (workerCount <= 1) {
297
+ // Single process — use in-process embedding (small workloads)
298
+ await initNlEmbedder();
299
+ db.exec('BEGIN');
300
+ try {
301
+ for (let i = 0; i < toEmbed.length; i += ITEMS_PER_BATCH) {
302
+ const batch = toEmbed.slice(i, i + ITEMS_PER_BATCH);
303
+ const vecs = await nlEmbedBatch(batch.map(d => d.text));
304
+ for (let j = 0; j < batch.length; j++) {
305
+ const doc = batch[j];
306
+ const vec = vecs[j];
307
+ const blob = Buffer.from(new Float32Array(vec).buffer);
308
+ insertStmt.run(doc.nodeId, blob, doc.hash);
309
+ if (nlInsertStmt) {
310
+ try {
311
+ nlInsertStmt.run(doc.nodeId, blob, doc.hash, doc.source, doc.text);
312
+ }
313
+ catch { }
314
+ }
315
+ embedded++;
316
+ }
317
+ onProgress?.(Math.min(i + ITEMS_PER_BATCH, toEmbed.length), toEmbed.length);
252
318
  }
253
- onProgress?.(Math.min(i + BATCH, toEmbed.length), toEmbed.length);
319
+ db.exec('COMMIT');
320
+ }
321
+ catch (err) {
322
+ db.exec('ROLLBACK');
323
+ throw err;
254
324
  }
255
- db.exec('COMMIT');
256
325
  }
257
- catch (err) {
258
- db.exec('ROLLBACK');
259
- throw err;
326
+ else {
327
+ // Multi-process: spawn N workers, dynamic dispatch
328
+ const workers = [];
329
+ const workerReady = [];
330
+ for (let i = 0; i < workerCount; i++) {
331
+ const worker = fork(workerScript, [], { stdio: ['pipe', 'pipe', 'pipe', 'ipc'] });
332
+ workers.push(worker);
333
+ workerReady.push(new Promise((resolve) => {
334
+ const handler = (msg) => {
335
+ if (msg?.type === 'ready') {
336
+ worker.removeListener('message', handler);
337
+ resolve();
338
+ }
339
+ };
340
+ worker.on('message', handler);
341
+ // Timeout: if worker doesn't ready in 30s, skip it
342
+ setTimeout(() => resolve(), 30000);
343
+ }));
344
+ }
345
+ // Wait for all workers to load model
346
+ await Promise.all(workerReady);
347
+ const activeWorkers = workers.filter(w => w.connected);
348
+ if (activeWorkers.length === 0) {
349
+ // Fallback to single process
350
+ await initNlEmbedder();
351
+ db.exec('BEGIN');
352
+ try {
353
+ for (let i = 0; i < toEmbed.length; i += ITEMS_PER_BATCH) {
354
+ const batch = toEmbed.slice(i, i + ITEMS_PER_BATCH);
355
+ const vecs = await nlEmbedBatch(batch.map(d => d.text));
356
+ for (let j = 0; j < batch.length; j++) {
357
+ const doc = batch[j];
358
+ const blob = Buffer.from(new Float32Array(vecs[j]).buffer);
359
+ insertStmt.run(doc.nodeId, blob, doc.hash);
360
+ embedded++;
361
+ }
362
+ onProgress?.(Math.min(i + ITEMS_PER_BATCH, toEmbed.length), toEmbed.length);
363
+ }
364
+ db.exec('COMMIT');
365
+ }
366
+ catch (err) {
367
+ db.exec('ROLLBACK');
368
+ throw err;
369
+ }
370
+ }
371
+ else {
372
+ // Dynamic dispatch: each worker requests next batch when done
373
+ db.exec('BEGIN');
374
+ let batchId = 0;
375
+ const runWorker = (worker) => {
376
+ return new Promise((resolve) => {
377
+ const sendNext = () => {
378
+ const batch = getNextBatch();
379
+ if (!batch) {
380
+ worker.send({ type: 'exit' });
381
+ resolve();
382
+ return;
383
+ }
384
+ worker.send({
385
+ type: 'embed',
386
+ batchId: batchId++,
387
+ items: batch.map(d => ({ nodeId: d.nodeId, text: d.text })),
388
+ });
389
+ };
390
+ worker.on('message', (msg) => {
391
+ if (msg?.type === 'results') {
392
+ // Write results to DB
393
+ for (const r of msg.results) {
394
+ const blob = Buffer.from(new Float32Array(r.vec).buffer);
395
+ const meta = docMap.get(r.nodeId);
396
+ insertStmt.run(r.nodeId, blob, meta?.hash ?? '');
397
+ if (nlInsertStmt && meta) {
398
+ try {
399
+ nlInsertStmt.run(r.nodeId, blob, meta.hash, meta.source, meta.text);
400
+ }
401
+ catch { }
402
+ }
403
+ embedded++;
404
+ }
405
+ onProgress?.(embedded, toEmbed.length);
406
+ sendNext(); // request next batch
407
+ }
408
+ });
409
+ worker.on('exit', () => resolve());
410
+ sendNext(); // start first batch
411
+ });
412
+ };
413
+ try {
414
+ await Promise.all(activeWorkers.map(w => runWorker(w)));
415
+ db.exec('COMMIT');
416
+ }
417
+ catch (err) {
418
+ db.exec('ROLLBACK');
419
+ throw err;
420
+ }
421
+ }
422
+ // Cleanup workers
423
+ for (const w of workers) {
424
+ try {
425
+ w.kill();
426
+ }
427
+ catch { }
428
+ }
260
429
  }
261
430
  return { embedded, skipped, durationMs: Date.now() - t0 };
262
431
  }
@@ -492,46 +492,38 @@ export async function refreshEmbeddings(db, dirtyFiles, hasEmbeddings) {
492
492
  }
493
493
  if (newNodes.length === 0)
494
494
  return;
495
- // Step 3: Enrich with graph context same as the full analyze pipeline
496
- // Lazy import to avoid circular dependency at module load time
497
- const { fetchGraphContext, enrichTextWithGraphContext } = await import('../embeddings/embedding-pipeline.js');
498
- const { generateEmbeddingText } = await import('../embeddings/text-generator.js');
499
- const { initEmbedder, embedBatch, embeddingToArray } = await import('../embeddings/embedder.js');
500
- const graphContext = fetchGraphContext(db, newNodes);
501
- // Step 4: Generate enriched text + hash for skip detection
495
+ // Step 3: Extract NL text and embed with bge-small (same model as full analyze)
496
+ const { extractNlTexts, initNlEmbedder, nlEmbed } = await import('../embeddings/nl-embedder.js');
502
497
  const { createHash } = await import('crypto');
503
498
  const { getEmbeddingHashes } = await import('../db/adapter.js');
504
499
  const existingHashes = getEmbeddingHashes(db);
500
+ await initNlEmbedder();
505
501
  const toEmbed = [];
506
502
  for (const node of newNodes) {
507
- let text = generateEmbeddingText(node);
508
- const ctx = graphContext.get(node.id);
509
- if (ctx) {
510
- text = enrichTextWithGraphContext(text, ctx);
511
- }
512
- const hash = createHash('md5').update(text).digest('hex');
513
- // Skip if hash unchanged (content + graph context identical)
503
+ const nlDocs = extractNlTexts({
504
+ id: node.id, name: node.name, label: node.label,
505
+ filePath: node.filePath, content: node.content || '',
506
+ startLine: node.startLine ?? null, description: node.description || '',
507
+ });
508
+ // Pick best doc (prefer comment over name)
509
+ const best = nlDocs.find(d => d.source === 'comment') || nlDocs[0];
510
+ if (!best)
511
+ continue;
512
+ const hash = createHash('md5').update(best.text).digest('hex');
514
513
  if (existingHashes.get(node.id) === hash)
515
514
  continue;
516
- toEmbed.push({ node, text, hash });
515
+ toEmbed.push({ nodeId: node.id, text: best.text, hash, source: best.source });
517
516
  }
518
517
  if (toEmbed.length === 0) {
519
518
  console.error(`Code Mapper: All ${newNodes.length} node(s) unchanged (hash skip)`);
520
519
  return;
521
520
  }
522
521
  console.error(`Code Mapper: Embedding ${toEmbed.length}/${newNodes.length} node(s) (${newNodes.length - toEmbed.length} unchanged)`);
523
- // Step 5: Ensure embedder is ready
524
- await initEmbedder();
525
- // Step 6: Batch embed only changed nodes
526
- const embeddings = await embedBatch(toEmbed.map(e => e.text));
527
- // Step 7: Insert with hashes
522
+ // Step 4: Embed and insert
528
523
  const items = [];
529
- for (let i = 0; i < toEmbed.length; i++) {
530
- const entry = toEmbed[i];
531
- const emb = embeddings[i];
532
- if (entry?.node && emb) {
533
- items.push({ nodeId: toNodeId(entry.node.id), embedding: embeddingToArray(emb), textHash: entry.hash });
534
- }
524
+ for (const entry of toEmbed) {
525
+ const vec = await nlEmbed(entry.text);
526
+ items.push({ nodeId: toNodeId(entry.nodeId), embedding: vec, textHash: entry.hash });
535
527
  }
536
528
  insertEmbeddingsBatch(db, items);
537
529
  console.error(`Code Mapper: Embedded ${items.length} node(s) incrementally`);