@zuvia-software-solutions/code-mapper 2.3.1 → 2.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  // code-mapper/src/cli/analyze.ts
2
2
  /** @file analyze.ts @description Indexes a repository, builds the knowledge graph, and stores it in .code-mapper/ */
3
3
  import path from 'path';
4
+ import os from 'os';
4
5
  import { execFileSync } from 'child_process';
5
6
  import v8 from 'v8';
6
7
  import cliProgress from 'cli-progress';
@@ -93,7 +94,8 @@ export const analyzeCommand = async (inputPath, options) => {
93
94
  }
94
95
  // Single progress bar for the entire pipeline
95
96
  const bar = new cliProgress.SingleBar({
96
- format: ' {bar} {percentage}% | {phase}',
97
+ // \x1b[K at end clears to EOL so shorter redraws don't leave trailing characters
98
+ format: ' {bar} {percentage}% | {phase} | {resources}\x1b[K',
97
99
  barCompleteChar: '\u2588',
98
100
  barIncompleteChar: '\u2591',
99
101
  hideCursor: true,
@@ -131,6 +133,37 @@ export const analyzeCommand = async (inputPath, options) => {
131
133
  console.log = barLog;
132
134
  console.warn = barLog;
133
135
  console.error = barLog;
136
+ const t0Global = Date.now();
137
+ const cpuStart = process.cpuUsage();
138
+ let peakRssMB = 0;
139
+ // Phase timing tracker — records wall time and RSS for each phase
140
+ const phaseTimes = [];
141
+ let currentPhaseName = 'init';
142
+ let currentPhaseStart = Date.now();
143
+ const recordPhase = (nextPhase) => {
144
+ const now = Date.now();
145
+ const elapsed = now - currentPhaseStart;
146
+ if (elapsed > 0) {
147
+ phaseTimes.push({
148
+ name: currentPhaseName,
149
+ ms: elapsed,
150
+ rssMB: Math.round(process.memoryUsage.rss() / (1024 * 1024)),
151
+ });
152
+ }
153
+ currentPhaseName = nextPhase;
154
+ currentPhaseStart = now;
155
+ };
156
+ // Live resource stats for the progress bar
157
+ const cpuCount = os.cpus().length;
158
+ const getResourceStats = () => {
159
+ const rssMB = Math.round(process.memoryUsage.rss() / (1024 * 1024));
160
+ if (rssMB > peakRssMB)
161
+ peakRssMB = rssMB;
162
+ const cpuDelta = process.cpuUsage(cpuStart);
163
+ const wallMs = Date.now() - t0Global || 1;
164
+ const cpuPct = Math.round(((cpuDelta.user + cpuDelta.system) / 1e3) / wallMs * 100);
165
+ return `${rssMB}MB | CPU ${cpuPct}%`;
166
+ };
134
167
  // Track elapsed time per phase — both updateBar and the interval use
135
168
  // the same format so they don't flicker against each other
136
169
  let lastPhaseLabel = 'Initializing...';
@@ -143,17 +176,16 @@ export const analyzeCommand = async (inputPath, options) => {
143
176
  }
144
177
  const elapsed = Math.round((Date.now() - phaseStart) / 1000);
145
178
  const display = elapsed >= 3 ? `${phaseLabel} (${elapsed}s)` : phaseLabel;
146
- bar.update(value, { phase: display });
179
+ bar.update(value, { phase: display, resources: getResourceStats() });
147
180
  };
148
181
  // Tick elapsed seconds for phases with infrequent progress callbacks
149
182
  // (e.g. CSV streaming, FTS indexing) — uses the same display format as updateBar
150
183
  const elapsedTimer = setInterval(() => {
151
184
  const elapsed = Math.round((Date.now() - phaseStart) / 1000);
152
185
  if (elapsed >= 3) {
153
- bar.update({ phase: `${lastPhaseLabel} (${elapsed}s)` });
186
+ bar.update({ phase: `${lastPhaseLabel} (${elapsed}s)`, resources: getResourceStats() });
154
187
  }
155
188
  }, 1000);
156
- const t0Global = Date.now();
157
189
  // Cache embeddings from existing index before rebuild
158
190
  let cachedEmbeddingNodeIds = new Set();
159
191
  let cachedEmbeddings = [];
@@ -180,15 +212,24 @@ export const analyzeCommand = async (inputPath, options) => {
180
212
  }
181
213
  }
182
214
  // Phase 1: Full Pipeline (0-60%)
215
+ let lastPipelinePhase = '';
183
216
  const pipelineResult = await runPipelineFromRepo(repoPath, (progress) => {
184
- const phaseLabel = PHASE_LABELS[progress.phase] || progress.phase;
217
+ if (progress.phase !== lastPipelinePhase) {
218
+ recordPhase(progress.phase);
219
+ lastPipelinePhase = progress.phase;
220
+ }
221
+ let phaseLabel = PHASE_LABELS[progress.phase] || progress.phase;
222
+ if (progress.stats && progress.stats.totalFiles > 0 &&
223
+ (progress.phase === 'parsing' || progress.phase === 'extracting')) {
224
+ phaseLabel += ` (${progress.stats.filesProcessed.toLocaleString()}/${progress.stats.totalFiles.toLocaleString()})`;
225
+ }
185
226
  const scaled = Math.round(progress.percent * 0.6);
186
227
  updateBar(scaled, phaseLabel);
187
228
  }, options?.tsgo === false ? { tsgo: false } : {});
188
229
  // Phase 2: SQLite (60-85%)
230
+ recordPhase('sqlite');
189
231
  updateBar(60, 'Loading into database...');
190
232
  // Reset the database (delete and recreate)
191
- const t0Db = Date.now();
192
233
  let db = resetDb(dbPath);
193
234
  let dbMsgCount = 0;
194
235
  const dbResult = loadGraphToDb(db, pipelineResult.graph, pipelineResult.repoPath, (msg) => {
@@ -196,20 +237,21 @@ export const analyzeCommand = async (inputPath, options) => {
196
237
  const progress = Math.min(84, 60 + Math.round((dbMsgCount / (dbMsgCount + 10)) * 24));
197
238
  updateBar(progress, msg);
198
239
  });
199
- const dbTime = ((Date.now() - t0Db) / 1000).toFixed(1);
200
240
  const dbWarnings = dbResult.warnings;
201
241
  // Phase 2.5: HTTP route stitching (post-DB-load, needs content field)
242
+ recordPhase('routes');
202
243
  stitchRoutes(db);
203
244
  // Phase 2.6: Populate searchText for BM25 concept matching
204
245
  // Uses first comment + callers + module — must run after edges are loaded
246
+ recordPhase('search-text');
205
247
  updateBar(84, 'Building search index...');
206
248
  populateSearchText(db);
207
249
  // Phase 3: FTS (85-90%)
208
250
  // FTS5 is auto-created by schema triggers — no manual index creation needed
251
+ recordPhase('fts');
209
252
  updateBar(85, 'Search indexes ready');
210
- const t0Fts = Date.now();
211
- const ftsTime = ((Date.now() - t0Fts) / 1000).toFixed(1);
212
253
  // Phase 3.5: Re-insert cached embeddings
254
+ recordPhase('restore-embeddings');
213
255
  if (cachedEmbeddings.length > 0) {
214
256
  updateBar(88, `Restoring ${cachedEmbeddings.length} cached embeddings...`);
215
257
  const EMBED_BATCH = 200;
@@ -226,15 +268,9 @@ export const analyzeCommand = async (inputPath, options) => {
226
268
  }
227
269
  // Phase 4: Embeddings (90-98%)
228
270
  const stats = getStats(db);
229
- let embeddingTime = '0.0';
230
- let embeddingSkipped = true;
231
- let embeddingSkipReason = 'off (use --no-embeddings to skip)';
232
271
  if (options?.embeddings) {
233
- embeddingSkipped = false;
234
- }
235
- if (!embeddingSkipped) {
272
+ recordPhase('embeddings');
236
273
  updateBar(90, 'Generating embeddings...');
237
- const t0Emb = Date.now();
238
274
  // Close DB so Python can write to it
239
275
  closeDb(dbPath);
240
276
  // Run Python embedder in batch mode — reads from SQLite, embeds, writes back.
@@ -266,17 +302,24 @@ export const analyzeCommand = async (inputPath, options) => {
266
302
  continue;
267
303
  try {
268
304
  const msg = JSON.parse(line);
269
- if (msg.phase === 'loaded') {
305
+ if (msg.phase === 'downloading' || msg.phase === 'converting') {
306
+ updateBar(90, msg.message);
307
+ }
308
+ else if (msg.phase === 'loaded') {
270
309
  updateBar(91, `Model loaded (${msg.load_ms}ms)`);
271
310
  }
272
311
  else if (msg.phase === 'queried') {
273
- updateBar(92, `Found ${msg.nodes} embeddable nodes`);
312
+ updateBar(92, `Found ${msg.nodes} embeddable nodes${msg.skipped_tests ? ` (${msg.skipped_tests} test files skipped)` : ''}`);
274
313
  }
275
314
  else if (msg.phase === 'prepared') {
276
315
  updateBar(93, `${msg.to_embed} to embed, ${msg.skipped} cached`);
277
316
  }
317
+ else if (msg.phase === 'embedding') {
318
+ const scaled = 93 + Math.round((msg.progress / 100) * 4);
319
+ updateBar(scaled, `Embedding... ${msg.progress}% (${msg.embedded} written)`);
320
+ }
278
321
  else if (msg.phase === 'embedded') {
279
- updateBar(96, `Embedded ${msg.count} nodes (${(msg.ms / 1000).toFixed(1)}s)`);
322
+ updateBar(97, `Embedded ${msg.count} nodes (${(msg.ms / 1000).toFixed(1)}s)`);
280
323
  }
281
324
  else if (msg.phase === 'done') {
282
325
  updateBar(98, `Embeddings complete (${msg.embedded} new, ${msg.skipped} cached)`);
@@ -288,9 +331,9 @@ export const analyzeCommand = async (inputPath, options) => {
288
331
  });
289
332
  // Reopen DB after Python is done
290
333
  db = openDb(dbPath);
291
- embeddingTime = ((Date.now() - t0Emb) / 1000).toFixed(1);
292
334
  }
293
335
  // Phase 5: Finalize (98-100%)
336
+ recordPhase('finalize');
294
337
  updateBar(98, 'Saving metadata...');
295
338
  // Count embeddings in the index (cached + newly generated) for metadata
296
339
  const embeddingCount = countEmbeddings(db);
@@ -331,19 +374,26 @@ export const analyzeCommand = async (inputPath, options) => {
331
374
  ...(processCount !== undefined ? { processes: processCount } : {}),
332
375
  });
333
376
  closeDb(dbPath);
377
+ recordPhase('done'); // close the last phase
334
378
  const totalTime = ((Date.now() - t0Global) / 1000).toFixed(1);
335
379
  clearInterval(elapsedTimer);
336
380
  process.removeListener('SIGINT', sigintHandler);
337
381
  console.log = origLog;
338
382
  console.warn = origWarn;
339
383
  console.error = origError;
340
- bar.update(100, { phase: 'Done' });
384
+ bar.update(100, { phase: 'Done', resources: '' });
341
385
  bar.stop();
386
+ // Clear any leftover characters from the progress bar line
387
+ process.stdout.write('\x1b[2K');
342
388
  // Summary
343
389
  const embeddingsCached = cachedEmbeddings.length > 0;
344
390
  console.log(`\n Repository indexed successfully (${totalTime}s)${embeddingsCached ? ` [${cachedEmbeddings.length} embeddings cached]` : ''}\n`);
345
391
  console.log(` ${stats.nodes.toLocaleString()} nodes | ${stats.edges.toLocaleString()} edges | ${pipelineResult.communityResult?.stats.totalCommunities || 0} clusters | ${pipelineResult.processResult?.stats.totalProcesses || 0} flows`);
346
- console.log(` SQLite ${dbTime}s | FTS ${ftsTime}s | Embeddings ${embeddingSkipped ? embeddingSkipReason : embeddingTime + 's'}`);
392
+ // Resource usage
393
+ const cpuEnd = process.cpuUsage(cpuStart);
394
+ const wallMs = Date.now() - t0Global || 1;
395
+ const cpuPct = Math.round(((cpuEnd.user + cpuEnd.system) / 1e3) / wallMs * 100);
396
+ console.log(` Memory: peak ${peakRssMB}MB RSS | CPU: ${cpuPct}% (${cpuCount} cores)`);
347
397
  console.log(` tsgo: ${pipelineResult.tsgoEnabled ? 'enabled (compiler-verified call resolution)' : 'disabled — install @typescript/native-preview for higher accuracy'}`);
348
398
  console.log(` ${repoPath}`);
349
399
  if (aiContext.files.length > 0) {
@@ -353,6 +403,39 @@ export const analyzeCommand = async (inputPath, options) => {
353
403
  if (dbWarnings.length > 0) {
354
404
  console.log(` Note: ${dbWarnings.length} warnings during graph load`);
355
405
  }
406
+ // Detailed performance breakdown
407
+ const totalMs = phaseTimes.reduce((s, p) => s + p.ms, 0) || 1;
408
+ const PHASE_DISPLAY_NAMES = {
409
+ init: 'Init',
410
+ extracting: 'Scanning files',
411
+ structure: 'Building structure',
412
+ parsing: 'Parsing & resolving',
413
+ imports: 'Resolving imports',
414
+ calls: 'Tracing calls',
415
+ heritage: 'Extracting inheritance',
416
+ communities: 'Detecting communities',
417
+ processes: 'Detecting processes',
418
+ enriching: 'Enriching clusters',
419
+ complete: 'Pipeline complete',
420
+ sqlite: 'SQLite load',
421
+ routes: 'Route stitching',
422
+ 'search-text': 'Search text',
423
+ fts: 'FTS indexing',
424
+ 'restore-embeddings': 'Restore embeddings',
425
+ embeddings: 'Embeddings (MLX)',
426
+ finalize: 'Finalize & context',
427
+ done: 'Done',
428
+ };
429
+ console.log('\n Phase breakdown:');
430
+ for (const phase of phaseTimes) {
431
+ const sec = (phase.ms / 1000).toFixed(1);
432
+ const pct = Math.round((phase.ms / totalMs) * 100);
433
+ const name = PHASE_DISPLAY_NAMES[phase.name] || phase.name;
434
+ const bar = pct >= 2 ? ' ' + '█'.repeat(Math.max(1, Math.round(pct / 3))) : '';
435
+ console.log(` ${name.padEnd(22)} ${sec.padStart(6)}s ${String(pct).padStart(3)}% ${phase.rssMB}MB${bar}`);
436
+ }
437
+ console.log(` ${'─'.repeat(50)}`);
438
+ console.log(` ${'Total'.padEnd(22)} ${totalTime.padStart(6)}s 100% ${peakRssMB}MB peak`);
356
439
  try {
357
440
  await fs.access(getGlobalRegistryPath());
358
441
  }
package/dist/cli/index.js CHANGED
File without changes
@@ -4,7 +4,10 @@ import fs from 'fs/promises';
4
4
  import path from 'path';
5
5
  import { glob } from 'glob';
6
6
  import { createIgnoreFilter } from '../../config/ignore-service.js';
7
- const READ_CONCURRENCY = 32;
7
+ // Stat is metadata-only (no I/O), can be highly concurrent
8
+ const STAT_CONCURRENCY = 256;
9
+ // File reads move actual data, keep bounded to avoid fd exhaustion
10
+ const READ_CONCURRENCY = 64;
8
11
  /** Scan repository: stat files to get paths + sizes, no content loaded (~10MB for 100K files) */
9
12
  export const walkRepositoryPaths = async (repoPath, onProgress) => {
10
13
  const ignoreFilter = await createIgnoreFilter(repoPath);
@@ -16,8 +19,8 @@ export const walkRepositoryPaths = async (repoPath, onProgress) => {
16
19
  });
17
20
  const entries = [];
18
21
  let processed = 0;
19
- for (let start = 0; start < filtered.length; start += READ_CONCURRENCY) {
20
- const batch = filtered.slice(start, start + READ_CONCURRENCY);
22
+ for (let start = 0; start < filtered.length; start += STAT_CONCURRENCY) {
23
+ const batch = filtered.slice(start, start + STAT_CONCURRENCY);
21
24
  const results = await Promise.allSettled(batch.map(async (relativePath) => {
22
25
  const fullPath = path.join(repoPath, relativePath);
23
26
  const stat = await fs.stat(fullPath);
@@ -23,10 +23,14 @@ const processParsingWithWorkers = async (graph, files, symbolTable, _astCache, w
23
23
  if (parseableFiles.length === 0)
24
24
  return { imports: [], calls: [], heritage: [], routes: [], constructorBindings: [] };
25
25
  const total = files.length;
26
- // Dispatch to worker pool
27
- const chunkResults = await workerPool.dispatch(parseableFiles, (filesProcessed) => {
26
+ // Dispatch to worker pool with size-balanced distribution
27
+ const { results: chunkResults, failures } = await workerPool.dispatch(parseableFiles, (filesProcessed) => {
28
28
  onFileProgress?.(Math.min(filesProcessed, total), total, 'Parsing...');
29
- });
29
+ }, (item) => item.content.length);
30
+ // Log worker failures (don't throw — partial results are still valuable)
31
+ for (const failure of failures) {
32
+ console.error(` Worker failure (partial results preserved): ${failure.message}`);
33
+ }
30
34
  // Merge worker results into graph and symbol table
31
35
  const allImports = [];
32
36
  const allCalls = [];
@@ -21,7 +21,10 @@ import { fileURLToPath, pathToFileURL } from 'node:url';
21
21
  import { memoryGuard } from '../../lib/memory-guard.js';
22
22
  import { toNodeId, toEdgeId } from '../db/schema.js';
23
23
  import { getTsgoService } from '../semantic/tsgo-service.js';
24
- const isDev = process.env['NODE_ENV'] === 'development';
24
+ const verbose = (...args) => {
25
+ if (process.env['CODE_MAPPER_VERBOSE'])
26
+ console.error(...args);
27
+ };
25
28
  // Default chunk budget — used when memory is plentiful.
26
29
  // Under memory pressure, adaptiveBatchSize() shrinks this automatically.
27
30
  const DEFAULT_CHUNK_BYTE_BUDGET = 50 * 1024 * 1024;
@@ -122,9 +125,9 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
122
125
  if (currentChunk.length > 0)
123
126
  chunks.push(currentChunk);
124
127
  const numChunks = chunks.length;
125
- if (isDev) {
128
+ {
126
129
  const totalMB = parseableScanned.reduce((s, f) => s + f.size, 0) / (1024 * 1024);
127
- console.log(`📂 Scan: ${totalFiles} paths, ${totalParseable} parseable (${totalMB.toFixed(0)}MB), ${numChunks} chunks @ ${Math.round(chunkBudget / (1024 * 1024))}MB budget (${memoryGuard.summary()})`);
130
+ verbose(`[parse] ${totalFiles} paths, ${totalParseable} parseable (${totalMB.toFixed(0)}MB), ${numChunks} chunks @ ${Math.round(chunkBudget / (1024 * 1024))}MB budget (${memoryGuard.summary()})`);
128
131
  }
129
132
  onProgress({
130
133
  phase: 'parsing',
@@ -148,8 +151,7 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
148
151
  workerPool = createWorkerPool(workerUrl);
149
152
  }
150
153
  catch (err) {
151
- if (isDev)
152
- console.warn('Worker pool creation failed, using sequential fallback:', err.message);
154
+ console.error('[parse] worker pool creation failed, using sequential fallback:', err.message);
153
155
  }
154
156
  let filesParsedSoFar = 0;
155
157
  // AST cache sized for one chunk (used by sequential fallback for import/call/heritage)
@@ -171,12 +173,17 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
171
173
  const chunkPaths = chunks[chunkIdx];
172
174
  if (!chunkPaths)
173
175
  continue;
176
+ const chunkStart = Date.now();
174
177
  // Read content for this chunk
175
178
  const chunkContents = await readFileContents(repoPath, chunkPaths);
176
179
  const chunkFiles = chunkPaths
177
180
  .filter(p => chunkContents.has(p))
178
181
  .map(p => ({ path: p, content: chunkContents.get(p) }));
182
+ const readMs = Date.now() - chunkStart;
183
+ const chunkMB = chunkFiles.reduce((s, f) => s + f.content.length, 0) / (1024 * 1024);
184
+ verbose(`[parse] chunk ${chunkIdx + 1}/${numChunks}: ${chunkFiles.length} files (${chunkMB.toFixed(1)}MB), read ${readMs}ms`);
179
185
  // Parse chunk (workers or sequential fallback)
186
+ const parseStart = Date.now();
180
187
  const chunkWorkerData = await processParsing(graph, chunkFiles, symbolTable, astCache, (current, _total, filePath) => {
181
188
  const globalCurrent = filesParsedSoFar + current;
182
189
  const parsingProgress = 20 + ((globalCurrent / totalParseable) * 62);
@@ -188,6 +195,8 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
188
195
  stats: { filesProcessed: globalCurrent, totalFiles: totalParseable, nodesCreated: graph.nodeCount },
189
196
  });
190
197
  }, workerPool);
198
+ const parseMs = Date.now() - parseStart;
199
+ verbose(`[parse] chunk ${chunkIdx + 1}/${numChunks}: parsed ${parseMs}ms (${memoryGuard.summary()})`);
191
200
  const chunkBasePercent = 20 + ((filesParsedSoFar / totalParseable) * 62);
192
201
  if (chunkWorkerData) {
193
202
  // Resolve imports per-chunk (file-level, doesn't need full symbol table)
@@ -235,14 +244,14 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
235
244
  sequentialChunkPaths.push(chunkPaths);
236
245
  }
237
246
  filesParsedSoFar += chunkFiles.length;
247
+ const totalChunkMs = Date.now() - chunkStart;
248
+ verbose(`[parse] chunk ${chunkIdx + 1}/${numChunks}: total ${totalChunkMs}ms, ${filesParsedSoFar}/${totalParseable} files done`);
238
249
  // Clear AST cache between chunks to free memory; chunk locals go out of scope for GC
239
250
  astCache.clear();
240
251
  // Attempt GC between chunks if under memory pressure
241
252
  if (memoryGuard.isUnderPressure()) {
242
253
  memoryGuard.tryGC();
243
- if (isDev) {
244
- console.log(`⚠️ Memory pressure after chunk ${chunkIdx + 1}: ${memoryGuard.summary()}`);
245
- }
254
+ verbose(`[parse] memory pressure after chunk ${chunkIdx + 1}: ${memoryGuard.summary()}`);
246
255
  }
247
256
  }
248
257
  }
@@ -301,12 +310,11 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
301
310
  tsgoService?.stop();
302
311
  }
303
312
  }
304
- // Log resolution cache stats in dev mode
305
- if (isDev) {
313
+ {
306
314
  const rcStats = ctx.getStats();
307
315
  const total = rcStats.cacheHits + rcStats.cacheMisses;
308
316
  const hitRate = total > 0 ? ((rcStats.cacheHits / total) * 100).toFixed(1) : '0';
309
- console.log(`🔍 Resolution cache: ${rcStats.cacheHits} hits, ${rcStats.cacheMisses} misses (${hitRate}% hit rate)`);
317
+ verbose(`[resolve] cache: ${rcStats.cacheHits} hits, ${rcStats.cacheMisses} misses (${hitRate}% hit rate)`);
310
318
  }
311
319
  // Free import resolution context (~94MB+ for large repos)
312
320
  allPathObjects.length = 0;
@@ -318,13 +326,13 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
318
326
  createDependsOnEdges(graph, ctx),
319
327
  createProvidesEdges(graph, ctx),
320
328
  ]);
321
- if (isDev && (diEdgeCount > 0 || providesEdgeCount > 0)) {
322
- console.log(`💉 DI: ${diEdgeCount} DEPENDS_ON edges, ${providesEdgeCount} PROVIDES edges`);
329
+ if (diEdgeCount > 0 || providesEdgeCount > 0) {
330
+ verbose(`[resolve] DI: ${diEdgeCount} DEPENDS_ON, ${providesEdgeCount} PROVIDES edges`);
323
331
  }
324
332
  // Phase 4.5a2: Interface dispatch — connect callers of interfaces to implementations
325
333
  const ifaceEdges = await resolveInterfaceDispatches(graph, ctx);
326
- if (isDev && ifaceEdges > 0) {
327
- console.log(`🔌 Interface dispatch: ${ifaceEdges} implementation edges`);
334
+ if (ifaceEdges > 0) {
335
+ verbose(`[resolve] interface dispatch: ${ifaceEdges} implementation edges`);
328
336
  }
329
337
  // Phase 4.5b: Method Resolution Order
330
338
  onProgress({
@@ -334,8 +342,8 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
334
342
  stats: { filesProcessed: totalFiles, totalFiles, nodesCreated: graph.nodeCount },
335
343
  });
336
344
  const mroResult = computeMRO(graph);
337
- if (isDev && mroResult.entries.length > 0) {
338
- console.log(`🔀 MRO: ${mroResult.entries.length} classes analyzed, ${mroResult.ambiguityCount} ambiguities found, ${mroResult.overrideEdges} OVERRIDES edges`);
345
+ if (mroResult.entries.length > 0) {
346
+ verbose(`[resolve] MRO: ${mroResult.entries.length} classes, ${mroResult.ambiguityCount} ambiguities, ${mroResult.overrideEdges} OVERRIDES edges`);
339
347
  }
340
348
  // Phase 5: Communities
341
349
  onProgress({
@@ -353,9 +361,7 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
353
361
  stats: { filesProcessed: totalFiles, totalFiles, nodesCreated: graph.nodeCount },
354
362
  });
355
363
  });
356
- if (isDev) {
357
- console.log(`🏘️ Community detection: ${communityResult.stats.totalCommunities} communities found (modularity: ${communityResult.stats.modularity.toFixed(3)})`);
358
- }
364
+ verbose(`[community] ${communityResult.stats.totalCommunities} communities (modularity: ${communityResult.stats.modularity.toFixed(3)})`);
359
365
  communityResult.communities.forEach(comm => {
360
366
  graph.addNode({
361
367
  id: toNodeId(comm.id),
@@ -399,9 +405,7 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
399
405
  stats: { filesProcessed: totalFiles, totalFiles, nodesCreated: graph.nodeCount },
400
406
  });
401
407
  }, { maxProcesses: dynamicMaxProcesses, minSteps: 3 });
402
- if (isDev) {
403
- console.log(`🔄 Process detection: ${processResult.stats.totalProcesses} processes found (${processResult.stats.crossCommunityCount} cross-community)`);
404
- }
408
+ verbose(`[process] ${processResult.stats.totalProcesses} processes (${processResult.stats.crossCommunityCount} cross-community)`);
405
409
  processResult.processes.forEach(proc => {
406
410
  graph.addNode({
407
411
  id: toNodeId(proc.id),
@@ -174,7 +174,7 @@ const processBatch = (files, onProgress) => {
174
174
  }
175
175
  let totalProcessed = 0;
176
176
  let lastReported = 0;
177
- const PROGRESS_INTERVAL = 100; // report every 100 files
177
+ const PROGRESS_INTERVAL = 25; // report every 25 files — resets the sub-batch timer
178
178
  const onFileProcessed = onProgress ? () => {
179
179
  totalProcessed++;
180
180
  if (totalProcessed - lastReported >= PROGRESS_INTERVAL) {
@@ -1,11 +1,16 @@
1
1
  /** @file Generic worker thread pool with sub-batch streaming for bounded memory usage */
2
2
  export interface WorkerPool {
3
3
  /**
4
- * Dispatch items across workers with sub-batch streaming
5
- * @param items - Items to process (split into chunks, one per worker)
4
+ * Dispatch items across workers with sub-batch streaming.
5
+ * Uses Promise.allSettled so one worker failure doesn't discard other workers' results.
6
+ * @param items - Items to process (split across workers using size-balanced round-robin)
6
7
  * @param onProgress - Optional callback for progress reporting
8
+ * @param getItemSize - Optional function to extract item size for balanced dispatch
7
9
  */
8
- dispatch<TInput, TResult>(items: TInput[], onProgress?: (filesProcessed: number) => void): Promise<TResult[]>;
10
+ dispatch<TInput, TResult>(items: TInput[], onProgress?: (filesProcessed: number) => void, getItemSize?: (item: TInput) => number): Promise<{
11
+ results: TResult[];
12
+ failures: Error[];
13
+ }>;
9
14
  /** Terminate all workers (must be called when done) */
10
15
  terminate(): Promise<void>;
11
16
  /** Number of workers in the pool */
@@ -5,9 +5,40 @@ import os from 'node:os';
5
5
  import fs from 'node:fs';
6
6
  import { fileURLToPath } from 'node:url';
7
7
  // Max files per postMessage to keep structured-clone memory bounded
8
- const SUB_BATCH_SIZE = 1500;
9
- // Per sub-batch timeout — large codebases with big files need more time
10
- const SUB_BATCH_TIMEOUT_MS = 120_000;
8
+ const SUB_BATCH_SIZE = 500;
9
+ // Base sub-batch timeout — extended proportionally to file count
10
+ const BASE_TIMEOUT_MS = 120_000;
11
+ // Per-file timeout extension (200ms per file in the sub-batch)
12
+ const PER_FILE_TIMEOUT_MS = 200;
13
+ /** Compute proportional timeout: max(base, fileCount * perFile) */
14
+ const computeTimeout = (fileCount) => Math.max(BASE_TIMEOUT_MS, fileCount * PER_FILE_TIMEOUT_MS);
15
+ /**
16
+ * Distribute items across N buckets using size-balanced round-robin (LPT heuristic).
17
+ * Items are sorted by size descending, then assigned to the bucket with the smallest total.
18
+ * This minimizes the makespan of the heaviest bucket.
19
+ */
20
+ const sizeBalancedDistribute = (items, bucketCount, getSize) => {
21
+ if (bucketCount <= 0)
22
+ return [];
23
+ if (items.length === 0)
24
+ return Array.from({ length: bucketCount }, () => []);
25
+ // Sort indices by size descending
26
+ const indices = items.map((_, i) => i);
27
+ indices.sort((a, b) => getSize(items[b]) - getSize(items[a]));
28
+ const buckets = Array.from({ length: bucketCount }, () => []);
29
+ const bucketSizes = new Array(bucketCount).fill(0);
30
+ for (const idx of indices) {
31
+ // Find the bucket with the smallest total size
32
+ let minBucket = 0;
33
+ for (let b = 1; b < bucketCount; b++) {
34
+ if (bucketSizes[b] < bucketSizes[minBucket])
35
+ minBucket = b;
36
+ }
37
+ buckets[minBucket].push(items[idx]);
38
+ bucketSizes[minBucket] += getSize(items[idx]);
39
+ }
40
+ return buckets;
41
+ };
11
42
  /** Create a pool of worker threads */
12
43
  export const createWorkerPool = (workerUrl, poolSize) => {
13
44
  // Validate worker script exists before spawning to prevent MODULE_NOT_FOUND crashes
@@ -20,13 +51,20 @@ export const createWorkerPool = (workerUrl, poolSize) => {
20
51
  for (let i = 0; i < size; i++) {
21
52
  workers.push(new Worker(workerUrl));
22
53
  }
23
- const dispatch = (items, onProgress) => {
54
+ const dispatch = (items, onProgress, getItemSize) => {
24
55
  if (items.length === 0)
25
- return Promise.resolve([]);
26
- const chunkSize = Math.ceil(items.length / size);
27
- const chunks = [];
28
- for (let i = 0; i < items.length; i += chunkSize) {
29
- chunks.push(items.slice(i, i + chunkSize));
56
+ return Promise.resolve({ results: [], failures: [] });
57
+ // Size-balanced dispatch when size function provided, otherwise equal split
58
+ let chunks;
59
+ if (getItemSize) {
60
+ chunks = sizeBalancedDistribute(items, Math.min(size, items.length), getItemSize);
61
+ }
62
+ else {
63
+ const chunkSize = Math.ceil(items.length / size);
64
+ chunks = [];
65
+ for (let i = 0; i < items.length; i += chunkSize) {
66
+ chunks.push(items.slice(i, i + chunkSize));
67
+ }
30
68
  }
31
69
  const workerProgress = new Array(chunks.length).fill(0);
32
70
  const promises = chunks.map((chunk, i) => {
@@ -37,6 +75,7 @@ export const createWorkerPool = (workerUrl, poolSize) => {
37
75
  return new Promise((resolve, reject) => {
38
76
  let settled = false;
39
77
  let subBatchTimer = null;
78
+ let currentSubBatchSize = 0;
40
79
  const cleanup = () => {
41
80
  if (subBatchTimer)
42
81
  clearTimeout(subBatchTimer);
@@ -47,13 +86,14 @@ export const createWorkerPool = (workerUrl, poolSize) => {
47
86
  const resetSubBatchTimer = () => {
48
87
  if (subBatchTimer)
49
88
  clearTimeout(subBatchTimer);
89
+ const timeout = computeTimeout(currentSubBatchSize);
50
90
  subBatchTimer = setTimeout(() => {
51
91
  if (!settled) {
52
92
  settled = true;
53
93
  cleanup();
54
- reject(new Error(`Worker ${i} sub-batch timed out after ${SUB_BATCH_TIMEOUT_MS / 1000}s (chunk: ${chunk.length} items).`));
94
+ reject(new Error(`Worker ${i} sub-batch timed out after ${timeout / 1000}s (chunk: ${chunk.length} items, sub-batch: ${currentSubBatchSize} items).`));
55
95
  }
56
- }, SUB_BATCH_TIMEOUT_MS);
96
+ }, timeout);
57
97
  };
58
98
  let subBatchIdx = 0;
59
99
  const sendNextSubBatch = () => {
@@ -63,6 +103,7 @@ export const createWorkerPool = (workerUrl, poolSize) => {
63
103
  return;
64
104
  }
65
105
  const subBatch = chunk.slice(start, start + SUB_BATCH_SIZE);
106
+ currentSubBatchSize = subBatch.length;
66
107
  subBatchIdx++;
67
108
  resetSubBatchTimer();
68
109
  worker.postMessage({ type: 'sub-batch', files: subBatch });
@@ -71,6 +112,8 @@ export const createWorkerPool = (workerUrl, poolSize) => {
71
112
  if (settled)
72
113
  return;
73
114
  if (msg && msg.type === 'progress') {
115
+ // BUG FIX: Reset timer on progress — worker is alive and making progress
116
+ resetSubBatchTimer();
74
117
  workerProgress[i] = msg.filesProcessed;
75
118
  if (onProgress) {
76
119
  const total = workerProgress.reduce((a, b) => a + b, 0);
@@ -116,7 +159,20 @@ export const createWorkerPool = (workerUrl, poolSize) => {
116
159
  sendNextSubBatch();
117
160
  });
118
161
  });
119
- return Promise.all(promises);
162
+ // Use allSettled so one worker failure doesn't discard other workers' results
163
+ return Promise.allSettled(promises).then(outcomes => {
164
+ const results = [];
165
+ const failures = [];
166
+ for (const outcome of outcomes) {
167
+ if (outcome.status === 'fulfilled') {
168
+ results.push(outcome.value);
169
+ }
170
+ else {
171
+ failures.push(outcome.reason instanceof Error ? outcome.reason : new Error(String(outcome.reason)));
172
+ }
173
+ }
174
+ return { results, failures };
175
+ });
120
176
  };
121
177
  const terminate = async () => {
122
178
  await Promise.all(workers.map(w => w.terminate()));
@@ -415,26 +415,81 @@ def batch_mode(db_path, dims=256, max_tokens=2048):
415
415
  unique_texts = [v["text"] for v in unique_by_hash.values()]
416
416
  deduped = len(to_embed) - len(unique_texts)
417
417
 
418
- # Embed only unique texts
418
+ # Embed unique texts in streaming fashion — process each batch, write to DB
419
+ # immediately, free GPU memory. Keeps peak memory at ONE batch instead of ALL.
419
420
  t0_embed = time.time()
420
- embeddings = embed_tiered(model, tokenizer, unique_texts, "retrieval.passage", dims, max_tokens)
421
- embed_ms = int((time.time() - t0_embed) * 1000)
421
+ unique_entries = list(unique_by_hash.values())
422
422
 
423
- print(json.dumps({"phase": "embedded", "count": len(unique_texts), "deduped": deduped, "ms": embed_ms}), flush=True)
423
+ # Tokenize + sort (same as embed_tiered but we handle the loop here)
424
+ is_code_model = "jina-code" in MODEL_DIR
425
+ if is_code_model:
426
+ prefix_map = {"retrieval.query": "Find the most relevant code snippet given the following query:\n", "retrieval.passage": "Candidate code snippet:\n"}
427
+ else:
428
+ prefix_map = {"retrieval.query": "Query: ", "retrieval.passage": "Document: "}
429
+ prefix = prefix_map.get("retrieval.passage", "")
430
+ prefixed = [prefix + e["text"] for e in unique_entries]
431
+ encodings = tokenizer.encode_batch(prefixed)
432
+ indexed = sorted(range(len(unique_entries)), key=lambda i: len(encodings[i].ids))
424
433
 
425
- # Write to database — copy embedding to all nodes sharing the same hash
426
- t0_write = time.time()
434
+ embedded_count = 0
427
435
  db.execute("BEGIN")
428
- for i, (text_hash, entry) in enumerate(unique_by_hash.items()):
429
- emb = embeddings[i]
430
- if emb is None:
431
- continue
432
- blob = float_list_to_blob(emb)
433
- for nid, th in entry["node_ids"]:
434
- db.execute("INSERT OR REPLACE INTO embeddings (nodeId, embedding, textHash) VALUES (?, ?, ?)",
435
- (nid, blob, th))
436
+
437
+ i = 0
438
+ while i < len(indexed):
439
+ peek_idx = indexed[min(i + 1, len(indexed) - 1)]
440
+ tok_count = min(len(encodings[peek_idx].ids), max_tokens)
441
+ batch_size = get_batch_size_for_tokens(tok_count)
442
+
443
+ batch_indices = []
444
+ batch_encs = []
445
+ while len(batch_encs) < batch_size and i < len(indexed):
446
+ orig_idx = indexed[i]
447
+ batch_indices.append(orig_idx)
448
+ batch_encs.append(encodings[orig_idx])
449
+ i += 1
450
+
451
+ max_len = min(max_tokens, max(len(e.ids) for e in batch_encs))
452
+ input_ids = []
453
+ attention_mask = []
454
+ for enc in batch_encs:
455
+ ids = enc.ids[:max_len]
456
+ mask = enc.attention_mask[:max_len]
457
+ pad = max_len - len(ids)
458
+ if pad > 0:
459
+ ids = ids + [0] * pad
460
+ mask = mask + [0] * pad
461
+ input_ids.append(ids)
462
+ attention_mask.append(mask)
463
+
464
+ # Forward pass
465
+ embs = model(mx.array(input_ids), mx.array(attention_mask))
466
+ if dims and dims < embs.shape[1]:
467
+ embs = embs[:, :dims]
468
+ norms = mx.linalg.norm(embs, axis=1, keepdims=True)
469
+ embs = embs / norms
470
+ mx.eval(embs)
471
+
472
+ # Convert to Python + write to DB immediately
473
+ emb_list = embs.tolist()
474
+ del embs # free MLX GPU memory
475
+
476
+ for j, orig_idx in enumerate(batch_indices):
477
+ entry = unique_entries[orig_idx]
478
+ blob = float_list_to_blob(emb_list[j])
479
+ for nid, th in entry["node_ids"]:
480
+ db.execute("INSERT OR REPLACE INTO embeddings (nodeId, embedding, textHash) VALUES (?, ?, ?)",
481
+ (nid, blob, th))
482
+ embedded_count += len(entry["node_ids"])
483
+
484
+ # Progress
485
+ pct = i * 100 // len(indexed)
486
+ print(json.dumps({"phase": "embedding", "progress": pct, "embedded": embedded_count}), flush=True)
487
+
436
488
  db.execute("COMMIT")
437
- write_ms = int((time.time() - t0_write) * 1000)
489
+ embed_ms = int((time.time() - t0_embed) * 1000)
490
+ write_ms = 0 # included in embed_ms now
491
+
492
+ print(json.dumps({"phase": "embedded", "count": len(unique_entries), "deduped": deduped, "ms": embed_ms}), flush=True)
438
493
 
439
494
  total_ms = int((time.time() - t0_total) * 1000)
440
495
  print(json.dumps({
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@zuvia-software-solutions/code-mapper",
3
- "version": "2.3.1",
3
+ "version": "2.3.3",
4
4
  "description": "Graph-powered code intelligence for AI agents. Index any codebase, query via MCP or CLI.",
5
5
  "author": "Abhigyan Patwari",
6
6
  "license": "PolyForm-Noncommercial-1.0.0",