npm - @zuvia-software-solutions/code-mapper - Versions diffs - 2.3.1 → 2.3.3 - Mend

@zuvia-software-solutions/code-mapper 2.3.1 → 2.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/dist/cli/analyze.js +105 -22
package/dist/cli/index.js +0 -0
package/dist/core/ingestion/filesystem-walker.js +6 -3
package/dist/core/ingestion/parsing-processor.js +7 -3
package/dist/core/ingestion/pipeline.js +27 -23
package/dist/core/ingestion/workers/parse-worker.js +1 -1
package/dist/core/ingestion/workers/worker-pool.d.ts +8 -3
package/dist/core/ingestion/workers/worker-pool.js +68 -12
package/models/mlx-embedder.py +70 -15
package/package.json +1 -1

package/dist/cli/analyze.js CHANGED Viewed

@@ -1,6 +1,7 @@
 // code-mapper/src/cli/analyze.ts
 /** @file analyze.ts @description Indexes a repository, builds the knowledge graph, and stores it in .code-mapper/ */
 import path from 'path';
+import os from 'os';
 import { execFileSync } from 'child_process';
 import v8 from 'v8';
 import cliProgress from 'cli-progress';
@@ -93,7 +94,8 @@ export const analyzeCommand = async (inputPath, options) => {
     }
     // Single progress bar for the entire pipeline
     const bar = new cliProgress.SingleBar({
-        format: '  {bar} {percentage}% | {phase}',
+        // \x1b[K at end clears to EOL so shorter redraws don't leave trailing characters
+        format: '  {bar} {percentage}% | {phase} | {resources}\x1b[K',
         barCompleteChar: '\u2588',
         barIncompleteChar: '\u2591',
         hideCursor: true,
@@ -131,6 +133,37 @@ export const analyzeCommand = async (inputPath, options) => {
     console.log = barLog;
     console.warn = barLog;
     console.error = barLog;
+    const t0Global = Date.now();
+    const cpuStart = process.cpuUsage();
+    let peakRssMB = 0;
+    // Phase timing tracker — records wall time and RSS for each phase
+    const phaseTimes = [];
+    let currentPhaseName = 'init';
+    let currentPhaseStart = Date.now();
+    const recordPhase = (nextPhase) => {
+        const now = Date.now();
+        const elapsed = now - currentPhaseStart;
+        if (elapsed > 0) {
+            phaseTimes.push({
+                name: currentPhaseName,
+                ms: elapsed,
+                rssMB: Math.round(process.memoryUsage.rss() / (1024 * 1024)),
+            });
+        }
+        currentPhaseName = nextPhase;
+        currentPhaseStart = now;
+    };
+    // Live resource stats for the progress bar
+    const cpuCount = os.cpus().length;
+    const getResourceStats = () => {
+        const rssMB = Math.round(process.memoryUsage.rss() / (1024 * 1024));
+        if (rssMB > peakRssMB)
+            peakRssMB = rssMB;
+        const cpuDelta = process.cpuUsage(cpuStart);
+        const wallMs = Date.now() - t0Global || 1;
+        const cpuPct = Math.round(((cpuDelta.user + cpuDelta.system) / 1e3) / wallMs * 100);
+        return `${rssMB}MB | CPU ${cpuPct}%`;
+    };
     // Track elapsed time per phase — both updateBar and the interval use
     // the same format so they don't flicker against each other
     let lastPhaseLabel = 'Initializing...';
@@ -143,17 +176,16 @@ export const analyzeCommand = async (inputPath, options) => {
         }
         const elapsed = Math.round((Date.now() - phaseStart) / 1000);
         const display = elapsed >= 3 ? `${phaseLabel} (${elapsed}s)` : phaseLabel;
-        bar.update(value, { phase: display });
+        bar.update(value, { phase: display, resources: getResourceStats() });
     };
     // Tick elapsed seconds for phases with infrequent progress callbacks
     // (e.g. CSV streaming, FTS indexing) — uses the same display format as updateBar
     const elapsedTimer = setInterval(() => {
         const elapsed = Math.round((Date.now() - phaseStart) / 1000);
         if (elapsed >= 3) {
-            bar.update({ phase: `${lastPhaseLabel} (${elapsed}s)` });
+            bar.update({ phase: `${lastPhaseLabel} (${elapsed}s)`, resources: getResourceStats() });
         }
     }, 1000);
-    const t0Global = Date.now();
     // Cache embeddings from existing index before rebuild
     let cachedEmbeddingNodeIds = new Set();
     let cachedEmbeddings = [];
@@ -180,15 +212,24 @@ export const analyzeCommand = async (inputPath, options) => {
         }
     }
     // Phase 1: Full Pipeline (0-60%)
+    let lastPipelinePhase = '';
     const pipelineResult = await runPipelineFromRepo(repoPath, (progress) => {
-        const phaseLabel = PHASE_LABELS[progress.phase] || progress.phase;
+        if (progress.phase !== lastPipelinePhase) {
+            recordPhase(progress.phase);
+            lastPipelinePhase = progress.phase;
+        }
+        let phaseLabel = PHASE_LABELS[progress.phase] || progress.phase;
+        if (progress.stats && progress.stats.totalFiles > 0 &&
+            (progress.phase === 'parsing' || progress.phase === 'extracting')) {
+            phaseLabel += ` (${progress.stats.filesProcessed.toLocaleString()}/${progress.stats.totalFiles.toLocaleString()})`;
+        }
         const scaled = Math.round(progress.percent * 0.6);
         updateBar(scaled, phaseLabel);
     }, options?.tsgo === false ? { tsgo: false } : {});
     // Phase 2: SQLite (60-85%)
+    recordPhase('sqlite');
     updateBar(60, 'Loading into database...');
     // Reset the database (delete and recreate)
-    const t0Db = Date.now();
     let db = resetDb(dbPath);
     let dbMsgCount = 0;
     const dbResult = loadGraphToDb(db, pipelineResult.graph, pipelineResult.repoPath, (msg) => {
@@ -196,20 +237,21 @@ export const analyzeCommand = async (inputPath, options) => {
         const progress = Math.min(84, 60 + Math.round((dbMsgCount / (dbMsgCount + 10)) * 24));
         updateBar(progress, msg);
     });
-    const dbTime = ((Date.now() - t0Db) / 1000).toFixed(1);
     const dbWarnings = dbResult.warnings;
     // Phase 2.5: HTTP route stitching (post-DB-load, needs content field)
+    recordPhase('routes');
     stitchRoutes(db);
     // Phase 2.6: Populate searchText for BM25 concept matching
     // Uses first comment + callers + module — must run after edges are loaded
+    recordPhase('search-text');
     updateBar(84, 'Building search index...');
     populateSearchText(db);
     // Phase 3: FTS (85-90%)
     // FTS5 is auto-created by schema triggers — no manual index creation needed
+    recordPhase('fts');
     updateBar(85, 'Search indexes ready');
-    const t0Fts = Date.now();
-    const ftsTime = ((Date.now() - t0Fts) / 1000).toFixed(1);
     // Phase 3.5: Re-insert cached embeddings
+    recordPhase('restore-embeddings');
     if (cachedEmbeddings.length > 0) {
         updateBar(88, `Restoring ${cachedEmbeddings.length} cached embeddings...`);
         const EMBED_BATCH = 200;
@@ -226,15 +268,9 @@ export const analyzeCommand = async (inputPath, options) => {
     }
     // Phase 4: Embeddings (90-98%)
     const stats = getStats(db);
-    let embeddingTime = '0.0';
-    let embeddingSkipped = true;
-    let embeddingSkipReason = 'off (use --no-embeddings to skip)';
     if (options?.embeddings) {
-        embeddingSkipped = false;
-    }
-    if (!embeddingSkipped) {
+        recordPhase('embeddings');
         updateBar(90, 'Generating embeddings...');
-        const t0Emb = Date.now();
         // Close DB so Python can write to it
         closeDb(dbPath);
         // Run Python embedder in batch mode — reads from SQLite, embeds, writes back.
@@ -266,17 +302,24 @@ export const analyzeCommand = async (inputPath, options) => {
                         continue;
                     try {
                         const msg = JSON.parse(line);
-                        if (msg.phase === 'loaded') {
+                        if (msg.phase === 'downloading' || msg.phase === 'converting') {
+                            updateBar(90, msg.message);
+                        }
+                        else if (msg.phase === 'loaded') {
                             updateBar(91, `Model loaded (${msg.load_ms}ms)`);
                         }
                         else if (msg.phase === 'queried') {
-                            updateBar(92, `Found ${msg.nodes} embeddable nodes`);
+                            updateBar(92, `Found ${msg.nodes} embeddable nodes${msg.skipped_tests ? ` (${msg.skipped_tests} test files skipped)` : ''}`);
                         }
                         else if (msg.phase === 'prepared') {
                             updateBar(93, `${msg.to_embed} to embed, ${msg.skipped} cached`);
                         }
+                        else if (msg.phase === 'embedding') {
+                            const scaled = 93 + Math.round((msg.progress / 100) * 4);
+                            updateBar(scaled, `Embedding... ${msg.progress}% (${msg.embedded} written)`);
+                        }
                         else if (msg.phase === 'embedded') {
-                            updateBar(96, `Embedded ${msg.count} nodes (${(msg.ms / 1000).toFixed(1)}s)`);
+                            updateBar(97, `Embedded ${msg.count} nodes (${(msg.ms / 1000).toFixed(1)}s)`);
                         }
                         else if (msg.phase === 'done') {
                             updateBar(98, `Embeddings complete (${msg.embedded} new, ${msg.skipped} cached)`);
@@ -288,9 +331,9 @@ export const analyzeCommand = async (inputPath, options) => {
         });
         // Reopen DB after Python is done
         db = openDb(dbPath);
-        embeddingTime = ((Date.now() - t0Emb) / 1000).toFixed(1);
     }
     // Phase 5: Finalize (98-100%)
+    recordPhase('finalize');
     updateBar(98, 'Saving metadata...');
     // Count embeddings in the index (cached + newly generated) for metadata
     const embeddingCount = countEmbeddings(db);
@@ -331,19 +374,26 @@ export const analyzeCommand = async (inputPath, options) => {
         ...(processCount !== undefined ? { processes: processCount } : {}),
     });
     closeDb(dbPath);
+    recordPhase('done'); // close the last phase
     const totalTime = ((Date.now() - t0Global) / 1000).toFixed(1);
     clearInterval(elapsedTimer);
     process.removeListener('SIGINT', sigintHandler);
     console.log = origLog;
     console.warn = origWarn;
     console.error = origError;
-    bar.update(100, { phase: 'Done' });
+    bar.update(100, { phase: 'Done', resources: '' });
     bar.stop();
+    // Clear any leftover characters from the progress bar line
+    process.stdout.write('\x1b[2K');
     // Summary
     const embeddingsCached = cachedEmbeddings.length > 0;
     console.log(`\n  Repository indexed successfully (${totalTime}s)${embeddingsCached ? ` [${cachedEmbeddings.length} embeddings cached]` : ''}\n`);
     console.log(`  ${stats.nodes.toLocaleString()} nodes | ${stats.edges.toLocaleString()} edges | ${pipelineResult.communityResult?.stats.totalCommunities || 0} clusters | ${pipelineResult.processResult?.stats.totalProcesses || 0} flows`);
-    console.log(`  SQLite ${dbTime}s | FTS ${ftsTime}s | Embeddings ${embeddingSkipped ? embeddingSkipReason : embeddingTime + 's'}`);
+    // Resource usage
+    const cpuEnd = process.cpuUsage(cpuStart);
+    const wallMs = Date.now() - t0Global || 1;
+    const cpuPct = Math.round(((cpuEnd.user + cpuEnd.system) / 1e3) / wallMs * 100);
+    console.log(`  Memory: peak ${peakRssMB}MB RSS | CPU: ${cpuPct}% (${cpuCount} cores)`);
     console.log(`  tsgo: ${pipelineResult.tsgoEnabled ? 'enabled (compiler-verified call resolution)' : 'disabled — install @typescript/native-preview for higher accuracy'}`);
     console.log(`  ${repoPath}`);
     if (aiContext.files.length > 0) {
@@ -353,6 +403,39 @@ export const analyzeCommand = async (inputPath, options) => {
     if (dbWarnings.length > 0) {
         console.log(`  Note: ${dbWarnings.length} warnings during graph load`);
     }
+    // Detailed performance breakdown
+    const totalMs = phaseTimes.reduce((s, p) => s + p.ms, 0) || 1;
+    const PHASE_DISPLAY_NAMES = {
+        init: 'Init',
+        extracting: 'Scanning files',
+        structure: 'Building structure',
+        parsing: 'Parsing & resolving',
+        imports: 'Resolving imports',
+        calls: 'Tracing calls',
+        heritage: 'Extracting inheritance',
+        communities: 'Detecting communities',
+        processes: 'Detecting processes',
+        enriching: 'Enriching clusters',
+        complete: 'Pipeline complete',
+        sqlite: 'SQLite load',
+        routes: 'Route stitching',
+        'search-text': 'Search text',
+        fts: 'FTS indexing',
+        'restore-embeddings': 'Restore embeddings',
+        embeddings: 'Embeddings (MLX)',
+        finalize: 'Finalize & context',
+        done: 'Done',
+    };
+    console.log('\n  Phase breakdown:');
+    for (const phase of phaseTimes) {
+        const sec = (phase.ms / 1000).toFixed(1);
+        const pct = Math.round((phase.ms / totalMs) * 100);
+        const name = PHASE_DISPLAY_NAMES[phase.name] || phase.name;
+        const bar = pct >= 2 ? ' ' + '█'.repeat(Math.max(1, Math.round(pct / 3))) : '';
+        console.log(`    ${name.padEnd(22)} ${sec.padStart(6)}s  ${String(pct).padStart(3)}%  ${phase.rssMB}MB${bar}`);
+    }
+    console.log(`    ${'─'.repeat(50)}`);
+    console.log(`    ${'Total'.padEnd(22)} ${totalTime.padStart(6)}s  100%  ${peakRssMB}MB peak`);
     try {
         await fs.access(getGlobalRegistryPath());
     }

package/dist/cli/index.js CHANGED Viewed

File without changes

package/dist/core/ingestion/filesystem-walker.js CHANGED Viewed

@@ -4,7 +4,10 @@ import fs from 'fs/promises';
 import path from 'path';
 import { glob } from 'glob';
 import { createIgnoreFilter } from '../../config/ignore-service.js';
-const READ_CONCURRENCY = 32;
+// Stat is metadata-only (no I/O), can be highly concurrent
+const STAT_CONCURRENCY = 256;
+// File reads move actual data, keep bounded to avoid fd exhaustion
+const READ_CONCURRENCY = 64;
 /** Scan repository: stat files to get paths + sizes, no content loaded (~10MB for 100K files) */
 export const walkRepositoryPaths = async (repoPath, onProgress) => {
     const ignoreFilter = await createIgnoreFilter(repoPath);
@@ -16,8 +19,8 @@ export const walkRepositoryPaths = async (repoPath, onProgress) => {
     });
     const entries = [];
     let processed = 0;
-    for (let start = 0; start < filtered.length; start += READ_CONCURRENCY) {
-        const batch = filtered.slice(start, start + READ_CONCURRENCY);
+    for (let start = 0; start < filtered.length; start += STAT_CONCURRENCY) {
+        const batch = filtered.slice(start, start + STAT_CONCURRENCY);
         const results = await Promise.allSettled(batch.map(async (relativePath) => {
             const fullPath = path.join(repoPath, relativePath);
             const stat = await fs.stat(fullPath);

package/dist/core/ingestion/parsing-processor.js CHANGED Viewed

@@ -23,10 +23,14 @@ const processParsingWithWorkers = async (graph, files, symbolTable, _astCache, w
     if (parseableFiles.length === 0)
         return { imports: [], calls: [], heritage: [], routes: [], constructorBindings: [] };
     const total = files.length;
-    // Dispatch to worker pool
-    const chunkResults = await workerPool.dispatch(parseableFiles, (filesProcessed) => {
+    // Dispatch to worker pool with size-balanced distribution
+    const { results: chunkResults, failures } = await workerPool.dispatch(parseableFiles, (filesProcessed) => {
         onFileProgress?.(Math.min(filesProcessed, total), total, 'Parsing...');
-    });
+    }, (item) => item.content.length);
+    // Log worker failures (don't throw — partial results are still valuable)
+    for (const failure of failures) {
+        console.error(`  Worker failure (partial results preserved): ${failure.message}`);
+    }
     // Merge worker results into graph and symbol table
     const allImports = [];
     const allCalls = [];

package/dist/core/ingestion/pipeline.js CHANGED Viewed

@@ -21,7 +21,10 @@ import { fileURLToPath, pathToFileURL } from 'node:url';
 import { memoryGuard } from '../../lib/memory-guard.js';
 import { toNodeId, toEdgeId } from '../db/schema.js';
 import { getTsgoService } from '../semantic/tsgo-service.js';
-const isDev = process.env['NODE_ENV'] === 'development';
+const verbose = (...args) => {
+    if (process.env['CODE_MAPPER_VERBOSE'])
+        console.error(...args);
+};
 // Default chunk budget — used when memory is plentiful.
 // Under memory pressure, adaptiveBatchSize() shrinks this automatically.
 const DEFAULT_CHUNK_BYTE_BUDGET = 50 * 1024 * 1024;
@@ -122,9 +125,9 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
         if (currentChunk.length > 0)
             chunks.push(currentChunk);
         const numChunks = chunks.length;
-        if (isDev) {
+        {
             const totalMB = parseableScanned.reduce((s, f) => s + f.size, 0) / (1024 * 1024);
-            console.log(`📂 Scan: ${totalFiles} paths, ${totalParseable} parseable (${totalMB.toFixed(0)}MB), ${numChunks} chunks @ ${Math.round(chunkBudget / (1024 * 1024))}MB budget (${memoryGuard.summary()})`);
+            verbose(`[parse] ${totalFiles} paths, ${totalParseable} parseable (${totalMB.toFixed(0)}MB), ${numChunks} chunks @ ${Math.round(chunkBudget / (1024 * 1024))}MB budget (${memoryGuard.summary()})`);
         }
         onProgress({
             phase: 'parsing',
@@ -148,8 +151,7 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
             workerPool = createWorkerPool(workerUrl);
         }
         catch (err) {
-            if (isDev)
-                console.warn('Worker pool creation failed, using sequential fallback:', err.message);
+            console.error('[parse] worker pool creation failed, using sequential fallback:', err.message);
         }
         let filesParsedSoFar = 0;
         // AST cache sized for one chunk (used by sequential fallback for import/call/heritage)
@@ -171,12 +173,17 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
                 const chunkPaths = chunks[chunkIdx];
                 if (!chunkPaths)
                     continue;
+                const chunkStart = Date.now();
                 // Read content for this chunk
                 const chunkContents = await readFileContents(repoPath, chunkPaths);
                 const chunkFiles = chunkPaths
                     .filter(p => chunkContents.has(p))
                     .map(p => ({ path: p, content: chunkContents.get(p) }));
+                const readMs = Date.now() - chunkStart;
+                const chunkMB = chunkFiles.reduce((s, f) => s + f.content.length, 0) / (1024 * 1024);
+                verbose(`[parse] chunk ${chunkIdx + 1}/${numChunks}: ${chunkFiles.length} files (${chunkMB.toFixed(1)}MB), read ${readMs}ms`);
                 // Parse chunk (workers or sequential fallback)
+                const parseStart = Date.now();
                 const chunkWorkerData = await processParsing(graph, chunkFiles, symbolTable, astCache, (current, _total, filePath) => {
                     const globalCurrent = filesParsedSoFar + current;
                     const parsingProgress = 20 + ((globalCurrent / totalParseable) * 62);
@@ -188,6 +195,8 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
                         stats: { filesProcessed: globalCurrent, totalFiles: totalParseable, nodesCreated: graph.nodeCount },
                     });
                 }, workerPool);
+                const parseMs = Date.now() - parseStart;
+                verbose(`[parse] chunk ${chunkIdx + 1}/${numChunks}: parsed ${parseMs}ms (${memoryGuard.summary()})`);
                 const chunkBasePercent = 20 + ((filesParsedSoFar / totalParseable) * 62);
                 if (chunkWorkerData) {
                     // Resolve imports per-chunk (file-level, doesn't need full symbol table)
@@ -235,14 +244,14 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
                     sequentialChunkPaths.push(chunkPaths);
                 }
                 filesParsedSoFar += chunkFiles.length;
+                const totalChunkMs = Date.now() - chunkStart;
+                verbose(`[parse] chunk ${chunkIdx + 1}/${numChunks}: total ${totalChunkMs}ms, ${filesParsedSoFar}/${totalParseable} files done`);
                 // Clear AST cache between chunks to free memory; chunk locals go out of scope for GC
                 astCache.clear();
                 // Attempt GC between chunks if under memory pressure
                 if (memoryGuard.isUnderPressure()) {
                     memoryGuard.tryGC();
-                    if (isDev) {
-                        console.log(`⚠️ Memory pressure after chunk ${chunkIdx + 1}: ${memoryGuard.summary()}`);
-                    }
+                    verbose(`[parse] memory pressure after chunk ${chunkIdx + 1}: ${memoryGuard.summary()}`);
                 }
             }
         }
@@ -301,12 +310,11 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
                 tsgoService?.stop();
             }
         }
-        // Log resolution cache stats in dev mode
-        if (isDev) {
+        {
             const rcStats = ctx.getStats();
             const total = rcStats.cacheHits + rcStats.cacheMisses;
             const hitRate = total > 0 ? ((rcStats.cacheHits / total) * 100).toFixed(1) : '0';
-            console.log(`🔍 Resolution cache: ${rcStats.cacheHits} hits, ${rcStats.cacheMisses} misses (${hitRate}% hit rate)`);
+            verbose(`[resolve] cache: ${rcStats.cacheHits} hits, ${rcStats.cacheMisses} misses (${hitRate}% hit rate)`);
         }
         // Free import resolution context (~94MB+ for large repos)
         allPathObjects.length = 0;
@@ -318,13 +326,13 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
             createDependsOnEdges(graph, ctx),
             createProvidesEdges(graph, ctx),
         ]);
-        if (isDev && (diEdgeCount > 0 || providesEdgeCount > 0)) {
-            console.log(`💉 DI: ${diEdgeCount} DEPENDS_ON edges, ${providesEdgeCount} PROVIDES edges`);
+        if (diEdgeCount > 0 || providesEdgeCount > 0) {
+            verbose(`[resolve] DI: ${diEdgeCount} DEPENDS_ON, ${providesEdgeCount} PROVIDES edges`);
         }
         // Phase 4.5a2: Interface dispatch — connect callers of interfaces to implementations
         const ifaceEdges = await resolveInterfaceDispatches(graph, ctx);
-        if (isDev && ifaceEdges > 0) {
-            console.log(`🔌 Interface dispatch: ${ifaceEdges} implementation edges`);
+        if (ifaceEdges > 0) {
+            verbose(`[resolve] interface dispatch: ${ifaceEdges} implementation edges`);
         }
         // Phase 4.5b: Method Resolution Order
         onProgress({
@@ -334,8 +342,8 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
             stats: { filesProcessed: totalFiles, totalFiles, nodesCreated: graph.nodeCount },
         });
         const mroResult = computeMRO(graph);
-        if (isDev && mroResult.entries.length > 0) {
-            console.log(`🔀 MRO: ${mroResult.entries.length} classes analyzed, ${mroResult.ambiguityCount} ambiguities found, ${mroResult.overrideEdges} OVERRIDES edges`);
+        if (mroResult.entries.length > 0) {
+            verbose(`[resolve] MRO: ${mroResult.entries.length} classes, ${mroResult.ambiguityCount} ambiguities, ${mroResult.overrideEdges} OVERRIDES edges`);
         }
         // Phase 5: Communities
         onProgress({
@@ -353,9 +361,7 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
                 stats: { filesProcessed: totalFiles, totalFiles, nodesCreated: graph.nodeCount },
             });
         });
-        if (isDev) {
-            console.log(`🏘️ Community detection: ${communityResult.stats.totalCommunities} communities found (modularity: ${communityResult.stats.modularity.toFixed(3)})`);
-        }
+        verbose(`[community] ${communityResult.stats.totalCommunities} communities (modularity: ${communityResult.stats.modularity.toFixed(3)})`);
         communityResult.communities.forEach(comm => {
             graph.addNode({
                 id: toNodeId(comm.id),
@@ -399,9 +405,7 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
                 stats: { filesProcessed: totalFiles, totalFiles, nodesCreated: graph.nodeCount },
             });
         }, { maxProcesses: dynamicMaxProcesses, minSteps: 3 });
-        if (isDev) {
-            console.log(`🔄 Process detection: ${processResult.stats.totalProcesses} processes found (${processResult.stats.crossCommunityCount} cross-community)`);
-        }
+        verbose(`[process] ${processResult.stats.totalProcesses} processes (${processResult.stats.crossCommunityCount} cross-community)`);
         processResult.processes.forEach(proc => {
             graph.addNode({
                 id: toNodeId(proc.id),

package/dist/core/ingestion/workers/parse-worker.js CHANGED Viewed

@@ -174,7 +174,7 @@ const processBatch = (files, onProgress) => {
     }
     let totalProcessed = 0;
     let lastReported = 0;
-    const PROGRESS_INTERVAL = 100; // report every 100 files
+    const PROGRESS_INTERVAL = 25; // report every 25 files — resets the sub-batch timer
     const onFileProcessed = onProgress ? () => {
         totalProcessed++;
         if (totalProcessed - lastReported >= PROGRESS_INTERVAL) {

package/dist/core/ingestion/workers/worker-pool.d.ts CHANGED Viewed

@@ -1,11 +1,16 @@
 /** @file Generic worker thread pool with sub-batch streaming for bounded memory usage */
 export interface WorkerPool {
     /**
-     * Dispatch items across workers with sub-batch streaming
-     * @param items - Items to process (split into chunks, one per worker)
+     * Dispatch items across workers with sub-batch streaming.
+     * Uses Promise.allSettled so one worker failure doesn't discard other workers' results.
+     * @param items - Items to process (split across workers using size-balanced round-robin)
      * @param onProgress - Optional callback for progress reporting
+     * @param getItemSize - Optional function to extract item size for balanced dispatch
      */
-    dispatch<TInput, TResult>(items: TInput[], onProgress?: (filesProcessed: number) => void): Promise<TResult[]>;
+    dispatch<TInput, TResult>(items: TInput[], onProgress?: (filesProcessed: number) => void, getItemSize?: (item: TInput) => number): Promise<{
+        results: TResult[];
+        failures: Error[];
+    }>;
     /** Terminate all workers (must be called when done) */
     terminate(): Promise<void>;
     /** Number of workers in the pool */

package/dist/core/ingestion/workers/worker-pool.js CHANGED Viewed

@@ -5,9 +5,40 @@ import os from 'node:os';
 import fs from 'node:fs';
 import { fileURLToPath } from 'node:url';
 // Max files per postMessage to keep structured-clone memory bounded
-const SUB_BATCH_SIZE = 1500;
-// Per sub-batch timeout — large codebases with big files need more time
-const SUB_BATCH_TIMEOUT_MS = 120_000;
+const SUB_BATCH_SIZE = 500;
+// Base sub-batch timeout — extended proportionally to file count
+const BASE_TIMEOUT_MS = 120_000;
+// Per-file timeout extension (200ms per file in the sub-batch)
+const PER_FILE_TIMEOUT_MS = 200;
+/** Compute proportional timeout: max(base, fileCount * perFile) */
+const computeTimeout = (fileCount) => Math.max(BASE_TIMEOUT_MS, fileCount * PER_FILE_TIMEOUT_MS);
+/**
+ * Distribute items across N buckets using size-balanced round-robin (LPT heuristic).
+ * Items are sorted by size descending, then assigned to the bucket with the smallest total.
+ * This minimizes the makespan of the heaviest bucket.
+ */
+const sizeBalancedDistribute = (items, bucketCount, getSize) => {
+    if (bucketCount <= 0)
+        return [];
+    if (items.length === 0)
+        return Array.from({ length: bucketCount }, () => []);
+    // Sort indices by size descending
+    const indices = items.map((_, i) => i);
+    indices.sort((a, b) => getSize(items[b]) - getSize(items[a]));
+    const buckets = Array.from({ length: bucketCount }, () => []);
+    const bucketSizes = new Array(bucketCount).fill(0);
+    for (const idx of indices) {
+        // Find the bucket with the smallest total size
+        let minBucket = 0;
+        for (let b = 1; b < bucketCount; b++) {
+            if (bucketSizes[b] < bucketSizes[minBucket])
+                minBucket = b;
+        }
+        buckets[minBucket].push(items[idx]);
+        bucketSizes[minBucket] += getSize(items[idx]);
+    }
+    return buckets;
+};
 /** Create a pool of worker threads */
 export const createWorkerPool = (workerUrl, poolSize) => {
     // Validate worker script exists before spawning to prevent MODULE_NOT_FOUND crashes
@@ -20,13 +51,20 @@ export const createWorkerPool = (workerUrl, poolSize) => {
     for (let i = 0; i < size; i++) {
         workers.push(new Worker(workerUrl));
     }
-    const dispatch = (items, onProgress) => {
+    const dispatch = (items, onProgress, getItemSize) => {
         if (items.length === 0)
-            return Promise.resolve([]);
-        const chunkSize = Math.ceil(items.length / size);
-        const chunks = [];
-        for (let i = 0; i < items.length; i += chunkSize) {
-            chunks.push(items.slice(i, i + chunkSize));
+            return Promise.resolve({ results: [], failures: [] });
+        // Size-balanced dispatch when size function provided, otherwise equal split
+        let chunks;
+        if (getItemSize) {
+            chunks = sizeBalancedDistribute(items, Math.min(size, items.length), getItemSize);
+        }
+        else {
+            const chunkSize = Math.ceil(items.length / size);
+            chunks = [];
+            for (let i = 0; i < items.length; i += chunkSize) {
+                chunks.push(items.slice(i, i + chunkSize));
+            }
         }
         const workerProgress = new Array(chunks.length).fill(0);
         const promises = chunks.map((chunk, i) => {
@@ -37,6 +75,7 @@ export const createWorkerPool = (workerUrl, poolSize) => {
             return new Promise((resolve, reject) => {
                 let settled = false;
                 let subBatchTimer = null;
+                let currentSubBatchSize = 0;
                 const cleanup = () => {
                     if (subBatchTimer)
                         clearTimeout(subBatchTimer);
@@ -47,13 +86,14 @@ export const createWorkerPool = (workerUrl, poolSize) => {
                 const resetSubBatchTimer = () => {
                     if (subBatchTimer)
                         clearTimeout(subBatchTimer);
+                    const timeout = computeTimeout(currentSubBatchSize);
                     subBatchTimer = setTimeout(() => {
                         if (!settled) {
                             settled = true;
                             cleanup();
-                            reject(new Error(`Worker ${i} sub-batch timed out after ${SUB_BATCH_TIMEOUT_MS / 1000}s (chunk: ${chunk.length} items).`));
+                            reject(new Error(`Worker ${i} sub-batch timed out after ${timeout / 1000}s (chunk: ${chunk.length} items, sub-batch: ${currentSubBatchSize} items).`));
                         }
-                    }, SUB_BATCH_TIMEOUT_MS);
+                    }, timeout);
                 };
                 let subBatchIdx = 0;
                 const sendNextSubBatch = () => {
@@ -63,6 +103,7 @@ export const createWorkerPool = (workerUrl, poolSize) => {
                         return;
                     }
                     const subBatch = chunk.slice(start, start + SUB_BATCH_SIZE);
+                    currentSubBatchSize = subBatch.length;
                     subBatchIdx++;
                     resetSubBatchTimer();
                     worker.postMessage({ type: 'sub-batch', files: subBatch });
@@ -71,6 +112,8 @@ export const createWorkerPool = (workerUrl, poolSize) => {
                     if (settled)
                         return;
                     if (msg && msg.type === 'progress') {
+                        // BUG FIX: Reset timer on progress — worker is alive and making progress
+                        resetSubBatchTimer();
                         workerProgress[i] = msg.filesProcessed;
                         if (onProgress) {
                             const total = workerProgress.reduce((a, b) => a + b, 0);
@@ -116,7 +159,20 @@ export const createWorkerPool = (workerUrl, poolSize) => {
                 sendNextSubBatch();
             });
         });
-        return Promise.all(promises);
+        // Use allSettled so one worker failure doesn't discard other workers' results
+        return Promise.allSettled(promises).then(outcomes => {
+            const results = [];
+            const failures = [];
+            for (const outcome of outcomes) {
+                if (outcome.status === 'fulfilled') {
+                    results.push(outcome.value);
+                }
+                else {
+                    failures.push(outcome.reason instanceof Error ? outcome.reason : new Error(String(outcome.reason)));
+                }
+            }
+            return { results, failures };
+        });
     };
     const terminate = async () => {
         await Promise.all(workers.map(w => w.terminate()));

package/models/mlx-embedder.py CHANGED Viewed

@@ -415,26 +415,81 @@ def batch_mode(db_path, dims=256, max_tokens=2048):
     unique_texts = [v["text"] for v in unique_by_hash.values()]
     deduped = len(to_embed) - len(unique_texts)
-    # Embed only unique texts
+    # Embed unique texts in streaming fashion — process each batch, write to DB
+    # immediately, free GPU memory. Keeps peak memory at ONE batch instead of ALL.
     t0_embed = time.time()
-    embeddings = embed_tiered(model, tokenizer, unique_texts, "retrieval.passage", dims, max_tokens)
-    embed_ms = int((time.time() - t0_embed) * 1000)
+    unique_entries = list(unique_by_hash.values())
-    print(json.dumps({"phase": "embedded", "count": len(unique_texts), "deduped": deduped, "ms": embed_ms}), flush=True)
+    # Tokenize + sort (same as embed_tiered but we handle the loop here)
+    is_code_model = "jina-code" in MODEL_DIR
+    if is_code_model:
+        prefix_map = {"retrieval.query": "Find the most relevant code snippet given the following query:\n", "retrieval.passage": "Candidate code snippet:\n"}
+    else:
+        prefix_map = {"retrieval.query": "Query: ", "retrieval.passage": "Document: "}
+    prefix = prefix_map.get("retrieval.passage", "")
+    prefixed = [prefix + e["text"] for e in unique_entries]
+    encodings = tokenizer.encode_batch(prefixed)
+    indexed = sorted(range(len(unique_entries)), key=lambda i: len(encodings[i].ids))
-    # Write to database — copy embedding to all nodes sharing the same hash
-    t0_write = time.time()
+    embedded_count = 0
     db.execute("BEGIN")
-    for i, (text_hash, entry) in enumerate(unique_by_hash.items()):
-        emb = embeddings[i]
-        if emb is None:
-            continue
-        blob = float_list_to_blob(emb)
-        for nid, th in entry["node_ids"]:
-            db.execute("INSERT OR REPLACE INTO embeddings (nodeId, embedding, textHash) VALUES (?, ?, ?)",
-                       (nid, blob, th))
+    i = 0
+    while i < len(indexed):
+        peek_idx = indexed[min(i + 1, len(indexed) - 1)]
+        tok_count = min(len(encodings[peek_idx].ids), max_tokens)
+        batch_size = get_batch_size_for_tokens(tok_count)
+        batch_indices = []
+        batch_encs = []
+        while len(batch_encs) < batch_size and i < len(indexed):
+            orig_idx = indexed[i]
+            batch_indices.append(orig_idx)
+            batch_encs.append(encodings[orig_idx])
+            i += 1
+        max_len = min(max_tokens, max(len(e.ids) for e in batch_encs))
+        input_ids = []
+        attention_mask = []
+        for enc in batch_encs:
+            ids = enc.ids[:max_len]
+            mask = enc.attention_mask[:max_len]
+            pad = max_len - len(ids)
+            if pad > 0:
+                ids = ids + [0] * pad
+                mask = mask + [0] * pad
+            input_ids.append(ids)
+            attention_mask.append(mask)
+        # Forward pass
+        embs = model(mx.array(input_ids), mx.array(attention_mask))
+        if dims and dims < embs.shape[1]:
+            embs = embs[:, :dims]
+        norms = mx.linalg.norm(embs, axis=1, keepdims=True)
+        embs = embs / norms
+        mx.eval(embs)
+        # Convert to Python + write to DB immediately
+        emb_list = embs.tolist()
+        del embs  # free MLX GPU memory
+        for j, orig_idx in enumerate(batch_indices):
+            entry = unique_entries[orig_idx]
+            blob = float_list_to_blob(emb_list[j])
+            for nid, th in entry["node_ids"]:
+                db.execute("INSERT OR REPLACE INTO embeddings (nodeId, embedding, textHash) VALUES (?, ?, ?)",
+                           (nid, blob, th))
+            embedded_count += len(entry["node_ids"])
+        # Progress
+        pct = i * 100 // len(indexed)
+        print(json.dumps({"phase": "embedding", "progress": pct, "embedded": embedded_count}), flush=True)
     db.execute("COMMIT")
-    write_ms = int((time.time() - t0_write) * 1000)
+    embed_ms = int((time.time() - t0_embed) * 1000)
+    write_ms = 0  # included in embed_ms now
+    print(json.dumps({"phase": "embedded", "count": len(unique_entries), "deduped": deduped, "ms": embed_ms}), flush=True)
     total_ms = int((time.time() - t0_total) * 1000)
     print(json.dumps({

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@zuvia-software-solutions/code-mapper",
-  "version": "2.3.1",
+  "version": "2.3.3",
   "description": "Graph-powered code intelligence for AI agents. Index any codebase, query via MCP or CLI.",
   "author": "Abhigyan Patwari",
   "license": "PolyForm-Noncommercial-1.0.0",