npm - gitnexus - Versions diffs - 1.2.9 → 1.3.0 - Mend

gitnexus 1.2.9 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

package/dist/cli/analyze.js +69 -28
package/dist/cli/index.js +20 -0
package/dist/core/graph/graph.js +5 -0
package/dist/core/graph/types.d.ts +12 -1
package/dist/core/ingestion/call-processor.js +52 -32
package/dist/core/ingestion/community-processor.js +75 -40
package/dist/core/ingestion/filesystem-walker.d.ts +23 -0
package/dist/core/ingestion/filesystem-walker.js +38 -3
package/dist/core/ingestion/import-processor.d.ts +11 -3
package/dist/core/ingestion/import-processor.js +27 -11
package/dist/core/ingestion/parsing-processor.js +2 -4
package/dist/core/ingestion/pipeline.js +142 -135
package/dist/core/ingestion/process-processor.js +12 -11
package/dist/core/ingestion/workers/parse-worker.js +67 -6
package/dist/core/ingestion/workers/worker-pool.d.ts +3 -9
package/dist/core/ingestion/workers/worker-pool.js +39 -18
package/dist/core/kuzu/csv-generator.d.ts +15 -8
package/dist/core/kuzu/csv-generator.js +258 -196
package/dist/core/kuzu/kuzu-adapter.d.ts +1 -4
package/dist/core/kuzu/kuzu-adapter.js +75 -63
package/dist/core/kuzu/schema.d.ts +1 -1
package/dist/core/kuzu/schema.js +10 -0
package/dist/types/pipeline.d.ts +6 -2
package/dist/types/pipeline.js +6 -4
package/package.json +1 -1

package/dist/cli/analyze.js CHANGED Viewed

@@ -4,6 +4,8 @@
  * Indexes a repository and stores the knowledge graph in .gitnexus/
  */
 import path from 'path';
+import { execFileSync } from 'child_process';
+import v8 from 'v8';
 import cliProgress from 'cli-progress';
 import { runPipelineFromRepo } from '../core/ingestion/pipeline.js';
 import { initKuzu, loadGraphToKuzu, getKuzuStats, executeQuery, executeWithReusedStatement, closeKuzu, createFTSIndex, loadCachedEmbeddings } from '../core/kuzu/kuzu-adapter.js';
@@ -14,6 +16,27 @@ import { getCurrentCommit, isGitRepo, getGitRoot } from '../storage/git.js';
 import { generateAIContextFiles } from './ai-context.js';
 import fs from 'fs/promises';
 import { registerClaudeHook } from './claude-hooks.js';
+const HEAP_MB = 8192;
+const HEAP_FLAG = `--max-old-space-size=${HEAP_MB}`;
+/** Re-exec the process with an 8GB heap if we're currently below that. */
+function ensureHeap() {
+    const nodeOpts = process.env.NODE_OPTIONS || '';
+    if (nodeOpts.includes('--max-old-space-size'))
+        return false;
+    const v8Heap = v8.getHeapStatistics().heap_size_limit;
+    if (v8Heap >= HEAP_MB * 1024 * 1024 * 0.9)
+        return false;
+    try {
+        execFileSync(process.execPath, [HEAP_FLAG, ...process.argv.slice(1)], {
+            stdio: 'inherit',
+            env: { ...process.env, NODE_OPTIONS: `${nodeOpts} ${HEAP_FLAG}`.trim() },
+        });
+    }
+    catch (e) {
+        process.exitCode = e.status ?? 1;
+    }
+    return true;
+}
 /** Threshold: auto-skip embeddings for repos with more nodes than this */
 const EMBEDDING_NODE_LIMIT = 50_000;
 const PHASE_LABELS = {
@@ -32,6 +55,8 @@ const PHASE_LABELS = {
     done: 'Done',
 };
 export const analyzeCommand = async (inputPath, options) => {
+    if (ensureHeap())
+        return;
     console.log('\n  GitNexus Analyzer\n');
     let repoPath;
     if (inputPath) {
@@ -70,18 +95,47 @@ export const analyzeCommand = async (inputPath, options) => {
         stopOnComplete: false,
     }, cliProgress.Presets.shades_grey);
     bar.start(100, 0, { phase: 'Initializing...' });
+    // Graceful SIGINT handling — clean up resources and exit
+    let aborted = false;
+    const sigintHandler = () => {
+        if (aborted)
+            process.exit(1); // Second Ctrl-C: force exit
+        aborted = true;
+        bar.stop();
+        console.log('\n  Interrupted — cleaning up...');
+        closeKuzu().catch(() => { }).finally(() => process.exit(130));
+    };
+    process.on('SIGINT', sigintHandler);
     // Route all console output through bar.log() so the bar doesn't stamp itself
     // multiple times when other code writes to stdout/stderr mid-render.
     const origLog = console.log.bind(console);
     const origWarn = console.warn.bind(console);
     const origError = console.error.bind(console);
-    const barLog = (...args) => origLog(args.map(a => (typeof a === 'string' ? a : String(a))).join(' '));
+    const barLog = (...args) => {
+        // Clear the bar line, print the message, then let the next bar.update redraw
+        process.stdout.write('\x1b[2K\r');
+        origLog(args.map(a => (typeof a === 'string' ? a : String(a))).join(' '));
+    };
     console.log = barLog;
     console.warn = barLog;
     console.error = barLog;
-    // Show elapsed seconds for phases that run longer than 3s
+    // Track elapsed time per phase — both updateBar and the interval use the
+    // same format so they don't flicker against each other.
     let lastPhaseLabel = 'Initializing...';
     let phaseStart = Date.now();
+    /** Update bar with phase label + elapsed seconds (shown after 3s). */
+    const updateBar = (value, phaseLabel) => {
+        if (phaseLabel !== lastPhaseLabel) {
+            lastPhaseLabel = phaseLabel;
+            phaseStart = Date.now();
+        }
+        const elapsed = Math.round((Date.now() - phaseStart) / 1000);
+        const display = elapsed >= 3 ? `${phaseLabel} (${elapsed}s)` : phaseLabel;
+        bar.update(value, { phase: display });
+    };
+    // Tick elapsed seconds for phases with infrequent progress callbacks
+    // (e.g. CSV streaming, FTS indexing). Uses the same display format as
+    // updateBar so there's no flickering.
     const elapsedTimer = setInterval(() => {
         const elapsed = Math.round((Date.now() - phaseStart) / 1000);
         if (elapsed >= 3) {
@@ -94,7 +148,7 @@ export const analyzeCommand = async (inputPath, options) => {
     let cachedEmbeddings = [];
     if (options?.embeddings && existingMeta && !options?.force) {
         try {
-            bar.update(0, { phase: 'Caching embeddings...' });
+            updateBar(0, 'Caching embeddings...');
             await initKuzu(kuzuPath);
             const cached = await loadCachedEmbeddings();
             cachedEmbeddingNodeIds = cached.embeddingNodeIds;
@@ -112,16 +166,10 @@ export const analyzeCommand = async (inputPath, options) => {
     const pipelineResult = await runPipelineFromRepo(repoPath, (progress) => {
         const phaseLabel = PHASE_LABELS[progress.phase] || progress.phase;
         const scaled = Math.round(progress.percent * 0.6);
-        if (phaseLabel !== lastPhaseLabel) {
-            lastPhaseLabel = phaseLabel;
-            phaseStart = Date.now();
-        }
-        bar.update(scaled, { phase: phaseLabel });
+        updateBar(scaled, phaseLabel);
     });
     // ── Phase 2: KuzuDB (60–85%) ──────────────────────────────────────
-    lastPhaseLabel = 'Loading into KuzuDB...';
-    phaseStart = Date.now();
-    bar.update(60, { phase: lastPhaseLabel });
+    updateBar(60, 'Loading into KuzuDB...');
     await closeKuzu();
     const kuzuFiles = [kuzuPath, `${kuzuPath}.wal`, `${kuzuPath}.lock`];
     for (const f of kuzuFiles) {
@@ -133,17 +181,15 @@ export const analyzeCommand = async (inputPath, options) => {
     const t0Kuzu = Date.now();
     await initKuzu(kuzuPath);
     let kuzuMsgCount = 0;
-    const kuzuResult = await loadGraphToKuzu(pipelineResult.graph, pipelineResult.fileContents, storagePath, (msg) => {
+    const kuzuResult = await loadGraphToKuzu(pipelineResult.graph, pipelineResult.repoPath, storagePath, (msg) => {
         kuzuMsgCount++;
         const progress = Math.min(84, 60 + Math.round((kuzuMsgCount / (kuzuMsgCount + 10)) * 24));
-        bar.update(progress, { phase: msg });
+        updateBar(progress, msg);
     });
     const kuzuTime = ((Date.now() - t0Kuzu) / 1000).toFixed(1);
     const kuzuWarnings = kuzuResult.warnings;
     // ── Phase 3: FTS (85–90%) ─────────────────────────────────────────
-    lastPhaseLabel = 'Creating search indexes...';
-    phaseStart = Date.now();
-    bar.update(85, { phase: lastPhaseLabel });
+    updateBar(85, 'Creating search indexes...');
     const t0Fts = Date.now();
     try {
         await createFTSIndex('File', 'file_fts', ['name', 'content']);
@@ -158,7 +204,7 @@ export const analyzeCommand = async (inputPath, options) => {
     const ftsTime = ((Date.now() - t0Fts) / 1000).toFixed(1);
     // ── Phase 3.5: Re-insert cached embeddings ────────────────────────
     if (cachedEmbeddings.length > 0) {
-        bar.update(88, { phase: `Restoring ${cachedEmbeddings.length} cached embeddings...` });
+        updateBar(88, `Restoring ${cachedEmbeddings.length} cached embeddings...`);
         const EMBED_BATCH = 200;
         for (let i = 0; i < cachedEmbeddings.length; i += EMBED_BATCH) {
             const batch = cachedEmbeddings.slice(i, i + EMBED_BATCH);
@@ -183,29 +229,23 @@ export const analyzeCommand = async (inputPath, options) => {
         }
     }
     if (!embeddingSkipped) {
-        lastPhaseLabel = 'Loading embedding model...';
-        phaseStart = Date.now();
-        bar.update(90, { phase: lastPhaseLabel });
+        updateBar(90, 'Loading embedding model...');
         const t0Emb = Date.now();
         await runEmbeddingPipeline(executeQuery, executeWithReusedStatement, (progress) => {
             const scaled = 90 + Math.round((progress.percent / 100) * 8);
             const label = progress.phase === 'loading-model' ? 'Loading embedding model...' : `Embedding ${progress.nodesProcessed || 0}/${progress.totalNodes || '?'}`;
-            if (label !== lastPhaseLabel) {
-                lastPhaseLabel = label;
-                phaseStart = Date.now();
-            }
-            bar.update(scaled, { phase: label });
+            updateBar(scaled, label);
         }, {}, cachedEmbeddingNodeIds.size > 0 ? cachedEmbeddingNodeIds : undefined);
         embeddingTime = ((Date.now() - t0Emb) / 1000).toFixed(1);
     }
     // ── Phase 5: Finalize (98–100%) ───────────────────────────────────
-    bar.update(98, { phase: 'Saving metadata...' });
+    updateBar(98, 'Saving metadata...');
     const meta = {
         repoPath,
         lastCommit: currentCommit,
         indexedAt: new Date().toISOString(),
         stats: {
-            files: pipelineResult.fileContents.size,
+            files: pipelineResult.totalFileCount,
             nodes: stats.nodes,
             edges: stats.edges,
             communities: pipelineResult.communityResult?.stats.totalCommunities,
@@ -227,7 +267,7 @@ export const analyzeCommand = async (inputPath, options) => {
         aggregatedClusterCount = Array.from(groups.values()).filter(count => count >= 5).length;
     }
     const aiContext = await generateAIContextFiles(repoPath, storagePath, projectName, {
-        files: pipelineResult.fileContents.size,
+        files: pipelineResult.totalFileCount,
         nodes: stats.nodes,
         edges: stats.edges,
         communities: pipelineResult.communityResult?.stats.totalCommunities,
@@ -240,6 +280,7 @@ export const analyzeCommand = async (inputPath, options) => {
     // Since the process exits immediately after, Node.js reclaims everything.
     const totalTime = ((Date.now() - t0Global) / 1000).toFixed(1);
     clearInterval(elapsedTimer);
+    process.removeListener('SIGINT', sigintHandler);
     console.log = origLog;
     console.warn = origWarn;
     console.error = origError;

package/dist/cli/index.js CHANGED Viewed

@@ -1,4 +1,24 @@
 #!/usr/bin/env node
+// Raise Node heap limit for large repos (e.g. Linux kernel).
+// Must run before any heavy allocation. If already set by the user, respect it.
+if (!process.env.NODE_OPTIONS?.includes('--max-old-space-size')) {
+    const execArgv = process.execArgv.join(' ');
+    if (!execArgv.includes('--max-old-space-size')) {
+        // Re-spawn with a larger heap (8 GB)
+        const { execFileSync } = await import('node:child_process');
+        try {
+            execFileSync(process.execPath, ['--max-old-space-size=8192', ...process.argv.slice(1)], {
+                stdio: 'inherit',
+                env: { ...process.env, NODE_OPTIONS: `${process.env.NODE_OPTIONS || ''} --max-old-space-size=8192`.trim() },
+            });
+            process.exit(0);
+        }
+        catch (e) {
+            // If the child exited with an error code, propagate it
+            process.exit(e.status ?? 1);
+        }
+    }
+}
 import { Command } from 'commander';
 import { analyzeCommand } from './analyze.js';
 import { serveCommand } from './serve.js';

package/dist/core/graph/graph.js CHANGED Viewed

@@ -46,6 +46,11 @@ export const createKnowledgeGraph = () => {
         get relationships() {
             return Array.from(relationshipMap.values());
         },
+        iterNodes: () => nodeMap.values(),
+        iterRelationships: () => relationshipMap.values(),
+        forEachNode(fn) { nodeMap.forEach(fn); },
+        forEachRelationship(fn) { relationshipMap.forEach(fn); },
+        getNode: (id) => nodeMap.get(id),
         // O(1) count getters - avoid creating arrays just for length
         get nodeCount() {
             return nodeMap.size;

package/dist/core/graph/types.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-export type NodeLabel = 'Project' | 'Package' | 'Module' | 'Folder' | 'File' | 'Class' | 'Function' | 'Method' | 'Variable' | 'Interface' | 'Enum' | 'Decorator' | 'Import' | 'Type' | 'CodeElement' | 'Community' | 'Process';
+export type NodeLabel = 'Project' | 'Package' | 'Module' | 'Folder' | 'File' | 'Class' | 'Function' | 'Method' | 'Variable' | 'Interface' | 'Enum' | 'Decorator' | 'Import' | 'Type' | 'CodeElement' | 'Community' | 'Process' | 'Struct' | 'Macro' | 'Typedef' | 'Union' | 'Namespace' | 'Trait' | 'Impl' | 'TypeAlias' | 'Const' | 'Static' | 'Property' | 'Record' | 'Delegate' | 'Annotation' | 'Constructor' | 'Template';
 export type NodeProperties = {
     name: string;
     filePath: string;
@@ -39,8 +39,19 @@ export interface GraphRelationship {
     step?: number;
 }
 export interface KnowledgeGraph {
+    /** Returns a full array copy — prefer iterNodes() for iteration */
     nodes: GraphNode[];
+    /** Returns a full array copy — prefer iterRelationships() for iteration */
     relationships: GraphRelationship[];
+    /** Zero-copy iterator over nodes */
+    iterNodes: () => IterableIterator<GraphNode>;
+    /** Zero-copy iterator over relationships */
+    iterRelationships: () => IterableIterator<GraphRelationship>;
+    /** Zero-copy forEach — avoids iterator protocol overhead in hot loops */
+    forEachNode: (fn: (node: GraphNode) => void) => void;
+    forEachRelationship: (fn: (rel: GraphRelationship) => void) => void;
+    /** Lookup a single node by id — O(1) */
+    getNode: (id: string) => GraphNode | undefined;
     nodeCount: number;
     relationshipCount: number;
     addNode: (node: GraphNode) => void;

package/dist/core/ingestion/call-processor.js CHANGED Viewed

@@ -232,38 +232,58 @@ const resolveCallTarget = (calledName, currentFile, symbolTable, importMap) => {
  * Filter out common built-in functions and noise
  * that shouldn't be tracked as calls
  */
-const isBuiltInOrNoise = (name) => {
-    const builtIns = new Set([
-        // JavaScript/TypeScript built-ins
-        'console', 'log', 'warn', 'error', 'info', 'debug',
-        'setTimeout', 'setInterval', 'clearTimeout', 'clearInterval',
-        'parseInt', 'parseFloat', 'isNaN', 'isFinite',
-        'encodeURI', 'decodeURI', 'encodeURIComponent', 'decodeURIComponent',
-        'JSON', 'parse', 'stringify',
-        'Object', 'Array', 'String', 'Number', 'Boolean', 'Symbol', 'BigInt',
-        'Map', 'Set', 'WeakMap', 'WeakSet',
-        'Promise', 'resolve', 'reject', 'then', 'catch', 'finally',
-        'Math', 'Date', 'RegExp', 'Error',
-        'require', 'import', 'export',
-        'fetch', 'Response', 'Request',
-        // React hooks and common functions
-        'useState', 'useEffect', 'useCallback', 'useMemo', 'useRef', 'useContext',
-        'useReducer', 'useLayoutEffect', 'useImperativeHandle', 'useDebugValue',
-        'createElement', 'createContext', 'createRef', 'forwardRef', 'memo', 'lazy',
-        // Common array/object methods
-        'map', 'filter', 'reduce', 'forEach', 'find', 'findIndex', 'some', 'every',
-        'includes', 'indexOf', 'slice', 'splice', 'concat', 'join', 'split',
-        'push', 'pop', 'shift', 'unshift', 'sort', 'reverse',
-        'keys', 'values', 'entries', 'assign', 'freeze', 'seal',
-        'hasOwnProperty', 'toString', 'valueOf',
-        // Python built-ins
-        'print', 'len', 'range', 'str', 'int', 'float', 'list', 'dict', 'set', 'tuple',
-        'open', 'read', 'write', 'close', 'append', 'extend', 'update',
-        'super', 'type', 'isinstance', 'issubclass', 'getattr', 'setattr', 'hasattr',
-        'enumerate', 'zip', 'sorted', 'reversed', 'min', 'max', 'sum', 'abs',
-    ]);
-    return builtIns.has(name);
-};
+/** Pre-built set (module-level singleton) to avoid re-creating per call */
+const BUILT_IN_NAMES = new Set([
+    // JavaScript/TypeScript built-ins
+    'console', 'log', 'warn', 'error', 'info', 'debug',
+    'setTimeout', 'setInterval', 'clearTimeout', 'clearInterval',
+    'parseInt', 'parseFloat', 'isNaN', 'isFinite',
+    'encodeURI', 'decodeURI', 'encodeURIComponent', 'decodeURIComponent',
+    'JSON', 'parse', 'stringify',
+    'Object', 'Array', 'String', 'Number', 'Boolean', 'Symbol', 'BigInt',
+    'Map', 'Set', 'WeakMap', 'WeakSet',
+    'Promise', 'resolve', 'reject', 'then', 'catch', 'finally',
+    'Math', 'Date', 'RegExp', 'Error',
+    'require', 'import', 'export',
+    'fetch', 'Response', 'Request',
+    // React hooks and common functions
+    'useState', 'useEffect', 'useCallback', 'useMemo', 'useRef', 'useContext',
+    'useReducer', 'useLayoutEffect', 'useImperativeHandle', 'useDebugValue',
+    'createElement', 'createContext', 'createRef', 'forwardRef', 'memo', 'lazy',
+    // Common array/object methods
+    'map', 'filter', 'reduce', 'forEach', 'find', 'findIndex', 'some', 'every',
+    'includes', 'indexOf', 'slice', 'splice', 'concat', 'join', 'split',
+    'push', 'pop', 'shift', 'unshift', 'sort', 'reverse',
+    'keys', 'values', 'entries', 'assign', 'freeze', 'seal',
+    'hasOwnProperty', 'toString', 'valueOf',
+    // Python built-ins
+    'print', 'len', 'range', 'str', 'int', 'float', 'list', 'dict', 'set', 'tuple',
+    'open', 'read', 'write', 'close', 'append', 'extend', 'update',
+    'super', 'type', 'isinstance', 'issubclass', 'getattr', 'setattr', 'hasattr',
+    'enumerate', 'zip', 'sorted', 'reversed', 'min', 'max', 'sum', 'abs',
+    // C/C++ standard library and common kernel helpers
+    'printf', 'fprintf', 'sprintf', 'snprintf', 'vprintf', 'vfprintf', 'vsprintf', 'vsnprintf',
+    'scanf', 'fscanf', 'sscanf',
+    'malloc', 'calloc', 'realloc', 'free', 'memcpy', 'memmove', 'memset', 'memcmp',
+    'strlen', 'strcpy', 'strncpy', 'strcat', 'strncat', 'strcmp', 'strncmp', 'strstr', 'strchr', 'strrchr',
+    'atoi', 'atol', 'atof', 'strtol', 'strtoul', 'strtoll', 'strtoull', 'strtod',
+    'sizeof', 'offsetof', 'typeof',
+    'assert', 'abort', 'exit', '_exit',
+    'fopen', 'fclose', 'fread', 'fwrite', 'fseek', 'ftell', 'rewind', 'fflush', 'fgets', 'fputs',
+    // Linux kernel common macros/helpers (not real call targets)
+    'likely', 'unlikely', 'BUG', 'BUG_ON', 'WARN', 'WARN_ON', 'WARN_ONCE',
+    'IS_ERR', 'PTR_ERR', 'ERR_PTR', 'IS_ERR_OR_NULL',
+    'ARRAY_SIZE', 'container_of', 'list_for_each_entry', 'list_for_each_entry_safe',
+    'min', 'max', 'clamp', 'abs', 'swap',
+    'pr_info', 'pr_warn', 'pr_err', 'pr_debug', 'pr_notice', 'pr_crit', 'pr_emerg',
+    'printk', 'dev_info', 'dev_warn', 'dev_err', 'dev_dbg',
+    'GFP_KERNEL', 'GFP_ATOMIC',
+    'spin_lock', 'spin_unlock', 'spin_lock_irqsave', 'spin_unlock_irqrestore',
+    'mutex_lock', 'mutex_unlock', 'mutex_init',
+    'kfree', 'kmalloc', 'kzalloc', 'kcalloc', 'krealloc', 'kvmalloc', 'kvfree',
+    'get', 'put',
+]);
+const isBuiltInOrNoise = (name) => BUILT_IN_NAMES.has(name);
 /**
  * Fast path: resolve pre-extracted call sites from workers.
  * No AST parsing — workers already extracted calledName + sourceId.

package/dist/core/ingestion/community-processor.js CHANGED Viewed

@@ -51,23 +51,51 @@ export const getCommunityColor = (communityIndex) => {
  */
 export const processCommunities = async (knowledgeGraph, onProgress) => {
     onProgress?.('Building graph for community detection...', 0);
-    // Step 1: Build a graphology graph from the knowledge graph
-    // We only include symbol nodes (Function, Class, Method) and CALLS edges
-    const graph = buildGraphologyGraph(knowledgeGraph);
+    // Pre-check total symbol count to determine large-graph mode before building
+    let symbolCount = 0;
+    knowledgeGraph.forEachNode(node => {
+        if (node.label === 'Function' || node.label === 'Class' || node.label === 'Method' || node.label === 'Interface') {
+            symbolCount++;
+        }
+    });
+    const isLarge = symbolCount > 10_000;
+    const graph = buildGraphologyGraph(knowledgeGraph, isLarge);
     if (graph.order === 0) {
-        // No nodes to cluster
         return {
             communities: [],
             memberships: [],
             stats: { totalCommunities: 0, modularity: 0, nodesProcessed: 0 }
         };
     }
-    onProgress?.(`Running Leiden algorithm on ${graph.order} nodes...`, 30);
-    // Step 2: Run Leiden algorithm for community detection
-    const details = leiden.detailed(graph, {
-        resolution: 1.0, // Default resolution, can be tuned
-        randomWalk: true,
-    });
+    const nodeCount = graph.order;
+    const edgeCount = graph.size;
+    onProgress?.(`Running Leiden on ${nodeCount} nodes, ${edgeCount} edges${isLarge ? ` (filtered from ${symbolCount} symbols)` : ''}...`, 30);
+    // Large graphs: higher resolution + capped iterations (matching Python leidenalg default of 2).
+    // The first 2 iterations capture ~95%+ of modularity; additional iterations have diminishing returns.
+    // Timeout: abort after 60s for pathological graph structures.
+    const LEIDEN_TIMEOUT_MS = 60_000;
+    let details;
+    try {
+        details = await Promise.race([
+            Promise.resolve(leiden.detailed(graph, {
+                resolution: isLarge ? 2.0 : 1.0,
+                maxIterations: isLarge ? 3 : 0,
+            })),
+            new Promise((_, reject) => setTimeout(() => reject(new Error('Leiden timeout')), LEIDEN_TIMEOUT_MS)),
+        ]);
+    }
+    catch (e) {
+        if (e.message === 'Leiden timeout') {
+            onProgress?.('Community detection timed out, using fallback...', 60);
+            // Fallback: assign all nodes to community 0
+            const communities = {};
+            graph.forEachNode((node) => { communities[node] = 0; });
+            details = { communities, count: 1, modularity: 0 };
+        }
+        else {
+            throw e;
+        }
+    }
     onProgress?.(`Found ${details.count} communities...`, 60);
     // Step 3: Create community nodes with heuristic labels
     const communityNodes = createCommunityNodes(details.communities, details.count, graph, knowledgeGraph);
@@ -95,41 +123,48 @@ export const processCommunities = async (knowledgeGraph, onProgress) => {
 // HELPER: Build graphology graph from knowledge graph
 // ============================================================================
 /**
- * Build a graphology graph containing only symbol nodes and CALLS edges
- * This is what the Leiden algorithm will cluster
+ * Build a graphology graph containing only symbol nodes and clustering edges.
+ * For large graphs (>10K symbols), filter out low-confidence fuzzy-global edges
+ * and degree-1 nodes that add noise and massively increase Leiden runtime.
  */
-const buildGraphologyGraph = (knowledgeGraph) => {
-    // Use undirected graph for Leiden - it looks at edge density, not direction
+const MIN_CONFIDENCE_LARGE = 0.5;
+const buildGraphologyGraph = (knowledgeGraph, isLarge) => {
     const graph = new Graph({ type: 'undirected', allowSelfLoops: false });
-    // Symbol types that should be clustered
     const symbolTypes = new Set(['Function', 'Class', 'Method', 'Interface']);
-    // First pass: collect which nodes participate in clustering edges
     const clusteringRelTypes = new Set(['CALLS', 'EXTENDS', 'IMPLEMENTS']);
     const connectedNodes = new Set();
-    knowledgeGraph.relationships.forEach(rel => {
-        if (clusteringRelTypes.has(rel.type) && rel.sourceId !== rel.targetId) {
-            connectedNodes.add(rel.sourceId);
-            connectedNodes.add(rel.targetId);
-        }
+    const nodeDegree = new Map();
+    knowledgeGraph.forEachRelationship(rel => {
+        if (!clusteringRelTypes.has(rel.type) || rel.sourceId === rel.targetId)
+            return;
+        if (isLarge && rel.confidence < MIN_CONFIDENCE_LARGE)
+            return;
+        connectedNodes.add(rel.sourceId);
+        connectedNodes.add(rel.targetId);
+        nodeDegree.set(rel.sourceId, (nodeDegree.get(rel.sourceId) || 0) + 1);
+        nodeDegree.set(rel.targetId, (nodeDegree.get(rel.targetId) || 0) + 1);
     });
-    // Only add nodes that have at least one clustering edge
-    // Isolated nodes would just become singletons (skipped anyway)
-    knowledgeGraph.nodes.forEach(node => {
-        if (symbolTypes.has(node.label) && connectedNodes.has(node.id)) {
-            graph.addNode(node.id, {
-                name: node.properties.name,
-                filePath: node.properties.filePath,
-                type: node.label,
-            });
-        }
+    knowledgeGraph.forEachNode(node => {
+        if (!symbolTypes.has(node.label) || !connectedNodes.has(node.id))
+            return;
+        // For large graphs, skip degree-1 nodes — they just become singletons or
+        // get absorbed into their single neighbor's community, but cost iteration time.
+        if (isLarge && (nodeDegree.get(node.id) || 0) < 2)
+            return;
+        graph.addNode(node.id, {
+            name: node.properties.name,
+            filePath: node.properties.filePath,
+            type: node.label,
+        });
     });
-    // Add edges
-    knowledgeGraph.relationships.forEach(rel => {
-        if (clusteringRelTypes.has(rel.type)) {
-            if (graph.hasNode(rel.sourceId) && graph.hasNode(rel.targetId) && rel.sourceId !== rel.targetId) {
-                if (!graph.hasEdge(rel.sourceId, rel.targetId)) {
-                    graph.addEdge(rel.sourceId, rel.targetId);
-                }
+    knowledgeGraph.forEachRelationship(rel => {
+        if (!clusteringRelTypes.has(rel.type))
+            return;
+        if (isLarge && rel.confidence < MIN_CONFIDENCE_LARGE)
+            return;
+        if (graph.hasNode(rel.sourceId) && graph.hasNode(rel.targetId) && rel.sourceId !== rel.targetId) {
+            if (!graph.hasEdge(rel.sourceId, rel.targetId)) {
+                graph.addEdge(rel.sourceId, rel.targetId);
             }
         }
     });
@@ -152,11 +187,11 @@ const createCommunityNodes = (communities, communityCount, graph, knowledgeGraph
     });
     // Build node lookup for file paths
     const nodePathMap = new Map();
-    knowledgeGraph.nodes.forEach(node => {
+    for (const node of knowledgeGraph.iterNodes()) {
         if (node.properties.filePath) {
             nodePathMap.set(node.id, node.properties.filePath);
         }
-    });
+    }
     // Create community nodes - SKIP SINGLETONS (isolated nodes)
     const communityNodes = [];
     communityMembers.forEach((memberIds, commNum) => {

package/dist/core/ingestion/filesystem-walker.d.ts CHANGED Viewed

@@ -2,4 +2,27 @@ export interface FileEntry {
     path: string;
     content: string;
 }
+/** Lightweight entry — path + size from stat, no content in memory */
+export interface ScannedFile {
+    path: string;
+    size: number;
+}
+/** Path-only reference (for type signatures) */
+export interface FilePath {
+    path: string;
+}
+/**
+ * Phase 1: Scan repository — stat files to get paths + sizes, no content loaded.
+ * Memory: ~10MB for 100K files vs ~1GB+ with content.
+ */
+export declare const walkRepositoryPaths: (repoPath: string, onProgress?: (current: number, total: number, filePath: string) => void) => Promise<ScannedFile[]>;
+/**
+ * Phase 2: Read file contents for a specific set of relative paths.
+ * Returns a Map for O(1) lookup. Silently skips files that fail to read.
+ */
+export declare const readFileContents: (repoPath: string, relativePaths: string[]) => Promise<Map<string, string>>;
+/**
+ * Legacy API — scans and reads everything into memory.
+ * Used by sequential fallback path only.
+ */
 export declare const walkRepository: (repoPath: string, onProgress?: (current: number, total: number, filePath: string) => void) => Promise<FileEntry[]>;

package/dist/core/ingestion/filesystem-walker.js CHANGED Viewed

@@ -5,7 +5,11 @@ import { shouldIgnorePath } from '../../config/ignore-service.js';
 const READ_CONCURRENCY = 32;
 /** Skip files larger than 512KB — they're usually generated/vendored and crash tree-sitter */
 const MAX_FILE_SIZE = 512 * 1024;
-export const walkRepository = async (repoPath, onProgress) => {
+/**
+ * Phase 1: Scan repository — stat files to get paths + sizes, no content loaded.
+ * Memory: ~10MB for 100K files vs ~1GB+ with content.
+ */
+export const walkRepositoryPaths = async (repoPath, onProgress) => {
     const files = await glob('**/*', {
         cwd: repoPath,
         nodir: true,
@@ -24,8 +28,7 @@ export const walkRepository = async (repoPath, onProgress) => {
                 skippedLarge++;
                 return null;
             }
-            const content = await fs.readFile(fullPath, 'utf-8');
-            return { path: relativePath.replace(/\\/g, '/'), content };
+            return { path: relativePath.replace(/\\/g, '/'), size: stat.size };
         }));
         for (const result of results) {
             processed++;
@@ -43,3 +46,35 @@ export const walkRepository = async (repoPath, onProgress) => {
     }
     return entries;
 };
+/**
+ * Phase 2: Read file contents for a specific set of relative paths.
+ * Returns a Map for O(1) lookup. Silently skips files that fail to read.
+ */
+export const readFileContents = async (repoPath, relativePaths) => {
+    const contents = new Map();
+    for (let start = 0; start < relativePaths.length; start += READ_CONCURRENCY) {
+        const batch = relativePaths.slice(start, start + READ_CONCURRENCY);
+        const results = await Promise.allSettled(batch.map(async (relativePath) => {
+            const fullPath = path.join(repoPath, relativePath);
+            const content = await fs.readFile(fullPath, 'utf-8');
+            return { path: relativePath, content };
+        }));
+        for (const result of results) {
+            if (result.status === 'fulfilled') {
+                contents.set(result.value.path, result.value.content);
+            }
+        }
+    }
+    return contents;
+};
+/**
+ * Legacy API — scans and reads everything into memory.
+ * Used by sequential fallback path only.
+ */
+export const walkRepository = async (repoPath, onProgress) => {
+    const scanned = await walkRepositoryPaths(repoPath, onProgress);
+    const contents = await readFileContents(repoPath, scanned.map(f => f.path));
+    return scanned
+        .filter(f => contents.has(f.path))
+        .map(f => ({ path: f.path, content: contents.get(f.path) }));
+};

package/dist/core/ingestion/import-processor.d.ts CHANGED Viewed

@@ -3,6 +3,15 @@ import { ASTCache } from './ast-cache.js';
 import type { ExtractedImport } from './workers/parse-worker.js';
 export type ImportMap = Map<string, Set<string>>;
 export declare const createImportMap: () => ImportMap;
+/** Pre-built lookup structures for import resolution. Build once, reuse across chunks. */
+export interface ImportResolutionContext {
+    allFilePaths: Set<string>;
+    allFileList: string[];
+    normalizedFileList: string[];
+    suffixIndex: SuffixIndex;
+    resolveCache: Map<string, string | null>;
+}
+export declare function buildImportResolutionContext(allPaths: string[]): ImportResolutionContext;
 /**
  * Build a suffix index for O(1) endsWith lookups.
  * Maps every possible path suffix to its original file path.
@@ -23,8 +32,7 @@ export interface SuffixIndex {
 export declare const processImports: (graph: KnowledgeGraph, files: {
     path: string;
     content: string;
-}[], astCache: ASTCache, importMap: ImportMap, onProgress?: (current: number, total: number) => void, repoRoot?: string) => Promise<void>;
+}[], astCache: ASTCache, importMap: ImportMap, onProgress?: (current: number, total: number) => void, repoRoot?: string, allPaths?: string[]) => Promise<void>;
 export declare const processImportsFromExtracted: (graph: KnowledgeGraph, files: {
     path: string;
-    content: string;
-}[], extractedImports: ExtractedImport[], importMap: ImportMap, onProgress?: (current: number, total: number) => void, repoRoot?: string) => Promise<void>;
+}[], extractedImports: ExtractedImport[], importMap: ImportMap, onProgress?: (current: number, total: number) => void, repoRoot?: string, prebuiltCtx?: ImportResolutionContext) => Promise<void>;