npm - gitnexus - Versions diffs - 1.2.9 → 1.3.0 - Mend

gitnexus 1.2.9 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

package/dist/cli/analyze.js +69 -28
package/dist/cli/index.js +20 -0
package/dist/core/graph/graph.js +5 -0
package/dist/core/graph/types.d.ts +12 -1
package/dist/core/ingestion/call-processor.js +52 -32
package/dist/core/ingestion/community-processor.js +75 -40
package/dist/core/ingestion/filesystem-walker.d.ts +23 -0
package/dist/core/ingestion/filesystem-walker.js +38 -3
package/dist/core/ingestion/import-processor.d.ts +11 -3
package/dist/core/ingestion/import-processor.js +27 -11
package/dist/core/ingestion/parsing-processor.js +2 -4
package/dist/core/ingestion/pipeline.js +142 -135
package/dist/core/ingestion/process-processor.js +12 -11
package/dist/core/ingestion/workers/parse-worker.js +67 -6
package/dist/core/ingestion/workers/worker-pool.d.ts +3 -9
package/dist/core/ingestion/workers/worker-pool.js +39 -18
package/dist/core/kuzu/csv-generator.d.ts +15 -8
package/dist/core/kuzu/csv-generator.js +258 -196
package/dist/core/kuzu/kuzu-adapter.d.ts +1 -4
package/dist/core/kuzu/kuzu-adapter.js +75 -63
package/dist/core/kuzu/schema.d.ts +1 -1
package/dist/core/kuzu/schema.js +10 -0
package/dist/types/pipeline.d.ts +6 -2
package/dist/types/pipeline.js +6 -4
package/package.json +1 -1

package/dist/core/ingestion/import-processor.js CHANGED Viewed

@@ -8,6 +8,16 @@ import { getLanguageFromFilename, yieldToEventLoop } from './utils.js';
 import { SupportedLanguages } from '../../config/supported-languages.js';
 const isDev = process.env.NODE_ENV === 'development';
 export const createImportMap = () => new Map();
+/** Max entries in the resolve cache. Beyond this, the cache is cleared to bound memory.
+ *  100K entries ≈ 15MB — covers the most common import patterns. */
+const RESOLVE_CACHE_CAP = 100_000;
+export function buildImportResolutionContext(allPaths) {
+    const allFileList = allPaths;
+    const normalizedFileList = allFileList.map(p => p.replace(/\\/g, '/'));
+    const allFilePaths = new Set(allFileList);
+    const suffixIndex = buildSuffixIndex(normalizedFileList, allFileList);
+    return { allFilePaths, allFileList, normalizedFileList, suffixIndex, resolveCache: new Map() };
+}
 /**
  * Parse tsconfig.json to extract path aliases.
  * Tries tsconfig.json, tsconfig.app.json, tsconfig.base.json in order.
@@ -196,6 +206,16 @@ const resolveImportPath = (currentFile, importPath, allFiles, allFileList, norma
     if (resolveCache.has(cacheKey))
         return resolveCache.get(cacheKey) ?? null;
     const cache = (result) => {
+        // Evict oldest 20% when cap is reached instead of clearing all
+        if (resolveCache.size >= RESOLVE_CACHE_CAP) {
+            const evictCount = Math.floor(RESOLVE_CACHE_CAP * 0.2);
+            const iter = resolveCache.keys();
+            for (let i = 0; i < evictCount; i++) {
+                const key = iter.next().value;
+                if (key !== undefined)
+                    resolveCache.delete(key);
+            }
+        }
         resolveCache.set(cacheKey, result);
         return result;
     };
@@ -429,12 +449,12 @@ function resolveGoPackage(importPath, goModule, normalizedFileList, allFileList)
 // ============================================================================
 // MAIN IMPORT PROCESSOR
 // ============================================================================
-export const processImports = async (graph, files, astCache, importMap, onProgress, repoRoot) => {
-    // Create a Set of all file paths for fast lookup during resolution
-    const allFilePaths = new Set(files.map(f => f.path));
+export const processImports = async (graph, files, astCache, importMap, onProgress, repoRoot, allPaths) => {
+    // Use allPaths (full repo) when available for cross-chunk resolution, else fall back to chunk files
+    const allFileList = allPaths ?? files.map(f => f.path);
+    const allFilePaths = new Set(allFileList);
     const parser = await loadParser();
     const resolveCache = new Map();
-    const allFileList = files.map(f => f.path);
     // Pre-compute normalized file list once (forward slashes)
     const normalizedFileList = allFileList.map(p => p.replace(/\\/g, '/'));
     // Build suffix index for O(1) lookups
@@ -573,13 +593,9 @@ export const processImports = async (graph, files, astCache, importMap, onProgre
 // ============================================================================
 // FAST PATH: Resolve pre-extracted imports (no parsing needed)
 // ============================================================================
-export const processImportsFromExtracted = async (graph, files, extractedImports, importMap, onProgress, repoRoot) => {
-    const allFilePaths = new Set(files.map(f => f.path));
-    const resolveCache = new Map();
-    const allFileList = files.map(f => f.path);
-    const normalizedFileList = allFileList.map(p => p.replace(/\\/g, '/'));
-    // Build suffix index for O(1) lookups
-    const index = buildSuffixIndex(normalizedFileList, allFileList);
+export const processImportsFromExtracted = async (graph, files, extractedImports, importMap, onProgress, repoRoot, prebuiltCtx) => {
+    const ctx = prebuiltCtx ?? buildImportResolutionContext(files.map(f => f.path));
+    const { allFilePaths, allFileList, normalizedFileList, suffixIndex: index, resolveCache } = ctx;
     let totalImportsFound = 0;
     let totalImportsResolved = 0;
     const effectiveRoot = repoRoot || '';

package/dist/core/ingestion/parsing-processor.js CHANGED Viewed

@@ -106,15 +106,13 @@ const processParsingWithWorkers = async (graph, files, symbolTable, astCache, wo
     const parseableFiles = [];
     for (const file of files) {
         const lang = getLanguageFromFilename(file.path);
-        if (lang) {
+        if (lang)
             parseableFiles.push({ path: file.path, content: file.content });
-        }
     }
     if (parseableFiles.length === 0)
         return { imports: [], calls: [], heritage: [] };
     const total = files.length;
-    // Dispatch to worker pool — pool handles splitting into chunks
-    // Workers send progress messages during parsing so the bar updates smoothly
+    // Dispatch to worker pool — pool handles splitting into chunks and sub-batching
     const chunkResults = await workerPool.dispatch(parseableFiles, (filesProcessed) => {
         onFileProgress?.(Math.min(filesProcessed, total), total, 'Parsing...');
     });

package/dist/core/ingestion/pipeline.js CHANGED Viewed

@@ -1,34 +1,41 @@
 import { createKnowledgeGraph } from '../graph/graph.js';
 import { processStructure } from './structure-processor.js';
 import { processParsing } from './parsing-processor.js';
-import { processImports, processImportsFromExtracted, createImportMap } from './import-processor.js';
+import { processImports, processImportsFromExtracted, createImportMap, buildImportResolutionContext } from './import-processor.js';
 import { processCalls, processCallsFromExtracted } from './call-processor.js';
 import { processHeritage, processHeritageFromExtracted } from './heritage-processor.js';
 import { processCommunities } from './community-processor.js';
 import { processProcesses } from './process-processor.js';
 import { createSymbolTable } from './symbol-table.js';
 import { createASTCache } from './ast-cache.js';
-import { walkRepository } from './filesystem-walker.js';
+import { walkRepositoryPaths, readFileContents } from './filesystem-walker.js';
+import { getLanguageFromFilename } from './utils.js';
 import { createWorkerPool } from './workers/worker-pool.js';
 const isDev = process.env.NODE_ENV === 'development';
+/** Max bytes of source content to load per parse chunk. Each chunk's source +
+ *  parsed ASTs + extracted records + worker serialization overhead all live in
+ *  memory simultaneously, so this must be conservative. 20MB source ≈ 200-400MB
+ *  peak working memory per chunk after parse expansion. */
+const CHUNK_BYTE_BUDGET = 20 * 1024 * 1024; // 20MB
+/** Max AST trees to keep in LRU cache */
+const AST_CACHE_CAP = 50;
 export const runPipelineFromRepo = async (repoPath, onProgress) => {
     const graph = createKnowledgeGraph();
-    const fileContents = new Map();
     const symbolTable = createSymbolTable();
-    // AST cache sized after file scan — start with a placeholder, resize after we know file count
-    let astCache = createASTCache(50);
+    let astCache = createASTCache(AST_CACHE_CAP);
     const importMap = createImportMap();
     const cleanup = () => {
         astCache.clear();
         symbolTable.clear();
     };
     try {
+        // ── Phase 1: Scan paths only (no content read) ─────────────────────
         onProgress({
             phase: 'extracting',
             percent: 0,
             message: 'Scanning repository...',
         });
-        const files = await walkRepository(repoPath, (current, total, filePath) => {
+        const scannedFiles = await walkRepositoryPaths(repoPath, (current, total, filePath) => {
             const scanProgress = Math.round((current / total) * 15);
             onProgress({
                 phase: 'extracting',
@@ -38,167 +45,165 @@ export const runPipelineFromRepo = async (repoPath, onProgress) => {
                 stats: { filesProcessed: current, totalFiles: total, nodesCreated: graph.nodeCount },
             });
         });
-        files.forEach(f => fileContents.set(f.path, f.content));
-        // Resize AST cache to fit all files — avoids re-parsing in import/call/heritage phases
-        astCache = createASTCache(files.length);
+        const totalFiles = scannedFiles.length;
         onProgress({
             phase: 'extracting',
             percent: 15,
             message: 'Repository scanned successfully',
-            stats: { filesProcessed: files.length, totalFiles: files.length, nodesCreated: graph.nodeCount },
+            stats: { filesProcessed: totalFiles, totalFiles, nodesCreated: graph.nodeCount },
         });
+        // ── Phase 2: Structure (paths only — no content needed) ────────────
         onProgress({
             phase: 'structure',
             percent: 15,
             message: 'Analyzing project structure...',
-            stats: { filesProcessed: 0, totalFiles: files.length, nodesCreated: graph.nodeCount },
+            stats: { filesProcessed: 0, totalFiles, nodesCreated: graph.nodeCount },
         });
-        const filePaths = files.map(f => f.path);
-        processStructure(graph, filePaths);
+        const allPaths = scannedFiles.map(f => f.path);
+        processStructure(graph, allPaths);
         onProgress({
             phase: 'structure',
-            percent: 30,
+            percent: 20,
             message: 'Project structure analyzed',
-            stats: { filesProcessed: files.length, totalFiles: files.length, nodesCreated: graph.nodeCount },
+            stats: { filesProcessed: totalFiles, totalFiles, nodesCreated: graph.nodeCount },
         });
+        // ── Phase 3+4: Chunked read + parse ────────────────────────────────
+        // Group parseable files into byte-budget chunks so only ~20MB of source
+        // is in memory at a time. Each chunk is: read → parse → extract → free.
+        const parseableScanned = scannedFiles.filter(f => getLanguageFromFilename(f.path));
+        const totalParseable = parseableScanned.length;
+        // Build byte-budget chunks
+        const chunks = [];
+        let currentChunk = [];
+        let currentBytes = 0;
+        for (const file of parseableScanned) {
+            if (currentChunk.length > 0 && currentBytes + file.size > CHUNK_BYTE_BUDGET) {
+                chunks.push(currentChunk);
+                currentChunk = [];
+                currentBytes = 0;
+            }
+            currentChunk.push(file.path);
+            currentBytes += file.size;
+        }
+        if (currentChunk.length > 0)
+            chunks.push(currentChunk);
+        const numChunks = chunks.length;
+        if (isDev) {
+            const totalMB = parseableScanned.reduce((s, f) => s + f.size, 0) / (1024 * 1024);
+            console.log(`📂 Scan: ${totalFiles} paths, ${totalParseable} parseable (${totalMB.toFixed(0)}MB), ${numChunks} chunks @ ${CHUNK_BYTE_BUDGET / (1024 * 1024)}MB budget`);
+        }
         onProgress({
             phase: 'parsing',
-            percent: 30,
-            message: 'Parsing code definitions...',
-            stats: { filesProcessed: 0, totalFiles: files.length, nodesCreated: graph.nodeCount },
+            percent: 20,
+            message: `Parsing ${totalParseable} files in ${numChunks} chunk${numChunks !== 1 ? 's' : ''}...`,
+            stats: { filesProcessed: 0, totalFiles: totalParseable, nodesCreated: graph.nodeCount },
         });
-        // Create worker pool for parallel parsing, with graceful fallback
+        // Create worker pool once, reuse across chunks
         let workerPool;
         try {
             const workerUrl = new URL('./workers/parse-worker.js', import.meta.url);
             workerPool = createWorkerPool(workerUrl);
         }
         catch (err) {
-            // Worker pool creation failed (e.g., single core) — sequential fallback
+            // Worker pool creation failed — sequential fallback
         }
-        let workerData = null;
+        let filesParsedSoFar = 0;
+        // AST cache sized for one chunk (sequential fallback uses it for import/call/heritage)
+        const maxChunkFiles = chunks.reduce((max, c) => Math.max(max, c.length), 0);
+        astCache = createASTCache(maxChunkFiles);
+        // Build import resolution context once — suffix index, file lists, resolve cache.
+        // Reused across all chunks to avoid rebuilding O(files × path_depth) structures.
+        const importCtx = buildImportResolutionContext(allPaths);
+        const allPathObjects = allPaths.map(p => ({ path: p }));
+        // Single-pass: parse + resolve imports/calls/heritage per chunk.
+        // Calls/heritage use the symbol table built so far (symbols from earlier chunks
+        // are already registered). This trades ~5% cross-chunk resolution accuracy for
+        // 200-400MB less memory — critical for Linux-kernel-scale repos.
+        const sequentialChunkPaths = [];
         try {
-            workerData = await processParsing(graph, files, symbolTable, astCache, (current, total, filePath) => {
-                const parsingProgress = 30 + ((current / total) * 40);
-                onProgress({
-                    phase: 'parsing',
-                    percent: Math.round(parsingProgress),
-                    message: 'Parsing code definitions...',
-                    detail: filePath,
-                    stats: { filesProcessed: current, totalFiles: total, nodesCreated: graph.nodeCount },
-                });
-            }, workerPool);
+            for (let chunkIdx = 0; chunkIdx < numChunks; chunkIdx++) {
+                const chunkPaths = chunks[chunkIdx];
+                // Read content for this chunk only
+                const chunkContents = await readFileContents(repoPath, chunkPaths);
+                const chunkFiles = chunkPaths
+                    .filter(p => chunkContents.has(p))
+                    .map(p => ({ path: p, content: chunkContents.get(p) }));
+                // Parse this chunk (workers or sequential fallback)
+                const chunkWorkerData = await processParsing(graph, chunkFiles, symbolTable, astCache, (current, _total, filePath) => {
+                    const globalCurrent = filesParsedSoFar + current;
+                    const parsingProgress = 20 + ((globalCurrent / totalParseable) * 62);
+                    onProgress({
+                        phase: 'parsing',
+                        percent: Math.round(parsingProgress),
+                        message: `Parsing chunk ${chunkIdx + 1}/${numChunks}...`,
+                        detail: filePath,
+                        stats: { filesProcessed: globalCurrent, totalFiles: totalParseable, nodesCreated: graph.nodeCount },
+                    });
+                }, workerPool);
+                if (chunkWorkerData) {
+                    // Imports
+                    await processImportsFromExtracted(graph, allPathObjects, chunkWorkerData.imports, importMap, undefined, repoPath, importCtx);
+                    // Calls — resolve immediately, then free the array
+                    if (chunkWorkerData.calls.length > 0) {
+                        await processCallsFromExtracted(graph, chunkWorkerData.calls, symbolTable, importMap);
+                    }
+                    // Heritage — resolve immediately, then free
+                    if (chunkWorkerData.heritage.length > 0) {
+                        await processHeritageFromExtracted(graph, chunkWorkerData.heritage, symbolTable);
+                    }
+                }
+                else {
+                    await processImports(graph, chunkFiles, astCache, importMap, undefined, repoPath, allPaths);
+                    sequentialChunkPaths.push(chunkPaths);
+                }
+                filesParsedSoFar += chunkFiles.length;
+                // Clear AST cache between chunks to free memory
+                astCache.clear();
+                // chunkContents + chunkFiles + chunkWorkerData go out of scope → GC reclaims
+            }
         }
         finally {
             await workerPool?.terminate();
         }
-        onProgress({
-            phase: 'imports',
-            percent: 70,
-            message: 'Resolving imports...',
-            stats: { filesProcessed: 0, totalFiles: files.length, nodesCreated: graph.nodeCount },
-        });
-        if (workerData) {
-            // Fast path: imports already extracted by workers, just resolve paths
-            await processImportsFromExtracted(graph, files, workerData.imports, importMap, (current, total) => {
-                const importProgress = 70 + ((current / total) * 12);
-                onProgress({
-                    phase: 'imports',
-                    percent: Math.round(importProgress),
-                    message: 'Resolving imports...',
-                    stats: { filesProcessed: current, totalFiles: total, nodesCreated: graph.nodeCount },
-                });
-            }, repoPath);
-        }
-        else {
-            // Fallback: full parse + resolve (sequential path)
-            await processImports(graph, files, astCache, importMap, (current, total) => {
-                const importProgress = 70 + ((current / total) * 12);
-                onProgress({
-                    phase: 'imports',
-                    percent: Math.round(importProgress),
-                    message: 'Resolving imports...',
-                    stats: { filesProcessed: current, totalFiles: total, nodesCreated: graph.nodeCount },
-                });
-            }, repoPath);
+        // Sequential fallback chunks: re-read source for call/heritage resolution
+        for (const chunkPaths of sequentialChunkPaths) {
+            const chunkContents = await readFileContents(repoPath, chunkPaths);
+            const chunkFiles = chunkPaths
+                .filter(p => chunkContents.has(p))
+                .map(p => ({ path: p, content: chunkContents.get(p) }));
+            astCache = createASTCache(chunkFiles.length);
+            await processCalls(graph, chunkFiles, astCache, symbolTable, importMap);
+            await processHeritage(graph, chunkFiles, astCache, symbolTable);
+            astCache.clear();
         }
+        // Free import resolution context — suffix index + resolve cache no longer needed
+        // (allPathObjects and importCtx hold ~94MB+ for large repos)
+        allPathObjects.length = 0;
+        importCtx.resolveCache.clear();
+        importCtx.suffixIndex = null;
+        importCtx.normalizedFileList = null;
         if (isDev) {
-            const importsCount = graph.relationships.filter(r => r.type === 'IMPORTS').length;
-            console.log(`📊 Pipeline: After import phase, graph has ${importsCount} IMPORTS relationships (total: ${graph.relationshipCount})`);
-        }
-        onProgress({
-            phase: 'calls',
-            percent: 82,
-            message: 'Tracing function calls...',
-            stats: { filesProcessed: 0, totalFiles: files.length, nodesCreated: graph.nodeCount },
-        });
-        if (workerData) {
-            // Fast path: calls already extracted by workers, just resolve targets
-            await processCallsFromExtracted(graph, workerData.calls, symbolTable, importMap, (current, total) => {
-                const callProgress = 82 + ((current / total) * 10);
-                onProgress({
-                    phase: 'calls',
-                    percent: Math.round(callProgress),
-                    message: 'Tracing function calls...',
-                    stats: { filesProcessed: current, totalFiles: total, nodesCreated: graph.nodeCount },
-                });
-            });
-        }
-        else {
-            // Fallback: full parse + resolve (sequential path)
-            await processCalls(graph, files, astCache, symbolTable, importMap, (current, total) => {
-                const callProgress = 82 + ((current / total) * 10);
-                onProgress({
-                    phase: 'calls',
-                    percent: Math.round(callProgress),
-                    message: 'Tracing function calls...',
-                    stats: { filesProcessed: current, totalFiles: total, nodesCreated: graph.nodeCount },
-                });
-            });
-        }
-        onProgress({
-            phase: 'heritage',
-            percent: 92,
-            message: 'Extracting class inheritance...',
-            stats: { filesProcessed: 0, totalFiles: files.length, nodesCreated: graph.nodeCount },
-        });
-        if (workerData) {
-            // Fast path: heritage already extracted by workers, just resolve symbols
-            await processHeritageFromExtracted(graph, workerData.heritage, symbolTable, (current, total) => {
-                const heritageProgress = 88 + ((current / total) * 4);
-                onProgress({
-                    phase: 'heritage',
-                    percent: Math.round(heritageProgress),
-                    message: 'Extracting class inheritance...',
-                    stats: { filesProcessed: current, totalFiles: total, nodesCreated: graph.nodeCount },
-                });
-            });
-        }
-        else {
-            // Fallback: full parse + resolve (sequential path)
-            await processHeritage(graph, files, astCache, symbolTable, (current, total) => {
-                const heritageProgress = 88 + ((current / total) * 4);
-                onProgress({
-                    phase: 'heritage',
-                    percent: Math.round(heritageProgress),
-                    message: 'Extracting class inheritance...',
-                    stats: { filesProcessed: current, totalFiles: total, nodesCreated: graph.nodeCount },
-                });
-            });
+            let importsCount = 0;
+            for (const r of graph.iterRelationships()) {
+                if (r.type === 'IMPORTS')
+                    importsCount++;
+            }
+            console.log(`📊 Pipeline: graph has ${importsCount} IMPORTS, ${graph.relationshipCount} total relationships`);
         }
+        // ── Phase 5: Communities ───────────────────────────────────────────
         onProgress({
             phase: 'communities',
-            percent: 92,
+            percent: 82,
             message: 'Detecting code communities...',
-            stats: { filesProcessed: files.length, totalFiles: files.length, nodesCreated: graph.nodeCount },
+            stats: { filesProcessed: totalFiles, totalFiles, nodesCreated: graph.nodeCount },
         });
         const communityResult = await processCommunities(graph, (message, progress) => {
-            const communityProgress = 92 + (progress * 0.06);
+            const communityProgress = 82 + (progress * 0.10);
             onProgress({
                 phase: 'communities',
                 percent: Math.round(communityProgress),
                 message,
-                stats: { filesProcessed: files.length, totalFiles: files.length, nodesCreated: graph.nodeCount },
+                stats: { filesProcessed: totalFiles, totalFiles, nodesCreated: graph.nodeCount },
             });
         });
         if (isDev) {
@@ -227,22 +232,24 @@ export const runPipelineFromRepo = async (repoPath, onProgress) => {
                 reason: 'leiden-algorithm',
             });
         });
+        // ── Phase 6: Processes ─────────────────────────────────────────────
         onProgress({
             phase: 'processes',
-            percent: 98,
+            percent: 94,
             message: 'Detecting execution flows...',
-            stats: { filesProcessed: files.length, totalFiles: files.length, nodesCreated: graph.nodeCount },
+            stats: { filesProcessed: totalFiles, totalFiles, nodesCreated: graph.nodeCount },
         });
-        // Dynamic process cap based on codebase size
-        const symbolCount = graph.nodes.filter(n => n.label !== 'File').length;
+        let symbolCount = 0;
+        graph.forEachNode(n => { if (n.label !== 'File')
+            symbolCount++; });
         const dynamicMaxProcesses = Math.max(20, Math.min(300, Math.round(symbolCount / 10)));
         const processResult = await processProcesses(graph, communityResult.memberships, (message, progress) => {
-            const processProgress = 98 + (progress * 0.01);
+            const processProgress = 94 + (progress * 0.05);
             onProgress({
                 phase: 'processes',
                 percent: Math.round(processProgress),
                 message,
-                stats: { filesProcessed: files.length, totalFiles: files.length, nodesCreated: graph.nodeCount },
+                stats: { filesProcessed: totalFiles, totalFiles, nodesCreated: graph.nodeCount },
             });
         }, { maxProcesses: dynamicMaxProcesses, minSteps: 3 });
         if (isDev) {
@@ -280,13 +287,13 @@ export const runPipelineFromRepo = async (repoPath, onProgress) => {
             percent: 100,
             message: `Graph complete! ${communityResult.stats.totalCommunities} communities, ${processResult.stats.totalProcesses} processes detected.`,
             stats: {
-                filesProcessed: files.length,
-                totalFiles: files.length,
+                filesProcessed: totalFiles,
+                totalFiles,
                 nodesCreated: graph.nodeCount
             },
         });
         astCache.clear();
-        return { graph, fileContents, communityResult, processResult };
+        return { graph, repoPath, totalFileCount: totalFiles, communityResult, processResult };
     }
     catch (error) {
         cleanup();

package/dist/core/ingestion/process-processor.js CHANGED Viewed

@@ -34,7 +34,8 @@ export const processProcesses = async (knowledgeGraph, memberships, onProgress,
     const callsEdges = buildCallsGraph(knowledgeGraph);
     const reverseCallsEdges = buildReverseCallsGraph(knowledgeGraph);
     const nodeMap = new Map();
-    knowledgeGraph.nodes.forEach(n => nodeMap.set(n.id, n));
+    for (const n of knowledgeGraph.iterNodes())
+        nodeMap.set(n.id, n);
     // Step 1: Find entry points (functions that call others but have few callers)
     const entryPoints = findEntryPoints(knowledgeGraph, reverseCallsEdges, callsEdges);
     onProgress?.(`Found ${entryPoints.length} entry points, tracing flows...`, 20);
@@ -129,26 +130,26 @@ export const processProcesses = async (knowledgeGraph, memberships, onProgress,
 const MIN_TRACE_CONFIDENCE = 0.5;
 const buildCallsGraph = (graph) => {
     const adj = new Map();
-    graph.relationships.forEach(rel => {
+    for (const rel of graph.iterRelationships()) {
         if (rel.type === 'CALLS' && rel.confidence >= MIN_TRACE_CONFIDENCE) {
             if (!adj.has(rel.sourceId)) {
                 adj.set(rel.sourceId, []);
             }
             adj.get(rel.sourceId).push(rel.targetId);
         }
-    });
+    }
     return adj;
 };
 const buildReverseCallsGraph = (graph) => {
     const adj = new Map();
-    graph.relationships.forEach(rel => {
+    for (const rel of graph.iterRelationships()) {
         if (rel.type === 'CALLS' && rel.confidence >= MIN_TRACE_CONFIDENCE) {
             if (!adj.has(rel.targetId)) {
                 adj.set(rel.targetId, []);
             }
             adj.get(rel.targetId).push(rel.sourceId);
         }
-    });
+    }
     return adj;
 };
 /**
@@ -164,32 +165,32 @@ const buildReverseCallsGraph = (graph) => {
 const findEntryPoints = (graph, reverseCallsEdges, callsEdges) => {
     const symbolTypes = new Set(['Function', 'Method']);
     const entryPointCandidates = [];
-    graph.nodes.forEach(node => {
+    for (const node of graph.iterNodes()) {
         if (!symbolTypes.has(node.label))
-            return;
+            continue;
         const filePath = node.properties.filePath || '';
         // Skip test files entirely
         if (isTestFile(filePath))
-            return;
+            continue;
         const callers = reverseCallsEdges.get(node.id) || [];
         const callees = callsEdges.get(node.id) || [];
         // Must have at least 1 outgoing call to trace forward
         if (callees.length === 0)
-            return;
+            continue;
         // Calculate entry point score using new scoring system
         const { score, reasons } = calculateEntryPointScore(node.properties.name, node.properties.language || 'javascript', node.properties.isExported ?? false, callers.length, callees.length, filePath // Pass filePath for framework detection
         );
         if (score > 0) {
             entryPointCandidates.push({ id: node.id, score, reasons });
         }
-    });
+    }
     // Sort by score descending and return top candidates
     const sorted = entryPointCandidates.sort((a, b) => b.score - a.score);
     // DEBUG: Log top candidates with new scoring details
     if (sorted.length > 0 && isDev) {
         console.log(`[Process] Top 10 entry point candidates (new scoring):`);
         sorted.slice(0, 10).forEach((c, i) => {
-            const node = graph.nodes.find(n => n.id === c.id);
+            const node = graph.getNode(c.id);
             const exported = node?.properties.isExported ? '✓' : '✗';
             const shortPath = node?.properties.filePath?.split('/').slice(-2).join('/') || '';
             console.log(`  ${i + 1}. ${node?.properties.name} [exported:${exported}] (${shortPath})`);

package/dist/core/ingestion/workers/parse-worker.js CHANGED Viewed

@@ -171,6 +171,7 @@ const findEnclosingFunctionId = (node, filePath) => {
     return null;
 };
 const BUILT_INS = new Set([
+    // JavaScript/TypeScript
     'console', 'log', 'warn', 'error', 'info', 'debug',
     'setTimeout', 'setInterval', 'clearTimeout', 'clearInterval',
     'parseInt', 'parseFloat', 'isNaN', 'isFinite',
@@ -189,10 +190,32 @@ const BUILT_INS = new Set([
     'push', 'pop', 'shift', 'unshift', 'sort', 'reverse',
     'keys', 'values', 'entries', 'assign', 'freeze', 'seal',
     'hasOwnProperty', 'toString', 'valueOf',
+    // Python
     'print', 'len', 'range', 'str', 'int', 'float', 'list', 'dict', 'set', 'tuple',
     'open', 'read', 'write', 'close', 'append', 'extend', 'update',
     'super', 'type', 'isinstance', 'issubclass', 'getattr', 'setattr', 'hasattr',
     'enumerate', 'zip', 'sorted', 'reversed', 'min', 'max', 'sum', 'abs',
+    // C/C++ standard library
+    'printf', 'fprintf', 'sprintf', 'snprintf', 'vprintf', 'vfprintf', 'vsprintf', 'vsnprintf',
+    'scanf', 'fscanf', 'sscanf',
+    'malloc', 'calloc', 'realloc', 'free', 'memcpy', 'memmove', 'memset', 'memcmp',
+    'strlen', 'strcpy', 'strncpy', 'strcat', 'strncat', 'strcmp', 'strncmp', 'strstr', 'strchr', 'strrchr',
+    'atoi', 'atol', 'atof', 'strtol', 'strtoul', 'strtoll', 'strtoull', 'strtod',
+    'sizeof', 'offsetof', 'typeof',
+    'assert', 'abort', 'exit', '_exit',
+    'fopen', 'fclose', 'fread', 'fwrite', 'fseek', 'ftell', 'rewind', 'fflush', 'fgets', 'fputs',
+    // Linux kernel common macros/helpers (not real call targets)
+    'likely', 'unlikely', 'BUG', 'BUG_ON', 'WARN', 'WARN_ON', 'WARN_ONCE',
+    'IS_ERR', 'PTR_ERR', 'ERR_PTR', 'IS_ERR_OR_NULL',
+    'ARRAY_SIZE', 'container_of', 'list_for_each_entry', 'list_for_each_entry_safe',
+    'min', 'max', 'clamp', 'abs', 'swap',
+    'pr_info', 'pr_warn', 'pr_err', 'pr_debug', 'pr_notice', 'pr_crit', 'pr_emerg',
+    'printk', 'dev_info', 'dev_warn', 'dev_err', 'dev_dbg',
+    'GFP_KERNEL', 'GFP_ATOMIC',
+    'spin_lock', 'spin_unlock', 'spin_lock_irqsave', 'spin_unlock_irqrestore',
+    'mutex_lock', 'mutex_unlock', 'mutex_init',
+    'kfree', 'kmalloc', 'kzalloc', 'kcalloc', 'krealloc', 'kvmalloc', 'kvfree',
+    'get', 'put',
 ]);
 // ============================================================================
 // Label detection from capture map
@@ -444,14 +467,52 @@ const processFileGroup = (files, language, queryString, result, onFileProcessed)
     }
 };
 // ============================================================================
-// Worker message handler
+// Worker message handler — supports sub-batch streaming
 // ============================================================================
-parentPort.on('message', (files) => {
+/** Accumulated result across sub-batches */
+let accumulated = {
+    nodes: [], relationships: [], symbols: [],
+    imports: [], calls: [], heritage: [], fileCount: 0,
+};
+let cumulativeProcessed = 0;
+const mergeResult = (target, src) => {
+    target.nodes.push(...src.nodes);
+    target.relationships.push(...src.relationships);
+    target.symbols.push(...src.symbols);
+    target.imports.push(...src.imports);
+    target.calls.push(...src.calls);
+    target.heritage.push(...src.heritage);
+    target.fileCount += src.fileCount;
+};
+parentPort.on('message', (msg) => {
     try {
-        const result = processBatch(files, (filesProcessed) => {
-            parentPort.postMessage({ type: 'progress', filesProcessed });
-        });
-        parentPort.postMessage({ type: 'result', data: result });
+        // Sub-batch mode: { type: 'sub-batch', files: [...] }
+        if (msg && msg.type === 'sub-batch') {
+            const result = processBatch(msg.files, (filesProcessed) => {
+                parentPort.postMessage({ type: 'progress', filesProcessed: cumulativeProcessed + filesProcessed });
+            });
+            cumulativeProcessed += result.fileCount;
+            mergeResult(accumulated, result);
+            // Signal ready for next sub-batch
+            parentPort.postMessage({ type: 'sub-batch-done' });
+            return;
+        }
+        // Flush: send accumulated results
+        if (msg && msg.type === 'flush') {
+            parentPort.postMessage({ type: 'result', data: accumulated });
+            // Reset for potential reuse
+            accumulated = { nodes: [], relationships: [], symbols: [], imports: [], calls: [], heritage: [], fileCount: 0 };
+            cumulativeProcessed = 0;
+            return;
+        }
+        // Legacy single-message mode (backward compat): array of files
+        if (Array.isArray(msg)) {
+            const result = processBatch(msg, (filesProcessed) => {
+                parentPort.postMessage({ type: 'progress', filesProcessed });
+            });
+            parentPort.postMessage({ type: 'result', data: result });
+            return;
+        }
     }
     catch (err) {
         const message = err instanceof Error ? err.message : String(err);