npm - gitnexus - Versions diffs - 1.2.6 → 1.2.8 - Mend

gitnexus 1.2.6 → 1.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/README.md +9 -10
package/dist/cli/analyze.d.ts +1 -1
package/dist/cli/analyze.js +59 -15
package/dist/cli/eval-server.js +1 -1
package/dist/cli/index.js +1 -1
package/dist/cli/mcp.js +1 -1
package/dist/core/augmentation/engine.js +20 -20
package/dist/core/embeddings/embedder.js +7 -0
package/dist/core/embeddings/embedding-pipeline.js +26 -26
package/dist/core/ingestion/cluster-enricher.js +16 -16
package/dist/core/ingestion/filesystem-walker.js +17 -3
package/dist/core/ingestion/parsing-processor.js +4 -1
package/dist/core/ingestion/workers/parse-worker.js +13 -4
package/dist/core/ingestion/workers/worker-pool.js +43 -9
package/dist/core/kuzu/kuzu-adapter.js +9 -9
package/dist/core/search/hybrid-search.js +3 -3
package/dist/core/wiki/graph-queries.js +52 -52
package/dist/core/wiki/prompts.js +82 -82
package/dist/mcp/local/local-backend.d.ts +18 -3
package/dist/mcp/local/local-backend.js +57 -13
package/dist/mcp/resources.js +4 -4
package/hooks/claude/gitnexus-hook.cjs +135 -135
package/hooks/claude/pre-tool-use.sh +78 -78
package/hooks/claude/session-start.sh +42 -42
package/package.json +1 -1
package/vendor/leiden/index.cjs +355 -355
package/vendor/leiden/utils.cjs +392 -392

package/README.md CHANGED Viewed

@@ -79,17 +79,16 @@ Add to `~/.config/opencode/config.json`:
 }
 ```
-## What It Does
+## How It Works
-GitNexus indexes your codebase through 7 phases:
+GitNexus builds a complete knowledge graph of your codebase through a multi-phase indexing pipeline:
-1. **Structure** — File/folder tree
-2. **Parse** — AST extraction via Tree-sitter (9 languages)
-3. **Imports** — Resolve import paths (including TS path aliases, Rust modules, Java wildcards, Go packages)
-4. **Calls** — Function call resolution with confidence scoring (0.3-0.9)
-5. **Heritage** — Class extends/implements chains
-6. **Communities** — Leiden algorithm clusters related code into functional groups
-7. **Processes** — Entry point detection and execution flow tracing
+1. **Structure** — Walks the file tree and maps folder/file relationships
+2. **Parsing** — Extracts functions, classes, methods, and interfaces using Tree-sitter ASTs
+3. **Resolution** — Resolves imports and function calls across files with language-aware logic
+4. **Clustering** — Groups related symbols into functional communities
+5. **Processes** — Traces execution flows from entry points through call chains
+6. **Search** — Builds hybrid search indexes for fast retrieval
 The result is a **KuzuDB graph database** stored locally in `.gitnexus/` with full-text search and semantic embeddings.
@@ -147,7 +146,7 @@ gitnexus wiki --model <model>    # Wiki with custom LLM model (default: gpt-4o-m
 ## Multi-Repo Support
-GitNexus supports indexing multiple repositories. Each `gitnexus analyze` registers the repo in a global registry (`~/.gitnexus/registry.json`). The MCP server serves all indexed repos automatically with lazy KuzuDB connections (max 5 concurrent, evicted after 5 minutes idle).
+GitNexus supports indexing multiple repositories. Each `gitnexus analyze` registers the repo in a global registry (`~/.gitnexus/registry.json`). The MCP server serves all indexed repos automatically.
 ## Supported Languages

package/dist/cli/analyze.d.ts CHANGED Viewed

@@ -5,6 +5,6 @@
  */
 export interface AnalyzeOptions {
     force?: boolean;
-    skipEmbeddings?: boolean;
+    embeddings?: boolean;
 }
 export declare const analyzeCommand: (inputPath?: string, options?: AnalyzeOptions) => Promise<void>;

package/dist/cli/analyze.js CHANGED Viewed

@@ -8,7 +8,7 @@ import cliProgress from 'cli-progress';
 import { runPipelineFromRepo } from '../core/ingestion/pipeline.js';
 import { initKuzu, loadGraphToKuzu, getKuzuStats, executeQuery, executeWithReusedStatement, closeKuzu, createFTSIndex, loadCachedEmbeddings } from '../core/kuzu/kuzu-adapter.js';
 import { runEmbeddingPipeline } from '../core/embeddings/embedding-pipeline.js';
-import { disposeEmbedder } from '../core/embeddings/embedder.js';
+// disposeEmbedder intentionally not called — ONNX Runtime segfaults on cleanup (see #38)
 import { getStoragePaths, saveMeta, loadMeta, addToGitignore, registerRepo, getGlobalRegistryPath } from '../storage/repo-manager.js';
 import { getCurrentCommit, isGitRepo, getGitRoot } from '../storage/git.js';
 import { generateAIContextFiles } from './ai-context.js';
@@ -70,11 +70,29 @@ export const analyzeCommand = async (inputPath, options) => {
         stopOnComplete: false,
     }, cliProgress.Presets.shades_grey);
     bar.start(100, 0, { phase: 'Initializing...' });
+    // Route all console output through bar.log() so the bar doesn't stamp itself
+    // multiple times when other code writes to stdout/stderr mid-render.
+    const origLog = console.log.bind(console);
+    const origWarn = console.warn.bind(console);
+    const origError = console.error.bind(console);
+    const barLog = (...args) => bar.log(args.map(a => (typeof a === 'string' ? a : String(a))).join(' '));
+    console.log = barLog;
+    console.warn = barLog;
+    console.error = barLog;
+    // Show elapsed seconds for phases that run longer than 3s
+    let lastPhaseLabel = 'Initializing...';
+    let phaseStart = Date.now();
+    const elapsedTimer = setInterval(() => {
+        const elapsed = Math.round((Date.now() - phaseStart) / 1000);
+        if (elapsed >= 3) {
+            bar.update({ phase: `${lastPhaseLabel} (${elapsed}s)` });
+        }
+    }, 1000);
     const t0Global = Date.now();
     // ── Cache embeddings from existing index before rebuild ────────────
     let cachedEmbeddingNodeIds = new Set();
     let cachedEmbeddings = [];
-    if (existingMeta && !options?.force) {
+    if (options?.embeddings && existingMeta && !options?.force) {
         try {
             bar.update(0, { phase: 'Caching embeddings...' });
             await initKuzu(kuzuPath);
@@ -94,10 +112,16 @@ export const analyzeCommand = async (inputPath, options) => {
     const pipelineResult = await runPipelineFromRepo(repoPath, (progress) => {
         const phaseLabel = PHASE_LABELS[progress.phase] || progress.phase;
         const scaled = Math.round(progress.percent * 0.6);
+        if (phaseLabel !== lastPhaseLabel) {
+            lastPhaseLabel = phaseLabel;
+            phaseStart = Date.now();
+        }
         bar.update(scaled, { phase: phaseLabel });
     });
     // ── Phase 2: KuzuDB (60–85%) ──────────────────────────────────────
-    bar.update(60, { phase: 'Loading into KuzuDB...' });
+    lastPhaseLabel = 'Loading into KuzuDB...';
+    phaseStart = Date.now();
+    bar.update(60, { phase: lastPhaseLabel });
     await closeKuzu();
     const kuzuFiles = [kuzuPath, `${kuzuPath}.wal`, `${kuzuPath}.lock`];
     for (const f of kuzuFiles) {
@@ -117,7 +141,9 @@ export const analyzeCommand = async (inputPath, options) => {
     const kuzuTime = ((Date.now() - t0Kuzu) / 1000).toFixed(1);
     const kuzuWarnings = kuzuResult.warnings;
     // ── Phase 3: FTS (85–90%) ─────────────────────────────────────────
-    bar.update(85, { phase: 'Creating search indexes...' });
+    lastPhaseLabel = 'Creating search indexes...';
+    phaseStart = Date.now();
+    bar.update(85, { phase: lastPhaseLabel });
     const t0Fts = Date.now();
     try {
         await createFTSIndex('File', 'file_fts', ['name', 'content']);
@@ -146,22 +172,28 @@ export const analyzeCommand = async (inputPath, options) => {
     // ── Phase 4: Embeddings (90–98%) ──────────────────────────────────
     const stats = await getKuzuStats();
     let embeddingTime = '0.0';
-    let embeddingSkipped = false;
-    let embeddingSkipReason = '';
-    if (options?.skipEmbeddings) {
-        embeddingSkipped = true;
-        embeddingSkipReason = 'skipped (--skip-embeddings)';
-    }
-    else if (stats.nodes > EMBEDDING_NODE_LIMIT) {
-        embeddingSkipped = true;
-        embeddingSkipReason = `skipped (${stats.nodes.toLocaleString()} nodes > ${EMBEDDING_NODE_LIMIT.toLocaleString()} limit)`;
+    let embeddingSkipped = true;
+    let embeddingSkipReason = 'off (use --embeddings to enable)';
+    if (options?.embeddings) {
+        if (stats.nodes > EMBEDDING_NODE_LIMIT) {
+            embeddingSkipReason = `skipped (${stats.nodes.toLocaleString()} nodes > ${EMBEDDING_NODE_LIMIT.toLocaleString()} limit)`;
+        }
+        else {
+            embeddingSkipped = false;
+        }
     }
     if (!embeddingSkipped) {
-        bar.update(90, { phase: 'Loading embedding model...' });
+        lastPhaseLabel = 'Loading embedding model...';
+        phaseStart = Date.now();
+        bar.update(90, { phase: lastPhaseLabel });
         const t0Emb = Date.now();
         await runEmbeddingPipeline(executeQuery, executeWithReusedStatement, (progress) => {
             const scaled = 90 + Math.round((progress.percent / 100) * 8);
             const label = progress.phase === 'loading-model' ? 'Loading embedding model...' : `Embedding ${progress.nodesProcessed || 0}/${progress.totalNodes || '?'}`;
+            if (label !== lastPhaseLabel) {
+                lastPhaseLabel = label;
+                phaseStart = Date.now();
+            }
             bar.update(scaled, { phase: label });
         }, {}, cachedEmbeddingNodeIds.size > 0 ? cachedEmbeddingNodeIds : undefined);
         embeddingTime = ((Date.now() - t0Emb) / 1000).toFixed(1);
@@ -203,8 +235,14 @@ export const analyzeCommand = async (inputPath, options) => {
         processes: pipelineResult.processResult?.stats.totalProcesses,
     });
     await closeKuzu();
-    await disposeEmbedder();
+    // Note: we intentionally do NOT call disposeEmbedder() here.
+    // ONNX Runtime's native cleanup segfaults on macOS and some Linux configs.
+    // Since the process exits immediately after, Node.js reclaims everything.
     const totalTime = ((Date.now() - t0Global) / 1000).toFixed(1);
+    clearInterval(elapsedTimer);
+    console.log = origLog;
+    console.warn = origWarn;
+    console.error = origError;
     bar.update(100, { phase: 'Done' });
     bar.stop();
     // ── Summary ───────────────────────────────────────────────────────
@@ -233,4 +271,10 @@ export const analyzeCommand = async (inputPath, options) => {
         console.log('\n  Tip: Run `gitnexus setup` to configure MCP for your editor.');
     }
     console.log('');
+    // ONNX Runtime registers native atexit hooks that segfault during process
+    // shutdown on macOS (#38) and some Linux configs (#40). Force-exit to
+    // bypass them when embeddings were loaded.
+    if (!embeddingSkipped) {
+        process.exit(0);
+    }
 };

package/dist/cli/eval-server.js CHANGED Viewed

@@ -261,7 +261,7 @@ export async function evalServerCommand(options) {
         console.error('GitNexus eval-server: No indexed repositories found. Run: gitnexus analyze');
         process.exit(1);
     }
-    const repos = backend.listRepos();
+    const repos = await backend.listRepos();
     console.error(`GitNexus eval-server: ${repos.length} repo(s) loaded: ${repos.map(r => r.name).join(', ')}`);
     let idleTimer = null;
     function resetIdleTimer() {

package/dist/cli/index.js CHANGED Viewed

@@ -24,7 +24,7 @@ program
     .command('analyze [path]')
     .description('Index a repository (full analysis)')
     .option('-f, --force', 'Force full re-index even if up to date')
-    .option('--skip-embeddings', 'Skip embedding generation (faster)')
+    .option('--embeddings', 'Enable embedding generation for semantic search (off by default)')
     .action(analyzeCommand);
 program
     .command('serve')

package/dist/cli/mcp.js CHANGED Viewed

@@ -38,7 +38,7 @@ export const mcpCommand = async () => {
         console.error('GitNexus: Failed to initialize backend from registry.');
         process.exit(1);
     }
-    const repoNames = backend.listRepos().map(r => r.name);
+    const repoNames = (await backend.listRepos()).map(r => r.name);
     console.error(`GitNexus: MCP server starting with ${repoNames.length} repo(s): ${repoNames.join(', ')}`);
     // Start MCP server (serves all repos)
     await startMCPServer(backend);

package/dist/core/augmentation/engine.js CHANGED Viewed

@@ -98,11 +98,11 @@ export async function augment(pattern, cwd) {
         for (const result of bm25Results.slice(0, 5)) {
             const escaped = result.filePath.replace(/'/g, "''");
             try {
-                const symbols = await executeQuery(repoId, `
-          MATCH (n) WHERE n.filePath = '${escaped}'
-          AND n.name CONTAINS '${pattern.replace(/'/g, "''").split(/\s+/)[0]}'
-          RETURN n.id AS id, n.name AS name, labels(n)[0] AS type, n.filePath AS filePath
-          LIMIT 3
+                const symbols = await executeQuery(repoId, `
+          MATCH (n) WHERE n.filePath = '${escaped}'
+          AND n.name CONTAINS '${pattern.replace(/'/g, "''").split(/\s+/)[0]}'
+          RETURN n.id AS id, n.name AS name, labels(n)[0] AS type, n.filePath AS filePath
+          LIMIT 3
         `);
                 for (const sym of symbols) {
                     symbolMatches.push({
@@ -130,10 +130,10 @@ export async function augment(pattern, cwd) {
             // Callers
             let callers = [];
             try {
-                const rows = await executeQuery(repoId, `
-          MATCH (caller)-[:CodeRelation {type: 'CALLS'}]->(n {id: '${escaped}'})
-          RETURN caller.name AS name
-          LIMIT 3
+                const rows = await executeQuery(repoId, `
+          MATCH (caller)-[:CodeRelation {type: 'CALLS'}]->(n {id: '${escaped}'})
+          RETURN caller.name AS name
+          LIMIT 3
         `);
                 callers = rows.map((r) => r.name || r[0]).filter(Boolean);
             }
@@ -141,10 +141,10 @@ export async function augment(pattern, cwd) {
             // Callees
             let callees = [];
             try {
-                const rows = await executeQuery(repoId, `
-          MATCH (n {id: '${escaped}'})-[:CodeRelation {type: 'CALLS'}]->(callee)
-          RETURN callee.name AS name
-          LIMIT 3
+                const rows = await executeQuery(repoId, `
+          MATCH (n {id: '${escaped}'})-[:CodeRelation {type: 'CALLS'}]->(callee)
+          RETURN callee.name AS name
+          LIMIT 3
         `);
                 callees = rows.map((r) => r.name || r[0]).filter(Boolean);
             }
@@ -152,9 +152,9 @@ export async function augment(pattern, cwd) {
             // Processes
             let processes = [];
             try {
-                const rows = await executeQuery(repoId, `
-          MATCH (n {id: '${escaped}'})-[r:CodeRelation {type: 'STEP_IN_PROCESS'}]->(p:Process)
-          RETURN p.heuristicLabel AS label, r.step AS step, p.stepCount AS stepCount
+                const rows = await executeQuery(repoId, `
+          MATCH (n {id: '${escaped}'})-[r:CodeRelation {type: 'STEP_IN_PROCESS'}]->(p:Process)
+          RETURN p.heuristicLabel AS label, r.step AS step, p.stepCount AS stepCount
         `);
                 processes = rows.map((r) => {
                     const label = r.label || r[0];
@@ -167,10 +167,10 @@ export async function augment(pattern, cwd) {
             // Cluster cohesion (internal ranking signal)
             let cohesion = 0;
             try {
-                const rows = await executeQuery(repoId, `
-          MATCH (n {id: '${escaped}'})-[:CodeRelation {type: 'MEMBER_OF'}]->(c:Community)
-          RETURN c.cohesion AS cohesion
-          LIMIT 1
+                const rows = await executeQuery(repoId, `
+          MATCH (n {id: '${escaped}'})-[:CodeRelation {type: 'MEMBER_OF'}]->(c:Community)
+          RETURN c.cohesion AS cohesion
+          LIMIT 1
         `);
                 if (rows.length > 0) {
                     cohesion = (rows[0].cohesion ?? rows[0][0]) || 0;

package/dist/core/embeddings/embedder.js CHANGED Viewed

@@ -6,6 +6,12 @@
  *
  * Uses snowflake-arctic-embed-xs by default (22M params, 384 dims, ~90MB)
  */
+// Suppress ONNX Runtime native warnings (e.g. VerifyEachNodeIsAssignedToAnEp)
+// Must be set BEFORE onnxruntime-node is imported by transformers.js
+// Level 3 = Error only (skips Warning/Info)
+if (!process.env.ORT_LOG_LEVEL) {
+    process.env.ORT_LOG_LEVEL = '3';
+}
 import { pipeline, env } from '@huggingface/transformers';
 import { DEFAULT_EMBEDDING_CONFIG } from './types.js';
 // Module-level state for singleton pattern
@@ -83,6 +89,7 @@ export const initEmbedder = async (onProgress, config = {}, forceDevice) => {
                         device: device,
                         dtype: 'fp32',
                         progress_callback: progressCallback,
+                        session_options: { logSeverityLevel: 3 },
                     });
                     currentDevice = device;
                     if (isDev) {

package/dist/core/embeddings/embedding-pipeline.js CHANGED Viewed

@@ -24,19 +24,19 @@ const queryEmbeddableNodes = async (executeQuery) => {
             let query;
             if (label === 'File') {
                 // File nodes don't have startLine/endLine
-                query = `
-          MATCH (n:File)
-          RETURN n.id AS id, n.name AS name, 'File' AS label,
-                 n.filePath AS filePath, n.content AS content
+                query = `
+          MATCH (n:File)
+          RETURN n.id AS id, n.name AS name, 'File' AS label,
+                 n.filePath AS filePath, n.content AS content
         `;
             }
             else {
                 // Code elements have startLine/endLine
-                query = `
-          MATCH (n:${label})
-          RETURN n.id AS id, n.name AS name, '${label}' AS label,
-                 n.filePath AS filePath, n.content AS content,
-                 n.startLine AS startLine, n.endLine AS endLine
+                query = `
+          MATCH (n:${label})
+          RETURN n.id AS id, n.name AS name, '${label}' AS label,
+                 n.filePath AS filePath, n.content AS content,
+                 n.startLine AS startLine, n.endLine AS endLine
         `;
             }
             const rows = await executeQuery(query);
@@ -77,8 +77,8 @@ const batchInsertEmbeddings = async (executeWithReusedStatement, updates) => {
  * Now indexes the separate CodeEmbedding table
  */
 const createVectorIndex = async (executeQuery) => {
-    const cypher = `
-    CALL CREATE_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx', 'embedding', metric := 'cosine')
+    const cypher = `
+    CALL CREATE_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx', 'embedding', metric := 'cosine')
   `;
     try {
         await executeQuery(cypher);
@@ -240,14 +240,14 @@ export const semanticSearch = async (executeQuery, query, k = 10, maxDistance =
     const queryVec = embeddingToArray(queryEmbedding);
     const queryVecStr = `[${queryVec.join(',')}]`;
     // Query the vector index on CodeEmbedding to get nodeIds and distances
-    const vectorQuery = `
-    CALL QUERY_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx',
-      CAST(${queryVecStr} AS FLOAT[384]), ${k})
-    YIELD node AS emb, distance
-    WITH emb, distance
-    WHERE distance < ${maxDistance}
-    RETURN emb.nodeId AS nodeId, distance
-    ORDER BY distance
+    const vectorQuery = `
+    CALL QUERY_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx',
+      CAST(${queryVecStr} AS FLOAT[384]), ${k})
+    YIELD node AS emb, distance
+    WITH emb, distance
+    WHERE distance < ${maxDistance}
+    RETURN emb.nodeId AS nodeId, distance
+    ORDER BY distance
   `;
     const embResults = await executeQuery(vectorQuery);
     if (embResults.length === 0) {
@@ -266,16 +266,16 @@ export const semanticSearch = async (executeQuery, query, k = 10, maxDistance =
         try {
             let nodeQuery;
             if (label === 'File') {
-                nodeQuery = `
-          MATCH (n:File {id: '${nodeId.replace(/'/g, "''")}'})
-          RETURN n.name AS name, n.filePath AS filePath
+                nodeQuery = `
+          MATCH (n:File {id: '${nodeId.replace(/'/g, "''")}'})
+          RETURN n.name AS name, n.filePath AS filePath
         `;
             }
             else {
-                nodeQuery = `
-          MATCH (n:${label} {id: '${nodeId.replace(/'/g, "''")}'})
-          RETURN n.name AS name, n.filePath AS filePath,
-                 n.startLine AS startLine, n.endLine AS endLine
+                nodeQuery = `
+          MATCH (n:${label} {id: '${nodeId.replace(/'/g, "''")}'})
+          RETURN n.name AS name, n.filePath AS filePath,
+                 n.startLine AS startLine, n.endLine AS endLine
         `;
             }
             const nodeRows = await executeQuery(nodeQuery);

package/dist/core/ingestion/cluster-enricher.js CHANGED Viewed

@@ -13,12 +13,12 @@ const buildEnrichmentPrompt = (members, heuristicLabel) => {
     const memberList = limitedMembers
         .map(m => `${m.name} (${m.type})`)
         .join(', ');
-    return `Analyze this code cluster and provide a semantic name and short description.
-Heuristic: "${heuristicLabel}"
-Members: ${memberList}${members.length > 20 ? ` (+${members.length - 20} more)` : ''}
-Reply with JSON only:
+    return `Analyze this code cluster and provide a semantic name and short description.
+Heuristic: "${heuristicLabel}"
+Members: ${memberList}${members.length > 20 ? ` (+${members.length - 20} more)` : ''}
+Reply with JSON only:
 {"name": "2-4 word semantic name", "description": "One sentence describing purpose"}`;
 };
 // ============================================================================
@@ -115,18 +115,18 @@ export const enrichClustersBatch = async (communities, memberMap, llmClient, bat
             const memberList = limitedMembers
                 .map(m => `${m.name} (${m.type})`)
                 .join(', ');
-            return `Cluster ${idx + 1} (id: ${community.id}):
-Heuristic: "${community.heuristicLabel}"
+            return `Cluster ${idx + 1} (id: ${community.id}):
+Heuristic: "${community.heuristicLabel}"
 Members: ${memberList}`;
         }).join('\n\n');
-        const prompt = `Analyze these code clusters and generate semantic names, keywords, and descriptions.
-${batchPrompt}
-Output JSON array:
-[
-  {"id": "comm_X", "name": "...", "keywords": [...], "description": "..."},
-  ...
+        const prompt = `Analyze these code clusters and generate semantic names, keywords, and descriptions.
+${batchPrompt}
+Output JSON array:
+[
+  {"id": "comm_X", "name": "...", "keywords": [...], "description": "..."},
+  ...
 ]`;
         try {
             const response = await llmClient.generate(prompt);

package/dist/core/ingestion/filesystem-walker.js CHANGED Viewed

@@ -3,6 +3,8 @@ import path from 'path';
 import { glob } from 'glob';
 import { shouldIgnorePath } from '../../config/ignore-service.js';
 const READ_CONCURRENCY = 32;
+/** Skip files larger than 512KB — they're usually generated/vendored and crash tree-sitter */
+const MAX_FILE_SIZE = 512 * 1024;
 export const walkRepository = async (repoPath, onProgress) => {
     const files = await glob('**/*', {
         cwd: repoPath,
@@ -12,13 +14,22 @@ export const walkRepository = async (repoPath, onProgress) => {
     const filtered = files.filter(file => !shouldIgnorePath(file));
     const entries = [];
     let processed = 0;
+    let skippedLarge = 0;
     for (let start = 0; start < filtered.length; start += READ_CONCURRENCY) {
         const batch = filtered.slice(start, start + READ_CONCURRENCY);
-        const results = await Promise.allSettled(batch.map(relativePath => fs.readFile(path.join(repoPath, relativePath), 'utf-8')
-            .then(content => ({ path: relativePath.replace(/\\/g, '/'), content }))));
+        const results = await Promise.allSettled(batch.map(async (relativePath) => {
+            const fullPath = path.join(repoPath, relativePath);
+            const stat = await fs.stat(fullPath);
+            if (stat.size > MAX_FILE_SIZE) {
+                skippedLarge++;
+                return null;
+            }
+            const content = await fs.readFile(fullPath, 'utf-8');
+            return { path: relativePath.replace(/\\/g, '/'), content };
+        }));
         for (const result of results) {
             processed++;
-            if (result.status === 'fulfilled') {
+            if (result.status === 'fulfilled' && result.value !== null) {
                 entries.push(result.value);
                 onProgress?.(processed, filtered.length, result.value.path);
             }
@@ -27,5 +38,8 @@ export const walkRepository = async (repoPath, onProgress) => {
             }
         }
     }
+    if (skippedLarge > 0) {
+        console.warn(`  Skipped ${skippedLarge} files larger than ${MAX_FILE_SIZE / 1024}KB`);
+    }
     return entries;
 };

package/dist/core/ingestion/parsing-processor.js CHANGED Viewed

@@ -158,6 +158,9 @@ const processParsingSequential = async (graph, files, symbolTable, astCache, onF
         const language = getLanguageFromFilename(file.path);
         if (!language)
             continue;
+        // Skip very large files — they can crash tree-sitter or cause OOM
+        if (file.content.length > 512 * 1024)
+            continue;
         await loadLanguage(language, file.path);
         let tree;
         try {
@@ -281,7 +284,7 @@ export const processParsing = async (graph, files, symbolTable, astCache, onFile
             return await processParsingWithWorkers(graph, files, symbolTable, astCache, workerPool, onFileProgress);
         }
         catch (err) {
-            console.warn('Worker pool parsing failed, falling back to sequential:', err);
+            console.warn('Worker pool parsing failed, falling back to sequential:', err instanceof Error ? err.message : err);
         }
     }
     // Fallback: sequential parsing (no pre-extracted data)

package/dist/core/ingestion/workers/parse-worker.js CHANGED Viewed

@@ -328,6 +328,9 @@ const processFileGroup = (files, language, queryString, result, onFileProcessed)
         return;
     }
     for (const file of files) {
+        // Skip very large files — they can crash tree-sitter or cause OOM
+        if (file.content.length > 512 * 1024)
+            continue;
         let tree;
         try {
             tree = parser.parse(file.content, undefined, { bufferSize: 1024 * 256 });
@@ -444,8 +447,14 @@ const processFileGroup = (files, language, queryString, result, onFileProcessed)
 // Worker message handler
 // ============================================================================
 parentPort.on('message', (files) => {
-    const result = processBatch(files, (filesProcessed) => {
-        parentPort.postMessage({ type: 'progress', filesProcessed });
-    });
-    parentPort.postMessage({ type: 'result', data: result });
+    try {
+        const result = processBatch(files, (filesProcessed) => {
+            parentPort.postMessage({ type: 'progress', filesProcessed });
+        });
+        parentPort.postMessage({ type: 'result', data: result });
+    }
+    catch (err) {
+        const message = err instanceof Error ? err.message : String(err);
+        parentPort.postMessage({ type: 'error', error: message });
+    }
 });

package/dist/core/ingestion/workers/worker-pool.js CHANGED Viewed

@@ -27,31 +27,65 @@ export const createWorkerPool = (workerUrl, poolSize) => {
         const promises = chunks.map((chunk, i) => {
             const worker = workers[i];
             return new Promise((resolve, reject) => {
+                let settled = false;
+                const cleanup = () => {
+                    clearTimeout(timer);
+                    worker.removeListener('message', handler);
+                    worker.removeListener('error', errorHandler);
+                    worker.removeListener('exit', exitHandler);
+                };
+                const timer = setTimeout(() => {
+                    if (!settled) {
+                        settled = true;
+                        cleanup();
+                        reject(new Error(`Worker ${i} timed out after 5 minutes (chunk: ${chunk.length} items). Worker may have crashed or is processing too much data.`));
+                    }
+                }, 5 * 60 * 1000);
                 const handler = (msg) => {
+                    if (settled)
+                        return;
                     if (msg && msg.type === 'progress') {
-                        // Intermediate progress from worker
                         workerProgress[i] = msg.filesProcessed;
                         if (onProgress) {
                             const total = workerProgress.reduce((a, b) => a + b, 0);
                             onProgress(total);
                         }
                     }
+                    else if (msg && msg.type === 'error') {
+                        // Error reported by worker via postMessage
+                        settled = true;
+                        cleanup();
+                        reject(new Error(`Worker ${i} error: ${msg.error}`));
+                    }
                     else if (msg && msg.type === 'result') {
-                        // Final result
-                        worker.removeListener('message', handler);
+                        settled = true;
+                        cleanup();
                         resolve(msg.data);
                     }
                     else {
-                        // Legacy: treat any non-typed message as result (backward compat)
-                        worker.removeListener('message', handler);
+                        // Legacy: treat any non-typed message as result
+                        settled = true;
+                        cleanup();
                         resolve(msg);
                     }
                 };
+                const errorHandler = (err) => {
+                    if (!settled) {
+                        settled = true;
+                        cleanup();
+                        reject(err);
+                    }
+                };
+                const exitHandler = (code) => {
+                    if (!settled) {
+                        settled = true;
+                        cleanup();
+                        reject(new Error(`Worker ${i} exited unexpectedly with code ${code}. This usually indicates an out-of-memory crash or native addon failure.`));
+                    }
+                };
                 worker.on('message', handler);
-                worker.once('error', (err) => {
-                    worker.removeListener('message', handler);
-                    reject(err);
-                });
+                worker.once('error', errorHandler);
+                worker.once('exit', exitHandler);
                 worker.postMessage(chunk);
             });
         });

package/dist/core/kuzu/kuzu-adapter.js CHANGED Viewed

@@ -242,10 +242,10 @@ const fallbackRelationshipInserts = async (validRelLines, validTables, getNodeLa
                 continue;
             const confidence = parseFloat(confidenceStr) || 1.0;
             const step = parseInt(stepStr) || 0;
-            await conn.query(`
-        MATCH (a:${escapeLabel(fromLabel)} {id: '${fromId.replace(/'/g, "''")}' }),
-              (b:${escapeLabel(toLabel)} {id: '${toId.replace(/'/g, "''")}' })
-        CREATE (a)-[:${REL_TABLE_NAME} {type: '${relType}', confidence: ${confidence}, reason: '${reason.replace(/'/g, "''")}', step: ${step}}]->(b)
+            await conn.query(`
+        MATCH (a:${escapeLabel(fromLabel)} {id: '${fromId.replace(/'/g, "''")}' }),
+              (b:${escapeLabel(toLabel)} {id: '${toId.replace(/'/g, "''")}' })
+        CREATE (a)-[:${REL_TABLE_NAME} {type: '${relType}', confidence: ${confidence}, reason: '${reason.replace(/'/g, "''")}', step: ${step}}]->(b)
       `);
         }
         catch {
@@ -636,11 +636,11 @@ export const queryFTS = async (tableName, indexName, query, limit = 20, conjunct
     }
     // Escape single quotes in query
     const escapedQuery = query.replace(/'/g, "''");
-    const cypher = `
-    CALL QUERY_FTS_INDEX('${tableName}', '${indexName}', '${escapedQuery}', conjunctive := ${conjunctive})
-    RETURN node, score
-    ORDER BY score DESC
-    LIMIT ${limit}
+    const cypher = `
+    CALL QUERY_FTS_INDEX('${tableName}', '${indexName}', '${escapedQuery}', conjunctive := ${conjunctive})
+    RETURN node, score
+    ORDER BY score DESC
+    LIMIT ${limit}
   `;
     try {
         const queryResult = await conn.query(cypher);