gitnexus 1.2.6 → 1.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -79,17 +79,16 @@ Add to `~/.config/opencode/config.json`:
79
79
  }
80
80
  ```
81
81
 
82
- ## What It Does
82
+ ## How It Works
83
83
 
84
- GitNexus indexes your codebase through 7 phases:
84
+ GitNexus builds a complete knowledge graph of your codebase through a multi-phase indexing pipeline:
85
85
 
86
- 1. **Structure** — File/folder tree
87
- 2. **Parse** — AST extraction via Tree-sitter (9 languages)
88
- 3. **Imports** — Resolve import paths (including TS path aliases, Rust modules, Java wildcards, Go packages)
89
- 4. **Calls** — Function call resolution with confidence scoring (0.3-0.9)
90
- 5. **Heritage** — Class extends/implements chains
91
- 6. **Communities** — Leiden algorithm clusters related code into functional groups
92
- 7. **Processes** — Entry point detection and execution flow tracing
86
+ 1. **Structure** — Walks the file tree and maps folder/file relationships
87
+ 2. **Parsing** — Extracts functions, classes, methods, and interfaces using Tree-sitter ASTs
88
+ 3. **Resolution** — Resolves imports and function calls across files with language-aware logic
89
+ 4. **Clustering** — Groups related symbols into functional communities
90
+ 5. **Processes** — Traces execution flows from entry points through call chains
91
+ 6. **Search** — Builds hybrid search indexes for fast retrieval
93
92
 
94
93
  The result is a **KuzuDB graph database** stored locally in `.gitnexus/` with full-text search and semantic embeddings.
95
94
 
@@ -147,7 +146,7 @@ gitnexus wiki --model <model> # Wiki with custom LLM model (default: gpt-4o-m
147
146
 
148
147
  ## Multi-Repo Support
149
148
 
150
- GitNexus supports indexing multiple repositories. Each `gitnexus analyze` registers the repo in a global registry (`~/.gitnexus/registry.json`). The MCP server serves all indexed repos automatically with lazy KuzuDB connections (max 5 concurrent, evicted after 5 minutes idle).
149
+ GitNexus supports indexing multiple repositories. Each `gitnexus analyze` registers the repo in a global registry (`~/.gitnexus/registry.json`). The MCP server serves all indexed repos automatically.
151
150
 
152
151
  ## Supported Languages
153
152
 
@@ -5,6 +5,6 @@
5
5
  */
6
6
  export interface AnalyzeOptions {
7
7
  force?: boolean;
8
- skipEmbeddings?: boolean;
8
+ embeddings?: boolean;
9
9
  }
10
10
  export declare const analyzeCommand: (inputPath?: string, options?: AnalyzeOptions) => Promise<void>;
@@ -8,7 +8,7 @@ import cliProgress from 'cli-progress';
8
8
  import { runPipelineFromRepo } from '../core/ingestion/pipeline.js';
9
9
  import { initKuzu, loadGraphToKuzu, getKuzuStats, executeQuery, executeWithReusedStatement, closeKuzu, createFTSIndex, loadCachedEmbeddings } from '../core/kuzu/kuzu-adapter.js';
10
10
  import { runEmbeddingPipeline } from '../core/embeddings/embedding-pipeline.js';
11
- import { disposeEmbedder } from '../core/embeddings/embedder.js';
11
+ // disposeEmbedder intentionally not called — ONNX Runtime segfaults on cleanup (see #38)
12
12
  import { getStoragePaths, saveMeta, loadMeta, addToGitignore, registerRepo, getGlobalRegistryPath } from '../storage/repo-manager.js';
13
13
  import { getCurrentCommit, isGitRepo, getGitRoot } from '../storage/git.js';
14
14
  import { generateAIContextFiles } from './ai-context.js';
@@ -70,11 +70,29 @@ export const analyzeCommand = async (inputPath, options) => {
70
70
  stopOnComplete: false,
71
71
  }, cliProgress.Presets.shades_grey);
72
72
  bar.start(100, 0, { phase: 'Initializing...' });
73
+ // Route all console output through bar.log() so the bar doesn't stamp itself
74
+ // multiple times when other code writes to stdout/stderr mid-render.
75
+ const origLog = console.log.bind(console);
76
+ const origWarn = console.warn.bind(console);
77
+ const origError = console.error.bind(console);
78
+ const barLog = (...args) => bar.log(args.map(a => (typeof a === 'string' ? a : String(a))).join(' '));
79
+ console.log = barLog;
80
+ console.warn = barLog;
81
+ console.error = barLog;
82
+ // Show elapsed seconds for phases that run longer than 3s
83
+ let lastPhaseLabel = 'Initializing...';
84
+ let phaseStart = Date.now();
85
+ const elapsedTimer = setInterval(() => {
86
+ const elapsed = Math.round((Date.now() - phaseStart) / 1000);
87
+ if (elapsed >= 3) {
88
+ bar.update({ phase: `${lastPhaseLabel} (${elapsed}s)` });
89
+ }
90
+ }, 1000);
73
91
  const t0Global = Date.now();
74
92
  // ── Cache embeddings from existing index before rebuild ────────────
75
93
  let cachedEmbeddingNodeIds = new Set();
76
94
  let cachedEmbeddings = [];
77
- if (existingMeta && !options?.force) {
95
+ if (options?.embeddings && existingMeta && !options?.force) {
78
96
  try {
79
97
  bar.update(0, { phase: 'Caching embeddings...' });
80
98
  await initKuzu(kuzuPath);
@@ -94,10 +112,16 @@ export const analyzeCommand = async (inputPath, options) => {
94
112
  const pipelineResult = await runPipelineFromRepo(repoPath, (progress) => {
95
113
  const phaseLabel = PHASE_LABELS[progress.phase] || progress.phase;
96
114
  const scaled = Math.round(progress.percent * 0.6);
115
+ if (phaseLabel !== lastPhaseLabel) {
116
+ lastPhaseLabel = phaseLabel;
117
+ phaseStart = Date.now();
118
+ }
97
119
  bar.update(scaled, { phase: phaseLabel });
98
120
  });
99
121
  // ── Phase 2: KuzuDB (60–85%) ──────────────────────────────────────
100
- bar.update(60, { phase: 'Loading into KuzuDB...' });
122
+ lastPhaseLabel = 'Loading into KuzuDB...';
123
+ phaseStart = Date.now();
124
+ bar.update(60, { phase: lastPhaseLabel });
101
125
  await closeKuzu();
102
126
  const kuzuFiles = [kuzuPath, `${kuzuPath}.wal`, `${kuzuPath}.lock`];
103
127
  for (const f of kuzuFiles) {
@@ -117,7 +141,9 @@ export const analyzeCommand = async (inputPath, options) => {
117
141
  const kuzuTime = ((Date.now() - t0Kuzu) / 1000).toFixed(1);
118
142
  const kuzuWarnings = kuzuResult.warnings;
119
143
  // ── Phase 3: FTS (85–90%) ─────────────────────────────────────────
120
- bar.update(85, { phase: 'Creating search indexes...' });
144
+ lastPhaseLabel = 'Creating search indexes...';
145
+ phaseStart = Date.now();
146
+ bar.update(85, { phase: lastPhaseLabel });
121
147
  const t0Fts = Date.now();
122
148
  try {
123
149
  await createFTSIndex('File', 'file_fts', ['name', 'content']);
@@ -146,22 +172,28 @@ export const analyzeCommand = async (inputPath, options) => {
146
172
  // ── Phase 4: Embeddings (90–98%) ──────────────────────────────────
147
173
  const stats = await getKuzuStats();
148
174
  let embeddingTime = '0.0';
149
- let embeddingSkipped = false;
150
- let embeddingSkipReason = '';
151
- if (options?.skipEmbeddings) {
152
- embeddingSkipped = true;
153
- embeddingSkipReason = 'skipped (--skip-embeddings)';
154
- }
155
- else if (stats.nodes > EMBEDDING_NODE_LIMIT) {
156
- embeddingSkipped = true;
157
- embeddingSkipReason = `skipped (${stats.nodes.toLocaleString()} nodes > ${EMBEDDING_NODE_LIMIT.toLocaleString()} limit)`;
175
+ let embeddingSkipped = true;
176
+ let embeddingSkipReason = 'off (use --embeddings to enable)';
177
+ if (options?.embeddings) {
178
+ if (stats.nodes > EMBEDDING_NODE_LIMIT) {
179
+ embeddingSkipReason = `skipped (${stats.nodes.toLocaleString()} nodes > ${EMBEDDING_NODE_LIMIT.toLocaleString()} limit)`;
180
+ }
181
+ else {
182
+ embeddingSkipped = false;
183
+ }
158
184
  }
159
185
  if (!embeddingSkipped) {
160
- bar.update(90, { phase: 'Loading embedding model...' });
186
+ lastPhaseLabel = 'Loading embedding model...';
187
+ phaseStart = Date.now();
188
+ bar.update(90, { phase: lastPhaseLabel });
161
189
  const t0Emb = Date.now();
162
190
  await runEmbeddingPipeline(executeQuery, executeWithReusedStatement, (progress) => {
163
191
  const scaled = 90 + Math.round((progress.percent / 100) * 8);
164
192
  const label = progress.phase === 'loading-model' ? 'Loading embedding model...' : `Embedding ${progress.nodesProcessed || 0}/${progress.totalNodes || '?'}`;
193
+ if (label !== lastPhaseLabel) {
194
+ lastPhaseLabel = label;
195
+ phaseStart = Date.now();
196
+ }
165
197
  bar.update(scaled, { phase: label });
166
198
  }, {}, cachedEmbeddingNodeIds.size > 0 ? cachedEmbeddingNodeIds : undefined);
167
199
  embeddingTime = ((Date.now() - t0Emb) / 1000).toFixed(1);
@@ -203,8 +235,14 @@ export const analyzeCommand = async (inputPath, options) => {
203
235
  processes: pipelineResult.processResult?.stats.totalProcesses,
204
236
  });
205
237
  await closeKuzu();
206
- await disposeEmbedder();
238
+ // Note: we intentionally do NOT call disposeEmbedder() here.
239
+ // ONNX Runtime's native cleanup segfaults on macOS and some Linux configs.
240
+ // Since the process exits immediately after, Node.js reclaims everything.
207
241
  const totalTime = ((Date.now() - t0Global) / 1000).toFixed(1);
242
+ clearInterval(elapsedTimer);
243
+ console.log = origLog;
244
+ console.warn = origWarn;
245
+ console.error = origError;
208
246
  bar.update(100, { phase: 'Done' });
209
247
  bar.stop();
210
248
  // ── Summary ───────────────────────────────────────────────────────
@@ -233,4 +271,10 @@ export const analyzeCommand = async (inputPath, options) => {
233
271
  console.log('\n Tip: Run `gitnexus setup` to configure MCP for your editor.');
234
272
  }
235
273
  console.log('');
274
+ // ONNX Runtime registers native atexit hooks that segfault during process
275
+ // shutdown on macOS (#38) and some Linux configs (#40). Force-exit to
276
+ // bypass them when embeddings were loaded.
277
+ if (!embeddingSkipped) {
278
+ process.exit(0);
279
+ }
236
280
  };
@@ -261,7 +261,7 @@ export async function evalServerCommand(options) {
261
261
  console.error('GitNexus eval-server: No indexed repositories found. Run: gitnexus analyze');
262
262
  process.exit(1);
263
263
  }
264
- const repos = backend.listRepos();
264
+ const repos = await backend.listRepos();
265
265
  console.error(`GitNexus eval-server: ${repos.length} repo(s) loaded: ${repos.map(r => r.name).join(', ')}`);
266
266
  let idleTimer = null;
267
267
  function resetIdleTimer() {
package/dist/cli/index.js CHANGED
@@ -24,7 +24,7 @@ program
24
24
  .command('analyze [path]')
25
25
  .description('Index a repository (full analysis)')
26
26
  .option('-f, --force', 'Force full re-index even if up to date')
27
- .option('--skip-embeddings', 'Skip embedding generation (faster)')
27
+ .option('--embeddings', 'Enable embedding generation for semantic search (off by default)')
28
28
  .action(analyzeCommand);
29
29
  program
30
30
  .command('serve')
package/dist/cli/mcp.js CHANGED
@@ -38,7 +38,7 @@ export const mcpCommand = async () => {
38
38
  console.error('GitNexus: Failed to initialize backend from registry.');
39
39
  process.exit(1);
40
40
  }
41
- const repoNames = backend.listRepos().map(r => r.name);
41
+ const repoNames = (await backend.listRepos()).map(r => r.name);
42
42
  console.error(`GitNexus: MCP server starting with ${repoNames.length} repo(s): ${repoNames.join(', ')}`);
43
43
  // Start MCP server (serves all repos)
44
44
  await startMCPServer(backend);
@@ -98,11 +98,11 @@ export async function augment(pattern, cwd) {
98
98
  for (const result of bm25Results.slice(0, 5)) {
99
99
  const escaped = result.filePath.replace(/'/g, "''");
100
100
  try {
101
- const symbols = await executeQuery(repoId, `
102
- MATCH (n) WHERE n.filePath = '${escaped}'
103
- AND n.name CONTAINS '${pattern.replace(/'/g, "''").split(/\s+/)[0]}'
104
- RETURN n.id AS id, n.name AS name, labels(n)[0] AS type, n.filePath AS filePath
105
- LIMIT 3
101
+ const symbols = await executeQuery(repoId, `
102
+ MATCH (n) WHERE n.filePath = '${escaped}'
103
+ AND n.name CONTAINS '${pattern.replace(/'/g, "''").split(/\s+/)[0]}'
104
+ RETURN n.id AS id, n.name AS name, labels(n)[0] AS type, n.filePath AS filePath
105
+ LIMIT 3
106
106
  `);
107
107
  for (const sym of symbols) {
108
108
  symbolMatches.push({
@@ -130,10 +130,10 @@ export async function augment(pattern, cwd) {
130
130
  // Callers
131
131
  let callers = [];
132
132
  try {
133
- const rows = await executeQuery(repoId, `
134
- MATCH (caller)-[:CodeRelation {type: 'CALLS'}]->(n {id: '${escaped}'})
135
- RETURN caller.name AS name
136
- LIMIT 3
133
+ const rows = await executeQuery(repoId, `
134
+ MATCH (caller)-[:CodeRelation {type: 'CALLS'}]->(n {id: '${escaped}'})
135
+ RETURN caller.name AS name
136
+ LIMIT 3
137
137
  `);
138
138
  callers = rows.map((r) => r.name || r[0]).filter(Boolean);
139
139
  }
@@ -141,10 +141,10 @@ export async function augment(pattern, cwd) {
141
141
  // Callees
142
142
  let callees = [];
143
143
  try {
144
- const rows = await executeQuery(repoId, `
145
- MATCH (n {id: '${escaped}'})-[:CodeRelation {type: 'CALLS'}]->(callee)
146
- RETURN callee.name AS name
147
- LIMIT 3
144
+ const rows = await executeQuery(repoId, `
145
+ MATCH (n {id: '${escaped}'})-[:CodeRelation {type: 'CALLS'}]->(callee)
146
+ RETURN callee.name AS name
147
+ LIMIT 3
148
148
  `);
149
149
  callees = rows.map((r) => r.name || r[0]).filter(Boolean);
150
150
  }
@@ -152,9 +152,9 @@ export async function augment(pattern, cwd) {
152
152
  // Processes
153
153
  let processes = [];
154
154
  try {
155
- const rows = await executeQuery(repoId, `
156
- MATCH (n {id: '${escaped}'})-[r:CodeRelation {type: 'STEP_IN_PROCESS'}]->(p:Process)
157
- RETURN p.heuristicLabel AS label, r.step AS step, p.stepCount AS stepCount
155
+ const rows = await executeQuery(repoId, `
156
+ MATCH (n {id: '${escaped}'})-[r:CodeRelation {type: 'STEP_IN_PROCESS'}]->(p:Process)
157
+ RETURN p.heuristicLabel AS label, r.step AS step, p.stepCount AS stepCount
158
158
  `);
159
159
  processes = rows.map((r) => {
160
160
  const label = r.label || r[0];
@@ -167,10 +167,10 @@ export async function augment(pattern, cwd) {
167
167
  // Cluster cohesion (internal ranking signal)
168
168
  let cohesion = 0;
169
169
  try {
170
- const rows = await executeQuery(repoId, `
171
- MATCH (n {id: '${escaped}'})-[:CodeRelation {type: 'MEMBER_OF'}]->(c:Community)
172
- RETURN c.cohesion AS cohesion
173
- LIMIT 1
170
+ const rows = await executeQuery(repoId, `
171
+ MATCH (n {id: '${escaped}'})-[:CodeRelation {type: 'MEMBER_OF'}]->(c:Community)
172
+ RETURN c.cohesion AS cohesion
173
+ LIMIT 1
174
174
  `);
175
175
  if (rows.length > 0) {
176
176
  cohesion = (rows[0].cohesion ?? rows[0][0]) || 0;
@@ -6,6 +6,12 @@
6
6
  *
7
7
  * Uses snowflake-arctic-embed-xs by default (22M params, 384 dims, ~90MB)
8
8
  */
9
+ // Suppress ONNX Runtime native warnings (e.g. VerifyEachNodeIsAssignedToAnEp)
10
+ // Must be set BEFORE onnxruntime-node is imported by transformers.js
11
+ // Level 3 = Error only (skips Warning/Info)
12
+ if (!process.env.ORT_LOG_LEVEL) {
13
+ process.env.ORT_LOG_LEVEL = '3';
14
+ }
9
15
  import { pipeline, env } from '@huggingface/transformers';
10
16
  import { DEFAULT_EMBEDDING_CONFIG } from './types.js';
11
17
  // Module-level state for singleton pattern
@@ -83,6 +89,7 @@ export const initEmbedder = async (onProgress, config = {}, forceDevice) => {
83
89
  device: device,
84
90
  dtype: 'fp32',
85
91
  progress_callback: progressCallback,
92
+ session_options: { logSeverityLevel: 3 },
86
93
  });
87
94
  currentDevice = device;
88
95
  if (isDev) {
@@ -24,19 +24,19 @@ const queryEmbeddableNodes = async (executeQuery) => {
24
24
  let query;
25
25
  if (label === 'File') {
26
26
  // File nodes don't have startLine/endLine
27
- query = `
28
- MATCH (n:File)
29
- RETURN n.id AS id, n.name AS name, 'File' AS label,
30
- n.filePath AS filePath, n.content AS content
27
+ query = `
28
+ MATCH (n:File)
29
+ RETURN n.id AS id, n.name AS name, 'File' AS label,
30
+ n.filePath AS filePath, n.content AS content
31
31
  `;
32
32
  }
33
33
  else {
34
34
  // Code elements have startLine/endLine
35
- query = `
36
- MATCH (n:${label})
37
- RETURN n.id AS id, n.name AS name, '${label}' AS label,
38
- n.filePath AS filePath, n.content AS content,
39
- n.startLine AS startLine, n.endLine AS endLine
35
+ query = `
36
+ MATCH (n:${label})
37
+ RETURN n.id AS id, n.name AS name, '${label}' AS label,
38
+ n.filePath AS filePath, n.content AS content,
39
+ n.startLine AS startLine, n.endLine AS endLine
40
40
  `;
41
41
  }
42
42
  const rows = await executeQuery(query);
@@ -77,8 +77,8 @@ const batchInsertEmbeddings = async (executeWithReusedStatement, updates) => {
77
77
  * Now indexes the separate CodeEmbedding table
78
78
  */
79
79
  const createVectorIndex = async (executeQuery) => {
80
- const cypher = `
81
- CALL CREATE_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx', 'embedding', metric := 'cosine')
80
+ const cypher = `
81
+ CALL CREATE_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx', 'embedding', metric := 'cosine')
82
82
  `;
83
83
  try {
84
84
  await executeQuery(cypher);
@@ -240,14 +240,14 @@ export const semanticSearch = async (executeQuery, query, k = 10, maxDistance =
240
240
  const queryVec = embeddingToArray(queryEmbedding);
241
241
  const queryVecStr = `[${queryVec.join(',')}]`;
242
242
  // Query the vector index on CodeEmbedding to get nodeIds and distances
243
- const vectorQuery = `
244
- CALL QUERY_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx',
245
- CAST(${queryVecStr} AS FLOAT[384]), ${k})
246
- YIELD node AS emb, distance
247
- WITH emb, distance
248
- WHERE distance < ${maxDistance}
249
- RETURN emb.nodeId AS nodeId, distance
250
- ORDER BY distance
243
+ const vectorQuery = `
244
+ CALL QUERY_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx',
245
+ CAST(${queryVecStr} AS FLOAT[384]), ${k})
246
+ YIELD node AS emb, distance
247
+ WITH emb, distance
248
+ WHERE distance < ${maxDistance}
249
+ RETURN emb.nodeId AS nodeId, distance
250
+ ORDER BY distance
251
251
  `;
252
252
  const embResults = await executeQuery(vectorQuery);
253
253
  if (embResults.length === 0) {
@@ -266,16 +266,16 @@ export const semanticSearch = async (executeQuery, query, k = 10, maxDistance =
266
266
  try {
267
267
  let nodeQuery;
268
268
  if (label === 'File') {
269
- nodeQuery = `
270
- MATCH (n:File {id: '${nodeId.replace(/'/g, "''")}'})
271
- RETURN n.name AS name, n.filePath AS filePath
269
+ nodeQuery = `
270
+ MATCH (n:File {id: '${nodeId.replace(/'/g, "''")}'})
271
+ RETURN n.name AS name, n.filePath AS filePath
272
272
  `;
273
273
  }
274
274
  else {
275
- nodeQuery = `
276
- MATCH (n:${label} {id: '${nodeId.replace(/'/g, "''")}'})
277
- RETURN n.name AS name, n.filePath AS filePath,
278
- n.startLine AS startLine, n.endLine AS endLine
275
+ nodeQuery = `
276
+ MATCH (n:${label} {id: '${nodeId.replace(/'/g, "''")}'})
277
+ RETURN n.name AS name, n.filePath AS filePath,
278
+ n.startLine AS startLine, n.endLine AS endLine
279
279
  `;
280
280
  }
281
281
  const nodeRows = await executeQuery(nodeQuery);
@@ -13,12 +13,12 @@ const buildEnrichmentPrompt = (members, heuristicLabel) => {
13
13
  const memberList = limitedMembers
14
14
  .map(m => `${m.name} (${m.type})`)
15
15
  .join(', ');
16
- return `Analyze this code cluster and provide a semantic name and short description.
17
-
18
- Heuristic: "${heuristicLabel}"
19
- Members: ${memberList}${members.length > 20 ? ` (+${members.length - 20} more)` : ''}
20
-
21
- Reply with JSON only:
16
+ return `Analyze this code cluster and provide a semantic name and short description.
17
+
18
+ Heuristic: "${heuristicLabel}"
19
+ Members: ${memberList}${members.length > 20 ? ` (+${members.length - 20} more)` : ''}
20
+
21
+ Reply with JSON only:
22
22
  {"name": "2-4 word semantic name", "description": "One sentence describing purpose"}`;
23
23
  };
24
24
  // ============================================================================
@@ -115,18 +115,18 @@ export const enrichClustersBatch = async (communities, memberMap, llmClient, bat
115
115
  const memberList = limitedMembers
116
116
  .map(m => `${m.name} (${m.type})`)
117
117
  .join(', ');
118
- return `Cluster ${idx + 1} (id: ${community.id}):
119
- Heuristic: "${community.heuristicLabel}"
118
+ return `Cluster ${idx + 1} (id: ${community.id}):
119
+ Heuristic: "${community.heuristicLabel}"
120
120
  Members: ${memberList}`;
121
121
  }).join('\n\n');
122
- const prompt = `Analyze these code clusters and generate semantic names, keywords, and descriptions.
123
-
124
- ${batchPrompt}
125
-
126
- Output JSON array:
127
- [
128
- {"id": "comm_X", "name": "...", "keywords": [...], "description": "..."},
129
- ...
122
+ const prompt = `Analyze these code clusters and generate semantic names, keywords, and descriptions.
123
+
124
+ ${batchPrompt}
125
+
126
+ Output JSON array:
127
+ [
128
+ {"id": "comm_X", "name": "...", "keywords": [...], "description": "..."},
129
+ ...
130
130
  ]`;
131
131
  try {
132
132
  const response = await llmClient.generate(prompt);
@@ -3,6 +3,8 @@ import path from 'path';
3
3
  import { glob } from 'glob';
4
4
  import { shouldIgnorePath } from '../../config/ignore-service.js';
5
5
  const READ_CONCURRENCY = 32;
6
+ /** Skip files larger than 512KB — they're usually generated/vendored and crash tree-sitter */
7
+ const MAX_FILE_SIZE = 512 * 1024;
6
8
  export const walkRepository = async (repoPath, onProgress) => {
7
9
  const files = await glob('**/*', {
8
10
  cwd: repoPath,
@@ -12,13 +14,22 @@ export const walkRepository = async (repoPath, onProgress) => {
12
14
  const filtered = files.filter(file => !shouldIgnorePath(file));
13
15
  const entries = [];
14
16
  let processed = 0;
17
+ let skippedLarge = 0;
15
18
  for (let start = 0; start < filtered.length; start += READ_CONCURRENCY) {
16
19
  const batch = filtered.slice(start, start + READ_CONCURRENCY);
17
- const results = await Promise.allSettled(batch.map(relativePath => fs.readFile(path.join(repoPath, relativePath), 'utf-8')
18
- .then(content => ({ path: relativePath.replace(/\\/g, '/'), content }))));
20
+ const results = await Promise.allSettled(batch.map(async (relativePath) => {
21
+ const fullPath = path.join(repoPath, relativePath);
22
+ const stat = await fs.stat(fullPath);
23
+ if (stat.size > MAX_FILE_SIZE) {
24
+ skippedLarge++;
25
+ return null;
26
+ }
27
+ const content = await fs.readFile(fullPath, 'utf-8');
28
+ return { path: relativePath.replace(/\\/g, '/'), content };
29
+ }));
19
30
  for (const result of results) {
20
31
  processed++;
21
- if (result.status === 'fulfilled') {
32
+ if (result.status === 'fulfilled' && result.value !== null) {
22
33
  entries.push(result.value);
23
34
  onProgress?.(processed, filtered.length, result.value.path);
24
35
  }
@@ -27,5 +38,8 @@ export const walkRepository = async (repoPath, onProgress) => {
27
38
  }
28
39
  }
29
40
  }
41
+ if (skippedLarge > 0) {
42
+ console.warn(` Skipped ${skippedLarge} files larger than ${MAX_FILE_SIZE / 1024}KB`);
43
+ }
30
44
  return entries;
31
45
  };
@@ -158,6 +158,9 @@ const processParsingSequential = async (graph, files, symbolTable, astCache, onF
158
158
  const language = getLanguageFromFilename(file.path);
159
159
  if (!language)
160
160
  continue;
161
+ // Skip very large files — they can crash tree-sitter or cause OOM
162
+ if (file.content.length > 512 * 1024)
163
+ continue;
161
164
  await loadLanguage(language, file.path);
162
165
  let tree;
163
166
  try {
@@ -281,7 +284,7 @@ export const processParsing = async (graph, files, symbolTable, astCache, onFile
281
284
  return await processParsingWithWorkers(graph, files, symbolTable, astCache, workerPool, onFileProgress);
282
285
  }
283
286
  catch (err) {
284
- console.warn('Worker pool parsing failed, falling back to sequential:', err);
287
+ console.warn('Worker pool parsing failed, falling back to sequential:', err instanceof Error ? err.message : err);
285
288
  }
286
289
  }
287
290
  // Fallback: sequential parsing (no pre-extracted data)
@@ -328,6 +328,9 @@ const processFileGroup = (files, language, queryString, result, onFileProcessed)
328
328
  return;
329
329
  }
330
330
  for (const file of files) {
331
+ // Skip very large files — they can crash tree-sitter or cause OOM
332
+ if (file.content.length > 512 * 1024)
333
+ continue;
331
334
  let tree;
332
335
  try {
333
336
  tree = parser.parse(file.content, undefined, { bufferSize: 1024 * 256 });
@@ -444,8 +447,14 @@ const processFileGroup = (files, language, queryString, result, onFileProcessed)
444
447
  // Worker message handler
445
448
  // ============================================================================
446
449
  parentPort.on('message', (files) => {
447
- const result = processBatch(files, (filesProcessed) => {
448
- parentPort.postMessage({ type: 'progress', filesProcessed });
449
- });
450
- parentPort.postMessage({ type: 'result', data: result });
450
+ try {
451
+ const result = processBatch(files, (filesProcessed) => {
452
+ parentPort.postMessage({ type: 'progress', filesProcessed });
453
+ });
454
+ parentPort.postMessage({ type: 'result', data: result });
455
+ }
456
+ catch (err) {
457
+ const message = err instanceof Error ? err.message : String(err);
458
+ parentPort.postMessage({ type: 'error', error: message });
459
+ }
451
460
  });
@@ -27,31 +27,65 @@ export const createWorkerPool = (workerUrl, poolSize) => {
27
27
  const promises = chunks.map((chunk, i) => {
28
28
  const worker = workers[i];
29
29
  return new Promise((resolve, reject) => {
30
+ let settled = false;
31
+ const cleanup = () => {
32
+ clearTimeout(timer);
33
+ worker.removeListener('message', handler);
34
+ worker.removeListener('error', errorHandler);
35
+ worker.removeListener('exit', exitHandler);
36
+ };
37
+ const timer = setTimeout(() => {
38
+ if (!settled) {
39
+ settled = true;
40
+ cleanup();
41
+ reject(new Error(`Worker ${i} timed out after 5 minutes (chunk: ${chunk.length} items). Worker may have crashed or is processing too much data.`));
42
+ }
43
+ }, 5 * 60 * 1000);
30
44
  const handler = (msg) => {
45
+ if (settled)
46
+ return;
31
47
  if (msg && msg.type === 'progress') {
32
- // Intermediate progress from worker
33
48
  workerProgress[i] = msg.filesProcessed;
34
49
  if (onProgress) {
35
50
  const total = workerProgress.reduce((a, b) => a + b, 0);
36
51
  onProgress(total);
37
52
  }
38
53
  }
54
+ else if (msg && msg.type === 'error') {
55
+ // Error reported by worker via postMessage
56
+ settled = true;
57
+ cleanup();
58
+ reject(new Error(`Worker ${i} error: ${msg.error}`));
59
+ }
39
60
  else if (msg && msg.type === 'result') {
40
- // Final result
41
- worker.removeListener('message', handler);
61
+ settled = true;
62
+ cleanup();
42
63
  resolve(msg.data);
43
64
  }
44
65
  else {
45
- // Legacy: treat any non-typed message as result (backward compat)
46
- worker.removeListener('message', handler);
66
+ // Legacy: treat any non-typed message as result
67
+ settled = true;
68
+ cleanup();
47
69
  resolve(msg);
48
70
  }
49
71
  };
72
+ const errorHandler = (err) => {
73
+ if (!settled) {
74
+ settled = true;
75
+ cleanup();
76
+ reject(err);
77
+ }
78
+ };
79
+ const exitHandler = (code) => {
80
+ if (!settled) {
81
+ settled = true;
82
+ cleanup();
83
+ reject(new Error(`Worker ${i} exited unexpectedly with code ${code}. This usually indicates an out-of-memory crash or native addon failure.`));
84
+ }
85
+ };
50
86
  worker.on('message', handler);
51
- worker.once('error', (err) => {
52
- worker.removeListener('message', handler);
53
- reject(err);
54
- });
87
+ worker.once('error', errorHandler);
88
+ worker.once('exit', exitHandler);
55
89
  worker.postMessage(chunk);
56
90
  });
57
91
  });
@@ -242,10 +242,10 @@ const fallbackRelationshipInserts = async (validRelLines, validTables, getNodeLa
242
242
  continue;
243
243
  const confidence = parseFloat(confidenceStr) || 1.0;
244
244
  const step = parseInt(stepStr) || 0;
245
- await conn.query(`
246
- MATCH (a:${escapeLabel(fromLabel)} {id: '${fromId.replace(/'/g, "''")}' }),
247
- (b:${escapeLabel(toLabel)} {id: '${toId.replace(/'/g, "''")}' })
248
- CREATE (a)-[:${REL_TABLE_NAME} {type: '${relType}', confidence: ${confidence}, reason: '${reason.replace(/'/g, "''")}', step: ${step}}]->(b)
245
+ await conn.query(`
246
+ MATCH (a:${escapeLabel(fromLabel)} {id: '${fromId.replace(/'/g, "''")}' }),
247
+ (b:${escapeLabel(toLabel)} {id: '${toId.replace(/'/g, "''")}' })
248
+ CREATE (a)-[:${REL_TABLE_NAME} {type: '${relType}', confidence: ${confidence}, reason: '${reason.replace(/'/g, "''")}', step: ${step}}]->(b)
249
249
  `);
250
250
  }
251
251
  catch {
@@ -636,11 +636,11 @@ export const queryFTS = async (tableName, indexName, query, limit = 20, conjunct
636
636
  }
637
637
  // Escape single quotes in query
638
638
  const escapedQuery = query.replace(/'/g, "''");
639
- const cypher = `
640
- CALL QUERY_FTS_INDEX('${tableName}', '${indexName}', '${escapedQuery}', conjunctive := ${conjunctive})
641
- RETURN node, score
642
- ORDER BY score DESC
643
- LIMIT ${limit}
639
+ const cypher = `
640
+ CALL QUERY_FTS_INDEX('${tableName}', '${indexName}', '${escapedQuery}', conjunctive := ${conjunctive})
641
+ RETURN node, score
642
+ ORDER BY score DESC
643
+ LIMIT ${limit}
644
644
  `;
645
645
  try {
646
646
  const queryResult = await conn.query(cypher);