gitnexus 1.2.9 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,6 +4,8 @@
4
4
  * Indexes a repository and stores the knowledge graph in .gitnexus/
5
5
  */
6
6
  import path from 'path';
7
+ import { execFileSync } from 'child_process';
8
+ import v8 from 'v8';
7
9
  import cliProgress from 'cli-progress';
8
10
  import { runPipelineFromRepo } from '../core/ingestion/pipeline.js';
9
11
  import { initKuzu, loadGraphToKuzu, getKuzuStats, executeQuery, executeWithReusedStatement, closeKuzu, createFTSIndex, loadCachedEmbeddings } from '../core/kuzu/kuzu-adapter.js';
@@ -14,6 +16,27 @@ import { getCurrentCommit, isGitRepo, getGitRoot } from '../storage/git.js';
14
16
  import { generateAIContextFiles } from './ai-context.js';
15
17
  import fs from 'fs/promises';
16
18
  import { registerClaudeHook } from './claude-hooks.js';
19
+ const HEAP_MB = 8192;
20
+ const HEAP_FLAG = `--max-old-space-size=${HEAP_MB}`;
21
+ /** Re-exec the process with an 8GB heap if we're currently below that. */
22
+ function ensureHeap() {
23
+ const nodeOpts = process.env.NODE_OPTIONS || '';
24
+ if (nodeOpts.includes('--max-old-space-size'))
25
+ return false;
26
+ const v8Heap = v8.getHeapStatistics().heap_size_limit;
27
+ if (v8Heap >= HEAP_MB * 1024 * 1024 * 0.9)
28
+ return false;
29
+ try {
30
+ execFileSync(process.execPath, [HEAP_FLAG, ...process.argv.slice(1)], {
31
+ stdio: 'inherit',
32
+ env: { ...process.env, NODE_OPTIONS: `${nodeOpts} ${HEAP_FLAG}`.trim() },
33
+ });
34
+ }
35
+ catch (e) {
36
+ process.exitCode = e.status ?? 1;
37
+ }
38
+ return true;
39
+ }
17
40
  /** Threshold: auto-skip embeddings for repos with more nodes than this */
18
41
  const EMBEDDING_NODE_LIMIT = 50_000;
19
42
  const PHASE_LABELS = {
@@ -32,6 +55,8 @@ const PHASE_LABELS = {
32
55
  done: 'Done',
33
56
  };
34
57
  export const analyzeCommand = async (inputPath, options) => {
58
+ if (ensureHeap())
59
+ return;
35
60
  console.log('\n GitNexus Analyzer\n');
36
61
  let repoPath;
37
62
  if (inputPath) {
@@ -70,18 +95,47 @@ export const analyzeCommand = async (inputPath, options) => {
70
95
  stopOnComplete: false,
71
96
  }, cliProgress.Presets.shades_grey);
72
97
  bar.start(100, 0, { phase: 'Initializing...' });
98
+ // Graceful SIGINT handling — clean up resources and exit
99
+ let aborted = false;
100
+ const sigintHandler = () => {
101
+ if (aborted)
102
+ process.exit(1); // Second Ctrl-C: force exit
103
+ aborted = true;
104
+ bar.stop();
105
+ console.log('\n Interrupted — cleaning up...');
106
+ closeKuzu().catch(() => { }).finally(() => process.exit(130));
107
+ };
108
+ process.on('SIGINT', sigintHandler);
73
109
  // Route all console output through bar.log() so the bar doesn't stamp itself
74
110
  // multiple times when other code writes to stdout/stderr mid-render.
75
111
  const origLog = console.log.bind(console);
76
112
  const origWarn = console.warn.bind(console);
77
113
  const origError = console.error.bind(console);
78
- const barLog = (...args) => origLog(args.map(a => (typeof a === 'string' ? a : String(a))).join(' '));
114
+ const barLog = (...args) => {
115
+ // Clear the bar line, print the message, then let the next bar.update redraw
116
+ process.stdout.write('\x1b[2K\r');
117
+ origLog(args.map(a => (typeof a === 'string' ? a : String(a))).join(' '));
118
+ };
79
119
  console.log = barLog;
80
120
  console.warn = barLog;
81
121
  console.error = barLog;
82
- // Show elapsed seconds for phases that run longer than 3s
122
+ // Track elapsed time per phase both updateBar and the interval use the
123
+ // same format so they don't flicker against each other.
83
124
  let lastPhaseLabel = 'Initializing...';
84
125
  let phaseStart = Date.now();
126
+ /** Update bar with phase label + elapsed seconds (shown after 3s). */
127
+ const updateBar = (value, phaseLabel) => {
128
+ if (phaseLabel !== lastPhaseLabel) {
129
+ lastPhaseLabel = phaseLabel;
130
+ phaseStart = Date.now();
131
+ }
132
+ const elapsed = Math.round((Date.now() - phaseStart) / 1000);
133
+ const display = elapsed >= 3 ? `${phaseLabel} (${elapsed}s)` : phaseLabel;
134
+ bar.update(value, { phase: display });
135
+ };
136
+ // Tick elapsed seconds for phases with infrequent progress callbacks
137
+ // (e.g. CSV streaming, FTS indexing). Uses the same display format as
138
+ // updateBar so there's no flickering.
85
139
  const elapsedTimer = setInterval(() => {
86
140
  const elapsed = Math.round((Date.now() - phaseStart) / 1000);
87
141
  if (elapsed >= 3) {
@@ -94,7 +148,7 @@ export const analyzeCommand = async (inputPath, options) => {
94
148
  let cachedEmbeddings = [];
95
149
  if (options?.embeddings && existingMeta && !options?.force) {
96
150
  try {
97
- bar.update(0, { phase: 'Caching embeddings...' });
151
+ updateBar(0, 'Caching embeddings...');
98
152
  await initKuzu(kuzuPath);
99
153
  const cached = await loadCachedEmbeddings();
100
154
  cachedEmbeddingNodeIds = cached.embeddingNodeIds;
@@ -112,16 +166,10 @@ export const analyzeCommand = async (inputPath, options) => {
112
166
  const pipelineResult = await runPipelineFromRepo(repoPath, (progress) => {
113
167
  const phaseLabel = PHASE_LABELS[progress.phase] || progress.phase;
114
168
  const scaled = Math.round(progress.percent * 0.6);
115
- if (phaseLabel !== lastPhaseLabel) {
116
- lastPhaseLabel = phaseLabel;
117
- phaseStart = Date.now();
118
- }
119
- bar.update(scaled, { phase: phaseLabel });
169
+ updateBar(scaled, phaseLabel);
120
170
  });
121
171
  // ── Phase 2: KuzuDB (60–85%) ──────────────────────────────────────
122
- lastPhaseLabel = 'Loading into KuzuDB...';
123
- phaseStart = Date.now();
124
- bar.update(60, { phase: lastPhaseLabel });
172
+ updateBar(60, 'Loading into KuzuDB...');
125
173
  await closeKuzu();
126
174
  const kuzuFiles = [kuzuPath, `${kuzuPath}.wal`, `${kuzuPath}.lock`];
127
175
  for (const f of kuzuFiles) {
@@ -133,17 +181,15 @@ export const analyzeCommand = async (inputPath, options) => {
133
181
  const t0Kuzu = Date.now();
134
182
  await initKuzu(kuzuPath);
135
183
  let kuzuMsgCount = 0;
136
- const kuzuResult = await loadGraphToKuzu(pipelineResult.graph, pipelineResult.fileContents, storagePath, (msg) => {
184
+ const kuzuResult = await loadGraphToKuzu(pipelineResult.graph, pipelineResult.repoPath, storagePath, (msg) => {
137
185
  kuzuMsgCount++;
138
186
  const progress = Math.min(84, 60 + Math.round((kuzuMsgCount / (kuzuMsgCount + 10)) * 24));
139
- bar.update(progress, { phase: msg });
187
+ updateBar(progress, msg);
140
188
  });
141
189
  const kuzuTime = ((Date.now() - t0Kuzu) / 1000).toFixed(1);
142
190
  const kuzuWarnings = kuzuResult.warnings;
143
191
  // ── Phase 3: FTS (85–90%) ─────────────────────────────────────────
144
- lastPhaseLabel = 'Creating search indexes...';
145
- phaseStart = Date.now();
146
- bar.update(85, { phase: lastPhaseLabel });
192
+ updateBar(85, 'Creating search indexes...');
147
193
  const t0Fts = Date.now();
148
194
  try {
149
195
  await createFTSIndex('File', 'file_fts', ['name', 'content']);
@@ -158,7 +204,7 @@ export const analyzeCommand = async (inputPath, options) => {
158
204
  const ftsTime = ((Date.now() - t0Fts) / 1000).toFixed(1);
159
205
  // ── Phase 3.5: Re-insert cached embeddings ────────────────────────
160
206
  if (cachedEmbeddings.length > 0) {
161
- bar.update(88, { phase: `Restoring ${cachedEmbeddings.length} cached embeddings...` });
207
+ updateBar(88, `Restoring ${cachedEmbeddings.length} cached embeddings...`);
162
208
  const EMBED_BATCH = 200;
163
209
  for (let i = 0; i < cachedEmbeddings.length; i += EMBED_BATCH) {
164
210
  const batch = cachedEmbeddings.slice(i, i + EMBED_BATCH);
@@ -183,29 +229,23 @@ export const analyzeCommand = async (inputPath, options) => {
183
229
  }
184
230
  }
185
231
  if (!embeddingSkipped) {
186
- lastPhaseLabel = 'Loading embedding model...';
187
- phaseStart = Date.now();
188
- bar.update(90, { phase: lastPhaseLabel });
232
+ updateBar(90, 'Loading embedding model...');
189
233
  const t0Emb = Date.now();
190
234
  await runEmbeddingPipeline(executeQuery, executeWithReusedStatement, (progress) => {
191
235
  const scaled = 90 + Math.round((progress.percent / 100) * 8);
192
236
  const label = progress.phase === 'loading-model' ? 'Loading embedding model...' : `Embedding ${progress.nodesProcessed || 0}/${progress.totalNodes || '?'}`;
193
- if (label !== lastPhaseLabel) {
194
- lastPhaseLabel = label;
195
- phaseStart = Date.now();
196
- }
197
- bar.update(scaled, { phase: label });
237
+ updateBar(scaled, label);
198
238
  }, {}, cachedEmbeddingNodeIds.size > 0 ? cachedEmbeddingNodeIds : undefined);
199
239
  embeddingTime = ((Date.now() - t0Emb) / 1000).toFixed(1);
200
240
  }
201
241
  // ── Phase 5: Finalize (98–100%) ───────────────────────────────────
202
- bar.update(98, { phase: 'Saving metadata...' });
242
+ updateBar(98, 'Saving metadata...');
203
243
  const meta = {
204
244
  repoPath,
205
245
  lastCommit: currentCommit,
206
246
  indexedAt: new Date().toISOString(),
207
247
  stats: {
208
- files: pipelineResult.fileContents.size,
248
+ files: pipelineResult.totalFileCount,
209
249
  nodes: stats.nodes,
210
250
  edges: stats.edges,
211
251
  communities: pipelineResult.communityResult?.stats.totalCommunities,
@@ -227,7 +267,7 @@ export const analyzeCommand = async (inputPath, options) => {
227
267
  aggregatedClusterCount = Array.from(groups.values()).filter(count => count >= 5).length;
228
268
  }
229
269
  const aiContext = await generateAIContextFiles(repoPath, storagePath, projectName, {
230
- files: pipelineResult.fileContents.size,
270
+ files: pipelineResult.totalFileCount,
231
271
  nodes: stats.nodes,
232
272
  edges: stats.edges,
233
273
  communities: pipelineResult.communityResult?.stats.totalCommunities,
@@ -240,6 +280,7 @@ export const analyzeCommand = async (inputPath, options) => {
240
280
  // Since the process exits immediately after, Node.js reclaims everything.
241
281
  const totalTime = ((Date.now() - t0Global) / 1000).toFixed(1);
242
282
  clearInterval(elapsedTimer);
283
+ process.removeListener('SIGINT', sigintHandler);
243
284
  console.log = origLog;
244
285
  console.warn = origWarn;
245
286
  console.error = origError;
package/dist/cli/index.js CHANGED
@@ -1,4 +1,24 @@
1
1
  #!/usr/bin/env node
2
+ // Raise Node heap limit for large repos (e.g. Linux kernel).
3
+ // Must run before any heavy allocation. If already set by the user, respect it.
4
+ if (!process.env.NODE_OPTIONS?.includes('--max-old-space-size')) {
5
+ const execArgv = process.execArgv.join(' ');
6
+ if (!execArgv.includes('--max-old-space-size')) {
7
+ // Re-spawn with a larger heap (8 GB)
8
+ const { execFileSync } = await import('node:child_process');
9
+ try {
10
+ execFileSync(process.execPath, ['--max-old-space-size=8192', ...process.argv.slice(1)], {
11
+ stdio: 'inherit',
12
+ env: { ...process.env, NODE_OPTIONS: `${process.env.NODE_OPTIONS || ''} --max-old-space-size=8192`.trim() },
13
+ });
14
+ process.exit(0);
15
+ }
16
+ catch (e) {
17
+ // If the child exited with an error code, propagate it
18
+ process.exit(e.status ?? 1);
19
+ }
20
+ }
21
+ }
2
22
  import { Command } from 'commander';
3
23
  import { analyzeCommand } from './analyze.js';
4
24
  import { serveCommand } from './serve.js';
@@ -46,6 +46,11 @@ export const createKnowledgeGraph = () => {
46
46
  get relationships() {
47
47
  return Array.from(relationshipMap.values());
48
48
  },
49
+ iterNodes: () => nodeMap.values(),
50
+ iterRelationships: () => relationshipMap.values(),
51
+ forEachNode(fn) { nodeMap.forEach(fn); },
52
+ forEachRelationship(fn) { relationshipMap.forEach(fn); },
53
+ getNode: (id) => nodeMap.get(id),
49
54
  // O(1) count getters - avoid creating arrays just for length
50
55
  get nodeCount() {
51
56
  return nodeMap.size;
@@ -1,4 +1,4 @@
1
- export type NodeLabel = 'Project' | 'Package' | 'Module' | 'Folder' | 'File' | 'Class' | 'Function' | 'Method' | 'Variable' | 'Interface' | 'Enum' | 'Decorator' | 'Import' | 'Type' | 'CodeElement' | 'Community' | 'Process';
1
+ export type NodeLabel = 'Project' | 'Package' | 'Module' | 'Folder' | 'File' | 'Class' | 'Function' | 'Method' | 'Variable' | 'Interface' | 'Enum' | 'Decorator' | 'Import' | 'Type' | 'CodeElement' | 'Community' | 'Process' | 'Struct' | 'Macro' | 'Typedef' | 'Union' | 'Namespace' | 'Trait' | 'Impl' | 'TypeAlias' | 'Const' | 'Static' | 'Property' | 'Record' | 'Delegate' | 'Annotation' | 'Constructor' | 'Template';
2
2
  export type NodeProperties = {
3
3
  name: string;
4
4
  filePath: string;
@@ -39,8 +39,19 @@ export interface GraphRelationship {
39
39
  step?: number;
40
40
  }
41
41
  export interface KnowledgeGraph {
42
+ /** Returns a full array copy — prefer iterNodes() for iteration */
42
43
  nodes: GraphNode[];
44
+ /** Returns a full array copy — prefer iterRelationships() for iteration */
43
45
  relationships: GraphRelationship[];
46
+ /** Zero-copy iterator over nodes */
47
+ iterNodes: () => IterableIterator<GraphNode>;
48
+ /** Zero-copy iterator over relationships */
49
+ iterRelationships: () => IterableIterator<GraphRelationship>;
50
+ /** Zero-copy forEach — avoids iterator protocol overhead in hot loops */
51
+ forEachNode: (fn: (node: GraphNode) => void) => void;
52
+ forEachRelationship: (fn: (rel: GraphRelationship) => void) => void;
53
+ /** Lookup a single node by id — O(1) */
54
+ getNode: (id: string) => GraphNode | undefined;
44
55
  nodeCount: number;
45
56
  relationshipCount: number;
46
57
  addNode: (node: GraphNode) => void;
@@ -232,38 +232,58 @@ const resolveCallTarget = (calledName, currentFile, symbolTable, importMap) => {
232
232
  * Filter out common built-in functions and noise
233
233
  * that shouldn't be tracked as calls
234
234
  */
235
- const isBuiltInOrNoise = (name) => {
236
- const builtIns = new Set([
237
- // JavaScript/TypeScript built-ins
238
- 'console', 'log', 'warn', 'error', 'info', 'debug',
239
- 'setTimeout', 'setInterval', 'clearTimeout', 'clearInterval',
240
- 'parseInt', 'parseFloat', 'isNaN', 'isFinite',
241
- 'encodeURI', 'decodeURI', 'encodeURIComponent', 'decodeURIComponent',
242
- 'JSON', 'parse', 'stringify',
243
- 'Object', 'Array', 'String', 'Number', 'Boolean', 'Symbol', 'BigInt',
244
- 'Map', 'Set', 'WeakMap', 'WeakSet',
245
- 'Promise', 'resolve', 'reject', 'then', 'catch', 'finally',
246
- 'Math', 'Date', 'RegExp', 'Error',
247
- 'require', 'import', 'export',
248
- 'fetch', 'Response', 'Request',
249
- // React hooks and common functions
250
- 'useState', 'useEffect', 'useCallback', 'useMemo', 'useRef', 'useContext',
251
- 'useReducer', 'useLayoutEffect', 'useImperativeHandle', 'useDebugValue',
252
- 'createElement', 'createContext', 'createRef', 'forwardRef', 'memo', 'lazy',
253
- // Common array/object methods
254
- 'map', 'filter', 'reduce', 'forEach', 'find', 'findIndex', 'some', 'every',
255
- 'includes', 'indexOf', 'slice', 'splice', 'concat', 'join', 'split',
256
- 'push', 'pop', 'shift', 'unshift', 'sort', 'reverse',
257
- 'keys', 'values', 'entries', 'assign', 'freeze', 'seal',
258
- 'hasOwnProperty', 'toString', 'valueOf',
259
- // Python built-ins
260
- 'print', 'len', 'range', 'str', 'int', 'float', 'list', 'dict', 'set', 'tuple',
261
- 'open', 'read', 'write', 'close', 'append', 'extend', 'update',
262
- 'super', 'type', 'isinstance', 'issubclass', 'getattr', 'setattr', 'hasattr',
263
- 'enumerate', 'zip', 'sorted', 'reversed', 'min', 'max', 'sum', 'abs',
264
- ]);
265
- return builtIns.has(name);
266
- };
235
+ /** Pre-built set (module-level singleton) to avoid re-creating per call */
236
+ const BUILT_IN_NAMES = new Set([
237
+ // JavaScript/TypeScript built-ins
238
+ 'console', 'log', 'warn', 'error', 'info', 'debug',
239
+ 'setTimeout', 'setInterval', 'clearTimeout', 'clearInterval',
240
+ 'parseInt', 'parseFloat', 'isNaN', 'isFinite',
241
+ 'encodeURI', 'decodeURI', 'encodeURIComponent', 'decodeURIComponent',
242
+ 'JSON', 'parse', 'stringify',
243
+ 'Object', 'Array', 'String', 'Number', 'Boolean', 'Symbol', 'BigInt',
244
+ 'Map', 'Set', 'WeakMap', 'WeakSet',
245
+ 'Promise', 'resolve', 'reject', 'then', 'catch', 'finally',
246
+ 'Math', 'Date', 'RegExp', 'Error',
247
+ 'require', 'import', 'export',
248
+ 'fetch', 'Response', 'Request',
249
+ // React hooks and common functions
250
+ 'useState', 'useEffect', 'useCallback', 'useMemo', 'useRef', 'useContext',
251
+ 'useReducer', 'useLayoutEffect', 'useImperativeHandle', 'useDebugValue',
252
+ 'createElement', 'createContext', 'createRef', 'forwardRef', 'memo', 'lazy',
253
+ // Common array/object methods
254
+ 'map', 'filter', 'reduce', 'forEach', 'find', 'findIndex', 'some', 'every',
255
+ 'includes', 'indexOf', 'slice', 'splice', 'concat', 'join', 'split',
256
+ 'push', 'pop', 'shift', 'unshift', 'sort', 'reverse',
257
+ 'keys', 'values', 'entries', 'assign', 'freeze', 'seal',
258
+ 'hasOwnProperty', 'toString', 'valueOf',
259
+ // Python built-ins
260
+ 'print', 'len', 'range', 'str', 'int', 'float', 'list', 'dict', 'set', 'tuple',
261
+ 'open', 'read', 'write', 'close', 'append', 'extend', 'update',
262
+ 'super', 'type', 'isinstance', 'issubclass', 'getattr', 'setattr', 'hasattr',
263
+ 'enumerate', 'zip', 'sorted', 'reversed', 'min', 'max', 'sum', 'abs',
264
+ // C/C++ standard library and common kernel helpers
265
+ 'printf', 'fprintf', 'sprintf', 'snprintf', 'vprintf', 'vfprintf', 'vsprintf', 'vsnprintf',
266
+ 'scanf', 'fscanf', 'sscanf',
267
+ 'malloc', 'calloc', 'realloc', 'free', 'memcpy', 'memmove', 'memset', 'memcmp',
268
+ 'strlen', 'strcpy', 'strncpy', 'strcat', 'strncat', 'strcmp', 'strncmp', 'strstr', 'strchr', 'strrchr',
269
+ 'atoi', 'atol', 'atof', 'strtol', 'strtoul', 'strtoll', 'strtoull', 'strtod',
270
+ 'sizeof', 'offsetof', 'typeof',
271
+ 'assert', 'abort', 'exit', '_exit',
272
+ 'fopen', 'fclose', 'fread', 'fwrite', 'fseek', 'ftell', 'rewind', 'fflush', 'fgets', 'fputs',
273
+ // Linux kernel common macros/helpers (not real call targets)
274
+ 'likely', 'unlikely', 'BUG', 'BUG_ON', 'WARN', 'WARN_ON', 'WARN_ONCE',
275
+ 'IS_ERR', 'PTR_ERR', 'ERR_PTR', 'IS_ERR_OR_NULL',
276
+ 'ARRAY_SIZE', 'container_of', 'list_for_each_entry', 'list_for_each_entry_safe',
277
+ 'min', 'max', 'clamp', 'abs', 'swap',
278
+ 'pr_info', 'pr_warn', 'pr_err', 'pr_debug', 'pr_notice', 'pr_crit', 'pr_emerg',
279
+ 'printk', 'dev_info', 'dev_warn', 'dev_err', 'dev_dbg',
280
+ 'GFP_KERNEL', 'GFP_ATOMIC',
281
+ 'spin_lock', 'spin_unlock', 'spin_lock_irqsave', 'spin_unlock_irqrestore',
282
+ 'mutex_lock', 'mutex_unlock', 'mutex_init',
283
+ 'kfree', 'kmalloc', 'kzalloc', 'kcalloc', 'krealloc', 'kvmalloc', 'kvfree',
284
+ 'get', 'put',
285
+ ]);
286
+ const isBuiltInOrNoise = (name) => BUILT_IN_NAMES.has(name);
267
287
  /**
268
288
  * Fast path: resolve pre-extracted call sites from workers.
269
289
  * No AST parsing — workers already extracted calledName + sourceId.
@@ -51,23 +51,51 @@ export const getCommunityColor = (communityIndex) => {
51
51
  */
52
52
  export const processCommunities = async (knowledgeGraph, onProgress) => {
53
53
  onProgress?.('Building graph for community detection...', 0);
54
- // Step 1: Build a graphology graph from the knowledge graph
55
- // We only include symbol nodes (Function, Class, Method) and CALLS edges
56
- const graph = buildGraphologyGraph(knowledgeGraph);
54
+ // Pre-check total symbol count to determine large-graph mode before building
55
+ let symbolCount = 0;
56
+ knowledgeGraph.forEachNode(node => {
57
+ if (node.label === 'Function' || node.label === 'Class' || node.label === 'Method' || node.label === 'Interface') {
58
+ symbolCount++;
59
+ }
60
+ });
61
+ const isLarge = symbolCount > 10_000;
62
+ const graph = buildGraphologyGraph(knowledgeGraph, isLarge);
57
63
  if (graph.order === 0) {
58
- // No nodes to cluster
59
64
  return {
60
65
  communities: [],
61
66
  memberships: [],
62
67
  stats: { totalCommunities: 0, modularity: 0, nodesProcessed: 0 }
63
68
  };
64
69
  }
65
- onProgress?.(`Running Leiden algorithm on ${graph.order} nodes...`, 30);
66
- // Step 2: Run Leiden algorithm for community detection
67
- const details = leiden.detailed(graph, {
68
- resolution: 1.0, // Default resolution, can be tuned
69
- randomWalk: true,
70
- });
70
+ const nodeCount = graph.order;
71
+ const edgeCount = graph.size;
72
+ onProgress?.(`Running Leiden on ${nodeCount} nodes, ${edgeCount} edges${isLarge ? ` (filtered from ${symbolCount} symbols)` : ''}...`, 30);
73
+ // Large graphs: higher resolution + capped iterations (matching Python leidenalg default of 2).
74
+ // The first 2 iterations capture ~95%+ of modularity; additional iterations have diminishing returns.
75
+ // Timeout: abort after 60s for pathological graph structures.
76
+ const LEIDEN_TIMEOUT_MS = 60_000;
77
+ let details;
78
+ try {
79
+ details = await Promise.race([
80
+ Promise.resolve(leiden.detailed(graph, {
81
+ resolution: isLarge ? 2.0 : 1.0,
82
+ maxIterations: isLarge ? 3 : 0,
83
+ })),
84
+ new Promise((_, reject) => setTimeout(() => reject(new Error('Leiden timeout')), LEIDEN_TIMEOUT_MS)),
85
+ ]);
86
+ }
87
+ catch (e) {
88
+ if (e.message === 'Leiden timeout') {
89
+ onProgress?.('Community detection timed out, using fallback...', 60);
90
+ // Fallback: assign all nodes to community 0
91
+ const communities = {};
92
+ graph.forEachNode((node) => { communities[node] = 0; });
93
+ details = { communities, count: 1, modularity: 0 };
94
+ }
95
+ else {
96
+ throw e;
97
+ }
98
+ }
71
99
  onProgress?.(`Found ${details.count} communities...`, 60);
72
100
  // Step 3: Create community nodes with heuristic labels
73
101
  const communityNodes = createCommunityNodes(details.communities, details.count, graph, knowledgeGraph);
@@ -95,41 +123,48 @@ export const processCommunities = async (knowledgeGraph, onProgress) => {
95
123
  // HELPER: Build graphology graph from knowledge graph
96
124
  // ============================================================================
97
125
  /**
98
- * Build a graphology graph containing only symbol nodes and CALLS edges
99
- * This is what the Leiden algorithm will cluster
126
+ * Build a graphology graph containing only symbol nodes and clustering edges.
127
+ * For large graphs (>10K symbols), filter out low-confidence fuzzy-global edges
128
+ * and degree-1 nodes that add noise and massively increase Leiden runtime.
100
129
  */
101
- const buildGraphologyGraph = (knowledgeGraph) => {
102
- // Use undirected graph for Leiden - it looks at edge density, not direction
130
+ const MIN_CONFIDENCE_LARGE = 0.5;
131
+ const buildGraphologyGraph = (knowledgeGraph, isLarge) => {
103
132
  const graph = new Graph({ type: 'undirected', allowSelfLoops: false });
104
- // Symbol types that should be clustered
105
133
  const symbolTypes = new Set(['Function', 'Class', 'Method', 'Interface']);
106
- // First pass: collect which nodes participate in clustering edges
107
134
  const clusteringRelTypes = new Set(['CALLS', 'EXTENDS', 'IMPLEMENTS']);
108
135
  const connectedNodes = new Set();
109
- knowledgeGraph.relationships.forEach(rel => {
110
- if (clusteringRelTypes.has(rel.type) && rel.sourceId !== rel.targetId) {
111
- connectedNodes.add(rel.sourceId);
112
- connectedNodes.add(rel.targetId);
113
- }
136
+ const nodeDegree = new Map();
137
+ knowledgeGraph.forEachRelationship(rel => {
138
+ if (!clusteringRelTypes.has(rel.type) || rel.sourceId === rel.targetId)
139
+ return;
140
+ if (isLarge && rel.confidence < MIN_CONFIDENCE_LARGE)
141
+ return;
142
+ connectedNodes.add(rel.sourceId);
143
+ connectedNodes.add(rel.targetId);
144
+ nodeDegree.set(rel.sourceId, (nodeDegree.get(rel.sourceId) || 0) + 1);
145
+ nodeDegree.set(rel.targetId, (nodeDegree.get(rel.targetId) || 0) + 1);
114
146
  });
115
- // Only add nodes that have at least one clustering edge
116
- // Isolated nodes would just become singletons (skipped anyway)
117
- knowledgeGraph.nodes.forEach(node => {
118
- if (symbolTypes.has(node.label) && connectedNodes.has(node.id)) {
119
- graph.addNode(node.id, {
120
- name: node.properties.name,
121
- filePath: node.properties.filePath,
122
- type: node.label,
123
- });
124
- }
147
+ knowledgeGraph.forEachNode(node => {
148
+ if (!symbolTypes.has(node.label) || !connectedNodes.has(node.id))
149
+ return;
150
+ // For large graphs, skip degree-1 nodes — they just become singletons or
151
+ // get absorbed into their single neighbor's community, but cost iteration time.
152
+ if (isLarge && (nodeDegree.get(node.id) || 0) < 2)
153
+ return;
154
+ graph.addNode(node.id, {
155
+ name: node.properties.name,
156
+ filePath: node.properties.filePath,
157
+ type: node.label,
158
+ });
125
159
  });
126
- // Add edges
127
- knowledgeGraph.relationships.forEach(rel => {
128
- if (clusteringRelTypes.has(rel.type)) {
129
- if (graph.hasNode(rel.sourceId) && graph.hasNode(rel.targetId) && rel.sourceId !== rel.targetId) {
130
- if (!graph.hasEdge(rel.sourceId, rel.targetId)) {
131
- graph.addEdge(rel.sourceId, rel.targetId);
132
- }
160
+ knowledgeGraph.forEachRelationship(rel => {
161
+ if (!clusteringRelTypes.has(rel.type))
162
+ return;
163
+ if (isLarge && rel.confidence < MIN_CONFIDENCE_LARGE)
164
+ return;
165
+ if (graph.hasNode(rel.sourceId) && graph.hasNode(rel.targetId) && rel.sourceId !== rel.targetId) {
166
+ if (!graph.hasEdge(rel.sourceId, rel.targetId)) {
167
+ graph.addEdge(rel.sourceId, rel.targetId);
133
168
  }
134
169
  }
135
170
  });
@@ -152,11 +187,11 @@ const createCommunityNodes = (communities, communityCount, graph, knowledgeGraph
152
187
  });
153
188
  // Build node lookup for file paths
154
189
  const nodePathMap = new Map();
155
- knowledgeGraph.nodes.forEach(node => {
190
+ for (const node of knowledgeGraph.iterNodes()) {
156
191
  if (node.properties.filePath) {
157
192
  nodePathMap.set(node.id, node.properties.filePath);
158
193
  }
159
- });
194
+ }
160
195
  // Create community nodes - SKIP SINGLETONS (isolated nodes)
161
196
  const communityNodes = [];
162
197
  communityMembers.forEach((memberIds, commNum) => {
@@ -2,4 +2,27 @@ export interface FileEntry {
2
2
  path: string;
3
3
  content: string;
4
4
  }
5
+ /** Lightweight entry — path + size from stat, no content in memory */
6
+ export interface ScannedFile {
7
+ path: string;
8
+ size: number;
9
+ }
10
+ /** Path-only reference (for type signatures) */
11
+ export interface FilePath {
12
+ path: string;
13
+ }
14
+ /**
15
+ * Phase 1: Scan repository — stat files to get paths + sizes, no content loaded.
16
+ * Memory: ~10MB for 100K files vs ~1GB+ with content.
17
+ */
18
+ export declare const walkRepositoryPaths: (repoPath: string, onProgress?: (current: number, total: number, filePath: string) => void) => Promise<ScannedFile[]>;
19
+ /**
20
+ * Phase 2: Read file contents for a specific set of relative paths.
21
+ * Returns a Map for O(1) lookup. Silently skips files that fail to read.
22
+ */
23
+ export declare const readFileContents: (repoPath: string, relativePaths: string[]) => Promise<Map<string, string>>;
24
+ /**
25
+ * Legacy API — scans and reads everything into memory.
26
+ * Used by sequential fallback path only.
27
+ */
5
28
  export declare const walkRepository: (repoPath: string, onProgress?: (current: number, total: number, filePath: string) => void) => Promise<FileEntry[]>;
@@ -5,7 +5,11 @@ import { shouldIgnorePath } from '../../config/ignore-service.js';
5
5
  const READ_CONCURRENCY = 32;
6
6
  /** Skip files larger than 512KB — they're usually generated/vendored and crash tree-sitter */
7
7
  const MAX_FILE_SIZE = 512 * 1024;
8
- export const walkRepository = async (repoPath, onProgress) => {
8
+ /**
9
+ * Phase 1: Scan repository — stat files to get paths + sizes, no content loaded.
10
+ * Memory: ~10MB for 100K files vs ~1GB+ with content.
11
+ */
12
+ export const walkRepositoryPaths = async (repoPath, onProgress) => {
9
13
  const files = await glob('**/*', {
10
14
  cwd: repoPath,
11
15
  nodir: true,
@@ -24,8 +28,7 @@ export const walkRepository = async (repoPath, onProgress) => {
24
28
  skippedLarge++;
25
29
  return null;
26
30
  }
27
- const content = await fs.readFile(fullPath, 'utf-8');
28
- return { path: relativePath.replace(/\\/g, '/'), content };
31
+ return { path: relativePath.replace(/\\/g, '/'), size: stat.size };
29
32
  }));
30
33
  for (const result of results) {
31
34
  processed++;
@@ -43,3 +46,35 @@ export const walkRepository = async (repoPath, onProgress) => {
43
46
  }
44
47
  return entries;
45
48
  };
49
+ /**
50
+ * Phase 2: Read file contents for a specific set of relative paths.
51
+ * Returns a Map for O(1) lookup. Silently skips files that fail to read.
52
+ */
53
+ export const readFileContents = async (repoPath, relativePaths) => {
54
+ const contents = new Map();
55
+ for (let start = 0; start < relativePaths.length; start += READ_CONCURRENCY) {
56
+ const batch = relativePaths.slice(start, start + READ_CONCURRENCY);
57
+ const results = await Promise.allSettled(batch.map(async (relativePath) => {
58
+ const fullPath = path.join(repoPath, relativePath);
59
+ const content = await fs.readFile(fullPath, 'utf-8');
60
+ return { path: relativePath, content };
61
+ }));
62
+ for (const result of results) {
63
+ if (result.status === 'fulfilled') {
64
+ contents.set(result.value.path, result.value.content);
65
+ }
66
+ }
67
+ }
68
+ return contents;
69
+ };
70
+ /**
71
+ * Legacy API — scans and reads everything into memory.
72
+ * Used by sequential fallback path only.
73
+ */
74
+ export const walkRepository = async (repoPath, onProgress) => {
75
+ const scanned = await walkRepositoryPaths(repoPath, onProgress);
76
+ const contents = await readFileContents(repoPath, scanned.map(f => f.path));
77
+ return scanned
78
+ .filter(f => contents.has(f.path))
79
+ .map(f => ({ path: f.path, content: contents.get(f.path) }));
80
+ };
@@ -3,6 +3,15 @@ import { ASTCache } from './ast-cache.js';
3
3
  import type { ExtractedImport } from './workers/parse-worker.js';
4
4
  export type ImportMap = Map<string, Set<string>>;
5
5
  export declare const createImportMap: () => ImportMap;
6
+ /** Pre-built lookup structures for import resolution. Build once, reuse across chunks. */
7
+ export interface ImportResolutionContext {
8
+ allFilePaths: Set<string>;
9
+ allFileList: string[];
10
+ normalizedFileList: string[];
11
+ suffixIndex: SuffixIndex;
12
+ resolveCache: Map<string, string | null>;
13
+ }
14
+ export declare function buildImportResolutionContext(allPaths: string[]): ImportResolutionContext;
6
15
  /**
7
16
  * Build a suffix index for O(1) endsWith lookups.
8
17
  * Maps every possible path suffix to its original file path.
@@ -23,8 +32,7 @@ export interface SuffixIndex {
23
32
  export declare const processImports: (graph: KnowledgeGraph, files: {
24
33
  path: string;
25
34
  content: string;
26
- }[], astCache: ASTCache, importMap: ImportMap, onProgress?: (current: number, total: number) => void, repoRoot?: string) => Promise<void>;
35
+ }[], astCache: ASTCache, importMap: ImportMap, onProgress?: (current: number, total: number) => void, repoRoot?: string, allPaths?: string[]) => Promise<void>;
27
36
  export declare const processImportsFromExtracted: (graph: KnowledgeGraph, files: {
28
37
  path: string;
29
- content: string;
30
- }[], extractedImports: ExtractedImport[], importMap: ImportMap, onProgress?: (current: number, total: number) => void, repoRoot?: string) => Promise<void>;
38
+ }[], extractedImports: ExtractedImport[], importMap: ImportMap, onProgress?: (current: number, total: number) => void, repoRoot?: string, prebuiltCtx?: ImportResolutionContext) => Promise<void>;