gitnexus 1.2.9 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/analyze.js +69 -28
- package/dist/cli/index.js +20 -0
- package/dist/core/graph/graph.js +5 -0
- package/dist/core/graph/types.d.ts +12 -1
- package/dist/core/ingestion/call-processor.js +52 -32
- package/dist/core/ingestion/community-processor.js +75 -40
- package/dist/core/ingestion/filesystem-walker.d.ts +23 -0
- package/dist/core/ingestion/filesystem-walker.js +38 -3
- package/dist/core/ingestion/import-processor.d.ts +11 -3
- package/dist/core/ingestion/import-processor.js +27 -11
- package/dist/core/ingestion/parsing-processor.js +2 -4
- package/dist/core/ingestion/pipeline.js +142 -135
- package/dist/core/ingestion/process-processor.js +12 -11
- package/dist/core/ingestion/workers/parse-worker.js +67 -6
- package/dist/core/ingestion/workers/worker-pool.d.ts +3 -9
- package/dist/core/ingestion/workers/worker-pool.js +39 -18
- package/dist/core/kuzu/csv-generator.d.ts +15 -8
- package/dist/core/kuzu/csv-generator.js +258 -196
- package/dist/core/kuzu/kuzu-adapter.d.ts +1 -4
- package/dist/core/kuzu/kuzu-adapter.js +75 -63
- package/dist/core/kuzu/schema.d.ts +1 -1
- package/dist/core/kuzu/schema.js +10 -0
- package/dist/types/pipeline.d.ts +6 -2
- package/dist/types/pipeline.js +6 -4
- package/package.json +1 -1
package/dist/cli/analyze.js
CHANGED
|
@@ -4,6 +4,8 @@
|
|
|
4
4
|
* Indexes a repository and stores the knowledge graph in .gitnexus/
|
|
5
5
|
*/
|
|
6
6
|
import path from 'path';
|
|
7
|
+
import { execFileSync } from 'child_process';
|
|
8
|
+
import v8 from 'v8';
|
|
7
9
|
import cliProgress from 'cli-progress';
|
|
8
10
|
import { runPipelineFromRepo } from '../core/ingestion/pipeline.js';
|
|
9
11
|
import { initKuzu, loadGraphToKuzu, getKuzuStats, executeQuery, executeWithReusedStatement, closeKuzu, createFTSIndex, loadCachedEmbeddings } from '../core/kuzu/kuzu-adapter.js';
|
|
@@ -14,6 +16,27 @@ import { getCurrentCommit, isGitRepo, getGitRoot } from '../storage/git.js';
|
|
|
14
16
|
import { generateAIContextFiles } from './ai-context.js';
|
|
15
17
|
import fs from 'fs/promises';
|
|
16
18
|
import { registerClaudeHook } from './claude-hooks.js';
|
|
19
|
+
const HEAP_MB = 8192;
|
|
20
|
+
const HEAP_FLAG = `--max-old-space-size=${HEAP_MB}`;
|
|
21
|
+
/** Re-exec the process with an 8GB heap if we're currently below that. */
|
|
22
|
+
function ensureHeap() {
|
|
23
|
+
const nodeOpts = process.env.NODE_OPTIONS || '';
|
|
24
|
+
if (nodeOpts.includes('--max-old-space-size'))
|
|
25
|
+
return false;
|
|
26
|
+
const v8Heap = v8.getHeapStatistics().heap_size_limit;
|
|
27
|
+
if (v8Heap >= HEAP_MB * 1024 * 1024 * 0.9)
|
|
28
|
+
return false;
|
|
29
|
+
try {
|
|
30
|
+
execFileSync(process.execPath, [HEAP_FLAG, ...process.argv.slice(1)], {
|
|
31
|
+
stdio: 'inherit',
|
|
32
|
+
env: { ...process.env, NODE_OPTIONS: `${nodeOpts} ${HEAP_FLAG}`.trim() },
|
|
33
|
+
});
|
|
34
|
+
}
|
|
35
|
+
catch (e) {
|
|
36
|
+
process.exitCode = e.status ?? 1;
|
|
37
|
+
}
|
|
38
|
+
return true;
|
|
39
|
+
}
|
|
17
40
|
/** Threshold: auto-skip embeddings for repos with more nodes than this */
|
|
18
41
|
const EMBEDDING_NODE_LIMIT = 50_000;
|
|
19
42
|
const PHASE_LABELS = {
|
|
@@ -32,6 +55,8 @@ const PHASE_LABELS = {
|
|
|
32
55
|
done: 'Done',
|
|
33
56
|
};
|
|
34
57
|
export const analyzeCommand = async (inputPath, options) => {
|
|
58
|
+
if (ensureHeap())
|
|
59
|
+
return;
|
|
35
60
|
console.log('\n GitNexus Analyzer\n');
|
|
36
61
|
let repoPath;
|
|
37
62
|
if (inputPath) {
|
|
@@ -70,18 +95,47 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
70
95
|
stopOnComplete: false,
|
|
71
96
|
}, cliProgress.Presets.shades_grey);
|
|
72
97
|
bar.start(100, 0, { phase: 'Initializing...' });
|
|
98
|
+
// Graceful SIGINT handling — clean up resources and exit
|
|
99
|
+
let aborted = false;
|
|
100
|
+
const sigintHandler = () => {
|
|
101
|
+
if (aborted)
|
|
102
|
+
process.exit(1); // Second Ctrl-C: force exit
|
|
103
|
+
aborted = true;
|
|
104
|
+
bar.stop();
|
|
105
|
+
console.log('\n Interrupted — cleaning up...');
|
|
106
|
+
closeKuzu().catch(() => { }).finally(() => process.exit(130));
|
|
107
|
+
};
|
|
108
|
+
process.on('SIGINT', sigintHandler);
|
|
73
109
|
// Route all console output through bar.log() so the bar doesn't stamp itself
|
|
74
110
|
// multiple times when other code writes to stdout/stderr mid-render.
|
|
75
111
|
const origLog = console.log.bind(console);
|
|
76
112
|
const origWarn = console.warn.bind(console);
|
|
77
113
|
const origError = console.error.bind(console);
|
|
78
|
-
const barLog = (...args) =>
|
|
114
|
+
const barLog = (...args) => {
|
|
115
|
+
// Clear the bar line, print the message, then let the next bar.update redraw
|
|
116
|
+
process.stdout.write('\x1b[2K\r');
|
|
117
|
+
origLog(args.map(a => (typeof a === 'string' ? a : String(a))).join(' '));
|
|
118
|
+
};
|
|
79
119
|
console.log = barLog;
|
|
80
120
|
console.warn = barLog;
|
|
81
121
|
console.error = barLog;
|
|
82
|
-
//
|
|
122
|
+
// Track elapsed time per phase — both updateBar and the interval use the
|
|
123
|
+
// same format so they don't flicker against each other.
|
|
83
124
|
let lastPhaseLabel = 'Initializing...';
|
|
84
125
|
let phaseStart = Date.now();
|
|
126
|
+
/** Update bar with phase label + elapsed seconds (shown after 3s). */
|
|
127
|
+
const updateBar = (value, phaseLabel) => {
|
|
128
|
+
if (phaseLabel !== lastPhaseLabel) {
|
|
129
|
+
lastPhaseLabel = phaseLabel;
|
|
130
|
+
phaseStart = Date.now();
|
|
131
|
+
}
|
|
132
|
+
const elapsed = Math.round((Date.now() - phaseStart) / 1000);
|
|
133
|
+
const display = elapsed >= 3 ? `${phaseLabel} (${elapsed}s)` : phaseLabel;
|
|
134
|
+
bar.update(value, { phase: display });
|
|
135
|
+
};
|
|
136
|
+
// Tick elapsed seconds for phases with infrequent progress callbacks
|
|
137
|
+
// (e.g. CSV streaming, FTS indexing). Uses the same display format as
|
|
138
|
+
// updateBar so there's no flickering.
|
|
85
139
|
const elapsedTimer = setInterval(() => {
|
|
86
140
|
const elapsed = Math.round((Date.now() - phaseStart) / 1000);
|
|
87
141
|
if (elapsed >= 3) {
|
|
@@ -94,7 +148,7 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
94
148
|
let cachedEmbeddings = [];
|
|
95
149
|
if (options?.embeddings && existingMeta && !options?.force) {
|
|
96
150
|
try {
|
|
97
|
-
|
|
151
|
+
updateBar(0, 'Caching embeddings...');
|
|
98
152
|
await initKuzu(kuzuPath);
|
|
99
153
|
const cached = await loadCachedEmbeddings();
|
|
100
154
|
cachedEmbeddingNodeIds = cached.embeddingNodeIds;
|
|
@@ -112,16 +166,10 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
112
166
|
const pipelineResult = await runPipelineFromRepo(repoPath, (progress) => {
|
|
113
167
|
const phaseLabel = PHASE_LABELS[progress.phase] || progress.phase;
|
|
114
168
|
const scaled = Math.round(progress.percent * 0.6);
|
|
115
|
-
|
|
116
|
-
lastPhaseLabel = phaseLabel;
|
|
117
|
-
phaseStart = Date.now();
|
|
118
|
-
}
|
|
119
|
-
bar.update(scaled, { phase: phaseLabel });
|
|
169
|
+
updateBar(scaled, phaseLabel);
|
|
120
170
|
});
|
|
121
171
|
// ── Phase 2: KuzuDB (60–85%) ──────────────────────────────────────
|
|
122
|
-
|
|
123
|
-
phaseStart = Date.now();
|
|
124
|
-
bar.update(60, { phase: lastPhaseLabel });
|
|
172
|
+
updateBar(60, 'Loading into KuzuDB...');
|
|
125
173
|
await closeKuzu();
|
|
126
174
|
const kuzuFiles = [kuzuPath, `${kuzuPath}.wal`, `${kuzuPath}.lock`];
|
|
127
175
|
for (const f of kuzuFiles) {
|
|
@@ -133,17 +181,15 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
133
181
|
const t0Kuzu = Date.now();
|
|
134
182
|
await initKuzu(kuzuPath);
|
|
135
183
|
let kuzuMsgCount = 0;
|
|
136
|
-
const kuzuResult = await loadGraphToKuzu(pipelineResult.graph, pipelineResult.
|
|
184
|
+
const kuzuResult = await loadGraphToKuzu(pipelineResult.graph, pipelineResult.repoPath, storagePath, (msg) => {
|
|
137
185
|
kuzuMsgCount++;
|
|
138
186
|
const progress = Math.min(84, 60 + Math.round((kuzuMsgCount / (kuzuMsgCount + 10)) * 24));
|
|
139
|
-
|
|
187
|
+
updateBar(progress, msg);
|
|
140
188
|
});
|
|
141
189
|
const kuzuTime = ((Date.now() - t0Kuzu) / 1000).toFixed(1);
|
|
142
190
|
const kuzuWarnings = kuzuResult.warnings;
|
|
143
191
|
// ── Phase 3: FTS (85–90%) ─────────────────────────────────────────
|
|
144
|
-
|
|
145
|
-
phaseStart = Date.now();
|
|
146
|
-
bar.update(85, { phase: lastPhaseLabel });
|
|
192
|
+
updateBar(85, 'Creating search indexes...');
|
|
147
193
|
const t0Fts = Date.now();
|
|
148
194
|
try {
|
|
149
195
|
await createFTSIndex('File', 'file_fts', ['name', 'content']);
|
|
@@ -158,7 +204,7 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
158
204
|
const ftsTime = ((Date.now() - t0Fts) / 1000).toFixed(1);
|
|
159
205
|
// ── Phase 3.5: Re-insert cached embeddings ────────────────────────
|
|
160
206
|
if (cachedEmbeddings.length > 0) {
|
|
161
|
-
|
|
207
|
+
updateBar(88, `Restoring ${cachedEmbeddings.length} cached embeddings...`);
|
|
162
208
|
const EMBED_BATCH = 200;
|
|
163
209
|
for (let i = 0; i < cachedEmbeddings.length; i += EMBED_BATCH) {
|
|
164
210
|
const batch = cachedEmbeddings.slice(i, i + EMBED_BATCH);
|
|
@@ -183,29 +229,23 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
183
229
|
}
|
|
184
230
|
}
|
|
185
231
|
if (!embeddingSkipped) {
|
|
186
|
-
|
|
187
|
-
phaseStart = Date.now();
|
|
188
|
-
bar.update(90, { phase: lastPhaseLabel });
|
|
232
|
+
updateBar(90, 'Loading embedding model...');
|
|
189
233
|
const t0Emb = Date.now();
|
|
190
234
|
await runEmbeddingPipeline(executeQuery, executeWithReusedStatement, (progress) => {
|
|
191
235
|
const scaled = 90 + Math.round((progress.percent / 100) * 8);
|
|
192
236
|
const label = progress.phase === 'loading-model' ? 'Loading embedding model...' : `Embedding ${progress.nodesProcessed || 0}/${progress.totalNodes || '?'}`;
|
|
193
|
-
|
|
194
|
-
lastPhaseLabel = label;
|
|
195
|
-
phaseStart = Date.now();
|
|
196
|
-
}
|
|
197
|
-
bar.update(scaled, { phase: label });
|
|
237
|
+
updateBar(scaled, label);
|
|
198
238
|
}, {}, cachedEmbeddingNodeIds.size > 0 ? cachedEmbeddingNodeIds : undefined);
|
|
199
239
|
embeddingTime = ((Date.now() - t0Emb) / 1000).toFixed(1);
|
|
200
240
|
}
|
|
201
241
|
// ── Phase 5: Finalize (98–100%) ───────────────────────────────────
|
|
202
|
-
|
|
242
|
+
updateBar(98, 'Saving metadata...');
|
|
203
243
|
const meta = {
|
|
204
244
|
repoPath,
|
|
205
245
|
lastCommit: currentCommit,
|
|
206
246
|
indexedAt: new Date().toISOString(),
|
|
207
247
|
stats: {
|
|
208
|
-
files: pipelineResult.
|
|
248
|
+
files: pipelineResult.totalFileCount,
|
|
209
249
|
nodes: stats.nodes,
|
|
210
250
|
edges: stats.edges,
|
|
211
251
|
communities: pipelineResult.communityResult?.stats.totalCommunities,
|
|
@@ -227,7 +267,7 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
227
267
|
aggregatedClusterCount = Array.from(groups.values()).filter(count => count >= 5).length;
|
|
228
268
|
}
|
|
229
269
|
const aiContext = await generateAIContextFiles(repoPath, storagePath, projectName, {
|
|
230
|
-
files: pipelineResult.
|
|
270
|
+
files: pipelineResult.totalFileCount,
|
|
231
271
|
nodes: stats.nodes,
|
|
232
272
|
edges: stats.edges,
|
|
233
273
|
communities: pipelineResult.communityResult?.stats.totalCommunities,
|
|
@@ -240,6 +280,7 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
240
280
|
// Since the process exits immediately after, Node.js reclaims everything.
|
|
241
281
|
const totalTime = ((Date.now() - t0Global) / 1000).toFixed(1);
|
|
242
282
|
clearInterval(elapsedTimer);
|
|
283
|
+
process.removeListener('SIGINT', sigintHandler);
|
|
243
284
|
console.log = origLog;
|
|
244
285
|
console.warn = origWarn;
|
|
245
286
|
console.error = origError;
|
package/dist/cli/index.js
CHANGED
|
@@ -1,4 +1,24 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
+
// Raise Node heap limit for large repos (e.g. Linux kernel).
|
|
3
|
+
// Must run before any heavy allocation. If already set by the user, respect it.
|
|
4
|
+
if (!process.env.NODE_OPTIONS?.includes('--max-old-space-size')) {
|
|
5
|
+
const execArgv = process.execArgv.join(' ');
|
|
6
|
+
if (!execArgv.includes('--max-old-space-size')) {
|
|
7
|
+
// Re-spawn with a larger heap (8 GB)
|
|
8
|
+
const { execFileSync } = await import('node:child_process');
|
|
9
|
+
try {
|
|
10
|
+
execFileSync(process.execPath, ['--max-old-space-size=8192', ...process.argv.slice(1)], {
|
|
11
|
+
stdio: 'inherit',
|
|
12
|
+
env: { ...process.env, NODE_OPTIONS: `${process.env.NODE_OPTIONS || ''} --max-old-space-size=8192`.trim() },
|
|
13
|
+
});
|
|
14
|
+
process.exit(0);
|
|
15
|
+
}
|
|
16
|
+
catch (e) {
|
|
17
|
+
// If the child exited with an error code, propagate it
|
|
18
|
+
process.exit(e.status ?? 1);
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
}
|
|
2
22
|
import { Command } from 'commander';
|
|
3
23
|
import { analyzeCommand } from './analyze.js';
|
|
4
24
|
import { serveCommand } from './serve.js';
|
package/dist/core/graph/graph.js
CHANGED
|
@@ -46,6 +46,11 @@ export const createKnowledgeGraph = () => {
|
|
|
46
46
|
get relationships() {
|
|
47
47
|
return Array.from(relationshipMap.values());
|
|
48
48
|
},
|
|
49
|
+
iterNodes: () => nodeMap.values(),
|
|
50
|
+
iterRelationships: () => relationshipMap.values(),
|
|
51
|
+
forEachNode(fn) { nodeMap.forEach(fn); },
|
|
52
|
+
forEachRelationship(fn) { relationshipMap.forEach(fn); },
|
|
53
|
+
getNode: (id) => nodeMap.get(id),
|
|
49
54
|
// O(1) count getters - avoid creating arrays just for length
|
|
50
55
|
get nodeCount() {
|
|
51
56
|
return nodeMap.size;
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
export type NodeLabel = 'Project' | 'Package' | 'Module' | 'Folder' | 'File' | 'Class' | 'Function' | 'Method' | 'Variable' | 'Interface' | 'Enum' | 'Decorator' | 'Import' | 'Type' | 'CodeElement' | 'Community' | 'Process';
|
|
1
|
+
export type NodeLabel = 'Project' | 'Package' | 'Module' | 'Folder' | 'File' | 'Class' | 'Function' | 'Method' | 'Variable' | 'Interface' | 'Enum' | 'Decorator' | 'Import' | 'Type' | 'CodeElement' | 'Community' | 'Process' | 'Struct' | 'Macro' | 'Typedef' | 'Union' | 'Namespace' | 'Trait' | 'Impl' | 'TypeAlias' | 'Const' | 'Static' | 'Property' | 'Record' | 'Delegate' | 'Annotation' | 'Constructor' | 'Template';
|
|
2
2
|
export type NodeProperties = {
|
|
3
3
|
name: string;
|
|
4
4
|
filePath: string;
|
|
@@ -39,8 +39,19 @@ export interface GraphRelationship {
|
|
|
39
39
|
step?: number;
|
|
40
40
|
}
|
|
41
41
|
export interface KnowledgeGraph {
|
|
42
|
+
/** Returns a full array copy — prefer iterNodes() for iteration */
|
|
42
43
|
nodes: GraphNode[];
|
|
44
|
+
/** Returns a full array copy — prefer iterRelationships() for iteration */
|
|
43
45
|
relationships: GraphRelationship[];
|
|
46
|
+
/** Zero-copy iterator over nodes */
|
|
47
|
+
iterNodes: () => IterableIterator<GraphNode>;
|
|
48
|
+
/** Zero-copy iterator over relationships */
|
|
49
|
+
iterRelationships: () => IterableIterator<GraphRelationship>;
|
|
50
|
+
/** Zero-copy forEach — avoids iterator protocol overhead in hot loops */
|
|
51
|
+
forEachNode: (fn: (node: GraphNode) => void) => void;
|
|
52
|
+
forEachRelationship: (fn: (rel: GraphRelationship) => void) => void;
|
|
53
|
+
/** Lookup a single node by id — O(1) */
|
|
54
|
+
getNode: (id: string) => GraphNode | undefined;
|
|
44
55
|
nodeCount: number;
|
|
45
56
|
relationshipCount: number;
|
|
46
57
|
addNode: (node: GraphNode) => void;
|
|
@@ -232,38 +232,58 @@ const resolveCallTarget = (calledName, currentFile, symbolTable, importMap) => {
|
|
|
232
232
|
* Filter out common built-in functions and noise
|
|
233
233
|
* that shouldn't be tracked as calls
|
|
234
234
|
*/
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
235
|
+
/** Pre-built set (module-level singleton) to avoid re-creating per call */
|
|
236
|
+
const BUILT_IN_NAMES = new Set([
|
|
237
|
+
// JavaScript/TypeScript built-ins
|
|
238
|
+
'console', 'log', 'warn', 'error', 'info', 'debug',
|
|
239
|
+
'setTimeout', 'setInterval', 'clearTimeout', 'clearInterval',
|
|
240
|
+
'parseInt', 'parseFloat', 'isNaN', 'isFinite',
|
|
241
|
+
'encodeURI', 'decodeURI', 'encodeURIComponent', 'decodeURIComponent',
|
|
242
|
+
'JSON', 'parse', 'stringify',
|
|
243
|
+
'Object', 'Array', 'String', 'Number', 'Boolean', 'Symbol', 'BigInt',
|
|
244
|
+
'Map', 'Set', 'WeakMap', 'WeakSet',
|
|
245
|
+
'Promise', 'resolve', 'reject', 'then', 'catch', 'finally',
|
|
246
|
+
'Math', 'Date', 'RegExp', 'Error',
|
|
247
|
+
'require', 'import', 'export',
|
|
248
|
+
'fetch', 'Response', 'Request',
|
|
249
|
+
// React hooks and common functions
|
|
250
|
+
'useState', 'useEffect', 'useCallback', 'useMemo', 'useRef', 'useContext',
|
|
251
|
+
'useReducer', 'useLayoutEffect', 'useImperativeHandle', 'useDebugValue',
|
|
252
|
+
'createElement', 'createContext', 'createRef', 'forwardRef', 'memo', 'lazy',
|
|
253
|
+
// Common array/object methods
|
|
254
|
+
'map', 'filter', 'reduce', 'forEach', 'find', 'findIndex', 'some', 'every',
|
|
255
|
+
'includes', 'indexOf', 'slice', 'splice', 'concat', 'join', 'split',
|
|
256
|
+
'push', 'pop', 'shift', 'unshift', 'sort', 'reverse',
|
|
257
|
+
'keys', 'values', 'entries', 'assign', 'freeze', 'seal',
|
|
258
|
+
'hasOwnProperty', 'toString', 'valueOf',
|
|
259
|
+
// Python built-ins
|
|
260
|
+
'print', 'len', 'range', 'str', 'int', 'float', 'list', 'dict', 'set', 'tuple',
|
|
261
|
+
'open', 'read', 'write', 'close', 'append', 'extend', 'update',
|
|
262
|
+
'super', 'type', 'isinstance', 'issubclass', 'getattr', 'setattr', 'hasattr',
|
|
263
|
+
'enumerate', 'zip', 'sorted', 'reversed', 'min', 'max', 'sum', 'abs',
|
|
264
|
+
// C/C++ standard library and common kernel helpers
|
|
265
|
+
'printf', 'fprintf', 'sprintf', 'snprintf', 'vprintf', 'vfprintf', 'vsprintf', 'vsnprintf',
|
|
266
|
+
'scanf', 'fscanf', 'sscanf',
|
|
267
|
+
'malloc', 'calloc', 'realloc', 'free', 'memcpy', 'memmove', 'memset', 'memcmp',
|
|
268
|
+
'strlen', 'strcpy', 'strncpy', 'strcat', 'strncat', 'strcmp', 'strncmp', 'strstr', 'strchr', 'strrchr',
|
|
269
|
+
'atoi', 'atol', 'atof', 'strtol', 'strtoul', 'strtoll', 'strtoull', 'strtod',
|
|
270
|
+
'sizeof', 'offsetof', 'typeof',
|
|
271
|
+
'assert', 'abort', 'exit', '_exit',
|
|
272
|
+
'fopen', 'fclose', 'fread', 'fwrite', 'fseek', 'ftell', 'rewind', 'fflush', 'fgets', 'fputs',
|
|
273
|
+
// Linux kernel common macros/helpers (not real call targets)
|
|
274
|
+
'likely', 'unlikely', 'BUG', 'BUG_ON', 'WARN', 'WARN_ON', 'WARN_ONCE',
|
|
275
|
+
'IS_ERR', 'PTR_ERR', 'ERR_PTR', 'IS_ERR_OR_NULL',
|
|
276
|
+
'ARRAY_SIZE', 'container_of', 'list_for_each_entry', 'list_for_each_entry_safe',
|
|
277
|
+
'min', 'max', 'clamp', 'abs', 'swap',
|
|
278
|
+
'pr_info', 'pr_warn', 'pr_err', 'pr_debug', 'pr_notice', 'pr_crit', 'pr_emerg',
|
|
279
|
+
'printk', 'dev_info', 'dev_warn', 'dev_err', 'dev_dbg',
|
|
280
|
+
'GFP_KERNEL', 'GFP_ATOMIC',
|
|
281
|
+
'spin_lock', 'spin_unlock', 'spin_lock_irqsave', 'spin_unlock_irqrestore',
|
|
282
|
+
'mutex_lock', 'mutex_unlock', 'mutex_init',
|
|
283
|
+
'kfree', 'kmalloc', 'kzalloc', 'kcalloc', 'krealloc', 'kvmalloc', 'kvfree',
|
|
284
|
+
'get', 'put',
|
|
285
|
+
]);
|
|
286
|
+
const isBuiltInOrNoise = (name) => BUILT_IN_NAMES.has(name);
|
|
267
287
|
/**
|
|
268
288
|
* Fast path: resolve pre-extracted call sites from workers.
|
|
269
289
|
* No AST parsing — workers already extracted calledName + sourceId.
|
|
@@ -51,23 +51,51 @@ export const getCommunityColor = (communityIndex) => {
|
|
|
51
51
|
*/
|
|
52
52
|
export const processCommunities = async (knowledgeGraph, onProgress) => {
|
|
53
53
|
onProgress?.('Building graph for community detection...', 0);
|
|
54
|
-
//
|
|
55
|
-
|
|
56
|
-
|
|
54
|
+
// Pre-check total symbol count to determine large-graph mode before building
|
|
55
|
+
let symbolCount = 0;
|
|
56
|
+
knowledgeGraph.forEachNode(node => {
|
|
57
|
+
if (node.label === 'Function' || node.label === 'Class' || node.label === 'Method' || node.label === 'Interface') {
|
|
58
|
+
symbolCount++;
|
|
59
|
+
}
|
|
60
|
+
});
|
|
61
|
+
const isLarge = symbolCount > 10_000;
|
|
62
|
+
const graph = buildGraphologyGraph(knowledgeGraph, isLarge);
|
|
57
63
|
if (graph.order === 0) {
|
|
58
|
-
// No nodes to cluster
|
|
59
64
|
return {
|
|
60
65
|
communities: [],
|
|
61
66
|
memberships: [],
|
|
62
67
|
stats: { totalCommunities: 0, modularity: 0, nodesProcessed: 0 }
|
|
63
68
|
};
|
|
64
69
|
}
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
70
|
+
const nodeCount = graph.order;
|
|
71
|
+
const edgeCount = graph.size;
|
|
72
|
+
onProgress?.(`Running Leiden on ${nodeCount} nodes, ${edgeCount} edges${isLarge ? ` (filtered from ${symbolCount} symbols)` : ''}...`, 30);
|
|
73
|
+
// Large graphs: higher resolution + capped iterations (matching Python leidenalg default of 2).
|
|
74
|
+
// The first 2 iterations capture ~95%+ of modularity; additional iterations have diminishing returns.
|
|
75
|
+
// Timeout: abort after 60s for pathological graph structures.
|
|
76
|
+
const LEIDEN_TIMEOUT_MS = 60_000;
|
|
77
|
+
let details;
|
|
78
|
+
try {
|
|
79
|
+
details = await Promise.race([
|
|
80
|
+
Promise.resolve(leiden.detailed(graph, {
|
|
81
|
+
resolution: isLarge ? 2.0 : 1.0,
|
|
82
|
+
maxIterations: isLarge ? 3 : 0,
|
|
83
|
+
})),
|
|
84
|
+
new Promise((_, reject) => setTimeout(() => reject(new Error('Leiden timeout')), LEIDEN_TIMEOUT_MS)),
|
|
85
|
+
]);
|
|
86
|
+
}
|
|
87
|
+
catch (e) {
|
|
88
|
+
if (e.message === 'Leiden timeout') {
|
|
89
|
+
onProgress?.('Community detection timed out, using fallback...', 60);
|
|
90
|
+
// Fallback: assign all nodes to community 0
|
|
91
|
+
const communities = {};
|
|
92
|
+
graph.forEachNode((node) => { communities[node] = 0; });
|
|
93
|
+
details = { communities, count: 1, modularity: 0 };
|
|
94
|
+
}
|
|
95
|
+
else {
|
|
96
|
+
throw e;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
71
99
|
onProgress?.(`Found ${details.count} communities...`, 60);
|
|
72
100
|
// Step 3: Create community nodes with heuristic labels
|
|
73
101
|
const communityNodes = createCommunityNodes(details.communities, details.count, graph, knowledgeGraph);
|
|
@@ -95,41 +123,48 @@ export const processCommunities = async (knowledgeGraph, onProgress) => {
|
|
|
95
123
|
// HELPER: Build graphology graph from knowledge graph
|
|
96
124
|
// ============================================================================
|
|
97
125
|
/**
|
|
98
|
-
* Build a graphology graph containing only symbol nodes and
|
|
99
|
-
*
|
|
126
|
+
* Build a graphology graph containing only symbol nodes and clustering edges.
|
|
127
|
+
* For large graphs (>10K symbols), filter out low-confidence fuzzy-global edges
|
|
128
|
+
* and degree-1 nodes that add noise and massively increase Leiden runtime.
|
|
100
129
|
*/
|
|
101
|
-
const
|
|
102
|
-
|
|
130
|
+
const MIN_CONFIDENCE_LARGE = 0.5;
|
|
131
|
+
const buildGraphologyGraph = (knowledgeGraph, isLarge) => {
|
|
103
132
|
const graph = new Graph({ type: 'undirected', allowSelfLoops: false });
|
|
104
|
-
// Symbol types that should be clustered
|
|
105
133
|
const symbolTypes = new Set(['Function', 'Class', 'Method', 'Interface']);
|
|
106
|
-
// First pass: collect which nodes participate in clustering edges
|
|
107
134
|
const clusteringRelTypes = new Set(['CALLS', 'EXTENDS', 'IMPLEMENTS']);
|
|
108
135
|
const connectedNodes = new Set();
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
136
|
+
const nodeDegree = new Map();
|
|
137
|
+
knowledgeGraph.forEachRelationship(rel => {
|
|
138
|
+
if (!clusteringRelTypes.has(rel.type) || rel.sourceId === rel.targetId)
|
|
139
|
+
return;
|
|
140
|
+
if (isLarge && rel.confidence < MIN_CONFIDENCE_LARGE)
|
|
141
|
+
return;
|
|
142
|
+
connectedNodes.add(rel.sourceId);
|
|
143
|
+
connectedNodes.add(rel.targetId);
|
|
144
|
+
nodeDegree.set(rel.sourceId, (nodeDegree.get(rel.sourceId) || 0) + 1);
|
|
145
|
+
nodeDegree.set(rel.targetId, (nodeDegree.get(rel.targetId) || 0) + 1);
|
|
114
146
|
});
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
147
|
+
knowledgeGraph.forEachNode(node => {
|
|
148
|
+
if (!symbolTypes.has(node.label) || !connectedNodes.has(node.id))
|
|
149
|
+
return;
|
|
150
|
+
// For large graphs, skip degree-1 nodes — they just become singletons or
|
|
151
|
+
// get absorbed into their single neighbor's community, but cost iteration time.
|
|
152
|
+
if (isLarge && (nodeDegree.get(node.id) || 0) < 2)
|
|
153
|
+
return;
|
|
154
|
+
graph.addNode(node.id, {
|
|
155
|
+
name: node.properties.name,
|
|
156
|
+
filePath: node.properties.filePath,
|
|
157
|
+
type: node.label,
|
|
158
|
+
});
|
|
125
159
|
});
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
160
|
+
knowledgeGraph.forEachRelationship(rel => {
|
|
161
|
+
if (!clusteringRelTypes.has(rel.type))
|
|
162
|
+
return;
|
|
163
|
+
if (isLarge && rel.confidence < MIN_CONFIDENCE_LARGE)
|
|
164
|
+
return;
|
|
165
|
+
if (graph.hasNode(rel.sourceId) && graph.hasNode(rel.targetId) && rel.sourceId !== rel.targetId) {
|
|
166
|
+
if (!graph.hasEdge(rel.sourceId, rel.targetId)) {
|
|
167
|
+
graph.addEdge(rel.sourceId, rel.targetId);
|
|
133
168
|
}
|
|
134
169
|
}
|
|
135
170
|
});
|
|
@@ -152,11 +187,11 @@ const createCommunityNodes = (communities, communityCount, graph, knowledgeGraph
|
|
|
152
187
|
});
|
|
153
188
|
// Build node lookup for file paths
|
|
154
189
|
const nodePathMap = new Map();
|
|
155
|
-
knowledgeGraph.
|
|
190
|
+
for (const node of knowledgeGraph.iterNodes()) {
|
|
156
191
|
if (node.properties.filePath) {
|
|
157
192
|
nodePathMap.set(node.id, node.properties.filePath);
|
|
158
193
|
}
|
|
159
|
-
}
|
|
194
|
+
}
|
|
160
195
|
// Create community nodes - SKIP SINGLETONS (isolated nodes)
|
|
161
196
|
const communityNodes = [];
|
|
162
197
|
communityMembers.forEach((memberIds, commNum) => {
|
|
@@ -2,4 +2,27 @@ export interface FileEntry {
|
|
|
2
2
|
path: string;
|
|
3
3
|
content: string;
|
|
4
4
|
}
|
|
5
|
+
/** Lightweight entry — path + size from stat, no content in memory */
|
|
6
|
+
export interface ScannedFile {
|
|
7
|
+
path: string;
|
|
8
|
+
size: number;
|
|
9
|
+
}
|
|
10
|
+
/** Path-only reference (for type signatures) */
|
|
11
|
+
export interface FilePath {
|
|
12
|
+
path: string;
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Phase 1: Scan repository — stat files to get paths + sizes, no content loaded.
|
|
16
|
+
* Memory: ~10MB for 100K files vs ~1GB+ with content.
|
|
17
|
+
*/
|
|
18
|
+
export declare const walkRepositoryPaths: (repoPath: string, onProgress?: (current: number, total: number, filePath: string) => void) => Promise<ScannedFile[]>;
|
|
19
|
+
/**
|
|
20
|
+
* Phase 2: Read file contents for a specific set of relative paths.
|
|
21
|
+
* Returns a Map for O(1) lookup. Silently skips files that fail to read.
|
|
22
|
+
*/
|
|
23
|
+
export declare const readFileContents: (repoPath: string, relativePaths: string[]) => Promise<Map<string, string>>;
|
|
24
|
+
/**
|
|
25
|
+
* Legacy API — scans and reads everything into memory.
|
|
26
|
+
* Used by sequential fallback path only.
|
|
27
|
+
*/
|
|
5
28
|
export declare const walkRepository: (repoPath: string, onProgress?: (current: number, total: number, filePath: string) => void) => Promise<FileEntry[]>;
|
|
@@ -5,7 +5,11 @@ import { shouldIgnorePath } from '../../config/ignore-service.js';
|
|
|
5
5
|
const READ_CONCURRENCY = 32;
|
|
6
6
|
/** Skip files larger than 512KB — they're usually generated/vendored and crash tree-sitter */
|
|
7
7
|
const MAX_FILE_SIZE = 512 * 1024;
|
|
8
|
-
|
|
8
|
+
/**
|
|
9
|
+
* Phase 1: Scan repository — stat files to get paths + sizes, no content loaded.
|
|
10
|
+
* Memory: ~10MB for 100K files vs ~1GB+ with content.
|
|
11
|
+
*/
|
|
12
|
+
export const walkRepositoryPaths = async (repoPath, onProgress) => {
|
|
9
13
|
const files = await glob('**/*', {
|
|
10
14
|
cwd: repoPath,
|
|
11
15
|
nodir: true,
|
|
@@ -24,8 +28,7 @@ export const walkRepository = async (repoPath, onProgress) => {
|
|
|
24
28
|
skippedLarge++;
|
|
25
29
|
return null;
|
|
26
30
|
}
|
|
27
|
-
|
|
28
|
-
return { path: relativePath.replace(/\\/g, '/'), content };
|
|
31
|
+
return { path: relativePath.replace(/\\/g, '/'), size: stat.size };
|
|
29
32
|
}));
|
|
30
33
|
for (const result of results) {
|
|
31
34
|
processed++;
|
|
@@ -43,3 +46,35 @@ export const walkRepository = async (repoPath, onProgress) => {
|
|
|
43
46
|
}
|
|
44
47
|
return entries;
|
|
45
48
|
};
|
|
49
|
+
/**
|
|
50
|
+
* Phase 2: Read file contents for a specific set of relative paths.
|
|
51
|
+
* Returns a Map for O(1) lookup. Silently skips files that fail to read.
|
|
52
|
+
*/
|
|
53
|
+
export const readFileContents = async (repoPath, relativePaths) => {
|
|
54
|
+
const contents = new Map();
|
|
55
|
+
for (let start = 0; start < relativePaths.length; start += READ_CONCURRENCY) {
|
|
56
|
+
const batch = relativePaths.slice(start, start + READ_CONCURRENCY);
|
|
57
|
+
const results = await Promise.allSettled(batch.map(async (relativePath) => {
|
|
58
|
+
const fullPath = path.join(repoPath, relativePath);
|
|
59
|
+
const content = await fs.readFile(fullPath, 'utf-8');
|
|
60
|
+
return { path: relativePath, content };
|
|
61
|
+
}));
|
|
62
|
+
for (const result of results) {
|
|
63
|
+
if (result.status === 'fulfilled') {
|
|
64
|
+
contents.set(result.value.path, result.value.content);
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
return contents;
|
|
69
|
+
};
|
|
70
|
+
/**
|
|
71
|
+
* Legacy API — scans and reads everything into memory.
|
|
72
|
+
* Used by sequential fallback path only.
|
|
73
|
+
*/
|
|
74
|
+
export const walkRepository = async (repoPath, onProgress) => {
|
|
75
|
+
const scanned = await walkRepositoryPaths(repoPath, onProgress);
|
|
76
|
+
const contents = await readFileContents(repoPath, scanned.map(f => f.path));
|
|
77
|
+
return scanned
|
|
78
|
+
.filter(f => contents.has(f.path))
|
|
79
|
+
.map(f => ({ path: f.path, content: contents.get(f.path) }));
|
|
80
|
+
};
|
|
@@ -3,6 +3,15 @@ import { ASTCache } from './ast-cache.js';
|
|
|
3
3
|
import type { ExtractedImport } from './workers/parse-worker.js';
|
|
4
4
|
export type ImportMap = Map<string, Set<string>>;
|
|
5
5
|
export declare const createImportMap: () => ImportMap;
|
|
6
|
+
/** Pre-built lookup structures for import resolution. Build once, reuse across chunks. */
|
|
7
|
+
export interface ImportResolutionContext {
|
|
8
|
+
allFilePaths: Set<string>;
|
|
9
|
+
allFileList: string[];
|
|
10
|
+
normalizedFileList: string[];
|
|
11
|
+
suffixIndex: SuffixIndex;
|
|
12
|
+
resolveCache: Map<string, string | null>;
|
|
13
|
+
}
|
|
14
|
+
export declare function buildImportResolutionContext(allPaths: string[]): ImportResolutionContext;
|
|
6
15
|
/**
|
|
7
16
|
* Build a suffix index for O(1) endsWith lookups.
|
|
8
17
|
* Maps every possible path suffix to its original file path.
|
|
@@ -23,8 +32,7 @@ export interface SuffixIndex {
|
|
|
23
32
|
export declare const processImports: (graph: KnowledgeGraph, files: {
|
|
24
33
|
path: string;
|
|
25
34
|
content: string;
|
|
26
|
-
}[], astCache: ASTCache, importMap: ImportMap, onProgress?: (current: number, total: number) => void, repoRoot?: string) => Promise<void>;
|
|
35
|
+
}[], astCache: ASTCache, importMap: ImportMap, onProgress?: (current: number, total: number) => void, repoRoot?: string, allPaths?: string[]) => Promise<void>;
|
|
27
36
|
export declare const processImportsFromExtracted: (graph: KnowledgeGraph, files: {
|
|
28
37
|
path: string;
|
|
29
|
-
|
|
30
|
-
}[], extractedImports: ExtractedImport[], importMap: ImportMap, onProgress?: (current: number, total: number) => void, repoRoot?: string) => Promise<void>;
|
|
38
|
+
}[], extractedImports: ExtractedImport[], importMap: ImportMap, onProgress?: (current: number, total: number) => void, repoRoot?: string, prebuiltCtx?: ImportResolutionContext) => Promise<void>;
|