gitnexus 1.2.6 → 1.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -10
- package/dist/cli/analyze.d.ts +1 -1
- package/dist/cli/analyze.js +59 -15
- package/dist/cli/eval-server.js +1 -1
- package/dist/cli/index.js +1 -1
- package/dist/cli/mcp.js +1 -1
- package/dist/core/augmentation/engine.js +20 -20
- package/dist/core/embeddings/embedder.js +7 -0
- package/dist/core/embeddings/embedding-pipeline.js +26 -26
- package/dist/core/ingestion/cluster-enricher.js +16 -16
- package/dist/core/ingestion/filesystem-walker.js +17 -3
- package/dist/core/ingestion/parsing-processor.js +4 -1
- package/dist/core/ingestion/workers/parse-worker.js +13 -4
- package/dist/core/ingestion/workers/worker-pool.js +43 -9
- package/dist/core/kuzu/kuzu-adapter.js +9 -9
- package/dist/core/search/hybrid-search.js +3 -3
- package/dist/core/wiki/graph-queries.js +52 -52
- package/dist/core/wiki/prompts.js +82 -82
- package/dist/mcp/local/local-backend.d.ts +18 -3
- package/dist/mcp/local/local-backend.js +57 -13
- package/dist/mcp/resources.js +4 -4
- package/hooks/claude/gitnexus-hook.cjs +135 -135
- package/hooks/claude/pre-tool-use.sh +78 -78
- package/hooks/claude/session-start.sh +42 -42
- package/package.json +1 -1
- package/vendor/leiden/index.cjs +355 -355
- package/vendor/leiden/utils.cjs +392 -392
package/README.md
CHANGED
|
@@ -79,17 +79,16 @@ Add to `~/.config/opencode/config.json`:
|
|
|
79
79
|
}
|
|
80
80
|
```
|
|
81
81
|
|
|
82
|
-
##
|
|
82
|
+
## How It Works
|
|
83
83
|
|
|
84
|
-
GitNexus
|
|
84
|
+
GitNexus builds a complete knowledge graph of your codebase through a multi-phase indexing pipeline:
|
|
85
85
|
|
|
86
|
-
1. **Structure** —
|
|
87
|
-
2. **
|
|
88
|
-
3. **
|
|
89
|
-
4. **
|
|
90
|
-
5. **
|
|
91
|
-
6. **
|
|
92
|
-
7. **Processes** — Entry point detection and execution flow tracing
|
|
86
|
+
1. **Structure** — Walks the file tree and maps folder/file relationships
|
|
87
|
+
2. **Parsing** — Extracts functions, classes, methods, and interfaces using Tree-sitter ASTs
|
|
88
|
+
3. **Resolution** — Resolves imports and function calls across files with language-aware logic
|
|
89
|
+
4. **Clustering** — Groups related symbols into functional communities
|
|
90
|
+
5. **Processes** — Traces execution flows from entry points through call chains
|
|
91
|
+
6. **Search** — Builds hybrid search indexes for fast retrieval
|
|
93
92
|
|
|
94
93
|
The result is a **KuzuDB graph database** stored locally in `.gitnexus/` with full-text search and semantic embeddings.
|
|
95
94
|
|
|
@@ -147,7 +146,7 @@ gitnexus wiki --model <model> # Wiki with custom LLM model (default: gpt-4o-m
|
|
|
147
146
|
|
|
148
147
|
## Multi-Repo Support
|
|
149
148
|
|
|
150
|
-
GitNexus supports indexing multiple repositories. Each `gitnexus analyze` registers the repo in a global registry (`~/.gitnexus/registry.json`). The MCP server serves all indexed repos automatically
|
|
149
|
+
GitNexus supports indexing multiple repositories. Each `gitnexus analyze` registers the repo in a global registry (`~/.gitnexus/registry.json`). The MCP server serves all indexed repos automatically.
|
|
151
150
|
|
|
152
151
|
## Supported Languages
|
|
153
152
|
|
package/dist/cli/analyze.d.ts
CHANGED
package/dist/cli/analyze.js
CHANGED
|
@@ -8,7 +8,7 @@ import cliProgress from 'cli-progress';
|
|
|
8
8
|
import { runPipelineFromRepo } from '../core/ingestion/pipeline.js';
|
|
9
9
|
import { initKuzu, loadGraphToKuzu, getKuzuStats, executeQuery, executeWithReusedStatement, closeKuzu, createFTSIndex, loadCachedEmbeddings } from '../core/kuzu/kuzu-adapter.js';
|
|
10
10
|
import { runEmbeddingPipeline } from '../core/embeddings/embedding-pipeline.js';
|
|
11
|
-
|
|
11
|
+
// disposeEmbedder intentionally not called — ONNX Runtime segfaults on cleanup (see #38)
|
|
12
12
|
import { getStoragePaths, saveMeta, loadMeta, addToGitignore, registerRepo, getGlobalRegistryPath } from '../storage/repo-manager.js';
|
|
13
13
|
import { getCurrentCommit, isGitRepo, getGitRoot } from '../storage/git.js';
|
|
14
14
|
import { generateAIContextFiles } from './ai-context.js';
|
|
@@ -70,11 +70,29 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
70
70
|
stopOnComplete: false,
|
|
71
71
|
}, cliProgress.Presets.shades_grey);
|
|
72
72
|
bar.start(100, 0, { phase: 'Initializing...' });
|
|
73
|
+
// Route all console output through bar.log() so the bar doesn't stamp itself
|
|
74
|
+
// multiple times when other code writes to stdout/stderr mid-render.
|
|
75
|
+
const origLog = console.log.bind(console);
|
|
76
|
+
const origWarn = console.warn.bind(console);
|
|
77
|
+
const origError = console.error.bind(console);
|
|
78
|
+
const barLog = (...args) => bar.log(args.map(a => (typeof a === 'string' ? a : String(a))).join(' '));
|
|
79
|
+
console.log = barLog;
|
|
80
|
+
console.warn = barLog;
|
|
81
|
+
console.error = barLog;
|
|
82
|
+
// Show elapsed seconds for phases that run longer than 3s
|
|
83
|
+
let lastPhaseLabel = 'Initializing...';
|
|
84
|
+
let phaseStart = Date.now();
|
|
85
|
+
const elapsedTimer = setInterval(() => {
|
|
86
|
+
const elapsed = Math.round((Date.now() - phaseStart) / 1000);
|
|
87
|
+
if (elapsed >= 3) {
|
|
88
|
+
bar.update({ phase: `${lastPhaseLabel} (${elapsed}s)` });
|
|
89
|
+
}
|
|
90
|
+
}, 1000);
|
|
73
91
|
const t0Global = Date.now();
|
|
74
92
|
// ── Cache embeddings from existing index before rebuild ────────────
|
|
75
93
|
let cachedEmbeddingNodeIds = new Set();
|
|
76
94
|
let cachedEmbeddings = [];
|
|
77
|
-
if (existingMeta && !options?.force) {
|
|
95
|
+
if (options?.embeddings && existingMeta && !options?.force) {
|
|
78
96
|
try {
|
|
79
97
|
bar.update(0, { phase: 'Caching embeddings...' });
|
|
80
98
|
await initKuzu(kuzuPath);
|
|
@@ -94,10 +112,16 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
94
112
|
const pipelineResult = await runPipelineFromRepo(repoPath, (progress) => {
|
|
95
113
|
const phaseLabel = PHASE_LABELS[progress.phase] || progress.phase;
|
|
96
114
|
const scaled = Math.round(progress.percent * 0.6);
|
|
115
|
+
if (phaseLabel !== lastPhaseLabel) {
|
|
116
|
+
lastPhaseLabel = phaseLabel;
|
|
117
|
+
phaseStart = Date.now();
|
|
118
|
+
}
|
|
97
119
|
bar.update(scaled, { phase: phaseLabel });
|
|
98
120
|
});
|
|
99
121
|
// ── Phase 2: KuzuDB (60–85%) ──────────────────────────────────────
|
|
100
|
-
|
|
122
|
+
lastPhaseLabel = 'Loading into KuzuDB...';
|
|
123
|
+
phaseStart = Date.now();
|
|
124
|
+
bar.update(60, { phase: lastPhaseLabel });
|
|
101
125
|
await closeKuzu();
|
|
102
126
|
const kuzuFiles = [kuzuPath, `${kuzuPath}.wal`, `${kuzuPath}.lock`];
|
|
103
127
|
for (const f of kuzuFiles) {
|
|
@@ -117,7 +141,9 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
117
141
|
const kuzuTime = ((Date.now() - t0Kuzu) / 1000).toFixed(1);
|
|
118
142
|
const kuzuWarnings = kuzuResult.warnings;
|
|
119
143
|
// ── Phase 3: FTS (85–90%) ─────────────────────────────────────────
|
|
120
|
-
|
|
144
|
+
lastPhaseLabel = 'Creating search indexes...';
|
|
145
|
+
phaseStart = Date.now();
|
|
146
|
+
bar.update(85, { phase: lastPhaseLabel });
|
|
121
147
|
const t0Fts = Date.now();
|
|
122
148
|
try {
|
|
123
149
|
await createFTSIndex('File', 'file_fts', ['name', 'content']);
|
|
@@ -146,22 +172,28 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
146
172
|
// ── Phase 4: Embeddings (90–98%) ──────────────────────────────────
|
|
147
173
|
const stats = await getKuzuStats();
|
|
148
174
|
let embeddingTime = '0.0';
|
|
149
|
-
let embeddingSkipped =
|
|
150
|
-
let embeddingSkipReason = '';
|
|
151
|
-
if (options?.
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
175
|
+
let embeddingSkipped = true;
|
|
176
|
+
let embeddingSkipReason = 'off (use --embeddings to enable)';
|
|
177
|
+
if (options?.embeddings) {
|
|
178
|
+
if (stats.nodes > EMBEDDING_NODE_LIMIT) {
|
|
179
|
+
embeddingSkipReason = `skipped (${stats.nodes.toLocaleString()} nodes > ${EMBEDDING_NODE_LIMIT.toLocaleString()} limit)`;
|
|
180
|
+
}
|
|
181
|
+
else {
|
|
182
|
+
embeddingSkipped = false;
|
|
183
|
+
}
|
|
158
184
|
}
|
|
159
185
|
if (!embeddingSkipped) {
|
|
160
|
-
|
|
186
|
+
lastPhaseLabel = 'Loading embedding model...';
|
|
187
|
+
phaseStart = Date.now();
|
|
188
|
+
bar.update(90, { phase: lastPhaseLabel });
|
|
161
189
|
const t0Emb = Date.now();
|
|
162
190
|
await runEmbeddingPipeline(executeQuery, executeWithReusedStatement, (progress) => {
|
|
163
191
|
const scaled = 90 + Math.round((progress.percent / 100) * 8);
|
|
164
192
|
const label = progress.phase === 'loading-model' ? 'Loading embedding model...' : `Embedding ${progress.nodesProcessed || 0}/${progress.totalNodes || '?'}`;
|
|
193
|
+
if (label !== lastPhaseLabel) {
|
|
194
|
+
lastPhaseLabel = label;
|
|
195
|
+
phaseStart = Date.now();
|
|
196
|
+
}
|
|
165
197
|
bar.update(scaled, { phase: label });
|
|
166
198
|
}, {}, cachedEmbeddingNodeIds.size > 0 ? cachedEmbeddingNodeIds : undefined);
|
|
167
199
|
embeddingTime = ((Date.now() - t0Emb) / 1000).toFixed(1);
|
|
@@ -203,8 +235,14 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
203
235
|
processes: pipelineResult.processResult?.stats.totalProcesses,
|
|
204
236
|
});
|
|
205
237
|
await closeKuzu();
|
|
206
|
-
|
|
238
|
+
// Note: we intentionally do NOT call disposeEmbedder() here.
|
|
239
|
+
// ONNX Runtime's native cleanup segfaults on macOS and some Linux configs.
|
|
240
|
+
// Since the process exits immediately after, Node.js reclaims everything.
|
|
207
241
|
const totalTime = ((Date.now() - t0Global) / 1000).toFixed(1);
|
|
242
|
+
clearInterval(elapsedTimer);
|
|
243
|
+
console.log = origLog;
|
|
244
|
+
console.warn = origWarn;
|
|
245
|
+
console.error = origError;
|
|
208
246
|
bar.update(100, { phase: 'Done' });
|
|
209
247
|
bar.stop();
|
|
210
248
|
// ── Summary ───────────────────────────────────────────────────────
|
|
@@ -233,4 +271,10 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
233
271
|
console.log('\n Tip: Run `gitnexus setup` to configure MCP for your editor.');
|
|
234
272
|
}
|
|
235
273
|
console.log('');
|
|
274
|
+
// ONNX Runtime registers native atexit hooks that segfault during process
|
|
275
|
+
// shutdown on macOS (#38) and some Linux configs (#40). Force-exit to
|
|
276
|
+
// bypass them when embeddings were loaded.
|
|
277
|
+
if (!embeddingSkipped) {
|
|
278
|
+
process.exit(0);
|
|
279
|
+
}
|
|
236
280
|
};
|
package/dist/cli/eval-server.js
CHANGED
|
@@ -261,7 +261,7 @@ export async function evalServerCommand(options) {
|
|
|
261
261
|
console.error('GitNexus eval-server: No indexed repositories found. Run: gitnexus analyze');
|
|
262
262
|
process.exit(1);
|
|
263
263
|
}
|
|
264
|
-
const repos = backend.listRepos();
|
|
264
|
+
const repos = await backend.listRepos();
|
|
265
265
|
console.error(`GitNexus eval-server: ${repos.length} repo(s) loaded: ${repos.map(r => r.name).join(', ')}`);
|
|
266
266
|
let idleTimer = null;
|
|
267
267
|
function resetIdleTimer() {
|
package/dist/cli/index.js
CHANGED
|
@@ -24,7 +24,7 @@ program
|
|
|
24
24
|
.command('analyze [path]')
|
|
25
25
|
.description('Index a repository (full analysis)')
|
|
26
26
|
.option('-f, --force', 'Force full re-index even if up to date')
|
|
27
|
-
.option('--
|
|
27
|
+
.option('--embeddings', 'Enable embedding generation for semantic search (off by default)')
|
|
28
28
|
.action(analyzeCommand);
|
|
29
29
|
program
|
|
30
30
|
.command('serve')
|
package/dist/cli/mcp.js
CHANGED
|
@@ -38,7 +38,7 @@ export const mcpCommand = async () => {
|
|
|
38
38
|
console.error('GitNexus: Failed to initialize backend from registry.');
|
|
39
39
|
process.exit(1);
|
|
40
40
|
}
|
|
41
|
-
const repoNames = backend.listRepos().map(r => r.name);
|
|
41
|
+
const repoNames = (await backend.listRepos()).map(r => r.name);
|
|
42
42
|
console.error(`GitNexus: MCP server starting with ${repoNames.length} repo(s): ${repoNames.join(', ')}`);
|
|
43
43
|
// Start MCP server (serves all repos)
|
|
44
44
|
await startMCPServer(backend);
|
|
@@ -98,11 +98,11 @@ export async function augment(pattern, cwd) {
|
|
|
98
98
|
for (const result of bm25Results.slice(0, 5)) {
|
|
99
99
|
const escaped = result.filePath.replace(/'/g, "''");
|
|
100
100
|
try {
|
|
101
|
-
const symbols = await executeQuery(repoId, `
|
|
102
|
-
MATCH (n) WHERE n.filePath = '${escaped}'
|
|
103
|
-
AND n.name CONTAINS '${pattern.replace(/'/g, "''").split(/\s+/)[0]}'
|
|
104
|
-
RETURN n.id AS id, n.name AS name, labels(n)[0] AS type, n.filePath AS filePath
|
|
105
|
-
LIMIT 3
|
|
101
|
+
const symbols = await executeQuery(repoId, `
|
|
102
|
+
MATCH (n) WHERE n.filePath = '${escaped}'
|
|
103
|
+
AND n.name CONTAINS '${pattern.replace(/'/g, "''").split(/\s+/)[0]}'
|
|
104
|
+
RETURN n.id AS id, n.name AS name, labels(n)[0] AS type, n.filePath AS filePath
|
|
105
|
+
LIMIT 3
|
|
106
106
|
`);
|
|
107
107
|
for (const sym of symbols) {
|
|
108
108
|
symbolMatches.push({
|
|
@@ -130,10 +130,10 @@ export async function augment(pattern, cwd) {
|
|
|
130
130
|
// Callers
|
|
131
131
|
let callers = [];
|
|
132
132
|
try {
|
|
133
|
-
const rows = await executeQuery(repoId, `
|
|
134
|
-
MATCH (caller)-[:CodeRelation {type: 'CALLS'}]->(n {id: '${escaped}'})
|
|
135
|
-
RETURN caller.name AS name
|
|
136
|
-
LIMIT 3
|
|
133
|
+
const rows = await executeQuery(repoId, `
|
|
134
|
+
MATCH (caller)-[:CodeRelation {type: 'CALLS'}]->(n {id: '${escaped}'})
|
|
135
|
+
RETURN caller.name AS name
|
|
136
|
+
LIMIT 3
|
|
137
137
|
`);
|
|
138
138
|
callers = rows.map((r) => r.name || r[0]).filter(Boolean);
|
|
139
139
|
}
|
|
@@ -141,10 +141,10 @@ export async function augment(pattern, cwd) {
|
|
|
141
141
|
// Callees
|
|
142
142
|
let callees = [];
|
|
143
143
|
try {
|
|
144
|
-
const rows = await executeQuery(repoId, `
|
|
145
|
-
MATCH (n {id: '${escaped}'})-[:CodeRelation {type: 'CALLS'}]->(callee)
|
|
146
|
-
RETURN callee.name AS name
|
|
147
|
-
LIMIT 3
|
|
144
|
+
const rows = await executeQuery(repoId, `
|
|
145
|
+
MATCH (n {id: '${escaped}'})-[:CodeRelation {type: 'CALLS'}]->(callee)
|
|
146
|
+
RETURN callee.name AS name
|
|
147
|
+
LIMIT 3
|
|
148
148
|
`);
|
|
149
149
|
callees = rows.map((r) => r.name || r[0]).filter(Boolean);
|
|
150
150
|
}
|
|
@@ -152,9 +152,9 @@ export async function augment(pattern, cwd) {
|
|
|
152
152
|
// Processes
|
|
153
153
|
let processes = [];
|
|
154
154
|
try {
|
|
155
|
-
const rows = await executeQuery(repoId, `
|
|
156
|
-
MATCH (n {id: '${escaped}'})-[r:CodeRelation {type: 'STEP_IN_PROCESS'}]->(p:Process)
|
|
157
|
-
RETURN p.heuristicLabel AS label, r.step AS step, p.stepCount AS stepCount
|
|
155
|
+
const rows = await executeQuery(repoId, `
|
|
156
|
+
MATCH (n {id: '${escaped}'})-[r:CodeRelation {type: 'STEP_IN_PROCESS'}]->(p:Process)
|
|
157
|
+
RETURN p.heuristicLabel AS label, r.step AS step, p.stepCount AS stepCount
|
|
158
158
|
`);
|
|
159
159
|
processes = rows.map((r) => {
|
|
160
160
|
const label = r.label || r[0];
|
|
@@ -167,10 +167,10 @@ export async function augment(pattern, cwd) {
|
|
|
167
167
|
// Cluster cohesion (internal ranking signal)
|
|
168
168
|
let cohesion = 0;
|
|
169
169
|
try {
|
|
170
|
-
const rows = await executeQuery(repoId, `
|
|
171
|
-
MATCH (n {id: '${escaped}'})-[:CodeRelation {type: 'MEMBER_OF'}]->(c:Community)
|
|
172
|
-
RETURN c.cohesion AS cohesion
|
|
173
|
-
LIMIT 1
|
|
170
|
+
const rows = await executeQuery(repoId, `
|
|
171
|
+
MATCH (n {id: '${escaped}'})-[:CodeRelation {type: 'MEMBER_OF'}]->(c:Community)
|
|
172
|
+
RETURN c.cohesion AS cohesion
|
|
173
|
+
LIMIT 1
|
|
174
174
|
`);
|
|
175
175
|
if (rows.length > 0) {
|
|
176
176
|
cohesion = (rows[0].cohesion ?? rows[0][0]) || 0;
|
|
@@ -6,6 +6,12 @@
|
|
|
6
6
|
*
|
|
7
7
|
* Uses snowflake-arctic-embed-xs by default (22M params, 384 dims, ~90MB)
|
|
8
8
|
*/
|
|
9
|
+
// Suppress ONNX Runtime native warnings (e.g. VerifyEachNodeIsAssignedToAnEp)
|
|
10
|
+
// Must be set BEFORE onnxruntime-node is imported by transformers.js
|
|
11
|
+
// Level 3 = Error only (skips Warning/Info)
|
|
12
|
+
if (!process.env.ORT_LOG_LEVEL) {
|
|
13
|
+
process.env.ORT_LOG_LEVEL = '3';
|
|
14
|
+
}
|
|
9
15
|
import { pipeline, env } from '@huggingface/transformers';
|
|
10
16
|
import { DEFAULT_EMBEDDING_CONFIG } from './types.js';
|
|
11
17
|
// Module-level state for singleton pattern
|
|
@@ -83,6 +89,7 @@ export const initEmbedder = async (onProgress, config = {}, forceDevice) => {
|
|
|
83
89
|
device: device,
|
|
84
90
|
dtype: 'fp32',
|
|
85
91
|
progress_callback: progressCallback,
|
|
92
|
+
session_options: { logSeverityLevel: 3 },
|
|
86
93
|
});
|
|
87
94
|
currentDevice = device;
|
|
88
95
|
if (isDev) {
|
|
@@ -24,19 +24,19 @@ const queryEmbeddableNodes = async (executeQuery) => {
|
|
|
24
24
|
let query;
|
|
25
25
|
if (label === 'File') {
|
|
26
26
|
// File nodes don't have startLine/endLine
|
|
27
|
-
query = `
|
|
28
|
-
MATCH (n:File)
|
|
29
|
-
RETURN n.id AS id, n.name AS name, 'File' AS label,
|
|
30
|
-
n.filePath AS filePath, n.content AS content
|
|
27
|
+
query = `
|
|
28
|
+
MATCH (n:File)
|
|
29
|
+
RETURN n.id AS id, n.name AS name, 'File' AS label,
|
|
30
|
+
n.filePath AS filePath, n.content AS content
|
|
31
31
|
`;
|
|
32
32
|
}
|
|
33
33
|
else {
|
|
34
34
|
// Code elements have startLine/endLine
|
|
35
|
-
query = `
|
|
36
|
-
MATCH (n:${label})
|
|
37
|
-
RETURN n.id AS id, n.name AS name, '${label}' AS label,
|
|
38
|
-
n.filePath AS filePath, n.content AS content,
|
|
39
|
-
n.startLine AS startLine, n.endLine AS endLine
|
|
35
|
+
query = `
|
|
36
|
+
MATCH (n:${label})
|
|
37
|
+
RETURN n.id AS id, n.name AS name, '${label}' AS label,
|
|
38
|
+
n.filePath AS filePath, n.content AS content,
|
|
39
|
+
n.startLine AS startLine, n.endLine AS endLine
|
|
40
40
|
`;
|
|
41
41
|
}
|
|
42
42
|
const rows = await executeQuery(query);
|
|
@@ -77,8 +77,8 @@ const batchInsertEmbeddings = async (executeWithReusedStatement, updates) => {
|
|
|
77
77
|
* Now indexes the separate CodeEmbedding table
|
|
78
78
|
*/
|
|
79
79
|
const createVectorIndex = async (executeQuery) => {
|
|
80
|
-
const cypher = `
|
|
81
|
-
CALL CREATE_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx', 'embedding', metric := 'cosine')
|
|
80
|
+
const cypher = `
|
|
81
|
+
CALL CREATE_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx', 'embedding', metric := 'cosine')
|
|
82
82
|
`;
|
|
83
83
|
try {
|
|
84
84
|
await executeQuery(cypher);
|
|
@@ -240,14 +240,14 @@ export const semanticSearch = async (executeQuery, query, k = 10, maxDistance =
|
|
|
240
240
|
const queryVec = embeddingToArray(queryEmbedding);
|
|
241
241
|
const queryVecStr = `[${queryVec.join(',')}]`;
|
|
242
242
|
// Query the vector index on CodeEmbedding to get nodeIds and distances
|
|
243
|
-
const vectorQuery = `
|
|
244
|
-
CALL QUERY_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx',
|
|
245
|
-
CAST(${queryVecStr} AS FLOAT[384]), ${k})
|
|
246
|
-
YIELD node AS emb, distance
|
|
247
|
-
WITH emb, distance
|
|
248
|
-
WHERE distance < ${maxDistance}
|
|
249
|
-
RETURN emb.nodeId AS nodeId, distance
|
|
250
|
-
ORDER BY distance
|
|
243
|
+
const vectorQuery = `
|
|
244
|
+
CALL QUERY_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx',
|
|
245
|
+
CAST(${queryVecStr} AS FLOAT[384]), ${k})
|
|
246
|
+
YIELD node AS emb, distance
|
|
247
|
+
WITH emb, distance
|
|
248
|
+
WHERE distance < ${maxDistance}
|
|
249
|
+
RETURN emb.nodeId AS nodeId, distance
|
|
250
|
+
ORDER BY distance
|
|
251
251
|
`;
|
|
252
252
|
const embResults = await executeQuery(vectorQuery);
|
|
253
253
|
if (embResults.length === 0) {
|
|
@@ -266,16 +266,16 @@ export const semanticSearch = async (executeQuery, query, k = 10, maxDistance =
|
|
|
266
266
|
try {
|
|
267
267
|
let nodeQuery;
|
|
268
268
|
if (label === 'File') {
|
|
269
|
-
nodeQuery = `
|
|
270
|
-
MATCH (n:File {id: '${nodeId.replace(/'/g, "''")}'})
|
|
271
|
-
RETURN n.name AS name, n.filePath AS filePath
|
|
269
|
+
nodeQuery = `
|
|
270
|
+
MATCH (n:File {id: '${nodeId.replace(/'/g, "''")}'})
|
|
271
|
+
RETURN n.name AS name, n.filePath AS filePath
|
|
272
272
|
`;
|
|
273
273
|
}
|
|
274
274
|
else {
|
|
275
|
-
nodeQuery = `
|
|
276
|
-
MATCH (n:${label} {id: '${nodeId.replace(/'/g, "''")}'})
|
|
277
|
-
RETURN n.name AS name, n.filePath AS filePath,
|
|
278
|
-
n.startLine AS startLine, n.endLine AS endLine
|
|
275
|
+
nodeQuery = `
|
|
276
|
+
MATCH (n:${label} {id: '${nodeId.replace(/'/g, "''")}'})
|
|
277
|
+
RETURN n.name AS name, n.filePath AS filePath,
|
|
278
|
+
n.startLine AS startLine, n.endLine AS endLine
|
|
279
279
|
`;
|
|
280
280
|
}
|
|
281
281
|
const nodeRows = await executeQuery(nodeQuery);
|
|
@@ -13,12 +13,12 @@ const buildEnrichmentPrompt = (members, heuristicLabel) => {
|
|
|
13
13
|
const memberList = limitedMembers
|
|
14
14
|
.map(m => `${m.name} (${m.type})`)
|
|
15
15
|
.join(', ');
|
|
16
|
-
return `Analyze this code cluster and provide a semantic name and short description.
|
|
17
|
-
|
|
18
|
-
Heuristic: "${heuristicLabel}"
|
|
19
|
-
Members: ${memberList}${members.length > 20 ? ` (+${members.length - 20} more)` : ''}
|
|
20
|
-
|
|
21
|
-
Reply with JSON only:
|
|
16
|
+
return `Analyze this code cluster and provide a semantic name and short description.
|
|
17
|
+
|
|
18
|
+
Heuristic: "${heuristicLabel}"
|
|
19
|
+
Members: ${memberList}${members.length > 20 ? ` (+${members.length - 20} more)` : ''}
|
|
20
|
+
|
|
21
|
+
Reply with JSON only:
|
|
22
22
|
{"name": "2-4 word semantic name", "description": "One sentence describing purpose"}`;
|
|
23
23
|
};
|
|
24
24
|
// ============================================================================
|
|
@@ -115,18 +115,18 @@ export const enrichClustersBatch = async (communities, memberMap, llmClient, bat
|
|
|
115
115
|
const memberList = limitedMembers
|
|
116
116
|
.map(m => `${m.name} (${m.type})`)
|
|
117
117
|
.join(', ');
|
|
118
|
-
return `Cluster ${idx + 1} (id: ${community.id}):
|
|
119
|
-
Heuristic: "${community.heuristicLabel}"
|
|
118
|
+
return `Cluster ${idx + 1} (id: ${community.id}):
|
|
119
|
+
Heuristic: "${community.heuristicLabel}"
|
|
120
120
|
Members: ${memberList}`;
|
|
121
121
|
}).join('\n\n');
|
|
122
|
-
const prompt = `Analyze these code clusters and generate semantic names, keywords, and descriptions.
|
|
123
|
-
|
|
124
|
-
${batchPrompt}
|
|
125
|
-
|
|
126
|
-
Output JSON array:
|
|
127
|
-
[
|
|
128
|
-
{"id": "comm_X", "name": "...", "keywords": [...], "description": "..."},
|
|
129
|
-
...
|
|
122
|
+
const prompt = `Analyze these code clusters and generate semantic names, keywords, and descriptions.
|
|
123
|
+
|
|
124
|
+
${batchPrompt}
|
|
125
|
+
|
|
126
|
+
Output JSON array:
|
|
127
|
+
[
|
|
128
|
+
{"id": "comm_X", "name": "...", "keywords": [...], "description": "..."},
|
|
129
|
+
...
|
|
130
130
|
]`;
|
|
131
131
|
try {
|
|
132
132
|
const response = await llmClient.generate(prompt);
|
|
@@ -3,6 +3,8 @@ import path from 'path';
|
|
|
3
3
|
import { glob } from 'glob';
|
|
4
4
|
import { shouldIgnorePath } from '../../config/ignore-service.js';
|
|
5
5
|
const READ_CONCURRENCY = 32;
|
|
6
|
+
/** Skip files larger than 512KB — they're usually generated/vendored and crash tree-sitter */
|
|
7
|
+
const MAX_FILE_SIZE = 512 * 1024;
|
|
6
8
|
export const walkRepository = async (repoPath, onProgress) => {
|
|
7
9
|
const files = await glob('**/*', {
|
|
8
10
|
cwd: repoPath,
|
|
@@ -12,13 +14,22 @@ export const walkRepository = async (repoPath, onProgress) => {
|
|
|
12
14
|
const filtered = files.filter(file => !shouldIgnorePath(file));
|
|
13
15
|
const entries = [];
|
|
14
16
|
let processed = 0;
|
|
17
|
+
let skippedLarge = 0;
|
|
15
18
|
for (let start = 0; start < filtered.length; start += READ_CONCURRENCY) {
|
|
16
19
|
const batch = filtered.slice(start, start + READ_CONCURRENCY);
|
|
17
|
-
const results = await Promise.allSettled(batch.map(
|
|
18
|
-
|
|
20
|
+
const results = await Promise.allSettled(batch.map(async (relativePath) => {
|
|
21
|
+
const fullPath = path.join(repoPath, relativePath);
|
|
22
|
+
const stat = await fs.stat(fullPath);
|
|
23
|
+
if (stat.size > MAX_FILE_SIZE) {
|
|
24
|
+
skippedLarge++;
|
|
25
|
+
return null;
|
|
26
|
+
}
|
|
27
|
+
const content = await fs.readFile(fullPath, 'utf-8');
|
|
28
|
+
return { path: relativePath.replace(/\\/g, '/'), content };
|
|
29
|
+
}));
|
|
19
30
|
for (const result of results) {
|
|
20
31
|
processed++;
|
|
21
|
-
if (result.status === 'fulfilled') {
|
|
32
|
+
if (result.status === 'fulfilled' && result.value !== null) {
|
|
22
33
|
entries.push(result.value);
|
|
23
34
|
onProgress?.(processed, filtered.length, result.value.path);
|
|
24
35
|
}
|
|
@@ -27,5 +38,8 @@ export const walkRepository = async (repoPath, onProgress) => {
|
|
|
27
38
|
}
|
|
28
39
|
}
|
|
29
40
|
}
|
|
41
|
+
if (skippedLarge > 0) {
|
|
42
|
+
console.warn(` Skipped ${skippedLarge} files larger than ${MAX_FILE_SIZE / 1024}KB`);
|
|
43
|
+
}
|
|
30
44
|
return entries;
|
|
31
45
|
};
|
|
@@ -158,6 +158,9 @@ const processParsingSequential = async (graph, files, symbolTable, astCache, onF
|
|
|
158
158
|
const language = getLanguageFromFilename(file.path);
|
|
159
159
|
if (!language)
|
|
160
160
|
continue;
|
|
161
|
+
// Skip very large files — they can crash tree-sitter or cause OOM
|
|
162
|
+
if (file.content.length > 512 * 1024)
|
|
163
|
+
continue;
|
|
161
164
|
await loadLanguage(language, file.path);
|
|
162
165
|
let tree;
|
|
163
166
|
try {
|
|
@@ -281,7 +284,7 @@ export const processParsing = async (graph, files, symbolTable, astCache, onFile
|
|
|
281
284
|
return await processParsingWithWorkers(graph, files, symbolTable, astCache, workerPool, onFileProgress);
|
|
282
285
|
}
|
|
283
286
|
catch (err) {
|
|
284
|
-
console.warn('Worker pool parsing failed, falling back to sequential:', err);
|
|
287
|
+
console.warn('Worker pool parsing failed, falling back to sequential:', err instanceof Error ? err.message : err);
|
|
285
288
|
}
|
|
286
289
|
}
|
|
287
290
|
// Fallback: sequential parsing (no pre-extracted data)
|
|
@@ -328,6 +328,9 @@ const processFileGroup = (files, language, queryString, result, onFileProcessed)
|
|
|
328
328
|
return;
|
|
329
329
|
}
|
|
330
330
|
for (const file of files) {
|
|
331
|
+
// Skip very large files — they can crash tree-sitter or cause OOM
|
|
332
|
+
if (file.content.length > 512 * 1024)
|
|
333
|
+
continue;
|
|
331
334
|
let tree;
|
|
332
335
|
try {
|
|
333
336
|
tree = parser.parse(file.content, undefined, { bufferSize: 1024 * 256 });
|
|
@@ -444,8 +447,14 @@ const processFileGroup = (files, language, queryString, result, onFileProcessed)
|
|
|
444
447
|
// Worker message handler
|
|
445
448
|
// ============================================================================
|
|
446
449
|
parentPort.on('message', (files) => {
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
450
|
+
try {
|
|
451
|
+
const result = processBatch(files, (filesProcessed) => {
|
|
452
|
+
parentPort.postMessage({ type: 'progress', filesProcessed });
|
|
453
|
+
});
|
|
454
|
+
parentPort.postMessage({ type: 'result', data: result });
|
|
455
|
+
}
|
|
456
|
+
catch (err) {
|
|
457
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
458
|
+
parentPort.postMessage({ type: 'error', error: message });
|
|
459
|
+
}
|
|
451
460
|
});
|
|
@@ -27,31 +27,65 @@ export const createWorkerPool = (workerUrl, poolSize) => {
|
|
|
27
27
|
const promises = chunks.map((chunk, i) => {
|
|
28
28
|
const worker = workers[i];
|
|
29
29
|
return new Promise((resolve, reject) => {
|
|
30
|
+
let settled = false;
|
|
31
|
+
const cleanup = () => {
|
|
32
|
+
clearTimeout(timer);
|
|
33
|
+
worker.removeListener('message', handler);
|
|
34
|
+
worker.removeListener('error', errorHandler);
|
|
35
|
+
worker.removeListener('exit', exitHandler);
|
|
36
|
+
};
|
|
37
|
+
const timer = setTimeout(() => {
|
|
38
|
+
if (!settled) {
|
|
39
|
+
settled = true;
|
|
40
|
+
cleanup();
|
|
41
|
+
reject(new Error(`Worker ${i} timed out after 5 minutes (chunk: ${chunk.length} items). Worker may have crashed or is processing too much data.`));
|
|
42
|
+
}
|
|
43
|
+
}, 5 * 60 * 1000);
|
|
30
44
|
const handler = (msg) => {
|
|
45
|
+
if (settled)
|
|
46
|
+
return;
|
|
31
47
|
if (msg && msg.type === 'progress') {
|
|
32
|
-
// Intermediate progress from worker
|
|
33
48
|
workerProgress[i] = msg.filesProcessed;
|
|
34
49
|
if (onProgress) {
|
|
35
50
|
const total = workerProgress.reduce((a, b) => a + b, 0);
|
|
36
51
|
onProgress(total);
|
|
37
52
|
}
|
|
38
53
|
}
|
|
54
|
+
else if (msg && msg.type === 'error') {
|
|
55
|
+
// Error reported by worker via postMessage
|
|
56
|
+
settled = true;
|
|
57
|
+
cleanup();
|
|
58
|
+
reject(new Error(`Worker ${i} error: ${msg.error}`));
|
|
59
|
+
}
|
|
39
60
|
else if (msg && msg.type === 'result') {
|
|
40
|
-
|
|
41
|
-
|
|
61
|
+
settled = true;
|
|
62
|
+
cleanup();
|
|
42
63
|
resolve(msg.data);
|
|
43
64
|
}
|
|
44
65
|
else {
|
|
45
|
-
// Legacy: treat any non-typed message as result
|
|
46
|
-
|
|
66
|
+
// Legacy: treat any non-typed message as result
|
|
67
|
+
settled = true;
|
|
68
|
+
cleanup();
|
|
47
69
|
resolve(msg);
|
|
48
70
|
}
|
|
49
71
|
};
|
|
72
|
+
const errorHandler = (err) => {
|
|
73
|
+
if (!settled) {
|
|
74
|
+
settled = true;
|
|
75
|
+
cleanup();
|
|
76
|
+
reject(err);
|
|
77
|
+
}
|
|
78
|
+
};
|
|
79
|
+
const exitHandler = (code) => {
|
|
80
|
+
if (!settled) {
|
|
81
|
+
settled = true;
|
|
82
|
+
cleanup();
|
|
83
|
+
reject(new Error(`Worker ${i} exited unexpectedly with code ${code}. This usually indicates an out-of-memory crash or native addon failure.`));
|
|
84
|
+
}
|
|
85
|
+
};
|
|
50
86
|
worker.on('message', handler);
|
|
51
|
-
worker.once('error',
|
|
52
|
-
|
|
53
|
-
reject(err);
|
|
54
|
-
});
|
|
87
|
+
worker.once('error', errorHandler);
|
|
88
|
+
worker.once('exit', exitHandler);
|
|
55
89
|
worker.postMessage(chunk);
|
|
56
90
|
});
|
|
57
91
|
});
|
|
@@ -242,10 +242,10 @@ const fallbackRelationshipInserts = async (validRelLines, validTables, getNodeLa
|
|
|
242
242
|
continue;
|
|
243
243
|
const confidence = parseFloat(confidenceStr) || 1.0;
|
|
244
244
|
const step = parseInt(stepStr) || 0;
|
|
245
|
-
await conn.query(`
|
|
246
|
-
MATCH (a:${escapeLabel(fromLabel)} {id: '${fromId.replace(/'/g, "''")}' }),
|
|
247
|
-
(b:${escapeLabel(toLabel)} {id: '${toId.replace(/'/g, "''")}' })
|
|
248
|
-
CREATE (a)-[:${REL_TABLE_NAME} {type: '${relType}', confidence: ${confidence}, reason: '${reason.replace(/'/g, "''")}', step: ${step}}]->(b)
|
|
245
|
+
await conn.query(`
|
|
246
|
+
MATCH (a:${escapeLabel(fromLabel)} {id: '${fromId.replace(/'/g, "''")}' }),
|
|
247
|
+
(b:${escapeLabel(toLabel)} {id: '${toId.replace(/'/g, "''")}' })
|
|
248
|
+
CREATE (a)-[:${REL_TABLE_NAME} {type: '${relType}', confidence: ${confidence}, reason: '${reason.replace(/'/g, "''")}', step: ${step}}]->(b)
|
|
249
249
|
`);
|
|
250
250
|
}
|
|
251
251
|
catch {
|
|
@@ -636,11 +636,11 @@ export const queryFTS = async (tableName, indexName, query, limit = 20, conjunct
|
|
|
636
636
|
}
|
|
637
637
|
// Escape single quotes in query
|
|
638
638
|
const escapedQuery = query.replace(/'/g, "''");
|
|
639
|
-
const cypher = `
|
|
640
|
-
CALL QUERY_FTS_INDEX('${tableName}', '${indexName}', '${escapedQuery}', conjunctive := ${conjunctive})
|
|
641
|
-
RETURN node, score
|
|
642
|
-
ORDER BY score DESC
|
|
643
|
-
LIMIT ${limit}
|
|
639
|
+
const cypher = `
|
|
640
|
+
CALL QUERY_FTS_INDEX('${tableName}', '${indexName}', '${escapedQuery}', conjunctive := ${conjunctive})
|
|
641
|
+
RETURN node, score
|
|
642
|
+
ORDER BY score DESC
|
|
643
|
+
LIMIT ${limit}
|
|
644
644
|
`;
|
|
645
645
|
try {
|
|
646
646
|
const queryResult = await conn.query(cypher);
|