@zuvia-software-solutions/code-mapper 2.3.1 → 2.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/analyze.js +105 -22
- package/dist/cli/index.js +0 -0
- package/dist/core/ingestion/filesystem-walker.js +6 -3
- package/dist/core/ingestion/parsing-processor.js +7 -3
- package/dist/core/ingestion/pipeline.js +27 -23
- package/dist/core/ingestion/workers/parse-worker.js +1 -1
- package/dist/core/ingestion/workers/worker-pool.d.ts +8 -3
- package/dist/core/ingestion/workers/worker-pool.js +68 -12
- package/models/mlx-embedder.py +70 -15
- package/package.json +1 -1
package/dist/cli/analyze.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
// code-mapper/src/cli/analyze.ts
|
|
2
2
|
/** @file analyze.ts @description Indexes a repository, builds the knowledge graph, and stores it in .code-mapper/ */
|
|
3
3
|
import path from 'path';
|
|
4
|
+
import os from 'os';
|
|
4
5
|
import { execFileSync } from 'child_process';
|
|
5
6
|
import v8 from 'v8';
|
|
6
7
|
import cliProgress from 'cli-progress';
|
|
@@ -93,7 +94,8 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
93
94
|
}
|
|
94
95
|
// Single progress bar for the entire pipeline
|
|
95
96
|
const bar = new cliProgress.SingleBar({
|
|
96
|
-
|
|
97
|
+
// \x1b[K at end clears to EOL so shorter redraws don't leave trailing characters
|
|
98
|
+
format: ' {bar} {percentage}% | {phase} | {resources}\x1b[K',
|
|
97
99
|
barCompleteChar: '\u2588',
|
|
98
100
|
barIncompleteChar: '\u2591',
|
|
99
101
|
hideCursor: true,
|
|
@@ -131,6 +133,37 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
131
133
|
console.log = barLog;
|
|
132
134
|
console.warn = barLog;
|
|
133
135
|
console.error = barLog;
|
|
136
|
+
const t0Global = Date.now();
|
|
137
|
+
const cpuStart = process.cpuUsage();
|
|
138
|
+
let peakRssMB = 0;
|
|
139
|
+
// Phase timing tracker — records wall time and RSS for each phase
|
|
140
|
+
const phaseTimes = [];
|
|
141
|
+
let currentPhaseName = 'init';
|
|
142
|
+
let currentPhaseStart = Date.now();
|
|
143
|
+
const recordPhase = (nextPhase) => {
|
|
144
|
+
const now = Date.now();
|
|
145
|
+
const elapsed = now - currentPhaseStart;
|
|
146
|
+
if (elapsed > 0) {
|
|
147
|
+
phaseTimes.push({
|
|
148
|
+
name: currentPhaseName,
|
|
149
|
+
ms: elapsed,
|
|
150
|
+
rssMB: Math.round(process.memoryUsage.rss() / (1024 * 1024)),
|
|
151
|
+
});
|
|
152
|
+
}
|
|
153
|
+
currentPhaseName = nextPhase;
|
|
154
|
+
currentPhaseStart = now;
|
|
155
|
+
};
|
|
156
|
+
// Live resource stats for the progress bar
|
|
157
|
+
const cpuCount = os.cpus().length;
|
|
158
|
+
const getResourceStats = () => {
|
|
159
|
+
const rssMB = Math.round(process.memoryUsage.rss() / (1024 * 1024));
|
|
160
|
+
if (rssMB > peakRssMB)
|
|
161
|
+
peakRssMB = rssMB;
|
|
162
|
+
const cpuDelta = process.cpuUsage(cpuStart);
|
|
163
|
+
const wallMs = Date.now() - t0Global || 1;
|
|
164
|
+
const cpuPct = Math.round(((cpuDelta.user + cpuDelta.system) / 1e3) / wallMs * 100);
|
|
165
|
+
return `${rssMB}MB | CPU ${cpuPct}%`;
|
|
166
|
+
};
|
|
134
167
|
// Track elapsed time per phase — both updateBar and the interval use
|
|
135
168
|
// the same format so they don't flicker against each other
|
|
136
169
|
let lastPhaseLabel = 'Initializing...';
|
|
@@ -143,17 +176,16 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
143
176
|
}
|
|
144
177
|
const elapsed = Math.round((Date.now() - phaseStart) / 1000);
|
|
145
178
|
const display = elapsed >= 3 ? `${phaseLabel} (${elapsed}s)` : phaseLabel;
|
|
146
|
-
bar.update(value, { phase: display });
|
|
179
|
+
bar.update(value, { phase: display, resources: getResourceStats() });
|
|
147
180
|
};
|
|
148
181
|
// Tick elapsed seconds for phases with infrequent progress callbacks
|
|
149
182
|
// (e.g. CSV streaming, FTS indexing) — uses the same display format as updateBar
|
|
150
183
|
const elapsedTimer = setInterval(() => {
|
|
151
184
|
const elapsed = Math.round((Date.now() - phaseStart) / 1000);
|
|
152
185
|
if (elapsed >= 3) {
|
|
153
|
-
bar.update({ phase: `${lastPhaseLabel} (${elapsed}s)
|
|
186
|
+
bar.update({ phase: `${lastPhaseLabel} (${elapsed}s)`, resources: getResourceStats() });
|
|
154
187
|
}
|
|
155
188
|
}, 1000);
|
|
156
|
-
const t0Global = Date.now();
|
|
157
189
|
// Cache embeddings from existing index before rebuild
|
|
158
190
|
let cachedEmbeddingNodeIds = new Set();
|
|
159
191
|
let cachedEmbeddings = [];
|
|
@@ -180,15 +212,24 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
180
212
|
}
|
|
181
213
|
}
|
|
182
214
|
// Phase 1: Full Pipeline (0-60%)
|
|
215
|
+
let lastPipelinePhase = '';
|
|
183
216
|
const pipelineResult = await runPipelineFromRepo(repoPath, (progress) => {
|
|
184
|
-
|
|
217
|
+
if (progress.phase !== lastPipelinePhase) {
|
|
218
|
+
recordPhase(progress.phase);
|
|
219
|
+
lastPipelinePhase = progress.phase;
|
|
220
|
+
}
|
|
221
|
+
let phaseLabel = PHASE_LABELS[progress.phase] || progress.phase;
|
|
222
|
+
if (progress.stats && progress.stats.totalFiles > 0 &&
|
|
223
|
+
(progress.phase === 'parsing' || progress.phase === 'extracting')) {
|
|
224
|
+
phaseLabel += ` (${progress.stats.filesProcessed.toLocaleString()}/${progress.stats.totalFiles.toLocaleString()})`;
|
|
225
|
+
}
|
|
185
226
|
const scaled = Math.round(progress.percent * 0.6);
|
|
186
227
|
updateBar(scaled, phaseLabel);
|
|
187
228
|
}, options?.tsgo === false ? { tsgo: false } : {});
|
|
188
229
|
// Phase 2: SQLite (60-85%)
|
|
230
|
+
recordPhase('sqlite');
|
|
189
231
|
updateBar(60, 'Loading into database...');
|
|
190
232
|
// Reset the database (delete and recreate)
|
|
191
|
-
const t0Db = Date.now();
|
|
192
233
|
let db = resetDb(dbPath);
|
|
193
234
|
let dbMsgCount = 0;
|
|
194
235
|
const dbResult = loadGraphToDb(db, pipelineResult.graph, pipelineResult.repoPath, (msg) => {
|
|
@@ -196,20 +237,21 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
196
237
|
const progress = Math.min(84, 60 + Math.round((dbMsgCount / (dbMsgCount + 10)) * 24));
|
|
197
238
|
updateBar(progress, msg);
|
|
198
239
|
});
|
|
199
|
-
const dbTime = ((Date.now() - t0Db) / 1000).toFixed(1);
|
|
200
240
|
const dbWarnings = dbResult.warnings;
|
|
201
241
|
// Phase 2.5: HTTP route stitching (post-DB-load, needs content field)
|
|
242
|
+
recordPhase('routes');
|
|
202
243
|
stitchRoutes(db);
|
|
203
244
|
// Phase 2.6: Populate searchText for BM25 concept matching
|
|
204
245
|
// Uses first comment + callers + module — must run after edges are loaded
|
|
246
|
+
recordPhase('search-text');
|
|
205
247
|
updateBar(84, 'Building search index...');
|
|
206
248
|
populateSearchText(db);
|
|
207
249
|
// Phase 3: FTS (85-90%)
|
|
208
250
|
// FTS5 is auto-created by schema triggers — no manual index creation needed
|
|
251
|
+
recordPhase('fts');
|
|
209
252
|
updateBar(85, 'Search indexes ready');
|
|
210
|
-
const t0Fts = Date.now();
|
|
211
|
-
const ftsTime = ((Date.now() - t0Fts) / 1000).toFixed(1);
|
|
212
253
|
// Phase 3.5: Re-insert cached embeddings
|
|
254
|
+
recordPhase('restore-embeddings');
|
|
213
255
|
if (cachedEmbeddings.length > 0) {
|
|
214
256
|
updateBar(88, `Restoring ${cachedEmbeddings.length} cached embeddings...`);
|
|
215
257
|
const EMBED_BATCH = 200;
|
|
@@ -226,15 +268,9 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
226
268
|
}
|
|
227
269
|
// Phase 4: Embeddings (90-98%)
|
|
228
270
|
const stats = getStats(db);
|
|
229
|
-
let embeddingTime = '0.0';
|
|
230
|
-
let embeddingSkipped = true;
|
|
231
|
-
let embeddingSkipReason = 'off (use --no-embeddings to skip)';
|
|
232
271
|
if (options?.embeddings) {
|
|
233
|
-
|
|
234
|
-
}
|
|
235
|
-
if (!embeddingSkipped) {
|
|
272
|
+
recordPhase('embeddings');
|
|
236
273
|
updateBar(90, 'Generating embeddings...');
|
|
237
|
-
const t0Emb = Date.now();
|
|
238
274
|
// Close DB so Python can write to it
|
|
239
275
|
closeDb(dbPath);
|
|
240
276
|
// Run Python embedder in batch mode — reads from SQLite, embeds, writes back.
|
|
@@ -266,17 +302,24 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
266
302
|
continue;
|
|
267
303
|
try {
|
|
268
304
|
const msg = JSON.parse(line);
|
|
269
|
-
if (msg.phase === '
|
|
305
|
+
if (msg.phase === 'downloading' || msg.phase === 'converting') {
|
|
306
|
+
updateBar(90, msg.message);
|
|
307
|
+
}
|
|
308
|
+
else if (msg.phase === 'loaded') {
|
|
270
309
|
updateBar(91, `Model loaded (${msg.load_ms}ms)`);
|
|
271
310
|
}
|
|
272
311
|
else if (msg.phase === 'queried') {
|
|
273
|
-
updateBar(92, `Found ${msg.nodes} embeddable nodes`);
|
|
312
|
+
updateBar(92, `Found ${msg.nodes} embeddable nodes${msg.skipped_tests ? ` (${msg.skipped_tests} test files skipped)` : ''}`);
|
|
274
313
|
}
|
|
275
314
|
else if (msg.phase === 'prepared') {
|
|
276
315
|
updateBar(93, `${msg.to_embed} to embed, ${msg.skipped} cached`);
|
|
277
316
|
}
|
|
317
|
+
else if (msg.phase === 'embedding') {
|
|
318
|
+
const scaled = 93 + Math.round((msg.progress / 100) * 4);
|
|
319
|
+
updateBar(scaled, `Embedding... ${msg.progress}% (${msg.embedded} written)`);
|
|
320
|
+
}
|
|
278
321
|
else if (msg.phase === 'embedded') {
|
|
279
|
-
updateBar(
|
|
322
|
+
updateBar(97, `Embedded ${msg.count} nodes (${(msg.ms / 1000).toFixed(1)}s)`);
|
|
280
323
|
}
|
|
281
324
|
else if (msg.phase === 'done') {
|
|
282
325
|
updateBar(98, `Embeddings complete (${msg.embedded} new, ${msg.skipped} cached)`);
|
|
@@ -288,9 +331,9 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
288
331
|
});
|
|
289
332
|
// Reopen DB after Python is done
|
|
290
333
|
db = openDb(dbPath);
|
|
291
|
-
embeddingTime = ((Date.now() - t0Emb) / 1000).toFixed(1);
|
|
292
334
|
}
|
|
293
335
|
// Phase 5: Finalize (98-100%)
|
|
336
|
+
recordPhase('finalize');
|
|
294
337
|
updateBar(98, 'Saving metadata...');
|
|
295
338
|
// Count embeddings in the index (cached + newly generated) for metadata
|
|
296
339
|
const embeddingCount = countEmbeddings(db);
|
|
@@ -331,19 +374,26 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
331
374
|
...(processCount !== undefined ? { processes: processCount } : {}),
|
|
332
375
|
});
|
|
333
376
|
closeDb(dbPath);
|
|
377
|
+
recordPhase('done'); // close the last phase
|
|
334
378
|
const totalTime = ((Date.now() - t0Global) / 1000).toFixed(1);
|
|
335
379
|
clearInterval(elapsedTimer);
|
|
336
380
|
process.removeListener('SIGINT', sigintHandler);
|
|
337
381
|
console.log = origLog;
|
|
338
382
|
console.warn = origWarn;
|
|
339
383
|
console.error = origError;
|
|
340
|
-
bar.update(100, { phase: 'Done' });
|
|
384
|
+
bar.update(100, { phase: 'Done', resources: '' });
|
|
341
385
|
bar.stop();
|
|
386
|
+
// Clear any leftover characters from the progress bar line
|
|
387
|
+
process.stdout.write('\x1b[2K');
|
|
342
388
|
// Summary
|
|
343
389
|
const embeddingsCached = cachedEmbeddings.length > 0;
|
|
344
390
|
console.log(`\n Repository indexed successfully (${totalTime}s)${embeddingsCached ? ` [${cachedEmbeddings.length} embeddings cached]` : ''}\n`);
|
|
345
391
|
console.log(` ${stats.nodes.toLocaleString()} nodes | ${stats.edges.toLocaleString()} edges | ${pipelineResult.communityResult?.stats.totalCommunities || 0} clusters | ${pipelineResult.processResult?.stats.totalProcesses || 0} flows`);
|
|
346
|
-
|
|
392
|
+
// Resource usage
|
|
393
|
+
const cpuEnd = process.cpuUsage(cpuStart);
|
|
394
|
+
const wallMs = Date.now() - t0Global || 1;
|
|
395
|
+
const cpuPct = Math.round(((cpuEnd.user + cpuEnd.system) / 1e3) / wallMs * 100);
|
|
396
|
+
console.log(` Memory: peak ${peakRssMB}MB RSS | CPU: ${cpuPct}% (${cpuCount} cores)`);
|
|
347
397
|
console.log(` tsgo: ${pipelineResult.tsgoEnabled ? 'enabled (compiler-verified call resolution)' : 'disabled — install @typescript/native-preview for higher accuracy'}`);
|
|
348
398
|
console.log(` ${repoPath}`);
|
|
349
399
|
if (aiContext.files.length > 0) {
|
|
@@ -353,6 +403,39 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
353
403
|
if (dbWarnings.length > 0) {
|
|
354
404
|
console.log(` Note: ${dbWarnings.length} warnings during graph load`);
|
|
355
405
|
}
|
|
406
|
+
// Detailed performance breakdown
|
|
407
|
+
const totalMs = phaseTimes.reduce((s, p) => s + p.ms, 0) || 1;
|
|
408
|
+
const PHASE_DISPLAY_NAMES = {
|
|
409
|
+
init: 'Init',
|
|
410
|
+
extracting: 'Scanning files',
|
|
411
|
+
structure: 'Building structure',
|
|
412
|
+
parsing: 'Parsing & resolving',
|
|
413
|
+
imports: 'Resolving imports',
|
|
414
|
+
calls: 'Tracing calls',
|
|
415
|
+
heritage: 'Extracting inheritance',
|
|
416
|
+
communities: 'Detecting communities',
|
|
417
|
+
processes: 'Detecting processes',
|
|
418
|
+
enriching: 'Enriching clusters',
|
|
419
|
+
complete: 'Pipeline complete',
|
|
420
|
+
sqlite: 'SQLite load',
|
|
421
|
+
routes: 'Route stitching',
|
|
422
|
+
'search-text': 'Search text',
|
|
423
|
+
fts: 'FTS indexing',
|
|
424
|
+
'restore-embeddings': 'Restore embeddings',
|
|
425
|
+
embeddings: 'Embeddings (MLX)',
|
|
426
|
+
finalize: 'Finalize & context',
|
|
427
|
+
done: 'Done',
|
|
428
|
+
};
|
|
429
|
+
console.log('\n Phase breakdown:');
|
|
430
|
+
for (const phase of phaseTimes) {
|
|
431
|
+
const sec = (phase.ms / 1000).toFixed(1);
|
|
432
|
+
const pct = Math.round((phase.ms / totalMs) * 100);
|
|
433
|
+
const name = PHASE_DISPLAY_NAMES[phase.name] || phase.name;
|
|
434
|
+
const bar = pct >= 2 ? ' ' + '█'.repeat(Math.max(1, Math.round(pct / 3))) : '';
|
|
435
|
+
console.log(` ${name.padEnd(22)} ${sec.padStart(6)}s ${String(pct).padStart(3)}% ${phase.rssMB}MB${bar}`);
|
|
436
|
+
}
|
|
437
|
+
console.log(` ${'─'.repeat(50)}`);
|
|
438
|
+
console.log(` ${'Total'.padEnd(22)} ${totalTime.padStart(6)}s 100% ${peakRssMB}MB peak`);
|
|
356
439
|
try {
|
|
357
440
|
await fs.access(getGlobalRegistryPath());
|
|
358
441
|
}
|
package/dist/cli/index.js
CHANGED
|
File without changes
|
|
@@ -4,7 +4,10 @@ import fs from 'fs/promises';
|
|
|
4
4
|
import path from 'path';
|
|
5
5
|
import { glob } from 'glob';
|
|
6
6
|
import { createIgnoreFilter } from '../../config/ignore-service.js';
|
|
7
|
-
|
|
7
|
+
// Stat is metadata-only (no I/O), can be highly concurrent
|
|
8
|
+
const STAT_CONCURRENCY = 256;
|
|
9
|
+
// File reads move actual data, keep bounded to avoid fd exhaustion
|
|
10
|
+
const READ_CONCURRENCY = 64;
|
|
8
11
|
/** Scan repository: stat files to get paths + sizes, no content loaded (~10MB for 100K files) */
|
|
9
12
|
export const walkRepositoryPaths = async (repoPath, onProgress) => {
|
|
10
13
|
const ignoreFilter = await createIgnoreFilter(repoPath);
|
|
@@ -16,8 +19,8 @@ export const walkRepositoryPaths = async (repoPath, onProgress) => {
|
|
|
16
19
|
});
|
|
17
20
|
const entries = [];
|
|
18
21
|
let processed = 0;
|
|
19
|
-
for (let start = 0; start < filtered.length; start +=
|
|
20
|
-
const batch = filtered.slice(start, start +
|
|
22
|
+
for (let start = 0; start < filtered.length; start += STAT_CONCURRENCY) {
|
|
23
|
+
const batch = filtered.slice(start, start + STAT_CONCURRENCY);
|
|
21
24
|
const results = await Promise.allSettled(batch.map(async (relativePath) => {
|
|
22
25
|
const fullPath = path.join(repoPath, relativePath);
|
|
23
26
|
const stat = await fs.stat(fullPath);
|
|
@@ -23,10 +23,14 @@ const processParsingWithWorkers = async (graph, files, symbolTable, _astCache, w
|
|
|
23
23
|
if (parseableFiles.length === 0)
|
|
24
24
|
return { imports: [], calls: [], heritage: [], routes: [], constructorBindings: [] };
|
|
25
25
|
const total = files.length;
|
|
26
|
-
// Dispatch to worker pool
|
|
27
|
-
const chunkResults = await workerPool.dispatch(parseableFiles, (filesProcessed) => {
|
|
26
|
+
// Dispatch to worker pool with size-balanced distribution
|
|
27
|
+
const { results: chunkResults, failures } = await workerPool.dispatch(parseableFiles, (filesProcessed) => {
|
|
28
28
|
onFileProgress?.(Math.min(filesProcessed, total), total, 'Parsing...');
|
|
29
|
-
});
|
|
29
|
+
}, (item) => item.content.length);
|
|
30
|
+
// Log worker failures (don't throw — partial results are still valuable)
|
|
31
|
+
for (const failure of failures) {
|
|
32
|
+
console.error(` Worker failure (partial results preserved): ${failure.message}`);
|
|
33
|
+
}
|
|
30
34
|
// Merge worker results into graph and symbol table
|
|
31
35
|
const allImports = [];
|
|
32
36
|
const allCalls = [];
|
|
@@ -21,7 +21,10 @@ import { fileURLToPath, pathToFileURL } from 'node:url';
|
|
|
21
21
|
import { memoryGuard } from '../../lib/memory-guard.js';
|
|
22
22
|
import { toNodeId, toEdgeId } from '../db/schema.js';
|
|
23
23
|
import { getTsgoService } from '../semantic/tsgo-service.js';
|
|
24
|
-
const
|
|
24
|
+
const verbose = (...args) => {
|
|
25
|
+
if (process.env['CODE_MAPPER_VERBOSE'])
|
|
26
|
+
console.error(...args);
|
|
27
|
+
};
|
|
25
28
|
// Default chunk budget — used when memory is plentiful.
|
|
26
29
|
// Under memory pressure, adaptiveBatchSize() shrinks this automatically.
|
|
27
30
|
const DEFAULT_CHUNK_BYTE_BUDGET = 50 * 1024 * 1024;
|
|
@@ -122,9 +125,9 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
|
|
|
122
125
|
if (currentChunk.length > 0)
|
|
123
126
|
chunks.push(currentChunk);
|
|
124
127
|
const numChunks = chunks.length;
|
|
125
|
-
|
|
128
|
+
{
|
|
126
129
|
const totalMB = parseableScanned.reduce((s, f) => s + f.size, 0) / (1024 * 1024);
|
|
127
|
-
|
|
130
|
+
verbose(`[parse] ${totalFiles} paths, ${totalParseable} parseable (${totalMB.toFixed(0)}MB), ${numChunks} chunks @ ${Math.round(chunkBudget / (1024 * 1024))}MB budget (${memoryGuard.summary()})`);
|
|
128
131
|
}
|
|
129
132
|
onProgress({
|
|
130
133
|
phase: 'parsing',
|
|
@@ -148,8 +151,7 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
|
|
|
148
151
|
workerPool = createWorkerPool(workerUrl);
|
|
149
152
|
}
|
|
150
153
|
catch (err) {
|
|
151
|
-
|
|
152
|
-
console.warn('Worker pool creation failed, using sequential fallback:', err.message);
|
|
154
|
+
console.error('[parse] worker pool creation failed, using sequential fallback:', err.message);
|
|
153
155
|
}
|
|
154
156
|
let filesParsedSoFar = 0;
|
|
155
157
|
// AST cache sized for one chunk (used by sequential fallback for import/call/heritage)
|
|
@@ -171,12 +173,17 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
|
|
|
171
173
|
const chunkPaths = chunks[chunkIdx];
|
|
172
174
|
if (!chunkPaths)
|
|
173
175
|
continue;
|
|
176
|
+
const chunkStart = Date.now();
|
|
174
177
|
// Read content for this chunk
|
|
175
178
|
const chunkContents = await readFileContents(repoPath, chunkPaths);
|
|
176
179
|
const chunkFiles = chunkPaths
|
|
177
180
|
.filter(p => chunkContents.has(p))
|
|
178
181
|
.map(p => ({ path: p, content: chunkContents.get(p) }));
|
|
182
|
+
const readMs = Date.now() - chunkStart;
|
|
183
|
+
const chunkMB = chunkFiles.reduce((s, f) => s + f.content.length, 0) / (1024 * 1024);
|
|
184
|
+
verbose(`[parse] chunk ${chunkIdx + 1}/${numChunks}: ${chunkFiles.length} files (${chunkMB.toFixed(1)}MB), read ${readMs}ms`);
|
|
179
185
|
// Parse chunk (workers or sequential fallback)
|
|
186
|
+
const parseStart = Date.now();
|
|
180
187
|
const chunkWorkerData = await processParsing(graph, chunkFiles, symbolTable, astCache, (current, _total, filePath) => {
|
|
181
188
|
const globalCurrent = filesParsedSoFar + current;
|
|
182
189
|
const parsingProgress = 20 + ((globalCurrent / totalParseable) * 62);
|
|
@@ -188,6 +195,8 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
|
|
|
188
195
|
stats: { filesProcessed: globalCurrent, totalFiles: totalParseable, nodesCreated: graph.nodeCount },
|
|
189
196
|
});
|
|
190
197
|
}, workerPool);
|
|
198
|
+
const parseMs = Date.now() - parseStart;
|
|
199
|
+
verbose(`[parse] chunk ${chunkIdx + 1}/${numChunks}: parsed ${parseMs}ms (${memoryGuard.summary()})`);
|
|
191
200
|
const chunkBasePercent = 20 + ((filesParsedSoFar / totalParseable) * 62);
|
|
192
201
|
if (chunkWorkerData) {
|
|
193
202
|
// Resolve imports per-chunk (file-level, doesn't need full symbol table)
|
|
@@ -235,14 +244,14 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
|
|
|
235
244
|
sequentialChunkPaths.push(chunkPaths);
|
|
236
245
|
}
|
|
237
246
|
filesParsedSoFar += chunkFiles.length;
|
|
247
|
+
const totalChunkMs = Date.now() - chunkStart;
|
|
248
|
+
verbose(`[parse] chunk ${chunkIdx + 1}/${numChunks}: total ${totalChunkMs}ms, ${filesParsedSoFar}/${totalParseable} files done`);
|
|
238
249
|
// Clear AST cache between chunks to free memory; chunk locals go out of scope for GC
|
|
239
250
|
astCache.clear();
|
|
240
251
|
// Attempt GC between chunks if under memory pressure
|
|
241
252
|
if (memoryGuard.isUnderPressure()) {
|
|
242
253
|
memoryGuard.tryGC();
|
|
243
|
-
|
|
244
|
-
console.log(`⚠️ Memory pressure after chunk ${chunkIdx + 1}: ${memoryGuard.summary()}`);
|
|
245
|
-
}
|
|
254
|
+
verbose(`[parse] memory pressure after chunk ${chunkIdx + 1}: ${memoryGuard.summary()}`);
|
|
246
255
|
}
|
|
247
256
|
}
|
|
248
257
|
}
|
|
@@ -301,12 +310,11 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
|
|
|
301
310
|
tsgoService?.stop();
|
|
302
311
|
}
|
|
303
312
|
}
|
|
304
|
-
|
|
305
|
-
if (isDev) {
|
|
313
|
+
{
|
|
306
314
|
const rcStats = ctx.getStats();
|
|
307
315
|
const total = rcStats.cacheHits + rcStats.cacheMisses;
|
|
308
316
|
const hitRate = total > 0 ? ((rcStats.cacheHits / total) * 100).toFixed(1) : '0';
|
|
309
|
-
|
|
317
|
+
verbose(`[resolve] cache: ${rcStats.cacheHits} hits, ${rcStats.cacheMisses} misses (${hitRate}% hit rate)`);
|
|
310
318
|
}
|
|
311
319
|
// Free import resolution context (~94MB+ for large repos)
|
|
312
320
|
allPathObjects.length = 0;
|
|
@@ -318,13 +326,13 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
|
|
|
318
326
|
createDependsOnEdges(graph, ctx),
|
|
319
327
|
createProvidesEdges(graph, ctx),
|
|
320
328
|
]);
|
|
321
|
-
if (
|
|
322
|
-
|
|
329
|
+
if (diEdgeCount > 0 || providesEdgeCount > 0) {
|
|
330
|
+
verbose(`[resolve] DI: ${diEdgeCount} DEPENDS_ON, ${providesEdgeCount} PROVIDES edges`);
|
|
323
331
|
}
|
|
324
332
|
// Phase 4.5a2: Interface dispatch — connect callers of interfaces to implementations
|
|
325
333
|
const ifaceEdges = await resolveInterfaceDispatches(graph, ctx);
|
|
326
|
-
if (
|
|
327
|
-
|
|
334
|
+
if (ifaceEdges > 0) {
|
|
335
|
+
verbose(`[resolve] interface dispatch: ${ifaceEdges} implementation edges`);
|
|
328
336
|
}
|
|
329
337
|
// Phase 4.5b: Method Resolution Order
|
|
330
338
|
onProgress({
|
|
@@ -334,8 +342,8 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
|
|
|
334
342
|
stats: { filesProcessed: totalFiles, totalFiles, nodesCreated: graph.nodeCount },
|
|
335
343
|
});
|
|
336
344
|
const mroResult = computeMRO(graph);
|
|
337
|
-
if (
|
|
338
|
-
|
|
345
|
+
if (mroResult.entries.length > 0) {
|
|
346
|
+
verbose(`[resolve] MRO: ${mroResult.entries.length} classes, ${mroResult.ambiguityCount} ambiguities, ${mroResult.overrideEdges} OVERRIDES edges`);
|
|
339
347
|
}
|
|
340
348
|
// Phase 5: Communities
|
|
341
349
|
onProgress({
|
|
@@ -353,9 +361,7 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
|
|
|
353
361
|
stats: { filesProcessed: totalFiles, totalFiles, nodesCreated: graph.nodeCount },
|
|
354
362
|
});
|
|
355
363
|
});
|
|
356
|
-
|
|
357
|
-
console.log(`🏘️ Community detection: ${communityResult.stats.totalCommunities} communities found (modularity: ${communityResult.stats.modularity.toFixed(3)})`);
|
|
358
|
-
}
|
|
364
|
+
verbose(`[community] ${communityResult.stats.totalCommunities} communities (modularity: ${communityResult.stats.modularity.toFixed(3)})`);
|
|
359
365
|
communityResult.communities.forEach(comm => {
|
|
360
366
|
graph.addNode({
|
|
361
367
|
id: toNodeId(comm.id),
|
|
@@ -399,9 +405,7 @@ export const runPipelineFromRepo = async (repoPath, onProgress, opts) => {
|
|
|
399
405
|
stats: { filesProcessed: totalFiles, totalFiles, nodesCreated: graph.nodeCount },
|
|
400
406
|
});
|
|
401
407
|
}, { maxProcesses: dynamicMaxProcesses, minSteps: 3 });
|
|
402
|
-
|
|
403
|
-
console.log(`🔄 Process detection: ${processResult.stats.totalProcesses} processes found (${processResult.stats.crossCommunityCount} cross-community)`);
|
|
404
|
-
}
|
|
408
|
+
verbose(`[process] ${processResult.stats.totalProcesses} processes (${processResult.stats.crossCommunityCount} cross-community)`);
|
|
405
409
|
processResult.processes.forEach(proc => {
|
|
406
410
|
graph.addNode({
|
|
407
411
|
id: toNodeId(proc.id),
|
|
@@ -174,7 +174,7 @@ const processBatch = (files, onProgress) => {
|
|
|
174
174
|
}
|
|
175
175
|
let totalProcessed = 0;
|
|
176
176
|
let lastReported = 0;
|
|
177
|
-
const PROGRESS_INTERVAL =
|
|
177
|
+
const PROGRESS_INTERVAL = 25; // report every 25 files — resets the sub-batch timer
|
|
178
178
|
const onFileProcessed = onProgress ? () => {
|
|
179
179
|
totalProcessed++;
|
|
180
180
|
if (totalProcessed - lastReported >= PROGRESS_INTERVAL) {
|
|
@@ -1,11 +1,16 @@
|
|
|
1
1
|
/** @file Generic worker thread pool with sub-batch streaming for bounded memory usage */
|
|
2
2
|
export interface WorkerPool {
|
|
3
3
|
/**
|
|
4
|
-
* Dispatch items across workers with sub-batch streaming
|
|
5
|
-
*
|
|
4
|
+
* Dispatch items across workers with sub-batch streaming.
|
|
5
|
+
* Uses Promise.allSettled so one worker failure doesn't discard other workers' results.
|
|
6
|
+
* @param items - Items to process (split across workers using size-balanced round-robin)
|
|
6
7
|
* @param onProgress - Optional callback for progress reporting
|
|
8
|
+
* @param getItemSize - Optional function to extract item size for balanced dispatch
|
|
7
9
|
*/
|
|
8
|
-
dispatch<TInput, TResult>(items: TInput[], onProgress?: (filesProcessed: number) => void): Promise<
|
|
10
|
+
dispatch<TInput, TResult>(items: TInput[], onProgress?: (filesProcessed: number) => void, getItemSize?: (item: TInput) => number): Promise<{
|
|
11
|
+
results: TResult[];
|
|
12
|
+
failures: Error[];
|
|
13
|
+
}>;
|
|
9
14
|
/** Terminate all workers (must be called when done) */
|
|
10
15
|
terminate(): Promise<void>;
|
|
11
16
|
/** Number of workers in the pool */
|
|
@@ -5,9 +5,40 @@ import os from 'node:os';
|
|
|
5
5
|
import fs from 'node:fs';
|
|
6
6
|
import { fileURLToPath } from 'node:url';
|
|
7
7
|
// Max files per postMessage to keep structured-clone memory bounded
|
|
8
|
-
const SUB_BATCH_SIZE =
|
|
9
|
-
//
|
|
10
|
-
const
|
|
8
|
+
const SUB_BATCH_SIZE = 500;
|
|
9
|
+
// Base sub-batch timeout — extended proportionally to file count
|
|
10
|
+
const BASE_TIMEOUT_MS = 120_000;
|
|
11
|
+
// Per-file timeout extension (200ms per file in the sub-batch)
|
|
12
|
+
const PER_FILE_TIMEOUT_MS = 200;
|
|
13
|
+
/** Compute proportional timeout: max(base, fileCount * perFile) */
|
|
14
|
+
const computeTimeout = (fileCount) => Math.max(BASE_TIMEOUT_MS, fileCount * PER_FILE_TIMEOUT_MS);
|
|
15
|
+
/**
|
|
16
|
+
* Distribute items across N buckets using size-balanced round-robin (LPT heuristic).
|
|
17
|
+
* Items are sorted by size descending, then assigned to the bucket with the smallest total.
|
|
18
|
+
* This minimizes the makespan of the heaviest bucket.
|
|
19
|
+
*/
|
|
20
|
+
const sizeBalancedDistribute = (items, bucketCount, getSize) => {
|
|
21
|
+
if (bucketCount <= 0)
|
|
22
|
+
return [];
|
|
23
|
+
if (items.length === 0)
|
|
24
|
+
return Array.from({ length: bucketCount }, () => []);
|
|
25
|
+
// Sort indices by size descending
|
|
26
|
+
const indices = items.map((_, i) => i);
|
|
27
|
+
indices.sort((a, b) => getSize(items[b]) - getSize(items[a]));
|
|
28
|
+
const buckets = Array.from({ length: bucketCount }, () => []);
|
|
29
|
+
const bucketSizes = new Array(bucketCount).fill(0);
|
|
30
|
+
for (const idx of indices) {
|
|
31
|
+
// Find the bucket with the smallest total size
|
|
32
|
+
let minBucket = 0;
|
|
33
|
+
for (let b = 1; b < bucketCount; b++) {
|
|
34
|
+
if (bucketSizes[b] < bucketSizes[minBucket])
|
|
35
|
+
minBucket = b;
|
|
36
|
+
}
|
|
37
|
+
buckets[minBucket].push(items[idx]);
|
|
38
|
+
bucketSizes[minBucket] += getSize(items[idx]);
|
|
39
|
+
}
|
|
40
|
+
return buckets;
|
|
41
|
+
};
|
|
11
42
|
/** Create a pool of worker threads */
|
|
12
43
|
export const createWorkerPool = (workerUrl, poolSize) => {
|
|
13
44
|
// Validate worker script exists before spawning to prevent MODULE_NOT_FOUND crashes
|
|
@@ -20,13 +51,20 @@ export const createWorkerPool = (workerUrl, poolSize) => {
|
|
|
20
51
|
for (let i = 0; i < size; i++) {
|
|
21
52
|
workers.push(new Worker(workerUrl));
|
|
22
53
|
}
|
|
23
|
-
const dispatch = (items, onProgress) => {
|
|
54
|
+
const dispatch = (items, onProgress, getItemSize) => {
|
|
24
55
|
if (items.length === 0)
|
|
25
|
-
return Promise.resolve([]);
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
chunks
|
|
56
|
+
return Promise.resolve({ results: [], failures: [] });
|
|
57
|
+
// Size-balanced dispatch when size function provided, otherwise equal split
|
|
58
|
+
let chunks;
|
|
59
|
+
if (getItemSize) {
|
|
60
|
+
chunks = sizeBalancedDistribute(items, Math.min(size, items.length), getItemSize);
|
|
61
|
+
}
|
|
62
|
+
else {
|
|
63
|
+
const chunkSize = Math.ceil(items.length / size);
|
|
64
|
+
chunks = [];
|
|
65
|
+
for (let i = 0; i < items.length; i += chunkSize) {
|
|
66
|
+
chunks.push(items.slice(i, i + chunkSize));
|
|
67
|
+
}
|
|
30
68
|
}
|
|
31
69
|
const workerProgress = new Array(chunks.length).fill(0);
|
|
32
70
|
const promises = chunks.map((chunk, i) => {
|
|
@@ -37,6 +75,7 @@ export const createWorkerPool = (workerUrl, poolSize) => {
|
|
|
37
75
|
return new Promise((resolve, reject) => {
|
|
38
76
|
let settled = false;
|
|
39
77
|
let subBatchTimer = null;
|
|
78
|
+
let currentSubBatchSize = 0;
|
|
40
79
|
const cleanup = () => {
|
|
41
80
|
if (subBatchTimer)
|
|
42
81
|
clearTimeout(subBatchTimer);
|
|
@@ -47,13 +86,14 @@ export const createWorkerPool = (workerUrl, poolSize) => {
|
|
|
47
86
|
const resetSubBatchTimer = () => {
|
|
48
87
|
if (subBatchTimer)
|
|
49
88
|
clearTimeout(subBatchTimer);
|
|
89
|
+
const timeout = computeTimeout(currentSubBatchSize);
|
|
50
90
|
subBatchTimer = setTimeout(() => {
|
|
51
91
|
if (!settled) {
|
|
52
92
|
settled = true;
|
|
53
93
|
cleanup();
|
|
54
|
-
reject(new Error(`Worker ${i} sub-batch timed out after ${
|
|
94
|
+
reject(new Error(`Worker ${i} sub-batch timed out after ${timeout / 1000}s (chunk: ${chunk.length} items, sub-batch: ${currentSubBatchSize} items).`));
|
|
55
95
|
}
|
|
56
|
-
},
|
|
96
|
+
}, timeout);
|
|
57
97
|
};
|
|
58
98
|
let subBatchIdx = 0;
|
|
59
99
|
const sendNextSubBatch = () => {
|
|
@@ -63,6 +103,7 @@ export const createWorkerPool = (workerUrl, poolSize) => {
|
|
|
63
103
|
return;
|
|
64
104
|
}
|
|
65
105
|
const subBatch = chunk.slice(start, start + SUB_BATCH_SIZE);
|
|
106
|
+
currentSubBatchSize = subBatch.length;
|
|
66
107
|
subBatchIdx++;
|
|
67
108
|
resetSubBatchTimer();
|
|
68
109
|
worker.postMessage({ type: 'sub-batch', files: subBatch });
|
|
@@ -71,6 +112,8 @@ export const createWorkerPool = (workerUrl, poolSize) => {
|
|
|
71
112
|
if (settled)
|
|
72
113
|
return;
|
|
73
114
|
if (msg && msg.type === 'progress') {
|
|
115
|
+
// BUG FIX: Reset timer on progress — worker is alive and making progress
|
|
116
|
+
resetSubBatchTimer();
|
|
74
117
|
workerProgress[i] = msg.filesProcessed;
|
|
75
118
|
if (onProgress) {
|
|
76
119
|
const total = workerProgress.reduce((a, b) => a + b, 0);
|
|
@@ -116,7 +159,20 @@ export const createWorkerPool = (workerUrl, poolSize) => {
|
|
|
116
159
|
sendNextSubBatch();
|
|
117
160
|
});
|
|
118
161
|
});
|
|
119
|
-
|
|
162
|
+
// Use allSettled so one worker failure doesn't discard other workers' results
|
|
163
|
+
return Promise.allSettled(promises).then(outcomes => {
|
|
164
|
+
const results = [];
|
|
165
|
+
const failures = [];
|
|
166
|
+
for (const outcome of outcomes) {
|
|
167
|
+
if (outcome.status === 'fulfilled') {
|
|
168
|
+
results.push(outcome.value);
|
|
169
|
+
}
|
|
170
|
+
else {
|
|
171
|
+
failures.push(outcome.reason instanceof Error ? outcome.reason : new Error(String(outcome.reason)));
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
return { results, failures };
|
|
175
|
+
});
|
|
120
176
|
};
|
|
121
177
|
const terminate = async () => {
|
|
122
178
|
await Promise.all(workers.map(w => w.terminate()));
|
package/models/mlx-embedder.py
CHANGED
|
@@ -415,26 +415,81 @@ def batch_mode(db_path, dims=256, max_tokens=2048):
|
|
|
415
415
|
unique_texts = [v["text"] for v in unique_by_hash.values()]
|
|
416
416
|
deduped = len(to_embed) - len(unique_texts)
|
|
417
417
|
|
|
418
|
-
# Embed
|
|
418
|
+
# Embed unique texts in streaming fashion — process each batch, write to DB
|
|
419
|
+
# immediately, free GPU memory. Keeps peak memory at ONE batch instead of ALL.
|
|
419
420
|
t0_embed = time.time()
|
|
420
|
-
|
|
421
|
-
embed_ms = int((time.time() - t0_embed) * 1000)
|
|
421
|
+
unique_entries = list(unique_by_hash.values())
|
|
422
422
|
|
|
423
|
-
|
|
423
|
+
# Tokenize + sort (same as embed_tiered but we handle the loop here)
|
|
424
|
+
is_code_model = "jina-code" in MODEL_DIR
|
|
425
|
+
if is_code_model:
|
|
426
|
+
prefix_map = {"retrieval.query": "Find the most relevant code snippet given the following query:\n", "retrieval.passage": "Candidate code snippet:\n"}
|
|
427
|
+
else:
|
|
428
|
+
prefix_map = {"retrieval.query": "Query: ", "retrieval.passage": "Document: "}
|
|
429
|
+
prefix = prefix_map.get("retrieval.passage", "")
|
|
430
|
+
prefixed = [prefix + e["text"] for e in unique_entries]
|
|
431
|
+
encodings = tokenizer.encode_batch(prefixed)
|
|
432
|
+
indexed = sorted(range(len(unique_entries)), key=lambda i: len(encodings[i].ids))
|
|
424
433
|
|
|
425
|
-
|
|
426
|
-
t0_write = time.time()
|
|
434
|
+
embedded_count = 0
|
|
427
435
|
db.execute("BEGIN")
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
+
|
|
437
|
+
i = 0
|
|
438
|
+
while i < len(indexed):
|
|
439
|
+
peek_idx = indexed[min(i + 1, len(indexed) - 1)]
|
|
440
|
+
tok_count = min(len(encodings[peek_idx].ids), max_tokens)
|
|
441
|
+
batch_size = get_batch_size_for_tokens(tok_count)
|
|
442
|
+
|
|
443
|
+
batch_indices = []
|
|
444
|
+
batch_encs = []
|
|
445
|
+
while len(batch_encs) < batch_size and i < len(indexed):
|
|
446
|
+
orig_idx = indexed[i]
|
|
447
|
+
batch_indices.append(orig_idx)
|
|
448
|
+
batch_encs.append(encodings[orig_idx])
|
|
449
|
+
i += 1
|
|
450
|
+
|
|
451
|
+
max_len = min(max_tokens, max(len(e.ids) for e in batch_encs))
|
|
452
|
+
input_ids = []
|
|
453
|
+
attention_mask = []
|
|
454
|
+
for enc in batch_encs:
|
|
455
|
+
ids = enc.ids[:max_len]
|
|
456
|
+
mask = enc.attention_mask[:max_len]
|
|
457
|
+
pad = max_len - len(ids)
|
|
458
|
+
if pad > 0:
|
|
459
|
+
ids = ids + [0] * pad
|
|
460
|
+
mask = mask + [0] * pad
|
|
461
|
+
input_ids.append(ids)
|
|
462
|
+
attention_mask.append(mask)
|
|
463
|
+
|
|
464
|
+
# Forward pass
|
|
465
|
+
embs = model(mx.array(input_ids), mx.array(attention_mask))
|
|
466
|
+
if dims and dims < embs.shape[1]:
|
|
467
|
+
embs = embs[:, :dims]
|
|
468
|
+
norms = mx.linalg.norm(embs, axis=1, keepdims=True)
|
|
469
|
+
embs = embs / norms
|
|
470
|
+
mx.eval(embs)
|
|
471
|
+
|
|
472
|
+
# Convert to Python + write to DB immediately
|
|
473
|
+
emb_list = embs.tolist()
|
|
474
|
+
del embs # free MLX GPU memory
|
|
475
|
+
|
|
476
|
+
for j, orig_idx in enumerate(batch_indices):
|
|
477
|
+
entry = unique_entries[orig_idx]
|
|
478
|
+
blob = float_list_to_blob(emb_list[j])
|
|
479
|
+
for nid, th in entry["node_ids"]:
|
|
480
|
+
db.execute("INSERT OR REPLACE INTO embeddings (nodeId, embedding, textHash) VALUES (?, ?, ?)",
|
|
481
|
+
(nid, blob, th))
|
|
482
|
+
embedded_count += len(entry["node_ids"])
|
|
483
|
+
|
|
484
|
+
# Progress
|
|
485
|
+
pct = i * 100 // len(indexed)
|
|
486
|
+
print(json.dumps({"phase": "embedding", "progress": pct, "embedded": embedded_count}), flush=True)
|
|
487
|
+
|
|
436
488
|
db.execute("COMMIT")
|
|
437
|
-
|
|
489
|
+
embed_ms = int((time.time() - t0_embed) * 1000)
|
|
490
|
+
write_ms = 0 # included in embed_ms now
|
|
491
|
+
|
|
492
|
+
print(json.dumps({"phase": "embedded", "count": len(unique_entries), "deduped": deduped, "ms": embed_ms}), flush=True)
|
|
438
493
|
|
|
439
494
|
total_ms = int((time.time() - t0_total) * 1000)
|
|
440
495
|
print(json.dumps({
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@zuvia-software-solutions/code-mapper",
|
|
3
|
-
"version": "2.3.
|
|
3
|
+
"version": "2.3.3",
|
|
4
4
|
"description": "Graph-powered code intelligence for AI agents. Index any codebase, query via MCP or CLI.",
|
|
5
5
|
"author": "Abhigyan Patwari",
|
|
6
6
|
"license": "PolyForm-Noncommercial-1.0.0",
|