code-graph-context 2.12.8 → 2.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/embeddings/embedding-sidecar.js +5 -2
- package/dist/core/embeddings/local-embeddings.service.js +18 -26
- package/dist/mcp/constants.js +2 -2
- package/dist/mcp/handlers/graph-generator.handler.js +40 -12
- package/dist/mcp/handlers/parallel-import.handler.js +3 -17
- package/dist/mcp/handlers/streaming-import.handler.js +4 -39
- package/package.json +1 -1
|
@@ -184,16 +184,19 @@ export class EmbeddingSidecar {
|
|
|
184
184
|
/**
|
|
185
185
|
* Embed an array of texts. Lazily starts the sidecar if not running.
|
|
186
186
|
*/
|
|
187
|
-
async embed(texts) {
|
|
187
|
+
async embed(texts, gpuBatchSize) {
|
|
188
188
|
await this.start();
|
|
189
189
|
const controller = new AbortController();
|
|
190
190
|
const timeout = setTimeout(() => controller.abort(), this.config.requestTimeoutMs);
|
|
191
191
|
const startTime = Date.now();
|
|
192
192
|
try {
|
|
193
|
+
const body = { texts };
|
|
194
|
+
if (gpuBatchSize)
|
|
195
|
+
body.batch_size = gpuBatchSize;
|
|
193
196
|
const res = await fetch(`${this.baseUrl}/embed`, {
|
|
194
197
|
method: 'POST',
|
|
195
198
|
headers: { 'Content-Type': 'application/json' },
|
|
196
|
-
body: JSON.stringify(
|
|
199
|
+
body: JSON.stringify(body),
|
|
197
200
|
signal: controller.signal,
|
|
198
201
|
});
|
|
199
202
|
if (!res.ok) {
|
|
@@ -19,33 +19,25 @@ export class LocalEmbeddingsService {
|
|
|
19
19
|
const sidecar = getEmbeddingSidecar();
|
|
20
20
|
return sidecar.embed(texts);
|
|
21
21
|
}
|
|
22
|
-
async embedTextsInBatches(texts,
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
22
|
+
async embedTextsInBatches(texts, _batchSize) {
|
|
23
|
+
if (texts.length === 0)
|
|
24
|
+
return [];
|
|
25
|
+
// GPU batch size controls how many texts the model processes at once (memory-bound).
|
|
26
|
+
// We send ALL texts in a single HTTP request and let the sidecar handle GPU batching
|
|
27
|
+
// internally via model.encode(batch_size=N). This eliminates HTTP round-trip overhead.
|
|
28
|
+
const gpuBatchSize = BATCH_CONFIG.maxBatchSize;
|
|
29
|
+
const gpuBatches = Math.ceil(texts.length / gpuBatchSize);
|
|
30
|
+
console.error(`[embedding] Sending ${texts.length} texts in 1 request (gpu_batch_size=${gpuBatchSize}, ~${gpuBatches} GPU batches)`);
|
|
31
|
+
await debugLog('Batch embedding started', { provider: 'local', textCount: texts.length, gpuBatchSize });
|
|
26
32
|
const sidecar = getEmbeddingSidecar();
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
batchIndex,
|
|
36
|
-
totalBatches,
|
|
37
|
-
batchSize: batch.length,
|
|
38
|
-
});
|
|
39
|
-
try {
|
|
40
|
-
const batchResults = await sidecar.embed(batch);
|
|
41
|
-
results.push(...batchResults);
|
|
42
|
-
}
|
|
43
|
-
catch (error) {
|
|
44
|
-
const msg = error instanceof Error ? error.message : String(error);
|
|
45
|
-
console.error(`[embedding] Batch ${batchIndex}/${totalBatches} FAILED (${batch.length} texts, batchSize=${safeBatchSize}): ${msg}`);
|
|
46
|
-
throw error;
|
|
47
|
-
}
|
|
33
|
+
try {
|
|
34
|
+
const results = await sidecar.embed(texts, gpuBatchSize);
|
|
35
|
+
return results;
|
|
36
|
+
}
|
|
37
|
+
catch (error) {
|
|
38
|
+
const msg = error instanceof Error ? error.message : String(error);
|
|
39
|
+
console.error(`[embedding] FAILED (${texts.length} texts, gpuBatchSize=${gpuBatchSize}): ${msg}`);
|
|
40
|
+
throw error;
|
|
48
41
|
}
|
|
49
|
-
return results;
|
|
50
42
|
}
|
|
51
43
|
}
|
package/dist/mcp/constants.js
CHANGED
|
@@ -380,11 +380,11 @@ export const DEFAULTS = {
|
|
|
380
380
|
// Parsing Configuration
|
|
381
381
|
export const PARSING = {
|
|
382
382
|
/** File count threshold to trigger parallel parsing with worker pool */
|
|
383
|
-
parallelThreshold:
|
|
383
|
+
parallelThreshold: 250,
|
|
384
384
|
/** File count threshold to trigger streaming import */
|
|
385
385
|
streamingThreshold: 100,
|
|
386
386
|
/** Default number of files per chunk */
|
|
387
|
-
defaultChunkSize:
|
|
387
|
+
defaultChunkSize: 75,
|
|
388
388
|
/** Worker timeout in milliseconds (30 minutes) */
|
|
389
389
|
workerTimeoutMs: 30 * 60 * 1000,
|
|
390
390
|
};
|
|
@@ -24,19 +24,31 @@ export class GraphGeneratorHandler {
|
|
|
24
24
|
}
|
|
25
25
|
async generateGraph(graphJsonPath, batchSize = DEFAULTS.batchSize, clearExisting = true) {
|
|
26
26
|
console.error(`Generating graph from JSON file: ${graphJsonPath}`);
|
|
27
|
-
|
|
27
|
+
const graphData = await this.loadGraphData(graphJsonPath);
|
|
28
|
+
return this.generateGraphFromData(graphData.nodes, graphData.edges, batchSize, clearExisting, graphData.metadata);
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Import nodes and edges directly from in-memory data.
|
|
32
|
+
* Skips the file read/write round-trip used by generateGraph.
|
|
33
|
+
*
|
|
34
|
+
* @param skipIndexes - When true, skips index creation (caller manages indexes).
|
|
35
|
+
* Use this for chunked imports where indexes are created once before/after all chunks.
|
|
36
|
+
*/
|
|
37
|
+
async generateGraphFromData(nodes, edges, batchSize = DEFAULTS.batchSize, clearExisting = true, metadata = {}, skipIndexes = false) {
|
|
38
|
+
await debugLog('Starting graph generation', { nodeCount: nodes.length, edgeCount: edges.length, batchSize, clearExisting, skipIndexes, projectId: this.projectId });
|
|
28
39
|
try {
|
|
29
|
-
const graphData = await this.loadGraphData(graphJsonPath);
|
|
30
|
-
const { nodes, edges, metadata } = graphData;
|
|
31
40
|
console.error(`Generating graph with ${nodes.length} nodes and ${edges.length} edges`);
|
|
32
|
-
await debugLog('Graph data loaded', { nodeCount: nodes.length, edgeCount: edges.length });
|
|
33
41
|
if (clearExisting) {
|
|
34
42
|
await this.clearExistingData();
|
|
35
43
|
}
|
|
36
|
-
|
|
44
|
+
if (!skipIndexes) {
|
|
45
|
+
await this.createProjectIndexes();
|
|
46
|
+
}
|
|
37
47
|
await this.importNodes(nodes, batchSize);
|
|
38
48
|
await this.importEdges(edges, batchSize);
|
|
39
|
-
|
|
49
|
+
if (!skipIndexes) {
|
|
50
|
+
await this.createVectorIndexes();
|
|
51
|
+
}
|
|
40
52
|
const result = {
|
|
41
53
|
nodesImported: nodes.length,
|
|
42
54
|
edgesImported: edges.length,
|
|
@@ -51,6 +63,13 @@ export class GraphGeneratorHandler {
|
|
|
51
63
|
throw error;
|
|
52
64
|
}
|
|
53
65
|
}
|
|
66
|
+
/**
|
|
67
|
+
* Create all indexes. Call once before chunked imports start.
|
|
68
|
+
*/
|
|
69
|
+
async ensureIndexes() {
|
|
70
|
+
await this.createProjectIndexes();
|
|
71
|
+
await this.createVectorIndexes();
|
|
72
|
+
}
|
|
54
73
|
async loadGraphData(graphJsonPath) {
|
|
55
74
|
const fileContent = await fs.readFile(graphJsonPath, 'utf-8');
|
|
56
75
|
return JSON.parse(fileContent);
|
|
@@ -81,17 +100,26 @@ export class GraphGeneratorHandler {
|
|
|
81
100
|
}
|
|
82
101
|
async importNodes(nodes, batchSize) {
|
|
83
102
|
console.error(`Importing ${nodes.length} nodes with embeddings...`);
|
|
103
|
+
// Pipelined: write batch N to Neo4j while embedding batch N+1.
|
|
104
|
+
// This overlaps GPU work with Neo4j I/O.
|
|
105
|
+
let pendingWrite = null;
|
|
84
106
|
for (let i = 0; i < nodes.length; i += batchSize) {
|
|
107
|
+
// Embed this batch (GPU-bound, the slow part)
|
|
85
108
|
const batch = await this.processNodeBatch(nodes.slice(i, i + batchSize));
|
|
86
|
-
|
|
109
|
+
// Wait for previous Neo4j write before starting next
|
|
110
|
+
if (pendingWrite)
|
|
111
|
+
await pendingWrite;
|
|
112
|
+
const batchStart = i + 1;
|
|
87
113
|
const batchEnd = Math.min(i + batchSize, nodes.length);
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
batchEnd,
|
|
92
|
-
created: result[0].created,
|
|
114
|
+
// Start Neo4j write — don't await, overlap with next batch's embedding
|
|
115
|
+
pendingWrite = this.neo4jService.run(QUERIES.CREATE_NODE, { nodes: batch }).then(async (result) => {
|
|
116
|
+
console.error(`Created ${result[0].created} nodes in batch ${batchStart}-${batchEnd}`);
|
|
117
|
+
await debugLog('Node batch imported', { batchStart, batchEnd, created: result[0].created });
|
|
93
118
|
});
|
|
94
119
|
}
|
|
120
|
+
// Wait for the final write to complete
|
|
121
|
+
if (pendingWrite)
|
|
122
|
+
await pendingWrite;
|
|
95
123
|
}
|
|
96
124
|
/**
|
|
97
125
|
* Process a batch of nodes with batched embedding calls.
|
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
* Orchestrates parallel chunk parsing using a worker pool with pipelined import.
|
|
4
4
|
* Used for large codebases (>= PARSING.parallelThreshold files).
|
|
5
5
|
*/
|
|
6
|
-
import { join } from 'path';
|
|
7
6
|
import { ProgressReporter } from '../../core/utils/progress-reporter.js';
|
|
8
7
|
import { debugLog } from '../utils.js';
|
|
9
8
|
import { ChunkWorkerPool } from '../workers/chunk-worker-pool.js';
|
|
@@ -41,6 +40,8 @@ export class ParallelImportHandler {
|
|
|
41
40
|
projectId: config.projectId,
|
|
42
41
|
projectType: config.projectType,
|
|
43
42
|
});
|
|
43
|
+
// Create indexes once before chunked imports start
|
|
44
|
+
await this.graphGeneratorHandler.ensureIndexes();
|
|
44
45
|
// Pipelined: import starts as soon as each chunk completes parsing
|
|
45
46
|
const poolResult = await pool.processChunks(chunks, async (result, stats) => {
|
|
46
47
|
await this.importToNeo4j(result.nodes, result.edges);
|
|
@@ -116,21 +117,6 @@ export class ParallelImportHandler {
|
|
|
116
117
|
async importToNeo4j(nodes, edges) {
|
|
117
118
|
if (nodes.length === 0 && edges.length === 0)
|
|
118
119
|
return;
|
|
119
|
-
|
|
120
|
-
const { randomBytes } = await import('crypto');
|
|
121
|
-
const { tmpdir } = await import('os');
|
|
122
|
-
const tempPath = join(tmpdir(), `chunk-${Date.now()}-${randomBytes(8).toString('hex')}.json`);
|
|
123
|
-
try {
|
|
124
|
-
await fs.writeFile(tempPath, JSON.stringify({ nodes, edges, metadata: { parallel: true } }));
|
|
125
|
-
await this.graphGeneratorHandler.generateGraph(tempPath, 100, false);
|
|
126
|
-
}
|
|
127
|
-
finally {
|
|
128
|
-
try {
|
|
129
|
-
await fs.unlink(tempPath);
|
|
130
|
-
}
|
|
131
|
-
catch {
|
|
132
|
-
// Ignore cleanup errors
|
|
133
|
-
}
|
|
134
|
-
}
|
|
120
|
+
await this.graphGeneratorHandler.generateGraphFromData(nodes, edges, 100, false, {}, true);
|
|
135
121
|
}
|
|
136
122
|
}
|
|
@@ -2,20 +2,9 @@
|
|
|
2
2
|
* Streaming Import Handler
|
|
3
3
|
* Orchestrates chunked parsing and import for large codebases
|
|
4
4
|
*/
|
|
5
|
-
import { randomBytes } from 'crypto';
|
|
6
|
-
import { tmpdir } from 'os';
|
|
7
|
-
import { join } from 'path';
|
|
8
5
|
import { ProgressReporter } from '../../core/utils/progress-reporter.js';
|
|
9
6
|
import { DEFAULTS } from '../constants.js';
|
|
10
7
|
import { debugLog } from '../utils.js';
|
|
11
|
-
/**
|
|
12
|
-
* Generate a secure temporary file path using crypto random bytes
|
|
13
|
-
* to avoid race conditions and predictable filenames
|
|
14
|
-
*/
|
|
15
|
-
const generateTempPath = (prefix) => {
|
|
16
|
-
const randomSuffix = randomBytes(16).toString('hex');
|
|
17
|
-
return join(tmpdir(), `${prefix}-${Date.now()}-${randomSuffix}.json`);
|
|
18
|
-
};
|
|
19
8
|
export class StreamingImportHandler {
|
|
20
9
|
graphGeneratorHandler;
|
|
21
10
|
progressReporter;
|
|
@@ -50,6 +39,8 @@ export class StreamingImportHandler {
|
|
|
50
39
|
}
|
|
51
40
|
let totalNodesImported = 0;
|
|
52
41
|
let totalEdgesImported = 0;
|
|
42
|
+
// Create indexes once before chunked imports start
|
|
43
|
+
await this.graphGeneratorHandler.ensureIndexes();
|
|
53
44
|
for (let chunkIndex = 0; chunkIndex < chunks.length; chunkIndex++) {
|
|
54
45
|
const chunk = chunks[chunkIndex];
|
|
55
46
|
const filesProcessed = chunkIndex * config.chunkSize + chunk.length;
|
|
@@ -129,37 +120,11 @@ export class StreamingImportHandler {
|
|
|
129
120
|
return result;
|
|
130
121
|
}
|
|
131
122
|
async importChunkToNeo4j(nodes, edges) {
|
|
132
|
-
|
|
133
|
-
const fs = await import('fs/promises');
|
|
134
|
-
try {
|
|
135
|
-
await fs.writeFile(tempPath, JSON.stringify({ nodes, edges, metadata: { chunked: true } }));
|
|
136
|
-
await this.graphGeneratorHandler.generateGraph(tempPath, DEFAULTS.batchSize, false);
|
|
137
|
-
}
|
|
138
|
-
finally {
|
|
139
|
-
try {
|
|
140
|
-
await fs.unlink(tempPath);
|
|
141
|
-
}
|
|
142
|
-
catch {
|
|
143
|
-
// Ignore cleanup errors
|
|
144
|
-
}
|
|
145
|
-
}
|
|
123
|
+
await this.graphGeneratorHandler.generateGraphFromData(nodes, edges, DEFAULTS.batchSize, false, {}, true);
|
|
146
124
|
}
|
|
147
125
|
async importEdgesToNeo4j(edges) {
|
|
148
126
|
if (edges.length === 0)
|
|
149
127
|
return;
|
|
150
|
-
|
|
151
|
-
const fs = await import('fs/promises');
|
|
152
|
-
try {
|
|
153
|
-
await fs.writeFile(tempPath, JSON.stringify({ nodes: [], edges, metadata: { edgesOnly: true } }));
|
|
154
|
-
await this.graphGeneratorHandler.generateGraph(tempPath, DEFAULTS.batchSize, false);
|
|
155
|
-
}
|
|
156
|
-
finally {
|
|
157
|
-
try {
|
|
158
|
-
await fs.unlink(tempPath);
|
|
159
|
-
}
|
|
160
|
-
catch {
|
|
161
|
-
// Ignore cleanup errors
|
|
162
|
-
}
|
|
163
|
-
}
|
|
128
|
+
await this.graphGeneratorHandler.generateGraphFromData([], edges, DEFAULTS.batchSize, false, {}, true);
|
|
164
129
|
}
|
|
165
130
|
}
|
package/package.json
CHANGED