code-graph-context 2.13.0 → 2.13.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -25,9 +25,40 @@ export class EmbeddingSidecar {
|
|
|
25
25
|
stopping = false;
|
|
26
26
|
_exitHandler = null;
|
|
27
27
|
_idleTimer = null;
|
|
28
|
+
// Concurrency semaphore — model.encode() is GPU-bound and processes
|
|
29
|
+
// requests serially inside uvicorn. If 15 workers send requests at once,
|
|
30
|
+
// the first takes ~3s, the last waits ~45s and times out. By queuing
|
|
31
|
+
// excess requests here (no timeout pressure) we let only N through at a
|
|
32
|
+
// time, keeping each request well within the 60s timeout.
|
|
33
|
+
maxConcurrent = parseInt(process.env.EMBEDDING_MAX_CONCURRENT ?? '', 10) || 2;
|
|
34
|
+
inflight = 0;
|
|
35
|
+
waitQueue = [];
|
|
28
36
|
constructor(config = {}) {
|
|
29
37
|
this.config = { ...DEFAULT_CONFIG, ...config };
|
|
30
38
|
}
|
|
39
|
+
/**
|
|
40
|
+
* Wait for a concurrency slot. If under the limit, returns immediately.
|
|
41
|
+
* Otherwise parks the caller in a FIFO queue until a slot opens.
|
|
42
|
+
*/
|
|
43
|
+
acquireSlot() {
|
|
44
|
+
if (this.inflight < this.maxConcurrent) {
|
|
45
|
+
this.inflight++;
|
|
46
|
+
return; // fast path — no allocation, no Promise
|
|
47
|
+
}
|
|
48
|
+
return new Promise((resolve) => this.waitQueue.push(() => {
|
|
49
|
+
this.inflight++;
|
|
50
|
+
resolve();
|
|
51
|
+
}));
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Release a slot, unblocking the next queued caller if any.
|
|
55
|
+
*/
|
|
56
|
+
releaseSlot() {
|
|
57
|
+
this.inflight--;
|
|
58
|
+
const next = this.waitQueue.shift();
|
|
59
|
+
if (next)
|
|
60
|
+
next(); // wake one waiter — it will increment inflight
|
|
61
|
+
}
|
|
31
62
|
get baseUrl() {
|
|
32
63
|
return `http://${this.config.host}:${this.config.port}`;
|
|
33
64
|
}
|
|
@@ -183,9 +214,19 @@ export class EmbeddingSidecar {
|
|
|
183
214
|
}
|
|
184
215
|
/**
|
|
185
216
|
* Embed an array of texts. Lazily starts the sidecar if not running.
|
|
217
|
+
* Concurrency-limited: at most `maxConcurrent` requests hit the sidecar
|
|
218
|
+
* at once. Excess callers wait in a FIFO queue (no timeout pressure).
|
|
186
219
|
*/
|
|
187
220
|
async embed(texts, gpuBatchSize) {
|
|
188
221
|
await this.start();
|
|
222
|
+
// Wait for a concurrency slot — the timeout only starts AFTER we
|
|
223
|
+
// acquire the slot, so queued requests don't eat into their timeout.
|
|
224
|
+
const queuedAt = Date.now();
|
|
225
|
+
await this.acquireSlot();
|
|
226
|
+
const queueMs = Date.now() - queuedAt;
|
|
227
|
+
if (queueMs > 100) {
|
|
228
|
+
console.error(`[embedding-sidecar] Waited ${queueMs}ms for concurrency slot (inflight=${this.inflight}, queued=${this.waitQueue.length})`);
|
|
229
|
+
}
|
|
189
230
|
const controller = new AbortController();
|
|
190
231
|
const timeout = setTimeout(() => controller.abort(), this.config.requestTimeoutMs);
|
|
191
232
|
const startTime = Date.now();
|
|
@@ -203,8 +244,6 @@ export class EmbeddingSidecar {
|
|
|
203
244
|
const detail = await res.text();
|
|
204
245
|
const isOOM = detail.toLowerCase().includes('out of memory');
|
|
205
246
|
if (res.status === 500 && isOOM) {
|
|
206
|
-
// OOM leaves GPU memory in a corrupted state — kill the sidecar
|
|
207
|
-
// so the next request spawns a fresh process with clean memory
|
|
208
247
|
console.error('[embedding-sidecar] OOM detected, restarting sidecar to reclaim GPU memory');
|
|
209
248
|
await this.stop();
|
|
210
249
|
}
|
|
@@ -230,6 +269,7 @@ export class EmbeddingSidecar {
|
|
|
230
269
|
}
|
|
231
270
|
finally {
|
|
232
271
|
clearTimeout(timeout);
|
|
272
|
+
this.releaseSlot(); // always release, even on error
|
|
233
273
|
}
|
|
234
274
|
}
|
|
235
275
|
/**
|
|
@@ -7,6 +7,10 @@ import { debugLog } from '../../mcp/utils.js';
|
|
|
7
7
|
import { getEmbeddingSidecar } from './embedding-sidecar.js';
|
|
8
8
|
const BATCH_CONFIG = {
|
|
9
9
|
maxBatchSize: parseInt(process.env.EMBEDDING_BATCH_SIZE ?? '', 10) || 8,
|
|
10
|
+
// Max texts per HTTP request to the sidecar. Keeps memory bounded when
|
|
11
|
+
// multiple parallel workers call embedTextsInBatches concurrently.
|
|
12
|
+
// The sidecar still handles GPU batching internally via batch_size.
|
|
13
|
+
httpBatchLimit: parseInt(process.env.EMBEDDING_HTTP_BATCH_LIMIT ?? '', 10) || 50,
|
|
10
14
|
};
|
|
11
15
|
export class LocalEmbeddingsService {
|
|
12
16
|
async embedText(text) {
|
|
@@ -22,22 +26,30 @@ export class LocalEmbeddingsService {
|
|
|
22
26
|
async embedTextsInBatches(texts, _batchSize) {
|
|
23
27
|
if (texts.length === 0)
|
|
24
28
|
return [];
|
|
25
|
-
// GPU batch size controls how many texts the model processes at once (memory-bound).
|
|
26
|
-
// We send ALL texts in a single HTTP request and let the sidecar handle GPU batching
|
|
27
|
-
// internally via model.encode(batch_size=N). This eliminates HTTP round-trip overhead.
|
|
28
29
|
const gpuBatchSize = BATCH_CONFIG.maxBatchSize;
|
|
29
|
-
const
|
|
30
|
-
|
|
31
|
-
|
|
30
|
+
const httpLimit = BATCH_CONFIG.httpBatchLimit;
|
|
31
|
+
const httpBatches = Math.ceil(texts.length / httpLimit);
|
|
32
|
+
const gpuBatchesPerRequest = Math.ceil(httpLimit / gpuBatchSize);
|
|
33
|
+
console.error(`[embedding] ${texts.length} texts → ${httpBatches} HTTP requests (http_limit=${httpLimit}, gpu_batch_size=${gpuBatchSize}, ~${gpuBatchesPerRequest} GPU batches/req)`);
|
|
34
|
+
await debugLog('Batch embedding started', { provider: 'local', textCount: texts.length, gpuBatchSize, httpLimit, httpBatches });
|
|
32
35
|
const sidecar = getEmbeddingSidecar();
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
36
|
+
const allResults = [];
|
|
37
|
+
for (let i = 0; i < texts.length; i += httpLimit) {
|
|
38
|
+
const batch = texts.slice(i, i + httpLimit);
|
|
39
|
+
const batchNum = Math.floor(i / httpLimit) + 1;
|
|
40
|
+
try {
|
|
41
|
+
const results = await sidecar.embed(batch, gpuBatchSize);
|
|
42
|
+
allResults.push(...results);
|
|
43
|
+
if (httpBatches > 1) {
|
|
44
|
+
console.error(`[embedding] HTTP batch ${batchNum}/${httpBatches}: ${batch.length} texts embedded`);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
catch (error) {
|
|
48
|
+
const msg = error instanceof Error ? error.message : String(error);
|
|
49
|
+
console.error(`[embedding] FAILED HTTP batch ${batchNum}/${httpBatches} (${batch.length} texts, gpuBatchSize=${gpuBatchSize}): ${msg}`);
|
|
50
|
+
throw error;
|
|
51
|
+
}
|
|
41
52
|
}
|
|
53
|
+
return allResults;
|
|
42
54
|
}
|
|
43
55
|
}
|
package/package.json
CHANGED