code-graph-context 2.13.0 → 2.13.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,9 +25,40 @@ export class EmbeddingSidecar {
25
25
  stopping = false;
26
26
  _exitHandler = null;
27
27
  _idleTimer = null;
28
+ // Concurrency semaphore — model.encode() is GPU-bound and processes
29
+ // requests serially inside uvicorn. If 15 workers send requests at once,
30
+ // the first takes ~3s, the last waits ~45s and times out. By queuing
31
+ // excess requests here (no timeout pressure) we let only N through at a
32
+ // time, keeping each request well within the 60s timeout.
33
+ maxConcurrent = parseInt(process.env.EMBEDDING_MAX_CONCURRENT ?? '', 10) || 2;
34
+ inflight = 0;
35
+ waitQueue = [];
28
36
  constructor(config = {}) {
29
37
  this.config = { ...DEFAULT_CONFIG, ...config };
30
38
  }
39
+ /**
40
+ * Wait for a concurrency slot. If under the limit, returns immediately.
41
+ * Otherwise parks the caller in a FIFO queue until a slot opens.
42
+ */
43
+ acquireSlot() {
44
+ if (this.inflight < this.maxConcurrent) {
45
+ this.inflight++;
46
+ return; // fast path — no allocation, no Promise
47
+ }
48
+ return new Promise((resolve) => this.waitQueue.push(() => {
49
+ this.inflight++;
50
+ resolve();
51
+ }));
52
+ }
53
+ /**
54
+ * Release a slot, unblocking the next queued caller if any.
55
+ */
56
+ releaseSlot() {
57
+ this.inflight--;
58
+ const next = this.waitQueue.shift();
59
+ if (next)
60
+ next(); // wake one waiter — it will increment inflight
61
+ }
31
62
  get baseUrl() {
32
63
  return `http://${this.config.host}:${this.config.port}`;
33
64
  }
@@ -183,9 +214,19 @@ export class EmbeddingSidecar {
183
214
  }
184
215
  /**
185
216
  * Embed an array of texts. Lazily starts the sidecar if not running.
217
+ * Concurrency-limited: at most `maxConcurrent` requests hit the sidecar
218
+ * at once. Excess callers wait in a FIFO queue (no timeout pressure).
186
219
  */
187
220
  async embed(texts, gpuBatchSize) {
188
221
  await this.start();
222
+ // Wait for a concurrency slot — the timeout only starts AFTER we
223
+ // acquire the slot, so queued requests don't eat into their timeout.
224
+ const queuedAt = Date.now();
225
+ await this.acquireSlot();
226
+ const queueMs = Date.now() - queuedAt;
227
+ if (queueMs > 100) {
228
+ console.error(`[embedding-sidecar] Waited ${queueMs}ms for concurrency slot (inflight=${this.inflight}, queued=${this.waitQueue.length})`);
229
+ }
189
230
  const controller = new AbortController();
190
231
  const timeout = setTimeout(() => controller.abort(), this.config.requestTimeoutMs);
191
232
  const startTime = Date.now();
@@ -203,8 +244,6 @@ export class EmbeddingSidecar {
203
244
  const detail = await res.text();
204
245
  const isOOM = detail.toLowerCase().includes('out of memory');
205
246
  if (res.status === 500 && isOOM) {
206
- // OOM leaves GPU memory in a corrupted state — kill the sidecar
207
- // so the next request spawns a fresh process with clean memory
208
247
  console.error('[embedding-sidecar] OOM detected, restarting sidecar to reclaim GPU memory');
209
248
  await this.stop();
210
249
  }
@@ -230,6 +269,7 @@ export class EmbeddingSidecar {
230
269
  }
231
270
  finally {
232
271
  clearTimeout(timeout);
272
+ this.releaseSlot(); // always release, even on error
233
273
  }
234
274
  }
235
275
  /**
@@ -7,6 +7,10 @@ import { debugLog } from '../../mcp/utils.js';
7
7
  import { getEmbeddingSidecar } from './embedding-sidecar.js';
8
8
  const BATCH_CONFIG = {
9
9
  maxBatchSize: parseInt(process.env.EMBEDDING_BATCH_SIZE ?? '', 10) || 8,
10
+ // Max texts per HTTP request to the sidecar. Keeps memory bounded when
11
+ // multiple parallel workers call embedTextsInBatches concurrently.
12
+ // The sidecar still handles GPU batching internally via batch_size.
13
+ httpBatchLimit: parseInt(process.env.EMBEDDING_HTTP_BATCH_LIMIT ?? '', 10) || 50,
10
14
  };
11
15
  export class LocalEmbeddingsService {
12
16
  async embedText(text) {
@@ -22,22 +26,30 @@ export class LocalEmbeddingsService {
22
26
  async embedTextsInBatches(texts, _batchSize) {
23
27
  if (texts.length === 0)
24
28
  return [];
25
- // GPU batch size controls how many texts the model processes at once (memory-bound).
26
- // We send ALL texts in a single HTTP request and let the sidecar handle GPU batching
27
- // internally via model.encode(batch_size=N). This eliminates HTTP round-trip overhead.
28
29
  const gpuBatchSize = BATCH_CONFIG.maxBatchSize;
29
- const gpuBatches = Math.ceil(texts.length / gpuBatchSize);
30
- console.error(`[embedding] Sending ${texts.length} texts in 1 request (gpu_batch_size=${gpuBatchSize}, ~${gpuBatches} GPU batches)`);
31
- await debugLog('Batch embedding started', { provider: 'local', textCount: texts.length, gpuBatchSize });
30
+ const httpLimit = BATCH_CONFIG.httpBatchLimit;
31
+ const httpBatches = Math.ceil(texts.length / httpLimit);
32
+ const gpuBatchesPerRequest = Math.ceil(httpLimit / gpuBatchSize);
33
+ console.error(`[embedding] ${texts.length} texts → ${httpBatches} HTTP requests (http_limit=${httpLimit}, gpu_batch_size=${gpuBatchSize}, ~${gpuBatchesPerRequest} GPU batches/req)`);
34
+ await debugLog('Batch embedding started', { provider: 'local', textCount: texts.length, gpuBatchSize, httpLimit, httpBatches });
32
35
  const sidecar = getEmbeddingSidecar();
33
- try {
34
- const results = await sidecar.embed(texts, gpuBatchSize);
35
- return results;
36
- }
37
- catch (error) {
38
- const msg = error instanceof Error ? error.message : String(error);
39
- console.error(`[embedding] FAILED (${texts.length} texts, gpuBatchSize=${gpuBatchSize}): ${msg}`);
40
- throw error;
36
+ const allResults = [];
37
+ for (let i = 0; i < texts.length; i += httpLimit) {
38
+ const batch = texts.slice(i, i + httpLimit);
39
+ const batchNum = Math.floor(i / httpLimit) + 1;
40
+ try {
41
+ const results = await sidecar.embed(batch, gpuBatchSize);
42
+ allResults.push(...results);
43
+ if (httpBatches > 1) {
44
+ console.error(`[embedding] HTTP batch ${batchNum}/${httpBatches}: ${batch.length} texts embedded`);
45
+ }
46
+ }
47
+ catch (error) {
48
+ const msg = error instanceof Error ? error.message : String(error);
49
+ console.error(`[embedding] FAILED HTTP batch ${batchNum}/${httpBatches} (${batch.length} texts, gpuBatchSize=${gpuBatchSize}): ${msg}`);
50
+ throw error;
51
+ }
41
52
  }
53
+ return allResults;
42
54
  }
43
55
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "code-graph-context",
3
- "version": "2.13.0",
3
+ "version": "2.13.2",
4
4
  "description": "MCP server that builds code graphs to provide rich context to LLMs",
5
5
  "type": "module",
6
6
  "homepage": "https://github.com/drewdrewH/code-graph-context#readme",