npm - code-graph-context - Versions diffs - 2.13.0 → 2.13.2 - Mend

code-graph-context 2.13.0 → 2.13.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/core/embeddings/embedding-sidecar.js +42 -2
package/dist/core/embeddings/local-embeddings.service.js +26 -14
package/package.json +1 -1

package/dist/core/embeddings/embedding-sidecar.js CHANGED Viewed

@@ -25,9 +25,40 @@ export class EmbeddingSidecar {
     stopping = false;
     _exitHandler = null;
     _idleTimer = null;
+    // Concurrency semaphore — model.encode() is GPU-bound and processes
+    // requests serially inside uvicorn. If 15 workers send requests at once,
+    // the first takes ~3s, the last waits ~45s and times out. By queuing
+    // excess requests here (no timeout pressure) we let only N through at a
+    // time, keeping each request well within the 60s timeout.
+    maxConcurrent = parseInt(process.env.EMBEDDING_MAX_CONCURRENT ?? '', 10) || 2;
+    inflight = 0;
+    waitQueue = [];
     constructor(config = {}) {
         this.config = { ...DEFAULT_CONFIG, ...config };
     }
+    /**
+     * Wait for a concurrency slot. If under the limit, returns immediately.
+     * Otherwise parks the caller in a FIFO queue until a slot opens.
+     */
+    acquireSlot() {
+        if (this.inflight < this.maxConcurrent) {
+            this.inflight++;
+            return; // fast path — no allocation, no Promise
+        }
+        return new Promise((resolve) => this.waitQueue.push(() => {
+            this.inflight++;
+            resolve();
+        }));
+    }
+    /**
+     * Release a slot, unblocking the next queued caller if any.
+     */
+    releaseSlot() {
+        this.inflight--;
+        const next = this.waitQueue.shift();
+        if (next)
+            next(); // wake one waiter — it will increment inflight
+    }
     get baseUrl() {
         return `http://${this.config.host}:${this.config.port}`;
     }
@@ -183,9 +214,19 @@ export class EmbeddingSidecar {
     }
     /**
      * Embed an array of texts. Lazily starts the sidecar if not running.
+     * Concurrency-limited: at most `maxConcurrent` requests hit the sidecar
+     * at once. Excess callers wait in a FIFO queue (no timeout pressure).
      */
     async embed(texts, gpuBatchSize) {
         await this.start();
+        // Wait for a concurrency slot — the timeout only starts AFTER we
+        // acquire the slot, so queued requests don't eat into their timeout.
+        const queuedAt = Date.now();
+        await this.acquireSlot();
+        const queueMs = Date.now() - queuedAt;
+        if (queueMs > 100) {
+            console.error(`[embedding-sidecar] Waited ${queueMs}ms for concurrency slot (inflight=${this.inflight}, queued=${this.waitQueue.length})`);
+        }
         const controller = new AbortController();
         const timeout = setTimeout(() => controller.abort(), this.config.requestTimeoutMs);
         const startTime = Date.now();
@@ -203,8 +244,6 @@ export class EmbeddingSidecar {
                 const detail = await res.text();
                 const isOOM = detail.toLowerCase().includes('out of memory');
                 if (res.status === 500 && isOOM) {
-                    // OOM leaves GPU memory in a corrupted state — kill the sidecar
-                    // so the next request spawns a fresh process with clean memory
                     console.error('[embedding-sidecar] OOM detected, restarting sidecar to reclaim GPU memory');
                     await this.stop();
                 }
@@ -230,6 +269,7 @@ export class EmbeddingSidecar {
         }
         finally {
             clearTimeout(timeout);
+            this.releaseSlot(); // always release, even on error
         }
     }
     /**

package/dist/core/embeddings/local-embeddings.service.js CHANGED Viewed

@@ -7,6 +7,10 @@ import { debugLog } from '../../mcp/utils.js';
 import { getEmbeddingSidecar } from './embedding-sidecar.js';
 const BATCH_CONFIG = {
     maxBatchSize: parseInt(process.env.EMBEDDING_BATCH_SIZE ?? '', 10) || 8,
+    // Max texts per HTTP request to the sidecar. Keeps memory bounded when
+    // multiple parallel workers call embedTextsInBatches concurrently.
+    // The sidecar still handles GPU batching internally via batch_size.
+    httpBatchLimit: parseInt(process.env.EMBEDDING_HTTP_BATCH_LIMIT ?? '', 10) || 50,
 };
 export class LocalEmbeddingsService {
     async embedText(text) {
@@ -22,22 +26,30 @@ export class LocalEmbeddingsService {
     async embedTextsInBatches(texts, _batchSize) {
         if (texts.length === 0)
             return [];
-        // GPU batch size controls how many texts the model processes at once (memory-bound).
-        // We send ALL texts in a single HTTP request and let the sidecar handle GPU batching
-        // internally via model.encode(batch_size=N). This eliminates HTTP round-trip overhead.
         const gpuBatchSize = BATCH_CONFIG.maxBatchSize;
-        const gpuBatches = Math.ceil(texts.length / gpuBatchSize);
-        console.error(`[embedding] Sending ${texts.length} texts in 1 request (gpu_batch_size=${gpuBatchSize}, ~${gpuBatches} GPU batches)`);
-        await debugLog('Batch embedding started', { provider: 'local', textCount: texts.length, gpuBatchSize });
+        const httpLimit = BATCH_CONFIG.httpBatchLimit;
+        const httpBatches = Math.ceil(texts.length / httpLimit);
+        const gpuBatchesPerRequest = Math.ceil(httpLimit / gpuBatchSize);
+        console.error(`[embedding] ${texts.length} texts → ${httpBatches} HTTP requests (http_limit=${httpLimit}, gpu_batch_size=${gpuBatchSize}, ~${gpuBatchesPerRequest} GPU batches/req)`);
+        await debugLog('Batch embedding started', { provider: 'local', textCount: texts.length, gpuBatchSize, httpLimit, httpBatches });
         const sidecar = getEmbeddingSidecar();
-        try {
-            const results = await sidecar.embed(texts, gpuBatchSize);
-            return results;
-        }
-        catch (error) {
-            const msg = error instanceof Error ? error.message : String(error);
-            console.error(`[embedding] FAILED (${texts.length} texts, gpuBatchSize=${gpuBatchSize}): ${msg}`);
-            throw error;
+        const allResults = [];
+        for (let i = 0; i < texts.length; i += httpLimit) {
+            const batch = texts.slice(i, i + httpLimit);
+            const batchNum = Math.floor(i / httpLimit) + 1;
+            try {
+                const results = await sidecar.embed(batch, gpuBatchSize);
+                allResults.push(...results);
+                if (httpBatches > 1) {
+                    console.error(`[embedding] HTTP batch ${batchNum}/${httpBatches}: ${batch.length} texts embedded`);
+                }
+            }
+            catch (error) {
+                const msg = error instanceof Error ? error.message : String(error);
+                console.error(`[embedding] FAILED HTTP batch ${batchNum}/${httpBatches} (${batch.length} texts, gpuBatchSize=${gpuBatchSize}): ${msg}`);
+                throw error;
+            }
         }
+        return allResults;
     }
 }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "code-graph-context",
-  "version": "2.13.0",
+  "version": "2.13.2",
   "description": "MCP server that builds code graphs to provide rich context to LLMs",
   "type": "module",
   "homepage": "https://github.com/drewdrewH/code-graph-context#readme",