npm - code-graph-context - Versions diffs - 2.11.0 → 2.11.1 - Mend

code-graph-context 2.11.0 → 2.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/README.md +5 -2
package/package.json +1 -1
package/sidecar/embedding_server.py +13 -19

package/README.md CHANGED Viewed

@@ -191,6 +191,7 @@ If you prefer to edit the config files directly:
 | `NEO4J_PASSWORD` | No | `PASSWORD` | Neo4j password |
 | `EMBEDDING_MODEL` | No | `Qwen/Qwen3-Embedding-0.6B` | Local embedding model (see [Embedding Configuration](#embedding-configuration)) |
 | `EMBEDDING_SIDECAR_PORT` | No | `8787` | Port for local embedding server |
+| `EMBEDDING_DEVICE` | No | `cpu` | Device for embeddings (`cpu` or `mps`). CPU is default to avoid MPS memory bloat |
 | `EMBEDDING_HALF_PRECISION` | No | `false` | Set `true` for float16 (uses ~0.5x memory) |
 | `OPENAI_ENABLED` | No | `false` | Set `true` to use OpenAI instead of local |
 | `OPENAI_API_KEY` | No* | - | Required when `OPENAI_ENABLED=true` |
@@ -536,9 +537,11 @@ This enables queries like "find all hooks that use context" while maintaining AS
 Local embeddings are the default — **no API key needed**. The Python sidecar starts automatically on first use and runs a local model for high-quality code embeddings.
-The sidecar uses **float16 (half precision)** by default, which halves memory usage with no meaningful quality loss. It also auto-shuts down after 3 minutes of inactivity to free memory, and restarts lazily when needed (~15-20s).
+The sidecar runs on **CPU by default** to avoid MPS memory pool bloat on Apple Silicon (MPS can pre-allocate 10+ GB even for small models). CPU is fast enough for models under 1B params. It also auto-shuts down after 3 minutes of inactivity to free memory, and restarts lazily when needed (~15-20s).
-> **Half precision mode:** To reduce memory usage at the cost of some accuracy, set `EMBEDDING_HALF_PRECISION=true`.
+> **GPU acceleration:** Set `EMBEDDING_DEVICE=mps` to use Apple Silicon GPU for larger models (1B+ params). Only recommended on machines with 32+ GB RAM.
+>
+> **Half precision:** Set `EMBEDDING_HALF_PRECISION=true` to load the model in float16, roughly halving memory usage.
 ### Available Models

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "code-graph-context",
-  "version": "2.11.0",
+  "version": "2.11.1",
   "description": "MCP server that builds code graphs to provide rich context to LLMs",
   "type": "module",
   "homepage": "https://github.com/drewdrewH/code-graph-context#readme",

package/sidecar/embedding_server.py CHANGED Viewed

@@ -48,16 +48,23 @@ def load_model():
         import torch
         from sentence_transformers import SentenceTransformer
-        device = "mps" if torch.backends.mps.is_available() else "cpu"
+        # Use CPU by default — MPS pre-allocates a massive memory pool (10+ GB)
+        # that bloats small models. CPU on Apple Silicon is fast enough for <1B models.
+        # Set EMBEDDING_DEVICE=mps to force GPU if needed for large models.
+        device_override = os.environ.get("EMBEDDING_DEVICE", "").lower()
+        if device_override:
+            device = device_override
+        else:
+            device = "cpu"
         logger.info(f"Loading {model_name} on {device}...")
         logger.info(f"PyTorch version: {torch.__version__}, MPS available: {torch.backends.mps.is_available()}")
         use_half = os.environ.get("EMBEDDING_HALF_PRECISION", "").lower() == "true"
-        model = SentenceTransformer(model_name, device=device)
         if use_half:
-            model.half()
+            model = SentenceTransformer(model_name, device=device, model_kwargs={"torch_dtype": "float16"})
             logger.info(f"Model loaded in float16 (half precision)")
         else:
+            model = SentenceTransformer(model_name, device=device)
             logger.info(f"Model loaded in float32 (full precision)")
         logger.info(f"Running warmup...")
@@ -114,7 +121,6 @@ async def embed(req: EmbedRequest):
 def _encode_with_oom_fallback(texts: list[str], batch_size: int) -> list[list[float]]:
     """
     Encode texts, falling back to CPU if MPS runs out of memory.
-    Also retries with smaller batch sizes before giving up.
     """
     import torch
@@ -126,28 +132,18 @@ def _encode_with_oom_fallback(texts: list[str], batch_size: int) -> list[list[fl
                 show_progress_bar=False,
                 normalize_embeddings=True,
             )
-        # Free intermediate tensors after each request
-        if hasattr(torch.mps, "empty_cache"):
-            torch.mps.empty_cache()
         return result.tolist()
-    except (torch.mps.OutOfMemoryError, RuntimeError) as e:
+    except (RuntimeError,) as e:
         if "out of memory" not in str(e).lower():
             raise
-        logger.warning(f"MPS OOM with batch_size={batch_size}, len={len(texts)}. Falling back to CPU.")
-        # Free MPS memory
-        if hasattr(torch.mps, "empty_cache"):
-            torch.mps.empty_cache()
+        logger.warning(f"OOM with batch_size={batch_size}, len={len(texts)}. Falling back to CPU.")
         gc.collect()
-        # Fall back to CPU for this request
         original_device = model.device
         model.to("cpu")
-        logger.info("Model moved to CPU for fallback encoding")
         try:
-            # Use smaller batches on CPU
             cpu_batch = min(batch_size, 4)
             with torch.no_grad():
                 result = model.encode(
@@ -159,12 +155,10 @@ def _encode_with_oom_fallback(texts: list[str], batch_size: int) -> list[list[fl
             logger.info(f"CPU fallback encoding complete ({len(texts)} texts)")
             return result.tolist()
         finally:
-            # Move back to MPS for future requests
             try:
                 model.to(original_device)
-                logger.info(f"Model moved back to {original_device}")
             except Exception:
-                logger.warning("Could not move model back to MPS, staying on CPU")
+                logger.warning("Could not move model back, staying on CPU")
 def handle_signal(sig, _frame):