npm - code-graph-context - Versions diffs - 2.10.5 → 2.11.1 - Mend

code-graph-context 2.10.5 → 2.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/README.md +9 -5
package/dist/cli/cli.js +1 -1
package/dist/core/embeddings/embedding-sidecar.js +1 -1
package/dist/core/embeddings/embeddings.service.js +3 -2
package/package.json +1 -1
package/sidecar/embedding_server.py +16 -22
package/sidecar/requirements.txt +1 -0

package/README.md CHANGED Viewed

@@ -189,9 +189,10 @@ If you prefer to edit the config files directly:
 | `NEO4J_URI` | No | `bolt://localhost:7687` | Neo4j connection URI |
 | `NEO4J_USER` | No | `neo4j` | Neo4j username |
 | `NEO4J_PASSWORD` | No | `PASSWORD` | Neo4j password |
-| `EMBEDDING_MODEL` | No | `Qodo/Qodo-Embed-1-1.5B` | Local embedding model (see [Embedding Configuration](#embedding-configuration)) |
+| `EMBEDDING_MODEL` | No | `Qwen/Qwen3-Embedding-0.6B` | Local embedding model (see [Embedding Configuration](#embedding-configuration)) |
 | `EMBEDDING_SIDECAR_PORT` | No | `8787` | Port for local embedding server |
-| `EMBEDDING_FULL_PRECISION` | No | `false` | Set `true` for float32 (uses ~2x memory) |
+| `EMBEDDING_DEVICE` | No | `cpu` | Device for embeddings (`cpu` or `mps`). CPU is default to avoid MPS memory bloat |
+| `EMBEDDING_HALF_PRECISION` | No | `false` | Set `true` for float16 (uses ~0.5x memory) |
 | `OPENAI_ENABLED` | No | `false` | Set `true` to use OpenAI instead of local |
 | `OPENAI_API_KEY` | No* | - | Required when `OPENAI_ENABLED=true` |
@@ -536,9 +537,11 @@ This enables queries like "find all hooks that use context" while maintaining AS
 Local embeddings are the default — **no API key needed**. The Python sidecar starts automatically on first use and runs a local model for high-quality code embeddings.
-The sidecar uses **float16 (half precision)** by default, which halves memory usage with no meaningful quality loss. It also auto-shuts down after 3 minutes of inactivity to free memory, and restarts lazily when needed (~15-20s).
+The sidecar runs on **CPU by default** to avoid MPS memory pool bloat on Apple Silicon (MPS can pre-allocate 10+ GB even for small models). CPU is fast enough for models under 1B params. It also auto-shuts down after 3 minutes of inactivity to free memory, and restarts lazily when needed (~15-20s).
-> **Full precision mode:** If you have 32+ GB RAM and want float32, set `EMBEDDING_FULL_PRECISION=true`.
+> **GPU acceleration:** Set `EMBEDDING_DEVICE=mps` to use Apple Silicon GPU for larger models (1B+ params). Only recommended on machines with 32+ GB RAM.
+>
+> **Half precision:** Set `EMBEDDING_HALF_PRECISION=true` to load the model in float16, roughly halving memory usage.
 ### Available Models
@@ -546,7 +549,8 @@ Set via the `EMBEDDING_MODEL` environment variable:
 | Model | Dimensions | RAM (fp16) | Quality | Best For |
 |-------|-----------|-----|---------|----------|
-| `Qodo/Qodo-Embed-1-1.5B` (default) | 1536 | ~4.5 GB | Best | Default, works on 16GB machines |
+| `Qwen/Qwen3-Embedding-0.6B` (default) | 1024 | ~1.2 GB | Best | Default, code-aware, MTEB-Code #1 |
+| `Qodo/Qodo-Embed-1-1.5B` | 1536 | ~4.5 GB | Great | Machines with 32+ GB RAM |
 | `BAAI/bge-base-en-v1.5` | 768 | ~250 MB | Good | General purpose, low RAM |
 | `sentence-transformers/all-MiniLM-L6-v2` | 384 | ~100 MB | OK | Minimal RAM, fast |
 | `nomic-ai/nomic-embed-text-v1.5` | 768 | ~300 MB | Good | Code + prose mixed |

package/dist/cli/cli.js CHANGED Viewed

@@ -251,7 +251,7 @@ const setupSidecar = async () => {
         return;
     }
     // Pre-download the embedding model so first real use is fast
-    const modelName = process.env.EMBEDDING_MODEL ?? 'Qodo/Qodo-Embed-1-1.5B';
+    const modelName = process.env.EMBEDDING_MODEL ?? 'Qwen/Qwen3-Embedding-0.6B';
     await preDownloadModel(sidecarDir, python, modelName);
 };
 /**

package/dist/core/embeddings/embedding-sidecar.js CHANGED Viewed

@@ -12,7 +12,7 @@ const __dirname = dirname(__filename);
 const DEFAULT_CONFIG = {
     port: parseInt(process.env.EMBEDDING_SIDECAR_PORT ?? '', 10) || 8787,
     host: '127.0.0.1',
-    model: process.env.EMBEDDING_MODEL ?? 'Qodo/Qodo-Embed-1-1.5B',
+    model: process.env.EMBEDDING_MODEL ?? 'Qwen/Qwen3-Embedding-0.6B',
     startupTimeoutMs: 120_000, // 2 min — first run downloads the model
     requestTimeoutMs: 60_000,
     idleTimeoutMs: 180_000, // 3 min — auto-shutdown after no requests

package/dist/core/embeddings/embeddings.service.js CHANGED Viewed

@@ -5,7 +5,7 @@
  * and get the right implementation based on OPENAI_ENABLED.
  *
  *   OPENAI_ENABLED=true  → OpenAI text-embedding-3-large (requires OPENAI_API_KEY)
- *   default              → Local Python sidecar with Qodo-Embed-1-1.5B
+ *   default              → Local Python sidecar with Qwen3-Embedding-0.6B
  */
 import { LocalEmbeddingsService } from './local-embeddings.service.js';
 import { OpenAIEmbeddingsService } from './openai-embeddings.service.js';
@@ -24,6 +24,7 @@ export const EMBEDDING_DIMENSIONS = {
     'text-embedding-3-large': 3072,
     'text-embedding-3-small': 1536,
     // Local models (via sidecar)
+    'Qwen/Qwen3-Embedding-0.6B': 1024,
     'Qodo/Qodo-Embed-1-1.5B': 1536,
     'sentence-transformers/all-MiniLM-L6-v2': 384,
     'sentence-transformers/all-mpnet-base-v2': 768,
@@ -45,7 +46,7 @@ export const getEmbeddingDimensions = () => {
         const model = process.env.OPENAI_EMBEDDING_MODEL ?? 'text-embedding-3-large';
         return EMBEDDING_DIMENSIONS[model] ?? 3072;
     }
-    const model = process.env.EMBEDDING_MODEL ?? 'Qodo/Qodo-Embed-1-1.5B';
+    const model = process.env.EMBEDDING_MODEL ?? 'Qwen/Qwen3-Embedding-0.6B';
     return EMBEDDING_DIMENSIONS[model] ?? 1536;
 };
 /**

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "code-graph-context",
-  "version": "2.10.5",
+  "version": "2.11.1",
   "description": "MCP server that builds code graphs to provide rich context to LLMs",
   "type": "module",
   "homepage": "https://github.com/drewdrewH/code-graph-context#readme",

package/sidecar/embedding_server.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
 Local embedding server for code-graph-context.
-Uses Qodo-Embed-1-1.5B for high-quality code embeddings without OpenAI dependency.
+Uses Qwen3-Embedding-0.6B for high-quality code embeddings without OpenAI dependency.
 Runs as a sidecar process managed by the Node.js MCP server.
 """
@@ -27,7 +27,7 @@ logger.info(f"Sidecar process starting (pid={os.getpid()})")
 app = FastAPI(title="code-graph-context embedding sidecar")
 model = None
-model_name = os.environ.get("EMBEDDING_MODEL", "Qodo/Qodo-Embed-1-1.5B")
+model_name = os.environ.get("EMBEDDING_MODEL", "Qwen/Qwen3-Embedding-0.6B")
 class EmbedRequest(BaseModel):
@@ -48,16 +48,23 @@ def load_model():
         import torch
         from sentence_transformers import SentenceTransformer
-        device = "mps" if torch.backends.mps.is_available() else "cpu"
+        # Use CPU by default — MPS pre-allocates a massive memory pool (10+ GB)
+        # that bloats small models. CPU on Apple Silicon is fast enough for <1B models.
+        # Set EMBEDDING_DEVICE=mps to force GPU if needed for large models.
+        device_override = os.environ.get("EMBEDDING_DEVICE", "").lower()
+        if device_override:
+            device = device_override
+        else:
+            device = "cpu"
         logger.info(f"Loading {model_name} on {device}...")
         logger.info(f"PyTorch version: {torch.__version__}, MPS available: {torch.backends.mps.is_available()}")
-        use_half = os.environ.get("EMBEDDING_FULL_PRECISION", "").lower() != "true"
-        model = SentenceTransformer(model_name, device=device)
+        use_half = os.environ.get("EMBEDDING_HALF_PRECISION", "").lower() == "true"
         if use_half:
-            model.half()
+            model = SentenceTransformer(model_name, device=device, model_kwargs={"torch_dtype": "float16"})
             logger.info(f"Model loaded in float16 (half precision)")
         else:
+            model = SentenceTransformer(model_name, device=device)
             logger.info(f"Model loaded in float32 (full precision)")
         logger.info(f"Running warmup...")
@@ -114,7 +121,6 @@ async def embed(req: EmbedRequest):
 def _encode_with_oom_fallback(texts: list[str], batch_size: int) -> list[list[float]]:
     """
     Encode texts, falling back to CPU if MPS runs out of memory.
-    Also retries with smaller batch sizes before giving up.
     """
     import torch
@@ -126,28 +132,18 @@ def _encode_with_oom_fallback(texts: list[str], batch_size: int) -> list[list[fl
                 show_progress_bar=False,
                 normalize_embeddings=True,
             )
-        # Free intermediate tensors after each request
-        if hasattr(torch.mps, "empty_cache"):
-            torch.mps.empty_cache()
         return result.tolist()
-    except (torch.mps.OutOfMemoryError, RuntimeError) as e:
+    except (RuntimeError,) as e:
         if "out of memory" not in str(e).lower():
             raise
-        logger.warning(f"MPS OOM with batch_size={batch_size}, len={len(texts)}. Falling back to CPU.")
-        # Free MPS memory
-        if hasattr(torch.mps, "empty_cache"):
-            torch.mps.empty_cache()
+        logger.warning(f"OOM with batch_size={batch_size}, len={len(texts)}. Falling back to CPU.")
         gc.collect()
-        # Fall back to CPU for this request
         original_device = model.device
         model.to("cpu")
-        logger.info("Model moved to CPU for fallback encoding")
         try:
-            # Use smaller batches on CPU
             cpu_batch = min(batch_size, 4)
             with torch.no_grad():
                 result = model.encode(
@@ -159,12 +155,10 @@ def _encode_with_oom_fallback(texts: list[str], batch_size: int) -> list[list[fl
             logger.info(f"CPU fallback encoding complete ({len(texts)} texts)")
             return result.tolist()
         finally:
-            # Move back to MPS for future requests
             try:
                 model.to(original_device)
-                logger.info(f"Model moved back to {original_device}")
             except Exception:
-                logger.warning("Could not move model back to MPS, staying on CPU")
+                logger.warning("Could not move model back, staying on CPU")
 def handle_signal(sig, _frame):

package/sidecar/requirements.txt CHANGED Viewed

@@ -3,3 +3,4 @@ uvicorn>=0.24.0
 sentence-transformers>=3.0.0
 torch>=2.0.0
 pydantic>=2.0.0
+transformers>=4.51.0