npm - code-graph-context - Versions diffs - 2.11.1 → 2.12.0 - Mend

code-graph-context 2.11.1 → 2.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/README.md +5 -5
package/dist/cli/cli.js +1 -1
package/dist/core/embeddings/embedding-sidecar.js +1 -1
package/dist/core/embeddings/embeddings.service.js +2 -2
package/package.json +1 -1
package/sidecar/embedding_server.py +2 -5

package/README.md CHANGED Viewed

@@ -189,9 +189,9 @@ If you prefer to edit the config files directly:
 | `NEO4J_URI` | No | `bolt://localhost:7687` | Neo4j connection URI |
 | `NEO4J_USER` | No | `neo4j` | Neo4j username |
 | `NEO4J_PASSWORD` | No | `PASSWORD` | Neo4j password |
-| `EMBEDDING_MODEL` | No | `Qwen/Qwen3-Embedding-0.6B` | Local embedding model (see [Embedding Configuration](#embedding-configuration)) |
+| `EMBEDDING_MODEL` | No | `codesage/codesage-base-v2` | Local embedding model (see [Embedding Configuration](#embedding-configuration)) |
 | `EMBEDDING_SIDECAR_PORT` | No | `8787` | Port for local embedding server |
-| `EMBEDDING_DEVICE` | No | `cpu` | Device for embeddings (`cpu` or `mps`). CPU is default to avoid MPS memory bloat |
+| `EMBEDDING_DEVICE` | No | auto (`mps`/`cpu`) | Device for embeddings. Auto-detects MPS on Apple Silicon |
 | `EMBEDDING_HALF_PRECISION` | No | `false` | Set `true` for float16 (uses ~0.5x memory) |
 | `OPENAI_ENABLED` | No | `false` | Set `true` to use OpenAI instead of local |
 | `OPENAI_API_KEY` | No* | - | Required when `OPENAI_ENABLED=true` |
@@ -537,9 +537,9 @@ This enables queries like "find all hooks that use context" while maintaining AS
 Local embeddings are the default — **no API key needed**. The Python sidecar starts automatically on first use and runs a local model for high-quality code embeddings.
-The sidecar runs on **CPU by default** to avoid MPS memory pool bloat on Apple Silicon (MPS can pre-allocate 10+ GB even for small models). CPU is fast enough for models under 1B params. It also auto-shuts down after 3 minutes of inactivity to free memory, and restarts lazily when needed (~15-20s).
+The sidecar uses **MPS (Apple Silicon GPU)** when available, falling back to CPU. It auto-shuts down after 3 minutes of inactivity to free memory, and restarts lazily when needed (~15-20s).
-> **GPU acceleration:** Set `EMBEDDING_DEVICE=mps` to use Apple Silicon GPU for larger models (1B+ params). Only recommended on machines with 32+ GB RAM.
+> **Device override:** Set `EMBEDDING_DEVICE=cpu` to force CPU if MPS causes issues.
 >
 > **Half precision:** Set `EMBEDDING_HALF_PRECISION=true` to load the model in float16, roughly halving memory usage.
@@ -549,7 +549,7 @@ Set via the `EMBEDDING_MODEL` environment variable:
 | Model | Dimensions | RAM (fp16) | Quality | Best For |
 |-------|-----------|-----|---------|----------|
-| `Qwen/Qwen3-Embedding-0.6B` (default) | 1024 | ~1.2 GB | Best | Default, code-aware, MTEB-Code #1 |
+| `codesage/codesage-base-v2` (default) | 1024 | ~700 MB | Best | Default, code-specific encoder, fast |
 | `Qodo/Qodo-Embed-1-1.5B` | 1536 | ~4.5 GB | Great | Machines with 32+ GB RAM |
 | `BAAI/bge-base-en-v1.5` | 768 | ~250 MB | Good | General purpose, low RAM |
 | `sentence-transformers/all-MiniLM-L6-v2` | 384 | ~100 MB | OK | Minimal RAM, fast |

package/dist/cli/cli.js CHANGED Viewed

@@ -251,7 +251,7 @@ const setupSidecar = async () => {
         return;
     }
     // Pre-download the embedding model so first real use is fast
-    const modelName = process.env.EMBEDDING_MODEL ?? 'Qwen/Qwen3-Embedding-0.6B';
+    const modelName = process.env.EMBEDDING_MODEL ?? 'codesage/codesage-base-v2';
     await preDownloadModel(sidecarDir, python, modelName);
 };
 /**

package/dist/core/embeddings/embedding-sidecar.js CHANGED Viewed

@@ -12,7 +12,7 @@ const __dirname = dirname(__filename);
 const DEFAULT_CONFIG = {
     port: parseInt(process.env.EMBEDDING_SIDECAR_PORT ?? '', 10) || 8787,
     host: '127.0.0.1',
-    model: process.env.EMBEDDING_MODEL ?? 'Qwen/Qwen3-Embedding-0.6B',
+    model: process.env.EMBEDDING_MODEL ?? 'codesage/codesage-base-v2',
     startupTimeoutMs: 120_000, // 2 min — first run downloads the model
     requestTimeoutMs: 60_000,
     idleTimeoutMs: 180_000, // 3 min — auto-shutdown after no requests

package/dist/core/embeddings/embeddings.service.js CHANGED Viewed

@@ -24,7 +24,7 @@ export const EMBEDDING_DIMENSIONS = {
     'text-embedding-3-large': 3072,
     'text-embedding-3-small': 1536,
     // Local models (via sidecar)
-    'Qwen/Qwen3-Embedding-0.6B': 1024,
+    'codesage/codesage-base-v2': 1024,
     'Qodo/Qodo-Embed-1-1.5B': 1536,
     'sentence-transformers/all-MiniLM-L6-v2': 384,
     'sentence-transformers/all-mpnet-base-v2': 768,
@@ -46,7 +46,7 @@ export const getEmbeddingDimensions = () => {
         const model = process.env.OPENAI_EMBEDDING_MODEL ?? 'text-embedding-3-large';
         return EMBEDDING_DIMENSIONS[model] ?? 3072;
     }
-    const model = process.env.EMBEDDING_MODEL ?? 'Qwen/Qwen3-Embedding-0.6B';
+    const model = process.env.EMBEDDING_MODEL ?? 'codesage/codesage-base-v2';
     return EMBEDDING_DIMENSIONS[model] ?? 1536;
 };
 /**

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "code-graph-context",
-  "version": "2.11.1",
+  "version": "2.12.0",
   "description": "MCP server that builds code graphs to provide rich context to LLMs",
   "type": "module",
   "homepage": "https://github.com/drewdrewH/code-graph-context#readme",

package/sidecar/embedding_server.py CHANGED Viewed

@@ -27,7 +27,7 @@ logger.info(f"Sidecar process starting (pid={os.getpid()})")
 app = FastAPI(title="code-graph-context embedding sidecar")
 model = None
-model_name = os.environ.get("EMBEDDING_MODEL", "Qwen/Qwen3-Embedding-0.6B")
+model_name = os.environ.get("EMBEDDING_MODEL", "codesage/codesage-base-v2")
 class EmbedRequest(BaseModel):
@@ -48,14 +48,11 @@ def load_model():
         import torch
         from sentence_transformers import SentenceTransformer
-        # Use CPU by default — MPS pre-allocates a massive memory pool (10+ GB)
-        # that bloats small models. CPU on Apple Silicon is fast enough for <1B models.
-        # Set EMBEDDING_DEVICE=mps to force GPU if needed for large models.
         device_override = os.environ.get("EMBEDDING_DEVICE", "").lower()
         if device_override:
             device = device_override
         else:
-            device = "cpu"
+            device = "mps" if torch.backends.mps.is_available() else "cpu"
         logger.info(f"Loading {model_name} on {device}...")
         logger.info(f"PyTorch version: {torch.__version__}, MPS available: {torch.backends.mps.is_available()}")