code-graph-context 2.11.1 → 2.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -189,9 +189,9 @@ If you prefer to edit the config files directly:
189
189
  | `NEO4J_URI` | No | `bolt://localhost:7687` | Neo4j connection URI |
190
190
  | `NEO4J_USER` | No | `neo4j` | Neo4j username |
191
191
  | `NEO4J_PASSWORD` | No | `PASSWORD` | Neo4j password |
192
- | `EMBEDDING_MODEL` | No | `Qwen/Qwen3-Embedding-0.6B` | Local embedding model (see [Embedding Configuration](#embedding-configuration)) |
192
+ | `EMBEDDING_MODEL` | No | `codesage/codesage-base-v2` | Local embedding model (see [Embedding Configuration](#embedding-configuration)) |
193
193
  | `EMBEDDING_SIDECAR_PORT` | No | `8787` | Port for local embedding server |
194
- | `EMBEDDING_DEVICE` | No | `cpu` | Device for embeddings (`cpu` or `mps`). CPU is default to avoid MPS memory bloat |
194
+ | `EMBEDDING_DEVICE` | No | auto (`mps`/`cpu`) | Device for embeddings. Auto-detects MPS on Apple Silicon |
195
195
  | `EMBEDDING_HALF_PRECISION` | No | `false` | Set `true` for float16 (uses ~0.5x memory) |
196
196
  | `OPENAI_ENABLED` | No | `false` | Set `true` to use OpenAI instead of local |
197
197
  | `OPENAI_API_KEY` | No* | - | Required when `OPENAI_ENABLED=true` |
@@ -537,9 +537,9 @@ This enables queries like "find all hooks that use context" while maintaining AS
537
537
 
538
538
  Local embeddings are the default — **no API key needed**. The Python sidecar starts automatically on first use and runs a local model for high-quality code embeddings.
539
539
 
540
- The sidecar runs on **CPU by default** to avoid MPS memory pool bloat on Apple Silicon (MPS can pre-allocate 10+ GB even for small models). CPU is fast enough for models under 1B params. It also auto-shuts down after 3 minutes of inactivity to free memory, and restarts lazily when needed (~15-20s).
540
+ The sidecar uses **MPS (Apple Silicon GPU)** when available, falling back to CPU. It auto-shuts down after 3 minutes of inactivity to free memory, and restarts lazily when needed (~15-20s).
541
541
 
542
- > **GPU acceleration:** Set `EMBEDDING_DEVICE=mps` to use Apple Silicon GPU for larger models (1B+ params). Only recommended on machines with 32+ GB RAM.
542
+ > **Device override:** Set `EMBEDDING_DEVICE=cpu` to force CPU if MPS causes issues.
543
543
  >
544
544
  > **Half precision:** Set `EMBEDDING_HALF_PRECISION=true` to load the model in float16, roughly halving memory usage.
545
545
 
@@ -549,7 +549,7 @@ Set via the `EMBEDDING_MODEL` environment variable:
549
549
 
550
550
  | Model | Dimensions | RAM (fp16) | Quality | Best For |
551
551
  |-------|-----------|-----|---------|----------|
552
- | `Qwen/Qwen3-Embedding-0.6B` (default) | 1024 | ~1.2 GB | Best | Default, code-aware, MTEB-Code #1 |
552
+ | `codesage/codesage-base-v2` (default) | 1024 | ~700 MB | Best | Default, code-specific encoder, fast |
553
553
  | `Qodo/Qodo-Embed-1-1.5B` | 1536 | ~4.5 GB | Great | Machines with 32+ GB RAM |
554
554
  | `BAAI/bge-base-en-v1.5` | 768 | ~250 MB | Good | General purpose, low RAM |
555
555
  | `sentence-transformers/all-MiniLM-L6-v2` | 384 | ~100 MB | OK | Minimal RAM, fast |
package/dist/cli/cli.js CHANGED
@@ -251,7 +251,7 @@ const setupSidecar = async () => {
251
251
  return;
252
252
  }
253
253
  // Pre-download the embedding model so first real use is fast
254
- const modelName = process.env.EMBEDDING_MODEL ?? 'Qwen/Qwen3-Embedding-0.6B';
254
+ const modelName = process.env.EMBEDDING_MODEL ?? 'codesage/codesage-base-v2';
255
255
  await preDownloadModel(sidecarDir, python, modelName);
256
256
  };
257
257
  /**
@@ -12,7 +12,7 @@ const __dirname = dirname(__filename);
12
12
  const DEFAULT_CONFIG = {
13
13
  port: parseInt(process.env.EMBEDDING_SIDECAR_PORT ?? '', 10) || 8787,
14
14
  host: '127.0.0.1',
15
- model: process.env.EMBEDDING_MODEL ?? 'Qwen/Qwen3-Embedding-0.6B',
15
+ model: process.env.EMBEDDING_MODEL ?? 'codesage/codesage-base-v2',
16
16
  startupTimeoutMs: 120_000, // 2 min — first run downloads the model
17
17
  requestTimeoutMs: 60_000,
18
18
  idleTimeoutMs: 180_000, // 3 min — auto-shutdown after no requests
@@ -24,7 +24,7 @@ export const EMBEDDING_DIMENSIONS = {
24
24
  'text-embedding-3-large': 3072,
25
25
  'text-embedding-3-small': 1536,
26
26
  // Local models (via sidecar)
27
- 'Qwen/Qwen3-Embedding-0.6B': 1024,
27
+ 'codesage/codesage-base-v2': 1024,
28
28
  'Qodo/Qodo-Embed-1-1.5B': 1536,
29
29
  'sentence-transformers/all-MiniLM-L6-v2': 384,
30
30
  'sentence-transformers/all-mpnet-base-v2': 768,
@@ -46,7 +46,7 @@ export const getEmbeddingDimensions = () => {
46
46
  const model = process.env.OPENAI_EMBEDDING_MODEL ?? 'text-embedding-3-large';
47
47
  return EMBEDDING_DIMENSIONS[model] ?? 3072;
48
48
  }
49
- const model = process.env.EMBEDDING_MODEL ?? 'Qwen/Qwen3-Embedding-0.6B';
49
+ const model = process.env.EMBEDDING_MODEL ?? 'codesage/codesage-base-v2';
50
50
  return EMBEDDING_DIMENSIONS[model] ?? 1536;
51
51
  };
52
52
  /**
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "code-graph-context",
3
- "version": "2.11.1",
3
+ "version": "2.12.0",
4
4
  "description": "MCP server that builds code graphs to provide rich context to LLMs",
5
5
  "type": "module",
6
6
  "homepage": "https://github.com/drewdrewH/code-graph-context#readme",
@@ -27,7 +27,7 @@ logger.info(f"Sidecar process starting (pid={os.getpid()})")
27
27
  app = FastAPI(title="code-graph-context embedding sidecar")
28
28
 
29
29
  model = None
30
- model_name = os.environ.get("EMBEDDING_MODEL", "Qwen/Qwen3-Embedding-0.6B")
30
+ model_name = os.environ.get("EMBEDDING_MODEL", "codesage/codesage-base-v2")
31
31
 
32
32
 
33
33
  class EmbedRequest(BaseModel):
@@ -48,14 +48,11 @@ def load_model():
48
48
  import torch
49
49
  from sentence_transformers import SentenceTransformer
50
50
 
51
- # Use CPU by default — MPS pre-allocates a massive memory pool (10+ GB)
52
- # that bloats small models. CPU on Apple Silicon is fast enough for <1B models.
53
- # Set EMBEDDING_DEVICE=mps to force GPU if needed for large models.
54
51
  device_override = os.environ.get("EMBEDDING_DEVICE", "").lower()
55
52
  if device_override:
56
53
  device = device_override
57
54
  else:
58
- device = "cpu"
55
+ device = "mps" if torch.backends.mps.is_available() else "cpu"
59
56
  logger.info(f"Loading {model_name} on {device}...")
60
57
  logger.info(f"PyTorch version: {torch.__version__}, MPS available: {torch.backends.mps.is_available()}")
61
58