code-graph-context 2.10.5 → 2.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -189,9 +189,10 @@ If you prefer to edit the config files directly:
189
189
  | `NEO4J_URI` | No | `bolt://localhost:7687` | Neo4j connection URI |
190
190
  | `NEO4J_USER` | No | `neo4j` | Neo4j username |
191
191
  | `NEO4J_PASSWORD` | No | `PASSWORD` | Neo4j password |
192
- | `EMBEDDING_MODEL` | No | `Qodo/Qodo-Embed-1-1.5B` | Local embedding model (see [Embedding Configuration](#embedding-configuration)) |
192
+ | `EMBEDDING_MODEL` | No | `Qwen/Qwen3-Embedding-0.6B` | Local embedding model (see [Embedding Configuration](#embedding-configuration)) |
193
193
  | `EMBEDDING_SIDECAR_PORT` | No | `8787` | Port for local embedding server |
194
- | `EMBEDDING_FULL_PRECISION` | No | `false` | Set `true` for float32 (uses ~2x memory) |
194
+ | `EMBEDDING_DEVICE` | No | `cpu` | Device for embeddings (`cpu` or `mps`). CPU is default to avoid MPS memory bloat |
195
+ | `EMBEDDING_HALF_PRECISION` | No | `false` | Set `true` for float16 (uses ~0.5x memory) |
195
196
  | `OPENAI_ENABLED` | No | `false` | Set `true` to use OpenAI instead of local |
196
197
  | `OPENAI_API_KEY` | No* | - | Required when `OPENAI_ENABLED=true` |
197
198
 
@@ -536,9 +537,11 @@ This enables queries like "find all hooks that use context" while maintaining AS
536
537
 
537
538
  Local embeddings are the default — **no API key needed**. The Python sidecar starts automatically on first use and runs a local model for high-quality code embeddings.
538
539
 
539
- The sidecar uses **float16 (half precision)** by default, which halves memory usage with no meaningful quality loss. It also auto-shuts down after 3 minutes of inactivity to free memory, and restarts lazily when needed (~15-20s).
540
+ The sidecar runs on **CPU by default** to avoid MPS memory pool bloat on Apple Silicon (MPS can pre-allocate 10+ GB even for small models). CPU is fast enough for models under 1B params. It also auto-shuts down after 3 minutes of inactivity to free memory, and restarts lazily when needed (~15-20s).
540
541
 
541
- > **Full precision mode:** If you have 32+ GB RAM and want float32, set `EMBEDDING_FULL_PRECISION=true`.
542
+ > **GPU acceleration:** Set `EMBEDDING_DEVICE=mps` to use Apple Silicon GPU for larger models (1B+ params). Only recommended on machines with 32+ GB RAM.
543
+ >
544
+ > **Half precision:** Set `EMBEDDING_HALF_PRECISION=true` to load the model in float16, roughly halving memory usage.
542
545
 
543
546
  ### Available Models
544
547
 
@@ -546,7 +549,8 @@ Set via the `EMBEDDING_MODEL` environment variable:
546
549
 
547
550
  | Model | Dimensions | RAM (fp16) | Quality | Best For |
548
551
  |-------|-----------|-----|---------|----------|
549
- | `Qodo/Qodo-Embed-1-1.5B` (default) | 1536 | ~4.5 GB | Best | Default, works on 16GB machines |
552
+ | `Qwen/Qwen3-Embedding-0.6B` (default) | 1024 | ~1.2 GB | Best | Default, code-aware, MTEB-Code #1 |
553
+ | `Qodo/Qodo-Embed-1-1.5B` | 1536 | ~4.5 GB | Great | Machines with 32+ GB RAM |
550
554
  | `BAAI/bge-base-en-v1.5` | 768 | ~250 MB | Good | General purpose, low RAM |
551
555
  | `sentence-transformers/all-MiniLM-L6-v2` | 384 | ~100 MB | OK | Minimal RAM, fast |
552
556
  | `nomic-ai/nomic-embed-text-v1.5` | 768 | ~300 MB | Good | Code + prose mixed |
package/dist/cli/cli.js CHANGED
@@ -251,7 +251,7 @@ const setupSidecar = async () => {
251
251
  return;
252
252
  }
253
253
  // Pre-download the embedding model so first real use is fast
254
- const modelName = process.env.EMBEDDING_MODEL ?? 'Qodo/Qodo-Embed-1-1.5B';
254
+ const modelName = process.env.EMBEDDING_MODEL ?? 'Qwen/Qwen3-Embedding-0.6B';
255
255
  await preDownloadModel(sidecarDir, python, modelName);
256
256
  };
257
257
  /**
@@ -12,7 +12,7 @@ const __dirname = dirname(__filename);
12
12
  const DEFAULT_CONFIG = {
13
13
  port: parseInt(process.env.EMBEDDING_SIDECAR_PORT ?? '', 10) || 8787,
14
14
  host: '127.0.0.1',
15
- model: process.env.EMBEDDING_MODEL ?? 'Qodo/Qodo-Embed-1-1.5B',
15
+ model: process.env.EMBEDDING_MODEL ?? 'Qwen/Qwen3-Embedding-0.6B',
16
16
  startupTimeoutMs: 120_000, // 2 min — first run downloads the model
17
17
  requestTimeoutMs: 60_000,
18
18
  idleTimeoutMs: 180_000, // 3 min — auto-shutdown after no requests
@@ -5,7 +5,7 @@
5
5
  * and get the right implementation based on OPENAI_ENABLED.
6
6
  *
7
7
  * OPENAI_ENABLED=true → OpenAI text-embedding-3-large (requires OPENAI_API_KEY)
8
- * default → Local Python sidecar with Qodo-Embed-1-1.5B
8
+ * default → Local Python sidecar with Qwen3-Embedding-0.6B
9
9
  */
10
10
  import { LocalEmbeddingsService } from './local-embeddings.service.js';
11
11
  import { OpenAIEmbeddingsService } from './openai-embeddings.service.js';
@@ -24,6 +24,7 @@ export const EMBEDDING_DIMENSIONS = {
24
24
  'text-embedding-3-large': 3072,
25
25
  'text-embedding-3-small': 1536,
26
26
  // Local models (via sidecar)
27
+ 'Qwen/Qwen3-Embedding-0.6B': 1024,
27
28
  'Qodo/Qodo-Embed-1-1.5B': 1536,
28
29
  'sentence-transformers/all-MiniLM-L6-v2': 384,
29
30
  'sentence-transformers/all-mpnet-base-v2': 768,
@@ -45,7 +46,7 @@ export const getEmbeddingDimensions = () => {
45
46
  const model = process.env.OPENAI_EMBEDDING_MODEL ?? 'text-embedding-3-large';
46
47
  return EMBEDDING_DIMENSIONS[model] ?? 3072;
47
48
  }
48
- const model = process.env.EMBEDDING_MODEL ?? 'Qodo/Qodo-Embed-1-1.5B';
49
+ const model = process.env.EMBEDDING_MODEL ?? 'Qwen/Qwen3-Embedding-0.6B';
49
50
  return EMBEDDING_DIMENSIONS[model] ?? 1536;
50
51
  };
51
52
  /**
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "code-graph-context",
3
- "version": "2.10.5",
3
+ "version": "2.11.1",
4
4
  "description": "MCP server that builds code graphs to provide rich context to LLMs",
5
5
  "type": "module",
6
6
  "homepage": "https://github.com/drewdrewH/code-graph-context#readme",
@@ -1,6 +1,6 @@
1
1
  """
2
2
  Local embedding server for code-graph-context.
3
- Uses Qodo-Embed-1-1.5B for high-quality code embeddings without OpenAI dependency.
3
+ Uses Qwen3-Embedding-0.6B for high-quality code embeddings without OpenAI dependency.
4
4
  Runs as a sidecar process managed by the Node.js MCP server.
5
5
  """
6
6
 
@@ -27,7 +27,7 @@ logger.info(f"Sidecar process starting (pid={os.getpid()})")
27
27
  app = FastAPI(title="code-graph-context embedding sidecar")
28
28
 
29
29
  model = None
30
- model_name = os.environ.get("EMBEDDING_MODEL", "Qodo/Qodo-Embed-1-1.5B")
30
+ model_name = os.environ.get("EMBEDDING_MODEL", "Qwen/Qwen3-Embedding-0.6B")
31
31
 
32
32
 
33
33
  class EmbedRequest(BaseModel):
@@ -48,16 +48,23 @@ def load_model():
48
48
  import torch
49
49
  from sentence_transformers import SentenceTransformer
50
50
 
51
- device = "mps" if torch.backends.mps.is_available() else "cpu"
51
+ # Use CPU by default — MPS pre-allocates a massive memory pool (10+ GB)
52
+ # that bloats small models. CPU on Apple Silicon is fast enough for <1B models.
53
+ # Set EMBEDDING_DEVICE=mps to force GPU if needed for large models.
54
+ device_override = os.environ.get("EMBEDDING_DEVICE", "").lower()
55
+ if device_override:
56
+ device = device_override
57
+ else:
58
+ device = "cpu"
52
59
  logger.info(f"Loading {model_name} on {device}...")
53
60
  logger.info(f"PyTorch version: {torch.__version__}, MPS available: {torch.backends.mps.is_available()}")
54
61
 
55
- use_half = os.environ.get("EMBEDDING_FULL_PRECISION", "").lower() != "true"
56
- model = SentenceTransformer(model_name, device=device)
62
+ use_half = os.environ.get("EMBEDDING_HALF_PRECISION", "").lower() == "true"
57
63
  if use_half:
58
- model.half()
64
+ model = SentenceTransformer(model_name, device=device, model_kwargs={"torch_dtype": "float16"})
59
65
  logger.info(f"Model loaded in float16 (half precision)")
60
66
  else:
67
+ model = SentenceTransformer(model_name, device=device)
61
68
  logger.info(f"Model loaded in float32 (full precision)")
62
69
  logger.info(f"Running warmup...")
63
70
 
@@ -114,7 +121,6 @@ async def embed(req: EmbedRequest):
114
121
  def _encode_with_oom_fallback(texts: list[str], batch_size: int) -> list[list[float]]:
115
122
  """
116
123
  Encode texts, falling back to CPU if MPS runs out of memory.
117
- Also retries with smaller batch sizes before giving up.
118
124
  """
119
125
  import torch
120
126
 
@@ -126,28 +132,18 @@ def _encode_with_oom_fallback(texts: list[str], batch_size: int) -> list[list[fl
126
132
  show_progress_bar=False,
127
133
  normalize_embeddings=True,
128
134
  )
129
- # Free intermediate tensors after each request
130
- if hasattr(torch.mps, "empty_cache"):
131
- torch.mps.empty_cache()
132
135
  return result.tolist()
133
- except (torch.mps.OutOfMemoryError, RuntimeError) as e:
136
+ except (RuntimeError,) as e:
134
137
  if "out of memory" not in str(e).lower():
135
138
  raise
136
139
 
137
- logger.warning(f"MPS OOM with batch_size={batch_size}, len={len(texts)}. Falling back to CPU.")
138
-
139
- # Free MPS memory
140
- if hasattr(torch.mps, "empty_cache"):
141
- torch.mps.empty_cache()
140
+ logger.warning(f"OOM with batch_size={batch_size}, len={len(texts)}. Falling back to CPU.")
142
141
  gc.collect()
143
142
 
144
- # Fall back to CPU for this request
145
143
  original_device = model.device
146
144
  model.to("cpu")
147
- logger.info("Model moved to CPU for fallback encoding")
148
145
 
149
146
  try:
150
- # Use smaller batches on CPU
151
147
  cpu_batch = min(batch_size, 4)
152
148
  with torch.no_grad():
153
149
  result = model.encode(
@@ -159,12 +155,10 @@ def _encode_with_oom_fallback(texts: list[str], batch_size: int) -> list[list[fl
159
155
  logger.info(f"CPU fallback encoding complete ({len(texts)} texts)")
160
156
  return result.tolist()
161
157
  finally:
162
- # Move back to MPS for future requests
163
158
  try:
164
159
  model.to(original_device)
165
- logger.info(f"Model moved back to {original_device}")
166
160
  except Exception:
167
- logger.warning("Could not move model back to MPS, staying on CPU")
161
+ logger.warning("Could not move model back, staying on CPU")
168
162
 
169
163
 
170
164
  def handle_signal(sig, _frame):
@@ -3,3 +3,4 @@ uvicorn>=0.24.0
3
3
  sentence-transformers>=3.0.0
4
4
  torch>=2.0.0
5
5
  pydantic>=2.0.0
6
+ transformers>=4.51.0