code-graph-context 2.11.0 → 2.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -191,6 +191,7 @@ If you prefer to edit the config files directly:
191
191
  | `NEO4J_PASSWORD` | No | `PASSWORD` | Neo4j password |
192
192
  | `EMBEDDING_MODEL` | No | `Qwen/Qwen3-Embedding-0.6B` | Local embedding model (see [Embedding Configuration](#embedding-configuration)) |
193
193
  | `EMBEDDING_SIDECAR_PORT` | No | `8787` | Port for local embedding server |
194
+ | `EMBEDDING_DEVICE` | No | `cpu` | Device for embeddings (`cpu` or `mps`). CPU is default to avoid MPS memory bloat |
194
195
  | `EMBEDDING_HALF_PRECISION` | No | `false` | Set `true` for float16 (uses ~0.5x memory) |
195
196
  | `OPENAI_ENABLED` | No | `false` | Set `true` to use OpenAI instead of local |
196
197
  | `OPENAI_API_KEY` | No* | - | Required when `OPENAI_ENABLED=true` |
@@ -536,9 +537,11 @@ This enables queries like "find all hooks that use context" while maintaining AS
536
537
 
537
538
  Local embeddings are the default — **no API key needed**. The Python sidecar starts automatically on first use and runs a local model for high-quality code embeddings.
538
539
 
539
- The sidecar uses **float16 (half precision)** by default, which halves memory usage with no meaningful quality loss. It also auto-shuts down after 3 minutes of inactivity to free memory, and restarts lazily when needed (~15-20s).
540
+ The sidecar runs on **CPU by default** to avoid MPS memory pool bloat on Apple Silicon (MPS can pre-allocate 10+ GB even for small models). CPU is fast enough for models under 1B params. It also auto-shuts down after 3 minutes of inactivity to free memory, and restarts lazily when needed (~15-20s).
540
541
 
541
- > **Half precision mode:** To reduce memory usage at the cost of some accuracy, set `EMBEDDING_HALF_PRECISION=true`.
542
+ > **GPU acceleration:** Set `EMBEDDING_DEVICE=mps` to use Apple Silicon GPU for larger models (1B+ params). Only recommended on machines with 32+ GB RAM.
543
+ >
544
+ > **Half precision:** Set `EMBEDDING_HALF_PRECISION=true` to load the model in float16, roughly halving memory usage.
542
545
 
543
546
  ### Available Models
544
547
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "code-graph-context",
3
- "version": "2.11.0",
3
+ "version": "2.11.1",
4
4
  "description": "MCP server that builds code graphs to provide rich context to LLMs",
5
5
  "type": "module",
6
6
  "homepage": "https://github.com/drewdrewH/code-graph-context#readme",
@@ -48,16 +48,23 @@ def load_model():
48
48
  import torch
49
49
  from sentence_transformers import SentenceTransformer
50
50
 
51
- device = "mps" if torch.backends.mps.is_available() else "cpu"
51
+ # Use CPU by default — MPS pre-allocates a massive memory pool (10+ GB)
52
+ # that bloats small models. CPU on Apple Silicon is fast enough for <1B models.
53
+ # Set EMBEDDING_DEVICE=mps to force GPU if needed for large models.
54
+ device_override = os.environ.get("EMBEDDING_DEVICE", "").lower()
55
+ if device_override:
56
+ device = device_override
57
+ else:
58
+ device = "cpu"
52
59
  logger.info(f"Loading {model_name} on {device}...")
53
60
  logger.info(f"PyTorch version: {torch.__version__}, MPS available: {torch.backends.mps.is_available()}")
54
61
 
55
62
  use_half = os.environ.get("EMBEDDING_HALF_PRECISION", "").lower() == "true"
56
- model = SentenceTransformer(model_name, device=device)
57
63
  if use_half:
58
- model.half()
64
+ model = SentenceTransformer(model_name, device=device, model_kwargs={"torch_dtype": "float16"})
59
65
  logger.info(f"Model loaded in float16 (half precision)")
60
66
  else:
67
+ model = SentenceTransformer(model_name, device=device)
61
68
  logger.info(f"Model loaded in float32 (full precision)")
62
69
  logger.info(f"Running warmup...")
63
70
 
@@ -114,7 +121,6 @@ async def embed(req: EmbedRequest):
114
121
  def _encode_with_oom_fallback(texts: list[str], batch_size: int) -> list[list[float]]:
115
122
  """
116
123
  Encode texts, falling back to CPU if MPS runs out of memory.
117
- Also retries with smaller batch sizes before giving up.
118
124
  """
119
125
  import torch
120
126
 
@@ -126,28 +132,18 @@ def _encode_with_oom_fallback(texts: list[str], batch_size: int) -> list[list[fl
126
132
  show_progress_bar=False,
127
133
  normalize_embeddings=True,
128
134
  )
129
- # Free intermediate tensors after each request
130
- if hasattr(torch.mps, "empty_cache"):
131
- torch.mps.empty_cache()
132
135
  return result.tolist()
133
- except (torch.mps.OutOfMemoryError, RuntimeError) as e:
136
+ except (RuntimeError,) as e:
134
137
  if "out of memory" not in str(e).lower():
135
138
  raise
136
139
 
137
- logger.warning(f"MPS OOM with batch_size={batch_size}, len={len(texts)}. Falling back to CPU.")
138
-
139
- # Free MPS memory
140
- if hasattr(torch.mps, "empty_cache"):
141
- torch.mps.empty_cache()
140
+ logger.warning(f"OOM with batch_size={batch_size}, len={len(texts)}. Falling back to CPU.")
142
141
  gc.collect()
143
142
 
144
- # Fall back to CPU for this request
145
143
  original_device = model.device
146
144
  model.to("cpu")
147
- logger.info("Model moved to CPU for fallback encoding")
148
145
 
149
146
  try:
150
- # Use smaller batches on CPU
151
147
  cpu_batch = min(batch_size, 4)
152
148
  with torch.no_grad():
153
149
  result = model.encode(
@@ -159,12 +155,10 @@ def _encode_with_oom_fallback(texts: list[str], batch_size: int) -> list[list[fl
159
155
  logger.info(f"CPU fallback encoding complete ({len(texts)} texts)")
160
156
  return result.tolist()
161
157
  finally:
162
- # Move back to MPS for future requests
163
158
  try:
164
159
  model.to(original_device)
165
- logger.info(f"Model moved back to {original_device}")
166
160
  except Exception:
167
- logger.warning("Could not move model back to MPS, staying on CPU")
161
+ logger.warning("Could not move model back, staying on CPU")
168
162
 
169
163
 
170
164
  def handle_signal(sig, _frame):