code-graph-context 2.11.0 → 2.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -189,8 +189,9 @@ If you prefer to edit the config files directly:
189
189
  | `NEO4J_URI` | No | `bolt://localhost:7687` | Neo4j connection URI |
190
190
  | `NEO4J_USER` | No | `neo4j` | Neo4j username |
191
191
  | `NEO4J_PASSWORD` | No | `PASSWORD` | Neo4j password |
192
- | `EMBEDDING_MODEL` | No | `Qwen/Qwen3-Embedding-0.6B` | Local embedding model (see [Embedding Configuration](#embedding-configuration)) |
192
+ | `EMBEDDING_MODEL` | No | `codesage/codesage-base-v2` | Local embedding model (see [Embedding Configuration](#embedding-configuration)) |
193
193
  | `EMBEDDING_SIDECAR_PORT` | No | `8787` | Port for local embedding server |
194
+ | `EMBEDDING_DEVICE` | No | auto (`mps`/`cpu`) | Device for embeddings. Auto-detects MPS on Apple Silicon |
194
195
  | `EMBEDDING_HALF_PRECISION` | No | `false` | Set `true` for float16 (uses ~0.5x memory) |
195
196
  | `OPENAI_ENABLED` | No | `false` | Set `true` to use OpenAI instead of local |
196
197
  | `OPENAI_API_KEY` | No* | - | Required when `OPENAI_ENABLED=true` |
@@ -536,9 +537,11 @@ This enables queries like "find all hooks that use context" while maintaining AS
536
537
 
537
538
  Local embeddings are the default — **no API key needed**. The Python sidecar starts automatically on first use and runs a local model for high-quality code embeddings.
538
539
 
539
- The sidecar uses **float16 (half precision)** by default, which halves memory usage with no meaningful quality loss. It also auto-shuts down after 3 minutes of inactivity to free memory, and restarts lazily when needed (~15-20s).
540
+ The sidecar uses **MPS (Apple Silicon GPU)** when available, falling back to CPU. It auto-shuts down after 3 minutes of inactivity to free memory, and restarts lazily when needed (~15-20s).
540
541
 
541
- > **Half precision mode:** To reduce memory usage at the cost of some accuracy, set `EMBEDDING_HALF_PRECISION=true`.
542
+ > **Device override:** Set `EMBEDDING_DEVICE=cpu` to force CPU if MPS causes issues.
543
+ >
544
+ > **Half precision:** Set `EMBEDDING_HALF_PRECISION=true` to load the model in float16, roughly halving memory usage.
542
545
 
543
546
  ### Available Models
544
547
 
@@ -546,7 +549,7 @@ Set via the `EMBEDDING_MODEL` environment variable:
546
549
 
547
550
  | Model | Dimensions | RAM (fp16) | Quality | Best For |
548
551
  |-------|-----------|-----|---------|----------|
549
- | `Qwen/Qwen3-Embedding-0.6B` (default) | 1024 | ~1.2 GB | Best | Default, code-aware, MTEB-Code #1 |
552
+ | `codesage/codesage-base-v2` (default) | 1024 | ~700 MB | Best | Default, code-specific encoder, fast |
550
553
  | `Qodo/Qodo-Embed-1-1.5B` | 1536 | ~4.5 GB | Great | Machines with 32+ GB RAM |
551
554
  | `BAAI/bge-base-en-v1.5` | 768 | ~250 MB | Good | General purpose, low RAM |
552
555
  | `sentence-transformers/all-MiniLM-L6-v2` | 384 | ~100 MB | OK | Minimal RAM, fast |
package/dist/cli/cli.js CHANGED
@@ -251,7 +251,7 @@ const setupSidecar = async () => {
251
251
  return;
252
252
  }
253
253
  // Pre-download the embedding model so first real use is fast
254
- const modelName = process.env.EMBEDDING_MODEL ?? 'Qwen/Qwen3-Embedding-0.6B';
254
+ const modelName = process.env.EMBEDDING_MODEL ?? 'codesage/codesage-base-v2';
255
255
  await preDownloadModel(sidecarDir, python, modelName);
256
256
  };
257
257
  /**
@@ -12,7 +12,7 @@ const __dirname = dirname(__filename);
12
12
  const DEFAULT_CONFIG = {
13
13
  port: parseInt(process.env.EMBEDDING_SIDECAR_PORT ?? '', 10) || 8787,
14
14
  host: '127.0.0.1',
15
- model: process.env.EMBEDDING_MODEL ?? 'Qwen/Qwen3-Embedding-0.6B',
15
+ model: process.env.EMBEDDING_MODEL ?? 'codesage/codesage-base-v2',
16
16
  startupTimeoutMs: 120_000, // 2 min — first run downloads the model
17
17
  requestTimeoutMs: 60_000,
18
18
  idleTimeoutMs: 180_000, // 3 min — auto-shutdown after no requests
@@ -24,7 +24,7 @@ export const EMBEDDING_DIMENSIONS = {
24
24
  'text-embedding-3-large': 3072,
25
25
  'text-embedding-3-small': 1536,
26
26
  // Local models (via sidecar)
27
- 'Qwen/Qwen3-Embedding-0.6B': 1024,
27
+ 'codesage/codesage-base-v2': 1024,
28
28
  'Qodo/Qodo-Embed-1-1.5B': 1536,
29
29
  'sentence-transformers/all-MiniLM-L6-v2': 384,
30
30
  'sentence-transformers/all-mpnet-base-v2': 768,
@@ -46,7 +46,7 @@ export const getEmbeddingDimensions = () => {
46
46
  const model = process.env.OPENAI_EMBEDDING_MODEL ?? 'text-embedding-3-large';
47
47
  return EMBEDDING_DIMENSIONS[model] ?? 3072;
48
48
  }
49
- const model = process.env.EMBEDDING_MODEL ?? 'Qwen/Qwen3-Embedding-0.6B';
49
+ const model = process.env.EMBEDDING_MODEL ?? 'codesage/codesage-base-v2';
50
50
  return EMBEDDING_DIMENSIONS[model] ?? 1536;
51
51
  };
52
52
  /**
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "code-graph-context",
3
- "version": "2.11.0",
3
+ "version": "2.12.0",
4
4
  "description": "MCP server that builds code graphs to provide rich context to LLMs",
5
5
  "type": "module",
6
6
  "homepage": "https://github.com/drewdrewH/code-graph-context#readme",
@@ -27,7 +27,7 @@ logger.info(f"Sidecar process starting (pid={os.getpid()})")
27
27
  app = FastAPI(title="code-graph-context embedding sidecar")
28
28
 
29
29
  model = None
30
- model_name = os.environ.get("EMBEDDING_MODEL", "Qwen/Qwen3-Embedding-0.6B")
30
+ model_name = os.environ.get("EMBEDDING_MODEL", "codesage/codesage-base-v2")
31
31
 
32
32
 
33
33
  class EmbedRequest(BaseModel):
@@ -48,16 +48,20 @@ def load_model():
48
48
  import torch
49
49
  from sentence_transformers import SentenceTransformer
50
50
 
51
- device = "mps" if torch.backends.mps.is_available() else "cpu"
51
+ device_override = os.environ.get("EMBEDDING_DEVICE", "").lower()
52
+ if device_override:
53
+ device = device_override
54
+ else:
55
+ device = "mps" if torch.backends.mps.is_available() else "cpu"
52
56
  logger.info(f"Loading {model_name} on {device}...")
53
57
  logger.info(f"PyTorch version: {torch.__version__}, MPS available: {torch.backends.mps.is_available()}")
54
58
 
55
59
  use_half = os.environ.get("EMBEDDING_HALF_PRECISION", "").lower() == "true"
56
- model = SentenceTransformer(model_name, device=device)
57
60
  if use_half:
58
- model.half()
61
+ model = SentenceTransformer(model_name, device=device, model_kwargs={"torch_dtype": "float16"})
59
62
  logger.info(f"Model loaded in float16 (half precision)")
60
63
  else:
64
+ model = SentenceTransformer(model_name, device=device)
61
65
  logger.info(f"Model loaded in float32 (full precision)")
62
66
  logger.info(f"Running warmup...")
63
67
 
@@ -114,7 +118,6 @@ async def embed(req: EmbedRequest):
114
118
  def _encode_with_oom_fallback(texts: list[str], batch_size: int) -> list[list[float]]:
115
119
  """
116
120
  Encode texts, falling back to CPU if MPS runs out of memory.
117
- Also retries with smaller batch sizes before giving up.
118
121
  """
119
122
  import torch
120
123
 
@@ -126,28 +129,18 @@ def _encode_with_oom_fallback(texts: list[str], batch_size: int) -> list[list[fl
126
129
  show_progress_bar=False,
127
130
  normalize_embeddings=True,
128
131
  )
129
- # Free intermediate tensors after each request
130
- if hasattr(torch.mps, "empty_cache"):
131
- torch.mps.empty_cache()
132
132
  return result.tolist()
133
- except (torch.mps.OutOfMemoryError, RuntimeError) as e:
133
+ except (RuntimeError,) as e:
134
134
  if "out of memory" not in str(e).lower():
135
135
  raise
136
136
 
137
- logger.warning(f"MPS OOM with batch_size={batch_size}, len={len(texts)}. Falling back to CPU.")
138
-
139
- # Free MPS memory
140
- if hasattr(torch.mps, "empty_cache"):
141
- torch.mps.empty_cache()
137
+ logger.warning(f"OOM with batch_size={batch_size}, len={len(texts)}. Falling back to CPU.")
142
138
  gc.collect()
143
139
 
144
- # Fall back to CPU for this request
145
140
  original_device = model.device
146
141
  model.to("cpu")
147
- logger.info("Model moved to CPU for fallback encoding")
148
142
 
149
143
  try:
150
- # Use smaller batches on CPU
151
144
  cpu_batch = min(batch_size, 4)
152
145
  with torch.no_grad():
153
146
  result = model.encode(
@@ -159,12 +152,10 @@ def _encode_with_oom_fallback(texts: list[str], batch_size: int) -> list[list[fl
159
152
  logger.info(f"CPU fallback encoding complete ({len(texts)} texts)")
160
153
  return result.tolist()
161
154
  finally:
162
- # Move back to MPS for future requests
163
155
  try:
164
156
  model.to(original_device)
165
- logger.info(f"Model moved back to {original_device}")
166
157
  except Exception:
167
- logger.warning("Could not move model back to MPS, staying on CPU")
158
+ logger.warning("Could not move model back, staying on CPU")
168
159
 
169
160
 
170
161
  def handle_signal(sig, _frame):