code-graph-context 2.10.3 → 2.10.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -191,6 +191,7 @@ If you prefer to edit the config files directly:
191
191
  | `NEO4J_PASSWORD` | No | `PASSWORD` | Neo4j password |
192
192
  | `EMBEDDING_MODEL` | No | `Qodo/Qodo-Embed-1-1.5B` | Local embedding model (see [Embedding Configuration](#embedding-configuration)) |
193
193
  | `EMBEDDING_SIDECAR_PORT` | No | `8787` | Port for local embedding server |
194
+ | `EMBEDDING_FULL_PRECISION` | No | `false` | Set `true` for float32 (uses ~2x memory) |
194
195
  | `OPENAI_ENABLED` | No | `false` | Set `true` to use OpenAI instead of local |
195
196
  | `OPENAI_API_KEY` | No* | - | Required when `OPENAI_ENABLED=true` |
196
197
 
@@ -535,20 +536,24 @@ This enables queries like "find all hooks that use context" while maintaining AS
535
536
 
536
537
  Local embeddings are the default — **no API key needed**. The Python sidecar starts automatically on first use and runs a local model for high-quality code embeddings.
537
538
 
539
+ The sidecar uses **float16 (half precision)** by default, which halves memory usage with no meaningful quality loss. It also auto-shuts down after 3 minutes of inactivity to free memory, and restarts lazily when needed (~15-20s).
540
+
541
+ > **Full precision mode:** If you have 32+ GB RAM and want float32, set `EMBEDDING_FULL_PRECISION=true`.
542
+
538
543
  ### Available Models
539
544
 
540
545
  Set via the `EMBEDDING_MODEL` environment variable:
541
546
 
542
- | Model | Dimensions | RAM | Quality | Best For |
547
+ | Model | Dimensions | RAM (fp16) | Quality | Best For |
543
548
  |-------|-----------|-----|---------|----------|
544
- | `Qodo/Qodo-Embed-1-1.5B` (default) | 1536 | ~9 GB | Best | Machines with 32+ GB RAM |
545
- | `BAAI/bge-base-en-v1.5` | 768 | ~500 MB | Good | General purpose, low RAM |
546
- | `sentence-transformers/all-MiniLM-L6-v2` | 384 | ~200 MB | OK | Minimal RAM, fast |
547
- | `nomic-ai/nomic-embed-text-v1.5` | 768 | ~600 MB | Good | Code + prose mixed |
548
- | `sentence-transformers/all-mpnet-base-v2` | 768 | ~500 MB | Good | Balanced quality/speed |
549
- | `BAAI/bge-small-en-v1.5` | 384 | ~130 MB | OK | Smallest footprint |
550
-
551
- **Example:** Use a lightweight model on a 16GB machine:
549
+ | `Qodo/Qodo-Embed-1-1.5B` (default) | 1536 | ~4.5 GB | Best | Default, works on 16GB machines |
550
+ | `BAAI/bge-base-en-v1.5` | 768 | ~250 MB | Good | General purpose, low RAM |
551
+ | `sentence-transformers/all-MiniLM-L6-v2` | 384 | ~100 MB | OK | Minimal RAM, fast |
552
+ | `nomic-ai/nomic-embed-text-v1.5` | 768 | ~300 MB | Good | Code + prose mixed |
553
+ | `sentence-transformers/all-mpnet-base-v2` | 768 | ~250 MB | Good | Balanced quality/speed |
554
+ | `BAAI/bge-small-en-v1.5` | 384 | ~65 MB | OK | Smallest footprint |
555
+
556
+ **Example:** Use a lightweight model on a low-memory machine:
552
557
  ```bash
553
558
  claude mcp add --scope user code-graph-context \
554
559
  -e EMBEDDING_MODEL=BAAI/bge-base-en-v1.5 \
@@ -15,6 +15,7 @@ const DEFAULT_CONFIG = {
15
15
  model: process.env.EMBEDDING_MODEL ?? 'Qodo/Qodo-Embed-1-1.5B',
16
16
  startupTimeoutMs: 120_000, // 2 min — first run downloads the model
17
17
  requestTimeoutMs: 60_000,
18
+ idleTimeoutMs: 180_000, // 3 min — auto-shutdown after no requests
18
19
  };
19
20
  export class EmbeddingSidecar {
20
21
  process = null;
@@ -23,6 +24,7 @@ export class EmbeddingSidecar {
23
24
  _dimensions = null;
24
25
  stopping = false;
25
26
  _exitHandler = null;
27
+ _idleTimer = null;
26
28
  constructor(config = {}) {
27
29
  this.config = { ...DEFAULT_CONFIG, ...config };
28
30
  }
@@ -205,6 +207,7 @@ export class EmbeddingSidecar {
205
207
  const data = (await res.json());
206
208
  if (data.dimensions)
207
209
  this._dimensions = data.dimensions;
210
+ this.resetIdleTimer();
208
211
  return data.embeddings;
209
212
  }
210
213
  catch (err) {
@@ -251,7 +254,21 @@ export class EmbeddingSidecar {
251
254
  }
252
255
  this.cleanup();
253
256
  }
257
+ resetIdleTimer() {
258
+ if (this._idleTimer)
259
+ clearTimeout(this._idleTimer);
260
+ this._idleTimer = setTimeout(() => {
261
+ console.error(`[embedding-sidecar] Idle for ${this.config.idleTimeoutMs / 1000}s, shutting down to free memory`);
262
+ this.stop();
263
+ }, this.config.idleTimeoutMs);
264
+ // Don't let the timer prevent Node from exiting
265
+ this._idleTimer.unref();
266
+ }
254
267
  cleanup() {
268
+ if (this._idleTimer) {
269
+ clearTimeout(this._idleTimer);
270
+ this._idleTimer = null;
271
+ }
255
272
  if (this._exitHandler) {
256
273
  process.removeListener('exit', this._exitHandler);
257
274
  this._exitHandler = null;
@@ -10,6 +10,7 @@ import { fileURLToPath } from 'url';
10
10
  import { Worker } from 'worker_threads';
11
11
  import { z } from 'zod';
12
12
  import { CORE_TYPESCRIPT_SCHEMA } from '../../core/config/schema.js';
13
+ import { stopEmbeddingSidecar } from '../../core/embeddings/embedding-sidecar.js';
13
14
  import { EmbeddingsService } from '../../core/embeddings/embeddings.service.js';
14
15
  import { ParserFactory } from '../../core/parsers/parser-factory.js';
15
16
  import { detectChangedFiles } from '../../core/utils/file-change-detection.js';
@@ -165,6 +166,7 @@ export const createParseTypescriptProjectTool = (server) => {
165
166
  const job = jobManager.getJob(jobId);
166
167
  if (job && job.status === 'running') {
167
168
  jobManager.failJob(jobId, `Worker timed out after ${PARSING.workerTimeoutMs / 60000} minutes`);
169
+ await stopEmbeddingSidecar();
168
170
  await terminateWorker('timeout');
169
171
  }
170
172
  }, PARSING.workerTimeoutMs);
@@ -183,6 +185,7 @@ export const createParseTypescriptProjectTool = (server) => {
183
185
  clearTimeout(timeoutId);
184
186
  jobManager.failJob(jobId, msg.error);
185
187
  debugLog('Async parsing failed', { jobId, error: msg.error });
188
+ stopEmbeddingSidecar();
186
189
  terminateWorker('error');
187
190
  }
188
191
  });
@@ -191,6 +194,7 @@ export const createParseTypescriptProjectTool = (server) => {
191
194
  clearTimeout(timeoutId);
192
195
  jobManager.failJob(jobId, err.message ?? String(err));
193
196
  console.error('Worker thread error:', err);
197
+ stopEmbeddingSidecar();
194
198
  terminateWorker('worker-error');
195
199
  });
196
200
  // Handle worker exit
@@ -371,12 +375,16 @@ export const createParseTypescriptProjectTool = (server) => {
371
375
  edgeCount: 0,
372
376
  });
373
377
  await debugLog('Project status updated to failed', { projectId: finalProjectId });
378
+ // Stop sidecar to free memory (restarts lazily on next embed request)
379
+ await stopEmbeddingSidecar();
374
380
  return createSuccessResponse(formatParsePartialSuccess(nodes.length, edges.length, outputPath, neo4jError.message));
375
381
  }
376
382
  }
377
383
  catch (error) {
378
384
  console.error('Parse tool error:', error);
379
385
  await debugLog('Parse tool error', { projectPath, tsconfigPath, error });
386
+ // Stop sidecar to free memory (restarts lazily on next embed request)
387
+ await stopEmbeddingSidecar();
380
388
  return createErrorResponse(error);
381
389
  }
382
390
  });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "code-graph-context",
3
- "version": "2.10.3",
3
+ "version": "2.10.5",
4
4
  "description": "MCP server that builds code graphs to provide rich context to LLMs",
5
5
  "type": "module",
6
6
  "homepage": "https://github.com/drewdrewH/code-graph-context#readme",
@@ -9,6 +9,8 @@ import os
9
9
  import sys
10
10
  import signal
11
11
  import logging
12
+ import threading
13
+ import time
12
14
 
13
15
  from fastapi import FastAPI, HTTPException
14
16
  from pydantic import BaseModel
@@ -20,6 +22,8 @@ logging.basicConfig(
20
22
  )
21
23
  logger = logging.getLogger("embedding-sidecar")
22
24
 
25
+ logger.info(f"Sidecar process starting (pid={os.getpid()})")
26
+
23
27
  app = FastAPI(title="code-graph-context embedding sidecar")
24
28
 
25
29
  model = None
@@ -46,15 +50,25 @@ def load_model():
46
50
 
47
51
  device = "mps" if torch.backends.mps.is_available() else "cpu"
48
52
  logger.info(f"Loading {model_name} on {device}...")
53
+ logger.info(f"PyTorch version: {torch.__version__}, MPS available: {torch.backends.mps.is_available()}")
54
+
55
+ use_half = os.environ.get("EMBEDDING_FULL_PRECISION", "").lower() != "true"
49
56
  model = SentenceTransformer(model_name, device=device)
57
+ if use_half:
58
+ model.half()
59
+ logger.info(f"Model loaded in float16 (half precision)")
60
+ else:
61
+ logger.info(f"Model loaded in float32 (full precision)")
62
+ logger.info(f"Running warmup...")
50
63
 
51
64
  # Warm up with a test embedding
52
65
  with torch.no_grad():
53
66
  test = model.encode(["warmup"], show_progress_bar=False)
54
67
  dims = len(test[0])
55
- logger.info(f"Model loaded: {dims} dimensions, device={device}")
68
+ logger.info(f"Warmup complete: {dims} dimensions, device={device}")
69
+ logger.info(f"Sidecar ready (pid={os.getpid()})")
56
70
  except Exception as e:
57
- logger.error(f"Failed to load model: {e}")
71
+ logger.error(f"Failed to load model: {e}", exc_info=True)
58
72
  raise
59
73
 
60
74
 
@@ -79,16 +93,21 @@ async def embed(req: EmbedRequest):
79
93
  if not req.texts:
80
94
  return EmbedResponse(embeddings=[], dimensions=0, model=model_name)
81
95
 
96
+ logger.info(f"Embed request: {len(req.texts)} texts, batch_size={req.batch_size}")
97
+ start = time.time()
98
+
82
99
  try:
83
100
  embeddings = _encode_with_oom_fallback(req.texts, req.batch_size)
84
101
  dims = len(embeddings[0])
102
+ elapsed = time.time() - start
103
+ logger.info(f"Embed complete: {len(embeddings)} embeddings in {elapsed:.2f}s")
85
104
  return EmbedResponse(
86
105
  embeddings=embeddings,
87
106
  dimensions=dims,
88
107
  model=model_name,
89
108
  )
90
109
  except Exception as e:
91
- logger.error(f"Embedding error: {e}")
110
+ logger.error(f"Embedding error: {e}", exc_info=True)
92
111
  raise HTTPException(status_code=500, detail=str(e))
93
112
 
94
113
 
@@ -125,6 +144,7 @@ def _encode_with_oom_fallback(texts: list[str], batch_size: int) -> list[list[fl
125
144
  # Fall back to CPU for this request
126
145
  original_device = model.device
127
146
  model.to("cpu")
147
+ logger.info("Model moved to CPU for fallback encoding")
128
148
 
129
149
  try:
130
150
  # Use smaller batches on CPU
@@ -136,17 +156,19 @@ def _encode_with_oom_fallback(texts: list[str], batch_size: int) -> list[list[fl
136
156
  show_progress_bar=False,
137
157
  normalize_embeddings=True,
138
158
  )
159
+ logger.info(f"CPU fallback encoding complete ({len(texts)} texts)")
139
160
  return result.tolist()
140
161
  finally:
141
162
  # Move back to MPS for future requests
142
163
  try:
143
164
  model.to(original_device)
165
+ logger.info(f"Model moved back to {original_device}")
144
166
  except Exception:
145
167
  logger.warning("Could not move model back to MPS, staying on CPU")
146
168
 
147
169
 
148
170
  def handle_signal(sig, _frame):
149
- logger.info(f"Received signal {sig}, shutting down")
171
+ logger.info(f"Received signal {sig}, shutting down (pid={os.getpid()})")
150
172
  sys.exit(0)
151
173
 
152
174
 
@@ -159,14 +181,18 @@ def _watch_stdin():
159
181
  the pipe breaks and stdin closes. This is our most reliable way to detect
160
182
  parent death and self-terminate instead of becoming an orphan.
161
183
  """
162
- import threading
163
184
 
164
185
  def _watcher():
186
+ logger.info("Stdin watcher thread started")
165
187
  try:
166
188
  # Blocks until stdin is closed (parent died)
167
- sys.stdin.read()
168
- except Exception:
169
- pass
189
+ while True:
190
+ data = sys.stdin.read(1)
191
+ if not data:
192
+ # EOF — parent closed the pipe
193
+ break
194
+ except Exception as e:
195
+ logger.info(f"Stdin watcher exception: {e}")
170
196
  logger.info("Parent process died (stdin closed), shutting down")
171
197
  os._exit(0)
172
198
 
@@ -174,4 +200,8 @@ def _watch_stdin():
174
200
  t.start()
175
201
 
176
202
 
177
- _watch_stdin()
203
+ # Only watch stdin if it's a pipe (not a TTY) — avoids issues when run manually
204
+ if not sys.stdin.isatty():
205
+ _watch_stdin()
206
+ else:
207
+ logger.info("Running in terminal mode, stdin watcher disabled")