code-graph-context 2.10.3 → 2.10.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md
CHANGED
|
@@ -191,6 +191,7 @@ If you prefer to edit the config files directly:
|
|
|
191
191
|
| `NEO4J_PASSWORD` | No | `PASSWORD` | Neo4j password |
|
|
192
192
|
| `EMBEDDING_MODEL` | No | `Qodo/Qodo-Embed-1-1.5B` | Local embedding model (see [Embedding Configuration](#embedding-configuration)) |
|
|
193
193
|
| `EMBEDDING_SIDECAR_PORT` | No | `8787` | Port for local embedding server |
|
|
194
|
+
| `EMBEDDING_FULL_PRECISION` | No | `false` | Set `true` for float32 (uses ~2x memory) |
|
|
194
195
|
| `OPENAI_ENABLED` | No | `false` | Set `true` to use OpenAI instead of local |
|
|
195
196
|
| `OPENAI_API_KEY` | No* | - | Required when `OPENAI_ENABLED=true` |
|
|
196
197
|
|
|
@@ -535,20 +536,24 @@ This enables queries like "find all hooks that use context" while maintaining AS
|
|
|
535
536
|
|
|
536
537
|
Local embeddings are the default — **no API key needed**. The Python sidecar starts automatically on first use and runs a local model for high-quality code embeddings.
|
|
537
538
|
|
|
539
|
+
The sidecar uses **float16 (half precision)** by default, which halves memory usage with no meaningful quality loss. It also auto-shuts down after 3 minutes of inactivity to free memory, and restarts lazily when needed (~15-20s).
|
|
540
|
+
|
|
541
|
+
> **Full precision mode:** If you have 32+ GB RAM and want float32, set `EMBEDDING_FULL_PRECISION=true`.
|
|
542
|
+
|
|
538
543
|
### Available Models
|
|
539
544
|
|
|
540
545
|
Set via the `EMBEDDING_MODEL` environment variable:
|
|
541
546
|
|
|
542
|
-
| Model | Dimensions | RAM | Quality | Best For |
|
|
547
|
+
| Model | Dimensions | RAM (fp16) | Quality | Best For |
|
|
543
548
|
|-------|-----------|-----|---------|----------|
|
|
544
|
-
| `Qodo/Qodo-Embed-1-1.5B` (default) | 1536 | ~
|
|
545
|
-
| `BAAI/bge-base-en-v1.5` | 768 | ~
|
|
546
|
-
| `sentence-transformers/all-MiniLM-L6-v2` | 384 | ~
|
|
547
|
-
| `nomic-ai/nomic-embed-text-v1.5` | 768 | ~
|
|
548
|
-
| `sentence-transformers/all-mpnet-base-v2` | 768 | ~
|
|
549
|
-
| `BAAI/bge-small-en-v1.5` | 384 | ~
|
|
550
|
-
|
|
551
|
-
**Example:** Use a lightweight model on a
|
|
549
|
+
| `Qodo/Qodo-Embed-1-1.5B` (default) | 1536 | ~4.5 GB | Best | Default, works on 16GB machines |
|
|
550
|
+
| `BAAI/bge-base-en-v1.5` | 768 | ~250 MB | Good | General purpose, low RAM |
|
|
551
|
+
| `sentence-transformers/all-MiniLM-L6-v2` | 384 | ~100 MB | OK | Minimal RAM, fast |
|
|
552
|
+
| `nomic-ai/nomic-embed-text-v1.5` | 768 | ~300 MB | Good | Code + prose mixed |
|
|
553
|
+
| `sentence-transformers/all-mpnet-base-v2` | 768 | ~250 MB | Good | Balanced quality/speed |
|
|
554
|
+
| `BAAI/bge-small-en-v1.5` | 384 | ~65 MB | OK | Smallest footprint |
|
|
555
|
+
|
|
556
|
+
**Example:** Use a lightweight model on a low-memory machine:
|
|
552
557
|
```bash
|
|
553
558
|
claude mcp add --scope user code-graph-context \
|
|
554
559
|
-e EMBEDDING_MODEL=BAAI/bge-base-en-v1.5 \
|
|
@@ -15,6 +15,7 @@ const DEFAULT_CONFIG = {
|
|
|
15
15
|
model: process.env.EMBEDDING_MODEL ?? 'Qodo/Qodo-Embed-1-1.5B',
|
|
16
16
|
startupTimeoutMs: 120_000, // 2 min — first run downloads the model
|
|
17
17
|
requestTimeoutMs: 60_000,
|
|
18
|
+
idleTimeoutMs: 180_000, // 3 min — auto-shutdown after no requests
|
|
18
19
|
};
|
|
19
20
|
export class EmbeddingSidecar {
|
|
20
21
|
process = null;
|
|
@@ -23,6 +24,7 @@ export class EmbeddingSidecar {
|
|
|
23
24
|
_dimensions = null;
|
|
24
25
|
stopping = false;
|
|
25
26
|
_exitHandler = null;
|
|
27
|
+
_idleTimer = null;
|
|
26
28
|
constructor(config = {}) {
|
|
27
29
|
this.config = { ...DEFAULT_CONFIG, ...config };
|
|
28
30
|
}
|
|
@@ -205,6 +207,7 @@ export class EmbeddingSidecar {
|
|
|
205
207
|
const data = (await res.json());
|
|
206
208
|
if (data.dimensions)
|
|
207
209
|
this._dimensions = data.dimensions;
|
|
210
|
+
this.resetIdleTimer();
|
|
208
211
|
return data.embeddings;
|
|
209
212
|
}
|
|
210
213
|
catch (err) {
|
|
@@ -251,7 +254,21 @@ export class EmbeddingSidecar {
|
|
|
251
254
|
}
|
|
252
255
|
this.cleanup();
|
|
253
256
|
}
|
|
257
|
+
resetIdleTimer() {
|
|
258
|
+
if (this._idleTimer)
|
|
259
|
+
clearTimeout(this._idleTimer);
|
|
260
|
+
this._idleTimer = setTimeout(() => {
|
|
261
|
+
console.error(`[embedding-sidecar] Idle for ${this.config.idleTimeoutMs / 1000}s, shutting down to free memory`);
|
|
262
|
+
this.stop();
|
|
263
|
+
}, this.config.idleTimeoutMs);
|
|
264
|
+
// Don't let the timer prevent Node from exiting
|
|
265
|
+
this._idleTimer.unref();
|
|
266
|
+
}
|
|
254
267
|
cleanup() {
|
|
268
|
+
if (this._idleTimer) {
|
|
269
|
+
clearTimeout(this._idleTimer);
|
|
270
|
+
this._idleTimer = null;
|
|
271
|
+
}
|
|
255
272
|
if (this._exitHandler) {
|
|
256
273
|
process.removeListener('exit', this._exitHandler);
|
|
257
274
|
this._exitHandler = null;
|
|
@@ -10,6 +10,7 @@ import { fileURLToPath } from 'url';
|
|
|
10
10
|
import { Worker } from 'worker_threads';
|
|
11
11
|
import { z } from 'zod';
|
|
12
12
|
import { CORE_TYPESCRIPT_SCHEMA } from '../../core/config/schema.js';
|
|
13
|
+
import { stopEmbeddingSidecar } from '../../core/embeddings/embedding-sidecar.js';
|
|
13
14
|
import { EmbeddingsService } from '../../core/embeddings/embeddings.service.js';
|
|
14
15
|
import { ParserFactory } from '../../core/parsers/parser-factory.js';
|
|
15
16
|
import { detectChangedFiles } from '../../core/utils/file-change-detection.js';
|
|
@@ -165,6 +166,7 @@ export const createParseTypescriptProjectTool = (server) => {
|
|
|
165
166
|
const job = jobManager.getJob(jobId);
|
|
166
167
|
if (job && job.status === 'running') {
|
|
167
168
|
jobManager.failJob(jobId, `Worker timed out after ${PARSING.workerTimeoutMs / 60000} minutes`);
|
|
169
|
+
await stopEmbeddingSidecar();
|
|
168
170
|
await terminateWorker('timeout');
|
|
169
171
|
}
|
|
170
172
|
}, PARSING.workerTimeoutMs);
|
|
@@ -183,6 +185,7 @@ export const createParseTypescriptProjectTool = (server) => {
|
|
|
183
185
|
clearTimeout(timeoutId);
|
|
184
186
|
jobManager.failJob(jobId, msg.error);
|
|
185
187
|
debugLog('Async parsing failed', { jobId, error: msg.error });
|
|
188
|
+
stopEmbeddingSidecar();
|
|
186
189
|
terminateWorker('error');
|
|
187
190
|
}
|
|
188
191
|
});
|
|
@@ -191,6 +194,7 @@ export const createParseTypescriptProjectTool = (server) => {
|
|
|
191
194
|
clearTimeout(timeoutId);
|
|
192
195
|
jobManager.failJob(jobId, err.message ?? String(err));
|
|
193
196
|
console.error('Worker thread error:', err);
|
|
197
|
+
stopEmbeddingSidecar();
|
|
194
198
|
terminateWorker('worker-error');
|
|
195
199
|
});
|
|
196
200
|
// Handle worker exit
|
|
@@ -371,12 +375,16 @@ export const createParseTypescriptProjectTool = (server) => {
|
|
|
371
375
|
edgeCount: 0,
|
|
372
376
|
});
|
|
373
377
|
await debugLog('Project status updated to failed', { projectId: finalProjectId });
|
|
378
|
+
// Stop sidecar to free memory (restarts lazily on next embed request)
|
|
379
|
+
await stopEmbeddingSidecar();
|
|
374
380
|
return createSuccessResponse(formatParsePartialSuccess(nodes.length, edges.length, outputPath, neo4jError.message));
|
|
375
381
|
}
|
|
376
382
|
}
|
|
377
383
|
catch (error) {
|
|
378
384
|
console.error('Parse tool error:', error);
|
|
379
385
|
await debugLog('Parse tool error', { projectPath, tsconfigPath, error });
|
|
386
|
+
// Stop sidecar to free memory (restarts lazily on next embed request)
|
|
387
|
+
await stopEmbeddingSidecar();
|
|
380
388
|
return createErrorResponse(error);
|
|
381
389
|
}
|
|
382
390
|
});
|
package/package.json
CHANGED
|
@@ -9,6 +9,8 @@ import os
|
|
|
9
9
|
import sys
|
|
10
10
|
import signal
|
|
11
11
|
import logging
|
|
12
|
+
import threading
|
|
13
|
+
import time
|
|
12
14
|
|
|
13
15
|
from fastapi import FastAPI, HTTPException
|
|
14
16
|
from pydantic import BaseModel
|
|
@@ -20,6 +22,8 @@ logging.basicConfig(
|
|
|
20
22
|
)
|
|
21
23
|
logger = logging.getLogger("embedding-sidecar")
|
|
22
24
|
|
|
25
|
+
logger.info(f"Sidecar process starting (pid={os.getpid()})")
|
|
26
|
+
|
|
23
27
|
app = FastAPI(title="code-graph-context embedding sidecar")
|
|
24
28
|
|
|
25
29
|
model = None
|
|
@@ -46,15 +50,25 @@ def load_model():
|
|
|
46
50
|
|
|
47
51
|
device = "mps" if torch.backends.mps.is_available() else "cpu"
|
|
48
52
|
logger.info(f"Loading {model_name} on {device}...")
|
|
53
|
+
logger.info(f"PyTorch version: {torch.__version__}, MPS available: {torch.backends.mps.is_available()}")
|
|
54
|
+
|
|
55
|
+
use_half = os.environ.get("EMBEDDING_FULL_PRECISION", "").lower() != "true"
|
|
49
56
|
model = SentenceTransformer(model_name, device=device)
|
|
57
|
+
if use_half:
|
|
58
|
+
model.half()
|
|
59
|
+
logger.info(f"Model loaded in float16 (half precision)")
|
|
60
|
+
else:
|
|
61
|
+
logger.info(f"Model loaded in float32 (full precision)")
|
|
62
|
+
logger.info(f"Running warmup...")
|
|
50
63
|
|
|
51
64
|
# Warm up with a test embedding
|
|
52
65
|
with torch.no_grad():
|
|
53
66
|
test = model.encode(["warmup"], show_progress_bar=False)
|
|
54
67
|
dims = len(test[0])
|
|
55
|
-
logger.info(f"
|
|
68
|
+
logger.info(f"Warmup complete: {dims} dimensions, device={device}")
|
|
69
|
+
logger.info(f"Sidecar ready (pid={os.getpid()})")
|
|
56
70
|
except Exception as e:
|
|
57
|
-
logger.error(f"Failed to load model: {e}")
|
|
71
|
+
logger.error(f"Failed to load model: {e}", exc_info=True)
|
|
58
72
|
raise
|
|
59
73
|
|
|
60
74
|
|
|
@@ -79,16 +93,21 @@ async def embed(req: EmbedRequest):
|
|
|
79
93
|
if not req.texts:
|
|
80
94
|
return EmbedResponse(embeddings=[], dimensions=0, model=model_name)
|
|
81
95
|
|
|
96
|
+
logger.info(f"Embed request: {len(req.texts)} texts, batch_size={req.batch_size}")
|
|
97
|
+
start = time.time()
|
|
98
|
+
|
|
82
99
|
try:
|
|
83
100
|
embeddings = _encode_with_oom_fallback(req.texts, req.batch_size)
|
|
84
101
|
dims = len(embeddings[0])
|
|
102
|
+
elapsed = time.time() - start
|
|
103
|
+
logger.info(f"Embed complete: {len(embeddings)} embeddings in {elapsed:.2f}s")
|
|
85
104
|
return EmbedResponse(
|
|
86
105
|
embeddings=embeddings,
|
|
87
106
|
dimensions=dims,
|
|
88
107
|
model=model_name,
|
|
89
108
|
)
|
|
90
109
|
except Exception as e:
|
|
91
|
-
logger.error(f"Embedding error: {e}")
|
|
110
|
+
logger.error(f"Embedding error: {e}", exc_info=True)
|
|
92
111
|
raise HTTPException(status_code=500, detail=str(e))
|
|
93
112
|
|
|
94
113
|
|
|
@@ -125,6 +144,7 @@ def _encode_with_oom_fallback(texts: list[str], batch_size: int) -> list[list[fl
|
|
|
125
144
|
# Fall back to CPU for this request
|
|
126
145
|
original_device = model.device
|
|
127
146
|
model.to("cpu")
|
|
147
|
+
logger.info("Model moved to CPU for fallback encoding")
|
|
128
148
|
|
|
129
149
|
try:
|
|
130
150
|
# Use smaller batches on CPU
|
|
@@ -136,17 +156,19 @@ def _encode_with_oom_fallback(texts: list[str], batch_size: int) -> list[list[fl
|
|
|
136
156
|
show_progress_bar=False,
|
|
137
157
|
normalize_embeddings=True,
|
|
138
158
|
)
|
|
159
|
+
logger.info(f"CPU fallback encoding complete ({len(texts)} texts)")
|
|
139
160
|
return result.tolist()
|
|
140
161
|
finally:
|
|
141
162
|
# Move back to MPS for future requests
|
|
142
163
|
try:
|
|
143
164
|
model.to(original_device)
|
|
165
|
+
logger.info(f"Model moved back to {original_device}")
|
|
144
166
|
except Exception:
|
|
145
167
|
logger.warning("Could not move model back to MPS, staying on CPU")
|
|
146
168
|
|
|
147
169
|
|
|
148
170
|
def handle_signal(sig, _frame):
|
|
149
|
-
logger.info(f"Received signal {sig}, shutting down")
|
|
171
|
+
logger.info(f"Received signal {sig}, shutting down (pid={os.getpid()})")
|
|
150
172
|
sys.exit(0)
|
|
151
173
|
|
|
152
174
|
|
|
@@ -159,14 +181,18 @@ def _watch_stdin():
|
|
|
159
181
|
the pipe breaks and stdin closes. This is our most reliable way to detect
|
|
160
182
|
parent death and self-terminate instead of becoming an orphan.
|
|
161
183
|
"""
|
|
162
|
-
import threading
|
|
163
184
|
|
|
164
185
|
def _watcher():
|
|
186
|
+
logger.info("Stdin watcher thread started")
|
|
165
187
|
try:
|
|
166
188
|
# Blocks until stdin is closed (parent died)
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
189
|
+
while True:
|
|
190
|
+
data = sys.stdin.read(1)
|
|
191
|
+
if not data:
|
|
192
|
+
# EOF — parent closed the pipe
|
|
193
|
+
break
|
|
194
|
+
except Exception as e:
|
|
195
|
+
logger.info(f"Stdin watcher exception: {e}")
|
|
170
196
|
logger.info("Parent process died (stdin closed), shutting down")
|
|
171
197
|
os._exit(0)
|
|
172
198
|
|
|
@@ -174,4 +200,8 @@ def _watch_stdin():
|
|
|
174
200
|
t.start()
|
|
175
201
|
|
|
176
202
|
|
|
177
|
-
|
|
203
|
+
# Only watch stdin if it's a pipe (not a TTY) — avoids issues when run manually
|
|
204
|
+
if not sys.stdin.isatty():
|
|
205
|
+
_watch_stdin()
|
|
206
|
+
else:
|
|
207
|
+
logger.info("Running in terminal mode, stdin watcher disabled")
|