code-graph-context 2.10.5 → 2.11.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md
CHANGED
|
@@ -189,9 +189,10 @@ If you prefer to edit the config files directly:
|
|
|
189
189
|
| `NEO4J_URI` | No | `bolt://localhost:7687` | Neo4j connection URI |
|
|
190
190
|
| `NEO4J_USER` | No | `neo4j` | Neo4j username |
|
|
191
191
|
| `NEO4J_PASSWORD` | No | `PASSWORD` | Neo4j password |
|
|
192
|
-
| `EMBEDDING_MODEL` | No | `
|
|
192
|
+
| `EMBEDDING_MODEL` | No | `Qwen/Qwen3-Embedding-0.6B` | Local embedding model (see [Embedding Configuration](#embedding-configuration)) |
|
|
193
193
|
| `EMBEDDING_SIDECAR_PORT` | No | `8787` | Port for local embedding server |
|
|
194
|
-
| `
|
|
194
|
+
| `EMBEDDING_DEVICE` | No | `cpu` | Device for embeddings (`cpu` or `mps`). CPU is default to avoid MPS memory bloat |
|
|
195
|
+
| `EMBEDDING_HALF_PRECISION` | No | `false` | Set `true` for float16 (uses ~0.5x memory) |
|
|
195
196
|
| `OPENAI_ENABLED` | No | `false` | Set `true` to use OpenAI instead of local |
|
|
196
197
|
| `OPENAI_API_KEY` | No* | - | Required when `OPENAI_ENABLED=true` |
|
|
197
198
|
|
|
@@ -536,9 +537,11 @@ This enables queries like "find all hooks that use context" while maintaining AS
|
|
|
536
537
|
|
|
537
538
|
Local embeddings are the default — **no API key needed**. The Python sidecar starts automatically on first use and runs a local model for high-quality code embeddings.
|
|
538
539
|
|
|
539
|
-
The sidecar
|
|
540
|
+
The sidecar runs on **CPU by default** to avoid MPS memory pool bloat on Apple Silicon (MPS can pre-allocate 10+ GB even for small models). CPU is fast enough for models under 1B params. It also auto-shuts down after 3 minutes of inactivity to free memory, and restarts lazily when needed (~15-20s).
|
|
540
541
|
|
|
541
|
-
> **
|
|
542
|
+
> **GPU acceleration:** Set `EMBEDDING_DEVICE=mps` to use Apple Silicon GPU for larger models (1B+ params). Only recommended on machines with 32+ GB RAM.
|
|
543
|
+
>
|
|
544
|
+
> **Half precision:** Set `EMBEDDING_HALF_PRECISION=true` to load the model in float16, roughly halving memory usage.
|
|
542
545
|
|
|
543
546
|
### Available Models
|
|
544
547
|
|
|
@@ -546,7 +549,8 @@ Set via the `EMBEDDING_MODEL` environment variable:
|
|
|
546
549
|
|
|
547
550
|
| Model | Dimensions | RAM (fp16) | Quality | Best For |
|
|
548
551
|
|-------|-----------|-----|---------|----------|
|
|
549
|
-
| `
|
|
552
|
+
| `Qwen/Qwen3-Embedding-0.6B` (default) | 1024 | ~1.2 GB | Best | Default, code-aware, MTEB-Code #1 |
|
|
553
|
+
| `Qodo/Qodo-Embed-1-1.5B` | 1536 | ~4.5 GB | Great | Machines with 32+ GB RAM |
|
|
550
554
|
| `BAAI/bge-base-en-v1.5` | 768 | ~250 MB | Good | General purpose, low RAM |
|
|
551
555
|
| `sentence-transformers/all-MiniLM-L6-v2` | 384 | ~100 MB | OK | Minimal RAM, fast |
|
|
552
556
|
| `nomic-ai/nomic-embed-text-v1.5` | 768 | ~300 MB | Good | Code + prose mixed |
|
package/dist/cli/cli.js
CHANGED
|
@@ -251,7 +251,7 @@ const setupSidecar = async () => {
|
|
|
251
251
|
return;
|
|
252
252
|
}
|
|
253
253
|
// Pre-download the embedding model so first real use is fast
|
|
254
|
-
const modelName = process.env.EMBEDDING_MODEL ?? '
|
|
254
|
+
const modelName = process.env.EMBEDDING_MODEL ?? 'Qwen/Qwen3-Embedding-0.6B';
|
|
255
255
|
await preDownloadModel(sidecarDir, python, modelName);
|
|
256
256
|
};
|
|
257
257
|
/**
|
|
@@ -12,7 +12,7 @@ const __dirname = dirname(__filename);
|
|
|
12
12
|
const DEFAULT_CONFIG = {
|
|
13
13
|
port: parseInt(process.env.EMBEDDING_SIDECAR_PORT ?? '', 10) || 8787,
|
|
14
14
|
host: '127.0.0.1',
|
|
15
|
-
model: process.env.EMBEDDING_MODEL ?? '
|
|
15
|
+
model: process.env.EMBEDDING_MODEL ?? 'Qwen/Qwen3-Embedding-0.6B',
|
|
16
16
|
startupTimeoutMs: 120_000, // 2 min — first run downloads the model
|
|
17
17
|
requestTimeoutMs: 60_000,
|
|
18
18
|
idleTimeoutMs: 180_000, // 3 min — auto-shutdown after no requests
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
* and get the right implementation based on OPENAI_ENABLED.
|
|
6
6
|
*
|
|
7
7
|
* OPENAI_ENABLED=true → OpenAI text-embedding-3-large (requires OPENAI_API_KEY)
|
|
8
|
-
* default → Local Python sidecar with
|
|
8
|
+
* default → Local Python sidecar with Qwen3-Embedding-0.6B
|
|
9
9
|
*/
|
|
10
10
|
import { LocalEmbeddingsService } from './local-embeddings.service.js';
|
|
11
11
|
import { OpenAIEmbeddingsService } from './openai-embeddings.service.js';
|
|
@@ -24,6 +24,7 @@ export const EMBEDDING_DIMENSIONS = {
|
|
|
24
24
|
'text-embedding-3-large': 3072,
|
|
25
25
|
'text-embedding-3-small': 1536,
|
|
26
26
|
// Local models (via sidecar)
|
|
27
|
+
'Qwen/Qwen3-Embedding-0.6B': 1024,
|
|
27
28
|
'Qodo/Qodo-Embed-1-1.5B': 1536,
|
|
28
29
|
'sentence-transformers/all-MiniLM-L6-v2': 384,
|
|
29
30
|
'sentence-transformers/all-mpnet-base-v2': 768,
|
|
@@ -45,7 +46,7 @@ export const getEmbeddingDimensions = () => {
|
|
|
45
46
|
const model = process.env.OPENAI_EMBEDDING_MODEL ?? 'text-embedding-3-large';
|
|
46
47
|
return EMBEDDING_DIMENSIONS[model] ?? 3072;
|
|
47
48
|
}
|
|
48
|
-
const model = process.env.EMBEDDING_MODEL ?? '
|
|
49
|
+
const model = process.env.EMBEDDING_MODEL ?? 'Qwen/Qwen3-Embedding-0.6B';
|
|
49
50
|
return EMBEDDING_DIMENSIONS[model] ?? 1536;
|
|
50
51
|
};
|
|
51
52
|
/**
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Local embedding server for code-graph-context.
|
|
3
|
-
Uses
|
|
3
|
+
Uses Qwen3-Embedding-0.6B for high-quality code embeddings without OpenAI dependency.
|
|
4
4
|
Runs as a sidecar process managed by the Node.js MCP server.
|
|
5
5
|
"""
|
|
6
6
|
|
|
@@ -27,7 +27,7 @@ logger.info(f"Sidecar process starting (pid={os.getpid()})")
|
|
|
27
27
|
app = FastAPI(title="code-graph-context embedding sidecar")
|
|
28
28
|
|
|
29
29
|
model = None
|
|
30
|
-
model_name = os.environ.get("EMBEDDING_MODEL", "
|
|
30
|
+
model_name = os.environ.get("EMBEDDING_MODEL", "Qwen/Qwen3-Embedding-0.6B")
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
class EmbedRequest(BaseModel):
|
|
@@ -48,16 +48,23 @@ def load_model():
|
|
|
48
48
|
import torch
|
|
49
49
|
from sentence_transformers import SentenceTransformer
|
|
50
50
|
|
|
51
|
-
|
|
51
|
+
# Use CPU by default — MPS pre-allocates a massive memory pool (10+ GB)
|
|
52
|
+
# that bloats small models. CPU on Apple Silicon is fast enough for <1B models.
|
|
53
|
+
# Set EMBEDDING_DEVICE=mps to force GPU if needed for large models.
|
|
54
|
+
device_override = os.environ.get("EMBEDDING_DEVICE", "").lower()
|
|
55
|
+
if device_override:
|
|
56
|
+
device = device_override
|
|
57
|
+
else:
|
|
58
|
+
device = "cpu"
|
|
52
59
|
logger.info(f"Loading {model_name} on {device}...")
|
|
53
60
|
logger.info(f"PyTorch version: {torch.__version__}, MPS available: {torch.backends.mps.is_available()}")
|
|
54
61
|
|
|
55
|
-
use_half = os.environ.get("
|
|
56
|
-
model = SentenceTransformer(model_name, device=device)
|
|
62
|
+
use_half = os.environ.get("EMBEDDING_HALF_PRECISION", "").lower() == "true"
|
|
57
63
|
if use_half:
|
|
58
|
-
model
|
|
64
|
+
model = SentenceTransformer(model_name, device=device, model_kwargs={"torch_dtype": "float16"})
|
|
59
65
|
logger.info(f"Model loaded in float16 (half precision)")
|
|
60
66
|
else:
|
|
67
|
+
model = SentenceTransformer(model_name, device=device)
|
|
61
68
|
logger.info(f"Model loaded in float32 (full precision)")
|
|
62
69
|
logger.info(f"Running warmup...")
|
|
63
70
|
|
|
@@ -114,7 +121,6 @@ async def embed(req: EmbedRequest):
|
|
|
114
121
|
def _encode_with_oom_fallback(texts: list[str], batch_size: int) -> list[list[float]]:
|
|
115
122
|
"""
|
|
116
123
|
Encode texts, falling back to CPU if MPS runs out of memory.
|
|
117
|
-
Also retries with smaller batch sizes before giving up.
|
|
118
124
|
"""
|
|
119
125
|
import torch
|
|
120
126
|
|
|
@@ -126,28 +132,18 @@ def _encode_with_oom_fallback(texts: list[str], batch_size: int) -> list[list[fl
|
|
|
126
132
|
show_progress_bar=False,
|
|
127
133
|
normalize_embeddings=True,
|
|
128
134
|
)
|
|
129
|
-
# Free intermediate tensors after each request
|
|
130
|
-
if hasattr(torch.mps, "empty_cache"):
|
|
131
|
-
torch.mps.empty_cache()
|
|
132
135
|
return result.tolist()
|
|
133
|
-
except (
|
|
136
|
+
except (RuntimeError,) as e:
|
|
134
137
|
if "out of memory" not in str(e).lower():
|
|
135
138
|
raise
|
|
136
139
|
|
|
137
|
-
logger.warning(f"
|
|
138
|
-
|
|
139
|
-
# Free MPS memory
|
|
140
|
-
if hasattr(torch.mps, "empty_cache"):
|
|
141
|
-
torch.mps.empty_cache()
|
|
140
|
+
logger.warning(f"OOM with batch_size={batch_size}, len={len(texts)}. Falling back to CPU.")
|
|
142
141
|
gc.collect()
|
|
143
142
|
|
|
144
|
-
# Fall back to CPU for this request
|
|
145
143
|
original_device = model.device
|
|
146
144
|
model.to("cpu")
|
|
147
|
-
logger.info("Model moved to CPU for fallback encoding")
|
|
148
145
|
|
|
149
146
|
try:
|
|
150
|
-
# Use smaller batches on CPU
|
|
151
147
|
cpu_batch = min(batch_size, 4)
|
|
152
148
|
with torch.no_grad():
|
|
153
149
|
result = model.encode(
|
|
@@ -159,12 +155,10 @@ def _encode_with_oom_fallback(texts: list[str], batch_size: int) -> list[list[fl
|
|
|
159
155
|
logger.info(f"CPU fallback encoding complete ({len(texts)} texts)")
|
|
160
156
|
return result.tolist()
|
|
161
157
|
finally:
|
|
162
|
-
# Move back to MPS for future requests
|
|
163
158
|
try:
|
|
164
159
|
model.to(original_device)
|
|
165
|
-
logger.info(f"Model moved back to {original_device}")
|
|
166
160
|
except Exception:
|
|
167
|
-
logger.warning("Could not move model back
|
|
161
|
+
logger.warning("Could not move model back, staying on CPU")
|
|
168
162
|
|
|
169
163
|
|
|
170
164
|
def handle_signal(sig, _frame):
|
package/sidecar/requirements.txt
CHANGED