code-graph-context 2.11.0 → 2.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md
CHANGED
|
@@ -189,8 +189,9 @@ If you prefer to edit the config files directly:
|
|
|
189
189
|
| `NEO4J_URI` | No | `bolt://localhost:7687` | Neo4j connection URI |
|
|
190
190
|
| `NEO4J_USER` | No | `neo4j` | Neo4j username |
|
|
191
191
|
| `NEO4J_PASSWORD` | No | `PASSWORD` | Neo4j password |
|
|
192
|
-
| `EMBEDDING_MODEL` | No | `
|
|
192
|
+
| `EMBEDDING_MODEL` | No | `codesage/codesage-base-v2` | Local embedding model (see [Embedding Configuration](#embedding-configuration)) |
|
|
193
193
|
| `EMBEDDING_SIDECAR_PORT` | No | `8787` | Port for local embedding server |
|
|
194
|
+
| `EMBEDDING_DEVICE` | No | auto (`mps`/`cpu`) | Device for embeddings. Auto-detects MPS on Apple Silicon |
|
|
194
195
|
| `EMBEDDING_HALF_PRECISION` | No | `false` | Set `true` for float16 (uses ~0.5x memory) |
|
|
195
196
|
| `OPENAI_ENABLED` | No | `false` | Set `true` to use OpenAI instead of local |
|
|
196
197
|
| `OPENAI_API_KEY` | No* | - | Required when `OPENAI_ENABLED=true` |
|
|
@@ -536,9 +537,11 @@ This enables queries like "find all hooks that use context" while maintaining AS
|
|
|
536
537
|
|
|
537
538
|
Local embeddings are the default — **no API key needed**. The Python sidecar starts automatically on first use and runs a local model for high-quality code embeddings.
|
|
538
539
|
|
|
539
|
-
The sidecar uses **
|
|
540
|
+
The sidecar uses **MPS (Apple Silicon GPU)** when available, falling back to CPU. It auto-shuts down after 3 minutes of inactivity to free memory, and restarts lazily when needed (~15-20s).
|
|
540
541
|
|
|
541
|
-
> **
|
|
542
|
+
> **Device override:** Set `EMBEDDING_DEVICE=cpu` to force CPU if MPS causes issues.
|
|
543
|
+
>
|
|
544
|
+
> **Half precision:** Set `EMBEDDING_HALF_PRECISION=true` to load the model in float16, roughly halving memory usage.
|
|
542
545
|
|
|
543
546
|
### Available Models
|
|
544
547
|
|
|
@@ -546,7 +549,7 @@ Set via the `EMBEDDING_MODEL` environment variable:
|
|
|
546
549
|
|
|
547
550
|
| Model | Dimensions | RAM (fp16) | Quality | Best For |
|
|
548
551
|
|-------|-----------|-----|---------|----------|
|
|
549
|
-
| `
|
|
552
|
+
| `codesage/codesage-base-v2` (default) | 1024 | ~700 MB | Best | Default, code-specific encoder, fast |
|
|
550
553
|
| `Qodo/Qodo-Embed-1-1.5B` | 1536 | ~4.5 GB | Great | Machines with 32+ GB RAM |
|
|
551
554
|
| `BAAI/bge-base-en-v1.5` | 768 | ~250 MB | Good | General purpose, low RAM |
|
|
552
555
|
| `sentence-transformers/all-MiniLM-L6-v2` | 384 | ~100 MB | OK | Minimal RAM, fast |
|
package/dist/cli/cli.js
CHANGED
|
@@ -251,7 +251,7 @@ const setupSidecar = async () => {
|
|
|
251
251
|
return;
|
|
252
252
|
}
|
|
253
253
|
// Pre-download the embedding model so first real use is fast
|
|
254
|
-
const modelName = process.env.EMBEDDING_MODEL ?? '
|
|
254
|
+
const modelName = process.env.EMBEDDING_MODEL ?? 'codesage/codesage-base-v2';
|
|
255
255
|
await preDownloadModel(sidecarDir, python, modelName);
|
|
256
256
|
};
|
|
257
257
|
/**
|
|
@@ -12,7 +12,7 @@ const __dirname = dirname(__filename);
|
|
|
12
12
|
const DEFAULT_CONFIG = {
|
|
13
13
|
port: parseInt(process.env.EMBEDDING_SIDECAR_PORT ?? '', 10) || 8787,
|
|
14
14
|
host: '127.0.0.1',
|
|
15
|
-
model: process.env.EMBEDDING_MODEL ?? '
|
|
15
|
+
model: process.env.EMBEDDING_MODEL ?? 'codesage/codesage-base-v2',
|
|
16
16
|
startupTimeoutMs: 120_000, // 2 min — first run downloads the model
|
|
17
17
|
requestTimeoutMs: 60_000,
|
|
18
18
|
idleTimeoutMs: 180_000, // 3 min — auto-shutdown after no requests
|
|
@@ -24,7 +24,7 @@ export const EMBEDDING_DIMENSIONS = {
|
|
|
24
24
|
'text-embedding-3-large': 3072,
|
|
25
25
|
'text-embedding-3-small': 1536,
|
|
26
26
|
// Local models (via sidecar)
|
|
27
|
-
'
|
|
27
|
+
'codesage/codesage-base-v2': 1024,
|
|
28
28
|
'Qodo/Qodo-Embed-1-1.5B': 1536,
|
|
29
29
|
'sentence-transformers/all-MiniLM-L6-v2': 384,
|
|
30
30
|
'sentence-transformers/all-mpnet-base-v2': 768,
|
|
@@ -46,7 +46,7 @@ export const getEmbeddingDimensions = () => {
|
|
|
46
46
|
const model = process.env.OPENAI_EMBEDDING_MODEL ?? 'text-embedding-3-large';
|
|
47
47
|
return EMBEDDING_DIMENSIONS[model] ?? 3072;
|
|
48
48
|
}
|
|
49
|
-
const model = process.env.EMBEDDING_MODEL ?? '
|
|
49
|
+
const model = process.env.EMBEDDING_MODEL ?? 'codesage/codesage-base-v2';
|
|
50
50
|
return EMBEDDING_DIMENSIONS[model] ?? 1536;
|
|
51
51
|
};
|
|
52
52
|
/**
|
package/package.json
CHANGED
|
@@ -27,7 +27,7 @@ logger.info(f"Sidecar process starting (pid={os.getpid()})")
|
|
|
27
27
|
app = FastAPI(title="code-graph-context embedding sidecar")
|
|
28
28
|
|
|
29
29
|
model = None
|
|
30
|
-
model_name = os.environ.get("EMBEDDING_MODEL", "
|
|
30
|
+
model_name = os.environ.get("EMBEDDING_MODEL", "codesage/codesage-base-v2")
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
class EmbedRequest(BaseModel):
|
|
@@ -48,16 +48,20 @@ def load_model():
|
|
|
48
48
|
import torch
|
|
49
49
|
from sentence_transformers import SentenceTransformer
|
|
50
50
|
|
|
51
|
-
|
|
51
|
+
device_override = os.environ.get("EMBEDDING_DEVICE", "").lower()
|
|
52
|
+
if device_override:
|
|
53
|
+
device = device_override
|
|
54
|
+
else:
|
|
55
|
+
device = "mps" if torch.backends.mps.is_available() else "cpu"
|
|
52
56
|
logger.info(f"Loading {model_name} on {device}...")
|
|
53
57
|
logger.info(f"PyTorch version: {torch.__version__}, MPS available: {torch.backends.mps.is_available()}")
|
|
54
58
|
|
|
55
59
|
use_half = os.environ.get("EMBEDDING_HALF_PRECISION", "").lower() == "true"
|
|
56
|
-
model = SentenceTransformer(model_name, device=device)
|
|
57
60
|
if use_half:
|
|
58
|
-
model
|
|
61
|
+
model = SentenceTransformer(model_name, device=device, model_kwargs={"torch_dtype": "float16"})
|
|
59
62
|
logger.info(f"Model loaded in float16 (half precision)")
|
|
60
63
|
else:
|
|
64
|
+
model = SentenceTransformer(model_name, device=device)
|
|
61
65
|
logger.info(f"Model loaded in float32 (full precision)")
|
|
62
66
|
logger.info(f"Running warmup...")
|
|
63
67
|
|
|
@@ -114,7 +118,6 @@ async def embed(req: EmbedRequest):
|
|
|
114
118
|
def _encode_with_oom_fallback(texts: list[str], batch_size: int) -> list[list[float]]:
|
|
115
119
|
"""
|
|
116
120
|
Encode texts, falling back to CPU if MPS runs out of memory.
|
|
117
|
-
Also retries with smaller batch sizes before giving up.
|
|
118
121
|
"""
|
|
119
122
|
import torch
|
|
120
123
|
|
|
@@ -126,28 +129,18 @@ def _encode_with_oom_fallback(texts: list[str], batch_size: int) -> list[list[fl
|
|
|
126
129
|
show_progress_bar=False,
|
|
127
130
|
normalize_embeddings=True,
|
|
128
131
|
)
|
|
129
|
-
# Free intermediate tensors after each request
|
|
130
|
-
if hasattr(torch.mps, "empty_cache"):
|
|
131
|
-
torch.mps.empty_cache()
|
|
132
132
|
return result.tolist()
|
|
133
|
-
except (
|
|
133
|
+
except (RuntimeError,) as e:
|
|
134
134
|
if "out of memory" not in str(e).lower():
|
|
135
135
|
raise
|
|
136
136
|
|
|
137
|
-
logger.warning(f"
|
|
138
|
-
|
|
139
|
-
# Free MPS memory
|
|
140
|
-
if hasattr(torch.mps, "empty_cache"):
|
|
141
|
-
torch.mps.empty_cache()
|
|
137
|
+
logger.warning(f"OOM with batch_size={batch_size}, len={len(texts)}. Falling back to CPU.")
|
|
142
138
|
gc.collect()
|
|
143
139
|
|
|
144
|
-
# Fall back to CPU for this request
|
|
145
140
|
original_device = model.device
|
|
146
141
|
model.to("cpu")
|
|
147
|
-
logger.info("Model moved to CPU for fallback encoding")
|
|
148
142
|
|
|
149
143
|
try:
|
|
150
|
-
# Use smaller batches on CPU
|
|
151
144
|
cpu_batch = min(batch_size, 4)
|
|
152
145
|
with torch.no_grad():
|
|
153
146
|
result = model.encode(
|
|
@@ -159,12 +152,10 @@ def _encode_with_oom_fallback(texts: list[str], batch_size: int) -> list[list[fl
|
|
|
159
152
|
logger.info(f"CPU fallback encoding complete ({len(texts)} texts)")
|
|
160
153
|
return result.tolist()
|
|
161
154
|
finally:
|
|
162
|
-
# Move back to MPS for future requests
|
|
163
155
|
try:
|
|
164
156
|
model.to(original_device)
|
|
165
|
-
logger.info(f"Model moved back to {original_device}")
|
|
166
157
|
except Exception:
|
|
167
|
-
logger.warning("Could not move model back
|
|
158
|
+
logger.warning("Could not move model back, staying on CPU")
|
|
168
159
|
|
|
169
160
|
|
|
170
161
|
def handle_signal(sig, _frame):
|