superlocalmemory 3.0.16 → 3.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,35 +2,29 @@
2
2
  # Licensed under the MIT License - see LICENSE file
3
3
  # Part of SuperLocalMemory V3 | https://qualixar.com | https://varunpratap.com
4
4
 
5
- """SuperLocalMemory V3 — Embedding Service.
5
+ """SuperLocalMemory V3 — Embedding Service (Subprocess-Isolated).
6
6
 
7
- Thread-safe, dimension-validated embedding with Fisher variance computation.
8
- Supports local (768-dim nomic) and cloud (3072-dim) models with EXPLICIT errors
9
- on dimension mismatch — NEVER silently falls back to a different dimension.
7
+ All PyTorch/model work runs in a SEPARATE subprocess. The main process
8
+ (dashboard, MCP, CLI) never imports torch and stays at ~60 MB.
10
9
 
11
- Memory management: Forces CPU-only inference to prevent GPU memory accumulation.
12
- Auto-unloads model after idle timeout to keep long-running MCP servers lean.
10
+ The worker subprocess auto-kills after 2 minutes idle, returning all
11
+ memory to the OS. It respawns on next embed call (~3 sec cold start).
13
12
 
14
13
  Part of Qualixar | Author: Varun Pratap Bhardwaj
15
14
  """
16
15
 
17
16
  from __future__ import annotations
18
17
 
18
+ import json
19
19
  import logging
20
20
  import os
21
+ import subprocess
22
+ import sys
21
23
  import threading
22
24
  import time
25
+ from pathlib import Path
23
26
  from typing import TYPE_CHECKING
24
27
 
25
- # Force CPU before any torch/sentence-transformers import.
26
- # On Apple Silicon, PyTorch defaults to Metal (MPS) which allocates 4-6 GB
27
- # of GPU shader buffers that grow over time and never get released.
28
- # On Windows/Linux with CUDA, similar GPU memory issues occur.
29
- # CPU-only keeps footprint under 1 GB (vs 6+ GB with GPU).
30
- os.environ.setdefault("CUDA_VISIBLE_DEVICES", "")
31
- os.environ.setdefault("PYTORCH_MPS_HIGH_WATERMARK_RATIO", "0.0")
32
- os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
33
-
34
28
  import numpy as np
35
29
 
36
30
  if TYPE_CHECKING:
@@ -40,276 +34,215 @@ from superlocalmemory.core.config import EmbeddingConfig
40
34
 
41
35
  logger = logging.getLogger(__name__)
42
36
 
43
- # ---------------------------------------------------------------------------
44
37
  # Fisher variance constants
45
- # ---------------------------------------------------------------------------
46
38
  _FISHER_VAR_MIN = 0.05
47
39
  _FISHER_VAR_MAX = 2.0
48
- _FISHER_VAR_RANGE = _FISHER_VAR_MAX - _FISHER_VAR_MIN # 1.95
40
+ _FISHER_VAR_RANGE = _FISHER_VAR_MAX - _FISHER_VAR_MIN
49
41
 
50
42
 
51
43
  class DimensionMismatchError(RuntimeError):
52
- """Raised when the actual embedding dimension differs from config.
53
-
54
- This is a HARD failure — V1 silently fell back to local embeddings
55
- when Azure failed, changing dimension from 3072 to 768 mid-run.
56
- We crash loudly instead.
57
- """
44
+ """Raised when the actual embedding dimension differs from config."""
58
45
 
59
46
 
60
- _IDLE_TIMEOUT_SECONDS = 300 # 5 minutes — unload model after idle
47
+ _IDLE_TIMEOUT_SECONDS = 120 # 2 minutes — kill worker after idle
61
48
 
62
49
 
63
50
  class EmbeddingService:
64
- """Thread-safe embedding service with strict dimension validation.
51
+ """Subprocess-isolated embedding service.
65
52
 
66
- Lazy-loads the underlying model on first embed call.
67
- Validates every output dimension against the configured expectation.
68
- Auto-unloads after 5 minutes idle to keep MCP server memory low.
69
- Forces CPU-only inference to prevent GPU memory accumulation.
53
+ All model inference runs in a child process. The main process never
54
+ imports torch/sentence-transformers, keeping its memory at ~60 MB.
55
+
56
+ The worker auto-kills after 2 min idle. First embed after idle takes
57
+ ~3 sec (model reload). Subsequent embeds are instant (<100ms).
70
58
  """
71
59
 
72
60
  def __init__(self, config: EmbeddingConfig) -> None:
73
61
  self._config = config
74
- self._model: object | None = None
75
62
  self._lock = threading.Lock()
76
- self._loaded = False
77
- self._available = True # Set False if model can't load
63
+ self._worker_proc: subprocess.Popen | None = None
64
+ self._available = True
78
65
  self._last_used: float = 0.0
79
66
  self._idle_timer: threading.Timer | None = None
67
+ self._worker_ready = False
80
68
 
81
69
  @property
82
70
  def is_available(self) -> bool:
83
- """Check if embedding service has a usable model."""
84
- if not self._loaded:
85
- self._ensure_loaded()
86
- return self._available and self._model is not None
71
+ """Check if embedding service can produce embeddings."""
72
+ if self._config.is_cloud:
73
+ return bool(self._config.api_endpoint and self._config.api_key)
74
+ return self._available
87
75
 
88
- def unload(self) -> None:
89
- """Explicitly unload the model to free memory.
76
+ @property
77
+ def dimension(self) -> int:
78
+ return self._config.dimension
90
79
 
91
- Called automatically after idle timeout, or manually for cleanup.
92
- The model will lazy-reload on next embed call.
93
- """
80
+ def unload(self) -> None:
81
+ """Kill the worker subprocess to free all memory."""
94
82
  with self._lock:
95
- if self._model is not None:
96
- del self._model
97
- self._model = None
98
- self._loaded = False
99
- import gc
100
- gc.collect()
101
- logger.info("EmbeddingService: model unloaded (idle timeout)")
102
-
103
- def _reset_idle_timer(self) -> None:
104
- """Reset the idle unload timer after each use."""
105
- if self._idle_timer is not None:
106
- self._idle_timer.cancel()
107
- self._idle_timer = threading.Timer(
108
- _IDLE_TIMEOUT_SECONDS, self.unload,
109
- )
110
- self._idle_timer.daemon = True
111
- self._idle_timer.start()
112
- self._last_used = time.time()
83
+ self._kill_worker()
84
+ logger.info("EmbeddingService: worker killed (idle timeout)")
113
85
 
114
86
  # ------------------------------------------------------------------
115
87
  # Public API
116
88
  # ------------------------------------------------------------------
117
89
 
118
- @property
119
- def dimension(self) -> int:
120
- """Expected embedding dimension (from config)."""
121
- return self._config.dimension
122
-
123
- def embed(self, text: str) -> list[float]:
124
- """Embed a single text string.
125
-
126
- Returns:
127
- L2-normalized embedding of exactly ``self.dimension`` floats.
128
-
129
- Raises:
130
- ValueError: If text is empty.
131
- DimensionMismatchError: If output dimension != config.
132
- """
90
+ def embed(self, text: str) -> list[float] | None:
91
+ """Embed a single text string. Returns list of floats or None."""
133
92
  if not text or not text.strip():
134
93
  raise ValueError("Cannot embed empty text")
135
- self._ensure_loaded()
136
- if self._model is None:
94
+ if self._config.is_cloud:
95
+ return self._cloud_embed_single(text)
96
+ result = self._subprocess_embed([text])
97
+ if result is None:
137
98
  return None
138
- vec = self._encode_single(text)
139
- self._validate_dimension(vec)
140
- self._reset_idle_timer()
141
- return vec.tolist()
142
-
143
- def embed_batch(self, texts: list[str]) -> list[list[float]]:
144
- """Embed a batch of texts.
99
+ vec = result[0]
100
+ self._validate_dimension(np.asarray(vec))
101
+ return vec
145
102
 
146
- Returns:
147
- List of L2-normalized embeddings, each ``self.dimension`` floats.
148
-
149
- Raises:
150
- ValueError: If any text is empty or list is empty.
151
- DimensionMismatchError: If any output dimension != config.
152
- """
103
+ def embed_batch(self, texts: list[str]) -> list[list[float] | None]:
104
+ """Embed a batch of texts."""
153
105
  if not texts:
154
106
  raise ValueError("Cannot embed empty batch")
155
- for i, t in enumerate(texts):
156
- if not t or not t.strip():
157
- raise ValueError(f"Text at index {i} is empty")
158
-
159
- self._ensure_loaded()
160
- if self._model is None:
107
+ if self._config.is_cloud:
108
+ return self._cloud_embed_batch(texts)
109
+ result = self._subprocess_embed(texts)
110
+ if result is None:
161
111
  return [None] * len(texts)
162
- vectors = self._encode_batch(texts)
163
- for vec in vectors:
164
- self._validate_dimension(vec)
165
- self._reset_idle_timer()
166
- return [v.tolist() for v in vectors]
112
+ for vec in result:
113
+ if vec is not None:
114
+ self._validate_dimension(np.asarray(vec))
115
+ return result
167
116
 
168
117
  def compute_fisher_params(
169
- self,
170
- embedding: list[float],
118
+ self, embedding: list[float],
171
119
  ) -> tuple[list[float], list[float]]:
172
- """Compute Fisher-Rao parameters from a raw embedding.
173
-
174
- Variance is content-derived (NOT uniform). Dimensions with strong
175
- signal (high absolute value) get LOW variance (high confidence).
176
- Weak-signal dimensions get HIGH variance (uncertainty).
177
-
178
- This heterogeneous variance is what gives Fisher-Rao metric
179
- discriminative power beyond simple cosine similarity.
180
-
181
- Args:
182
- embedding: Raw embedding vector (already L2-normalized).
183
-
184
- Returns:
185
- (mean, variance) — both lists of ``self.dimension`` floats.
186
- Variance values are clamped to [0.3, 2.0].
187
- """
120
+ """Compute Fisher-Rao parameters from a raw embedding."""
188
121
  arr = np.asarray(embedding, dtype=np.float64)
189
122
  norm = float(np.linalg.norm(arr))
190
-
191
123
  if norm < 1e-10:
192
124
  mean = np.zeros(len(arr), dtype=np.float64)
193
125
  variance = np.full(len(arr), _FISHER_VAR_MAX, dtype=np.float64)
194
126
  return mean.tolist(), variance.tolist()
195
-
196
127
  mean = arr / norm
197
-
198
- # Content-derived heterogeneous variance
199
128
  abs_mean = np.abs(mean)
200
129
  max_val = float(np.max(abs_mean)) + 1e-10
201
- signal_strength = abs_mean / max_val # [0, 1]
202
-
203
- # Inverse: strong signal -> low variance, weak -> high
130
+ signal_strength = abs_mean / max_val
204
131
  variance = _FISHER_VAR_MAX - _FISHER_VAR_RANGE * signal_strength
205
132
  variance = np.clip(variance, _FISHER_VAR_MIN, _FISHER_VAR_MAX)
206
-
207
133
  return mean.tolist(), variance.tolist()
208
134
 
209
135
  # ------------------------------------------------------------------
210
- # Internals model loading
136
+ # Subprocess worker management
211
137
  # ------------------------------------------------------------------
212
138
 
213
- def _ensure_loaded(self) -> None:
214
- """Lazy-load the model on first use (thread-safe)."""
215
- if self._loaded:
216
- return
139
+ def _subprocess_embed(self, texts: list[str]) -> list[list[float]] | None:
140
+ """Send texts to worker subprocess, get embeddings back."""
217
141
  with self._lock:
218
- if self._loaded:
219
- return
220
- if self._config.is_cloud:
221
- # Cloud mode: no local model needed, validate config
222
- if not self._config.api_endpoint or not self._config.api_key:
223
- raise RuntimeError(
224
- "Cloud embedding requires api_endpoint and api_key"
225
- )
226
- logger.info(
227
- "EmbeddingService: cloud mode (%s, %d-dim)",
228
- self._config.deployment_name,
229
- self._config.dimension,
230
- )
231
- else:
232
- self._load_local_model()
233
- self._loaded = True
234
-
235
- def _load_local_model(self) -> None:
236
- """Load sentence-transformers model for local embedding.
237
-
238
- Forces CPU device to prevent GPU memory accumulation:
239
- - Apple Silicon MPS: allocates 4-6 GB Metal shader buffers
240
- - NVIDIA CUDA: allocates GPU VRAM that never releases
241
- - CPU-only: stable ~880 MB footprint, no growth over time
242
- """
142
+ self._ensure_worker()
143
+ if self._worker_proc is None:
144
+ return None
145
+
146
+ req = json.dumps({
147
+ "cmd": "embed",
148
+ "texts": texts,
149
+ "model_name": self._config.model_name,
150
+ "dimension": self._config.dimension,
151
+ }) + "\n"
152
+
153
+ try:
154
+ self._worker_proc.stdin.write(req)
155
+ self._worker_proc.stdin.flush()
156
+ resp_line = self._worker_proc.stdout.readline()
157
+ if not resp_line:
158
+ logger.warning("Worker returned empty response, restarting")
159
+ self._kill_worker()
160
+ return None
161
+ resp = json.loads(resp_line)
162
+ if not resp.get("ok"):
163
+ logger.warning("Worker error: %s", resp.get("error"))
164
+ return None
165
+ self._reset_idle_timer()
166
+ return resp["vectors"]
167
+ except (BrokenPipeError, OSError, json.JSONDecodeError) as exc:
168
+ logger.warning("Worker communication failed: %s", exc)
169
+ self._kill_worker()
170
+ return None
171
+
172
+ def _ensure_worker(self) -> None:
173
+ """Spawn worker subprocess if not running."""
174
+ if self._worker_proc is not None and self._worker_proc.poll() is None:
175
+ return
176
+ self._worker_proc = None
177
+ worker_module = "superlocalmemory.core.embedding_worker"
243
178
  try:
244
- from sentence_transformers import SentenceTransformer
245
- except ImportError:
246
- logger.warning(
247
- "sentence-transformers not installed. Embeddings disabled. "
248
- "Install with: pip install sentence-transformers"
179
+ env = {
180
+ **os.environ,
181
+ "CUDA_VISIBLE_DEVICES": "",
182
+ "PYTORCH_MPS_HIGH_WATERMARK_RATIO": "0.0",
183
+ "PYTORCH_MPS_MEM_LIMIT": "0",
184
+ "PYTORCH_ENABLE_MPS_FALLBACK": "1",
185
+ "TOKENIZERS_PARALLELISM": "false",
186
+ "TORCH_DEVICE": "cpu",
187
+ }
188
+ self._worker_proc = subprocess.Popen(
189
+ [sys.executable, "-m", worker_module],
190
+ stdin=subprocess.PIPE,
191
+ stdout=subprocess.PIPE,
192
+ stderr=subprocess.DEVNULL,
193
+ text=True,
194
+ bufsize=1,
195
+ env=env,
249
196
  )
250
- self._model = None
251
- self._loaded = True
197
+ logger.info("Embedding worker spawned (PID %d)", self._worker_proc.pid)
198
+ self._worker_ready = True
199
+ except Exception as exc:
200
+ logger.warning("Failed to spawn embedding worker: %s", exc)
252
201
  self._available = False
253
- return
254
- model = SentenceTransformer(
255
- self._config.model_name, trust_remote_code=True,
256
- device="cpu",
257
- )
258
- actual_dim = model.get_sentence_embedding_dimension()
259
- if actual_dim != self._config.dimension:
260
- raise DimensionMismatchError(
261
- f"Model '{self._config.model_name}' produces {actual_dim}-dim "
262
- f"embeddings but config expects {self._config.dimension}-dim"
263
- )
264
- self._model = model
265
- logger.info(
266
- "EmbeddingService: local model loaded (%s, %d-dim, device=cpu)",
267
- self._config.model_name,
268
- actual_dim,
202
+ self._worker_proc = None
203
+
204
+ def _kill_worker(self) -> None:
205
+ """Terminate worker subprocess."""
206
+ if self._idle_timer is not None:
207
+ self._idle_timer.cancel()
208
+ self._idle_timer = None
209
+ if self._worker_proc is not None:
210
+ try:
211
+ self._worker_proc.stdin.write('{"cmd":"quit"}\n')
212
+ self._worker_proc.stdin.flush()
213
+ self._worker_proc.wait(timeout=3)
214
+ except Exception:
215
+ try:
216
+ self._worker_proc.kill()
217
+ except Exception:
218
+ pass
219
+ self._worker_proc = None
220
+ self._worker_ready = False
221
+
222
+ def _reset_idle_timer(self) -> None:
223
+ """Reset idle timer — kills worker after 2 min inactivity."""
224
+ if self._idle_timer is not None:
225
+ self._idle_timer.cancel()
226
+ self._idle_timer = threading.Timer(
227
+ _IDLE_TIMEOUT_SECONDS, self.unload,
269
228
  )
229
+ self._idle_timer.daemon = True
230
+ self._idle_timer.start()
231
+ self._last_used = time.time()
270
232
 
271
233
  # ------------------------------------------------------------------
272
- # Internalsencoding
234
+ # Cloud embedding (no subprocess needed just HTTP)
273
235
  # ------------------------------------------------------------------
274
236
 
275
- def _encode_single(self, text: str) -> NDArray[np.float32]:
276
- """Encode one text. Dispatches to local or cloud."""
277
- self._ensure_loaded()
278
- if self._config.is_cloud:
279
- return self._cloud_embed([text])[0]
280
- return self._local_embed_batch([text])[0]
237
+ def _cloud_embed_single(self, text: str) -> list[float]:
238
+ vecs = self._cloud_embed_batch([text])
239
+ return vecs[0]
281
240
 
282
- def _encode_batch(self, texts: list[str]) -> list[NDArray[np.float32]]:
283
- """Encode a batch. Dispatches to local or cloud."""
284
- self._ensure_loaded()
285
- if self._config.is_cloud:
286
- return self._cloud_embed(texts)
287
- return self._local_embed_batch(texts)
288
-
289
- def _local_embed_batch(
290
- self,
291
- texts: list[str],
292
- ) -> list[NDArray[np.float32]]:
293
- """Encode via local sentence-transformers (L2-normalized)."""
294
- if self._model is None:
295
- raise RuntimeError("Local model not loaded")
296
- vecs = self._model.encode(texts, normalize_embeddings=True)
297
- if isinstance(vecs, np.ndarray) and vecs.ndim == 2:
298
- return [vecs[i] for i in range(vecs.shape[0])]
299
- return [np.asarray(v, dtype=np.float32) for v in vecs]
300
-
301
- def _cloud_embed(
302
- self,
303
- texts: list[str],
304
- *,
305
- max_retries: int = 3,
306
- ) -> list[NDArray[np.float32]]:
307
- """Encode via Azure OpenAI embedding API with retry logic.
308
-
309
- Raises on failure — NEVER falls back to local model.
310
- """
241
+ def _cloud_embed_batch(
242
+ self, texts: list[str], *, max_retries: int = 3,
243
+ ) -> list[list[float]]:
244
+ """Encode via Azure OpenAI embedding API with retry."""
311
245
  import httpx
312
-
313
246
  url = (
314
247
  f"{self._config.api_endpoint.rstrip('/')}/openai/deployments/"
315
248
  f"{self._config.deployment_name}/embeddings"
@@ -320,7 +253,6 @@ class EmbeddingService:
320
253
  "api-key": self._config.api_key,
321
254
  }
322
255
  body = {"input": texts, "model": self._config.deployment_name}
323
-
324
256
  last_error: Exception | None = None
325
257
  for attempt in range(max_retries):
326
258
  try:
@@ -328,39 +260,23 @@ class EmbeddingService:
328
260
  resp = client.post(url, headers=headers, json=body)
329
261
  resp.raise_for_status()
330
262
  data = resp.json()
331
- results: list[NDArray[np.float32]] = []
263
+ results = []
332
264
  for item in sorted(data["data"], key=lambda d: d["index"]):
333
- vec = np.asarray(item["embedding"], dtype=np.float32)
334
- results.append(vec)
265
+ results.append(item["embedding"])
335
266
  return results
336
267
  except Exception as exc:
337
268
  last_error = exc
338
- wait = 2 ** attempt # 1s, 2s, 4s
339
- logger.warning(
340
- "Cloud embed attempt %d/%d failed: %s (retry in %ds)",
341
- attempt + 1,
342
- max_retries,
343
- exc,
344
- wait,
345
- )
346
269
  if attempt < max_retries - 1:
347
- time.sleep(wait)
348
-
349
- raise RuntimeError(
350
- f"Cloud embedding failed after {max_retries} attempts: "
351
- f"{last_error}"
352
- )
270
+ time.sleep(2 ** attempt)
271
+ raise RuntimeError(f"Cloud embedding failed: {last_error}")
353
272
 
354
273
  # ------------------------------------------------------------------
355
274
  # Validation
356
275
  # ------------------------------------------------------------------
357
276
 
358
277
  def _validate_dimension(self, vec: NDArray) -> None:
359
- """Hard validation — crash on mismatch, never silently fall back."""
360
278
  actual = len(vec)
361
279
  if actual != self._config.dimension:
362
280
  raise DimensionMismatchError(
363
- f"Embedding dimension {actual} != "
364
- f"expected {self._config.dimension}. "
365
- f"This is a HARD failure — check your model/API config."
281
+ f"Embedding dimension {actual} != expected {self._config.dimension}"
366
282
  )