superlocalmemory 3.3.16 → 3.3.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "superlocalmemory",
3
- "version": "3.3.16",
3
+ "version": "3.3.18",
4
4
  "description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
5
5
  "keywords": [
6
6
  "ai-memory",
package/pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "superlocalmemory"
3
- version = "3.3.16"
3
+ version = "3.3.18"
4
4
  description = "Information-geometric agent memory with mathematical guarantees"
5
5
  readme = "README.md"
6
6
  license = {text = "MIT"}
@@ -155,7 +155,7 @@ class RetrievalConfig:
155
155
  # Reranking (V3.3.2: ONNX backend enabled for all modes)
156
156
  use_cross_encoder: bool = True
157
157
  cross_encoder_model: str = "cross-encoder/ms-marco-MiniLM-L-12-v2"
158
- cross_encoder_backend: str = "onnx" # "onnx" (~200MB) or "" (PyTorch, ~1.5GB)
158
+ cross_encoder_backend: str = "" # "" = PyTorch (~500MB stable), "onnx" = ONNX (leaks on ARM64 CoreML)
159
159
 
160
160
  # Agentic (Mode C only)
161
161
  agentic_max_rounds: int = 3
@@ -35,6 +35,8 @@ os.environ["PYTORCH_MPS_MEM_LIMIT"] = "0"
35
35
  os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
36
36
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
37
37
  os.environ["TORCH_DEVICE"] = "cpu"
38
+ # V3.3.17: Disable CoreML EP for ONNX Runtime — uses 3-5GB on ARM64 Mac.
39
+ os.environ["ORT_DISABLE_COREML"] = "1"
38
40
 
39
41
  # SIGTERM bridge: Docker/systemd send SIGTERM to stop processes.
40
42
  # Without this, the worker ignores SIGTERM and becomes a zombie.
@@ -65,6 +67,34 @@ def _start_parent_watchdog() -> None:
65
67
  t.start()
66
68
 
67
69
 
70
+ def _load_embedding_model(name: str) -> tuple:
71
+ """Load embedding model. ONNX first (no memory leak), PyTorch fallback.
72
+
73
+ V3.3.17: PyTorch SentenceTransformer on ARM64 Mac leaks memory —
74
+ grows from 300MB to 17GB after ~200 encode calls. ONNX Runtime
75
+ has no such issue. Same approach as CrossEncoder ONNX migration.
76
+
77
+ Returns (model, backend_name) or (None, "").
78
+ """
79
+ from sentence_transformers import SentenceTransformer
80
+
81
+ # Tier 1: ONNX (stable memory, ~200MB footprint)
82
+ try:
83
+ m = SentenceTransformer(name, backend="onnx", trust_remote_code=True)
84
+ return m, "onnx"
85
+ except Exception:
86
+ pass
87
+
88
+ # Tier 2: PyTorch CPU (stable at ~1.4GB after 100+ calls, verified)
89
+ try:
90
+ import torch
91
+ with torch.inference_mode():
92
+ m = SentenceTransformer(name, trust_remote_code=True, device="cpu")
93
+ return m, "pytorch"
94
+ except Exception:
95
+ return None, ""
96
+
97
+
68
98
  def _worker_main() -> None:
69
99
  """Main loop: read JSON requests from stdin, write responses to stdout."""
70
100
  _start_parent_watchdog() # V3.3.7: self-terminate if parent dies
@@ -97,18 +127,17 @@ def _worker_main() -> None:
97
127
  if cmd == "load":
98
128
  name = req.get("model_name", "nomic-ai/nomic-embed-text-v1.5")
99
129
  expected_dim = req.get("dimension", 768)
100
- try:
101
- from sentence_transformers import SentenceTransformer
102
- model = SentenceTransformer(name, trust_remote_code=True, device="cpu")
130
+ model, active_backend = _load_embedding_model(name)
131
+ if model is not None:
103
132
  dim = model.get_sentence_embedding_dimension()
104
133
  if dim != expected_dim:
105
134
  _respond({"ok": False, "error": f"Dimension mismatch: {dim} != {expected_dim}"})
106
135
  model = None
107
136
  continue
108
137
  model_name = name
109
- _respond({"ok": True, "dim": dim, "model": name})
110
- except Exception as exc:
111
- _respond({"ok": False, "error": str(exc)})
138
+ _respond({"ok": True, "dim": dim, "model": name, "backend": active_backend})
139
+ else:
140
+ _respond({"ok": False, "error": "Model load failed"})
112
141
  continue
113
142
 
114
143
  if cmd == "embed":
@@ -117,26 +146,16 @@ def _worker_main() -> None:
117
146
  _respond({"ok": False, "error": "No texts provided"})
118
147
  continue
119
148
  if model is None:
120
- # Auto-load if not yet loaded
121
149
  name = req.get("model_name", "nomic-ai/nomic-embed-text-v1.5")
122
- expected_dim = req.get("dimension", 768)
123
- try:
124
- from sentence_transformers import SentenceTransformer
125
- model = SentenceTransformer(name, trust_remote_code=True, device="cpu")
150
+ model, active_backend = _load_embedding_model(name)
151
+ if model is not None:
126
152
  dim = model.get_sentence_embedding_dimension()
127
153
  model_name = name
128
- except Exception as exc:
129
- _respond({"ok": False, "error": f"Model load failed: {exc}"})
154
+ else:
155
+ _respond({"ok": False, "error": "Model load failed"})
130
156
  continue
131
157
  try:
132
- # torch.inference_mode prevents autograd graph accumulation
133
- # which causes silent memory leaks over long-running sessions.
134
- try:
135
- import torch
136
- with torch.inference_mode():
137
- vecs = model.encode(texts, normalize_embeddings=True)
138
- except ImportError:
139
- vecs = model.encode(texts, normalize_embeddings=True)
158
+ vecs = model.encode(texts, normalize_embeddings=True)
140
159
  if isinstance(vecs, np.ndarray) and vecs.ndim == 2:
141
160
  result = [vecs[i].tolist() for i in range(vecs.shape[0])]
142
161
  else:
@@ -144,6 +163,16 @@ def _worker_main() -> None:
144
163
  _respond({"ok": True, "vectors": result, "dim": dim})
145
164
  except Exception as exc:
146
165
  _respond({"ok": False, "error": str(exc)})
166
+
167
+ # V3.3.16: RSS watchdog — self-terminate if memory exceeds 1.5GB.
168
+ # PyTorch on ARM64 Mac never returns memory to OS. After ~200 embeds
169
+ # a worker that started at 300MB grows to 17GB+. Parent auto-respawns
170
+ # a fresh worker on next request (existing mechanism in embeddings.py).
171
+ import resource
172
+ rss_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 / 1024
173
+ if rss_mb > 2500:
174
+ sys.exit(0)
175
+
147
176
  continue
148
177
 
149
178
  _respond({"ok": False, "error": f"Unknown command: {cmd}"})
@@ -207,11 +207,29 @@ class EmbeddingService:
207
207
  return resp["vectors"]
208
208
  except (BrokenPipeError, OSError, json.JSONDecodeError) as exc:
209
209
  logger.warning(
210
- "Embedding worker communication failed: %s. "
211
- "Run 'slm doctor' to check dependencies and Python version.",
210
+ "Embedding worker communication failed: %s — respawning.",
212
211
  exc,
213
212
  )
214
213
  self._kill_worker()
214
+ # V3.3.16: Auto-retry once after worker death (RSS watchdog
215
+ # or crash). Respawn + re-send instead of returning None.
216
+ try:
217
+ self._ensure_worker()
218
+ if self._worker_proc is not None:
219
+ self._worker_proc.stdin.write(req)
220
+ self._worker_proc.stdin.flush()
221
+ resp_line = self._readline_with_timeout(
222
+ self._worker_proc.stdout,
223
+ _SUBPROCESS_RESPONSE_TIMEOUT,
224
+ )
225
+ if resp_line:
226
+ resp = json.loads(resp_line)
227
+ if resp.get("ok"):
228
+ self._reset_idle_timer()
229
+ self._request_count = 1
230
+ return resp["vectors"]
231
+ except Exception:
232
+ self._kill_worker()
215
233
  return None
216
234
 
217
235
  @staticmethod
@@ -263,16 +263,32 @@ def run_recall(
263
263
  for r in response.results:
264
264
  trust_scorer.update_on_access("fact", r.fact.fact_id, profile_id)
265
265
 
266
- # V3.3.16: Access count update onlyno redundant embedding call.
267
- # Fisher Bayesian variance update moved to store_pipeline (write-time)
268
- # to avoid per-recall memory pressure from numpy array creation.
269
- # Previously: embedder.embed(query) here duplicated the embed call
270
- # already done in retrieval engine, creating 768-dim numpy arrays
271
- # 304 times during benchmark → pymalloc arena fragmentation → 25GB.
266
+ # Fisher Bayesian update on recall narrows variance on accessed facts
267
+ # so they score higher on subsequent recalls (critical for benchmark: +24pp).
268
+ # V3.3.16: Reuse query embedding from retrieval engine cache instead of
269
+ # calling embedder.embed() again (which was the memory leak source).
270
+ q_var_arr = None
271
+ if embedder and hasattr(retrieval_engine, '_query_embedding_cache'):
272
+ cached_emb = retrieval_engine._query_embedding_cache.get(query)
273
+ if cached_emb is not None:
274
+ import numpy as _np
275
+ _, q_var_list = embedder.compute_fisher_params(cached_emb)
276
+ q_var_arr = _np.array(q_var_list, dtype=_np.float64)
277
+
272
278
  for r in response.results:
273
- db.update_fact(r.fact.fact_id, {
279
+ updates: dict[str, object] = {
274
280
  "access_count": r.fact.access_count + 1,
275
- })
281
+ }
282
+ if (q_var_arr is not None
283
+ and r.fact.fisher_variance
284
+ and len(r.fact.fisher_variance) == len(q_var_arr)
285
+ and r.fact.access_count >= 3):
286
+ import numpy as _np
287
+ f_var = _np.array(r.fact.fisher_variance, dtype=_np.float64)
288
+ new_var = 1.0 / (1.0 / _np.maximum(f_var, 0.05) + 1.0 / _np.maximum(q_var_arr, 0.05))
289
+ new_var = _np.clip(new_var, 0.05, 2.0)
290
+ updates["fisher_variance"] = new_var.tolist()
291
+ db.update_fact(r.fact.fact_id, updates)
276
292
 
277
293
  # Post-operation hooks (audit, trust signal, learning)
278
294
  hook_ctx["result_count"] = len(response.results)
@@ -321,6 +321,13 @@ def _worker_main() -> None:
321
321
  except Exception as exc:
322
322
  _respond({"ok": False, "error": str(exc)})
323
323
 
324
+ # V3.3.16: RSS watchdog — self-terminate if memory exceeds 1.5GB.
325
+ # Parent auto-respawns a fresh worker on next request.
326
+ import resource
327
+ rss_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 / 1024
328
+ if rss_mb > 2500:
329
+ sys.exit(0)
330
+
324
331
 
325
332
  def _respond(data: dict) -> None:
326
333
  sys.stdout.write(json.dumps(data) + "\n")
@@ -328,4 +335,7 @@ def _respond(data: dict) -> None:
328
335
 
329
336
 
330
337
  if __name__ == "__main__":
331
- _worker_main()
338
+ try:
339
+ _worker_main()
340
+ except KeyboardInterrupt:
341
+ sys.exit(0)
@@ -40,6 +40,9 @@ os.environ["PYTORCH_MPS_MEM_LIMIT"] = "0"
40
40
  os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
41
41
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
42
42
  os.environ["TORCH_DEVICE"] = "cpu"
43
+ # V3.3.17: Disable CoreML EP for ONNX Runtime. CoreML compiles execution
44
+ # plans that consume 3-5GB on ARM64 Mac. CPU EP is ~500MB and fast enough.
45
+ os.environ["ORT_DISABLE_COREML"] = "1"
43
46
 
44
47
  # SIGTERM bridge for Docker/systemd
45
48
  if sys.platform != "win32":
@@ -124,10 +127,12 @@ def _worker_main() -> None:
124
127
  warmup_ok = False
125
128
  if model is not None:
126
129
  try:
130
+ # Use 60 pairs (realistic batch size) to trigger CoreML
131
+ # compilation for the actual workload. 3 pairs compiled a
132
+ # different execution plan that got recompiled on 60 pairs.
127
133
  dummy_pairs = [
128
- ("What is the capital of France?", "Paris is the capital of France."),
129
- ("Who wrote Hamlet?", "Shakespeare wrote many plays."),
130
- ("What color is the sky?", "The sky is blue on a clear day."),
134
+ (f"What happened to person {i}?", f"Person {i} went to location {i} and did activity {i} last summer with friends.")
135
+ for i in range(60)
131
136
  ]
132
137
  try:
133
138
  import torch
@@ -174,6 +179,13 @@ def _worker_main() -> None:
174
179
  })
175
180
  except Exception as exc:
176
181
  _respond({"ok": False, "error": str(exc)})
182
+
183
+ # V3.3.16: RSS watchdog — same as embedding_worker
184
+ import resource
185
+ rss_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 / 1024
186
+ if rss_mb > 2500:
187
+ sys.exit(0)
188
+
177
189
  continue
178
190
 
179
191
  if cmd == "score":
@@ -94,8 +94,10 @@ class CrossEncoderReranker:
94
94
  def _start_background_warmup(self) -> None:
95
95
  """Start worker and load model in background thread.
96
96
 
97
- Returns immediately. The worker loads the model in parallel
98
- with the rest of engine initialization and the first recall.
97
+ V3.3.16: Uses _send_request (lock-protected) instead of raw
98
+ stdin/stdout access. Previous code wrote to stdin without the
99
+ lock, creating a race where the warmup's readline thread could
100
+ steal responses meant for _send_request → deadlock → timeout.
99
101
  """
100
102
  if self._worker_loading or self._model_loaded:
101
103
  return
@@ -106,26 +108,18 @@ class CrossEncoderReranker:
106
108
  self._ensure_worker()
107
109
  if self._worker_proc is None:
108
110
  return
109
- # Send load command and wait for response
110
- req = json.dumps({
111
+ resp = self._send_request({
111
112
  "cmd": "load",
112
113
  "model_name": self._model_name,
113
114
  "backend": self._backend,
114
- }) + "\n"
115
- self._worker_proc.stdin.write(req)
116
- self._worker_proc.stdin.flush()
117
- resp_line = self._readline_with_timeout(
118
- self._worker_proc.stdout, _SUBPROCESS_RESPONSE_TIMEOUT,
119
- )
120
- if resp_line:
121
- resp = json.loads(resp_line)
122
- if resp.get("ok"):
123
- self._model_loaded = True
124
- logger.info(
125
- "Reranker worker warm (backend=%s)",
126
- resp.get("backend", "?"),
127
- )
128
- self._reset_idle_timer()
115
+ }, timeout=_SUBPROCESS_RESPONSE_TIMEOUT)
116
+ if resp and resp.get("ok"):
117
+ self._model_loaded = True
118
+ logger.info(
119
+ "Reranker worker warm (backend=%s, warmup_inference=%s)",
120
+ resp.get("backend", "?"),
121
+ resp.get("warmup_inference", False),
122
+ )
129
123
  except Exception as exc:
130
124
  logger.debug("Background reranker warmup failed: %s", exc)
131
125
  finally: