superlocalmemory 3.3.15 → 3.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "superlocalmemory",
3
- "version": "3.3.15",
3
+ "version": "3.3.17",
4
4
  "description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
5
5
  "keywords": [
6
6
  "ai-memory",
package/pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "superlocalmemory"
3
- version = "3.3.15"
3
+ version = "3.3.17"
4
4
  description = "Information-geometric agent memory with mathematical guarantees"
5
5
  readme = "README.md"
6
6
  license = {text = "MIT"}
@@ -144,6 +144,16 @@ def _worker_main() -> None:
144
144
  _respond({"ok": True, "vectors": result, "dim": dim})
145
145
  except Exception as exc:
146
146
  _respond({"ok": False, "error": str(exc)})
147
+
148
+ # V3.3.16: RSS watchdog — self-terminate if memory exceeds 1.5GB.
149
+ # PyTorch on ARM64 Mac never returns memory to OS. After ~200 embeds
150
+ # a worker that started at 300MB grows to 17GB+. Parent auto-respawns
151
+ # a fresh worker on next request (existing mechanism in embeddings.py).
152
+ import resource
153
+ rss_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 / 1024
154
+ if rss_mb > 2500:
155
+ sys.exit(0)
156
+
147
157
  continue
148
158
 
149
159
  _respond({"ok": False, "error": f"Unknown command: {cmd}"})
@@ -207,11 +207,29 @@ class EmbeddingService:
207
207
  return resp["vectors"]
208
208
  except (BrokenPipeError, OSError, json.JSONDecodeError) as exc:
209
209
  logger.warning(
210
- "Embedding worker communication failed: %s. "
211
- "Run 'slm doctor' to check dependencies and Python version.",
210
+ "Embedding worker communication failed: %s — respawning.",
212
211
  exc,
213
212
  )
214
213
  self._kill_worker()
214
+ # V3.3.16: Auto-retry once after worker death (RSS watchdog
215
+ # or crash). Respawn + re-send instead of returning None.
216
+ try:
217
+ self._ensure_worker()
218
+ if self._worker_proc is not None:
219
+ self._worker_proc.stdin.write(req)
220
+ self._worker_proc.stdin.flush()
221
+ resp_line = self._readline_with_timeout(
222
+ self._worker_proc.stdout,
223
+ _SUBPROCESS_RESPONSE_TIMEOUT,
224
+ )
225
+ if resp_line:
226
+ resp = json.loads(resp_line)
227
+ if resp.get("ok"):
228
+ self._reset_idle_timer()
229
+ self._request_count = 1
230
+ return resp["vectors"]
231
+ except Exception:
232
+ self._kill_worker()
215
233
  return None
216
234
 
217
235
  @staticmethod
@@ -24,6 +24,36 @@ from superlocalmemory.storage.models import Mode, RecallResponse
24
24
  logger = logging.getLogger(__name__)
25
25
 
26
26
 
27
+ # ---------------------------------------------------------------------------
28
+ # V3.3.16: Module-level singletons for recall hot-path objects.
29
+ # Prevents creating new BehavioralTracker / ForgettingScheduler per recall
30
+ # (304 recalls = 304 objects that fragment pymalloc arenas → 25GB).
31
+ # ---------------------------------------------------------------------------
32
+
33
+ _behavioral_tracker_cache: dict[int, object] = {}
34
+ _forgetting_scheduler_cache: dict[int, object] = {}
35
+
36
+
37
+ def _get_behavioral_tracker(db: Any) -> Any:
38
+ """Get or create a cached BehavioralTracker for this DB instance."""
39
+ key = id(db)
40
+ if key not in _behavioral_tracker_cache:
41
+ from superlocalmemory.learning.behavioral import BehavioralTracker
42
+ _behavioral_tracker_cache[key] = BehavioralTracker(db)
43
+ return _behavioral_tracker_cache[key]
44
+
45
+
46
+ def _get_forgetting_scheduler(db: Any, config: Any) -> Any:
47
+ """Get or create a cached ForgettingScheduler for this DB instance."""
48
+ key = id(db)
49
+ if key not in _forgetting_scheduler_cache:
50
+ from superlocalmemory.learning.forgetting_scheduler import ForgettingScheduler
51
+ from superlocalmemory.math.ebbinghaus import EbbinghausCurve
52
+ ebbinghaus = EbbinghausCurve(config.forgetting)
53
+ _forgetting_scheduler_cache[key] = ForgettingScheduler(db, ebbinghaus, config.forgetting)
54
+ return _forgetting_scheduler_cache[key]
55
+
56
+
27
57
  # ---------------------------------------------------------------------------
28
58
  # apply_adaptive_ranking (was MemoryEngine._apply_adaptive_ranking)
29
59
  # ---------------------------------------------------------------------------
@@ -192,11 +222,11 @@ def run_recall(
192
222
  except Exception as exc:
193
223
  logger.debug("Access log batch store failed: %s", exc)
194
224
 
195
- # V3.3.12: Wire BehavioralTracker.record_query() into live recall pipeline
225
+ # V3.3.16: Behavioral tracking + spaced repetition use module-level
226
+ # singletons to avoid creating new objects per recall (was causing
227
+ # object accumulation across 304 benchmark recalls).
196
228
  try:
197
- from superlocalmemory.learning.behavioral import BehavioralTracker
198
- _tracker = BehavioralTracker(db)
199
- _tracker.record_query(
229
+ _get_behavioral_tracker(db).record_query(
200
230
  profile_id=profile_id, query=query,
201
231
  query_type=response.query_type,
202
232
  result_count=len(response.results),
@@ -204,15 +234,11 @@ def run_recall(
204
234
  except Exception as exc:
205
235
  logger.debug("Behavioral tracking: %s", exc)
206
236
 
207
- # V3.3.12: Spaced repetition update on recall (Ebbinghaus on_access_event)
208
237
  if response.results:
209
238
  try:
210
- from superlocalmemory.learning.forgetting_scheduler import ForgettingScheduler
211
- from superlocalmemory.math.ebbinghaus import EbbinghausCurve
212
- _ebbinghaus = EbbinghausCurve(config.forgetting)
213
- _fsched = ForgettingScheduler(db, _ebbinghaus, config.forgetting)
239
+ fsched = _get_forgetting_scheduler(db, config)
214
240
  for r in response.results[:10]:
215
- _fsched.on_access_event(r.fact.fact_id, profile_id)
241
+ fsched.on_access_event(r.fact.fact_id, profile_id)
216
242
  except Exception as exc:
217
243
  logger.debug("Spaced repetition update: %s", exc)
218
244
 
@@ -237,30 +263,31 @@ def run_recall(
237
263
  for r in response.results:
238
264
  trust_scorer.update_on_access("fact", r.fact.fact_id, profile_id)
239
265
 
240
- # Fisher Bayesian update on recall
241
- q_emb = embedder.embed(query) if embedder else None
266
+ # Fisher Bayesian update on recall — narrows variance on accessed facts
267
+ # so they score higher on subsequent recalls (critical for benchmark: +24pp).
268
+ # V3.3.16: Reuse query embedding from retrieval engine cache instead of
269
+ # calling embedder.embed() again (which was the memory leak source).
242
270
  q_var_arr = None
243
- if embedder and q_emb:
244
- _, q_var_list = embedder.compute_fisher_params(q_emb)
245
- import numpy as _np
246
- q_var_arr = _np.array(q_var_list, dtype=_np.float64)
271
+ if embedder and hasattr(retrieval_engine, '_query_embedding_cache'):
272
+ cached_emb = retrieval_engine._query_embedding_cache.get(query)
273
+ if cached_emb is not None:
274
+ import numpy as _np
275
+ _, q_var_list = embedder.compute_fisher_params(cached_emb)
276
+ q_var_arr = _np.array(q_var_list, dtype=_np.float64)
247
277
 
248
278
  for r in response.results:
249
279
  updates: dict[str, object] = {
250
280
  "access_count": r.fact.access_count + 1,
251
281
  }
252
- # Bayesian variance narrowing after 3+ accesses
253
282
  if (q_var_arr is not None
254
283
  and r.fact.fisher_variance
255
284
  and len(r.fact.fisher_variance) == len(q_var_arr)
256
285
  and r.fact.access_count >= 3):
257
286
  import numpy as _np
258
287
  f_var = _np.array(r.fact.fisher_variance, dtype=_np.float64)
259
- # Conjugate Gaussian update: 1/new_var = 1/f_var + 1/q_var
260
288
  new_var = 1.0 / (1.0 / _np.maximum(f_var, 0.05) + 1.0 / _np.maximum(q_var_arr, 0.05))
261
289
  new_var = _np.clip(new_var, 0.05, 2.0)
262
290
  updates["fisher_variance"] = new_var.tolist()
263
-
264
291
  db.update_fact(r.fact.fact_id, updates)
265
292
 
266
293
  # Post-operation hooks (audit, trust signal, learning)
@@ -321,6 +321,13 @@ def _worker_main() -> None:
321
321
  except Exception as exc:
322
322
  _respond({"ok": False, "error": str(exc)})
323
323
 
324
+ # V3.3.16: RSS watchdog — self-terminate if memory exceeds 1.5GB.
325
+ # Parent auto-respawns a fresh worker on next request.
326
+ import resource
327
+ rss_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 / 1024
328
+ if rss_mb > 2500:
329
+ sys.exit(0)
330
+
324
331
 
325
332
  def _respond(data: dict) -> None:
326
333
  sys.stdout.write(json.dumps(data) + "\n")
@@ -328,4 +335,7 @@ def _respond(data: dict) -> None:
328
335
 
329
336
 
330
337
  if __name__ == "__main__":
331
- _worker_main()
338
+ try:
339
+ _worker_main()
340
+ except KeyboardInterrupt:
341
+ sys.exit(0)
@@ -118,10 +118,33 @@ def _worker_main() -> None:
118
118
  name = req.get("model_name", "cross-encoder/ms-marco-MiniLM-L-12-v2")
119
119
  backend = req.get("backend", "onnx")
120
120
  model, active_backend, model_name = _load_model(name, backend)
121
+ # V3.3.16: Run real inference to trigger ONNX CoreML JIT compilation.
122
+ # Without this, first real rerank call triggers 30-60s compilation
123
+ # that exceeds the caller's timeout, killing the worker.
124
+ warmup_ok = False
125
+ if model is not None:
126
+ try:
127
+ # Use 60 pairs (realistic batch size) to trigger CoreML
128
+ # compilation for the actual workload. 3 pairs compiled a
129
+ # different execution plan that got recompiled on 60 pairs.
130
+ dummy_pairs = [
131
+ (f"What happened to person {i}?", f"Person {i} went to location {i} and did activity {i} last summer with friends.")
132
+ for i in range(60)
133
+ ]
134
+ try:
135
+ import torch
136
+ with torch.inference_mode():
137
+ _scores = model.predict(dummy_pairs)
138
+ except ImportError:
139
+ _scores = model.predict(dummy_pairs)
140
+ warmup_ok = True
141
+ except Exception:
142
+ pass
121
143
  _respond({
122
144
  "ok": model is not None,
123
145
  "backend": active_backend,
124
146
  "model": model_name,
147
+ "warmup_inference": warmup_ok,
125
148
  })
126
149
  continue
127
150
 
@@ -153,6 +176,13 @@ def _worker_main() -> None:
153
176
  })
154
177
  except Exception as exc:
155
178
  _respond({"ok": False, "error": str(exc)})
179
+
180
+ # V3.3.16: RSS watchdog — same as embedding_worker
181
+ import resource
182
+ rss_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 / 1024
183
+ if rss_mb > 2500:
184
+ sys.exit(0)
185
+
156
186
  continue
157
187
 
158
188
  if cmd == "score":
@@ -338,31 +338,27 @@ class RetrievalEngine:
338
338
  if not candidates:
339
339
  return fused
340
340
 
341
- # Bug 3 fix: strip speaker tags from content before CE scoring
342
- clean_candidates: list[tuple[AtomicFact, float]] = []
343
- for fact, score in candidates:
344
- cleaned_content = re.sub(r'^\[[A-Za-z]+\]:\s*', '', fact.content)
345
- clean_fact = AtomicFact(
346
- fact_id=fact.fact_id, memory_id=fact.memory_id,
347
- profile_id=fact.profile_id, content=cleaned_content,
348
- fact_type=fact.fact_type, entities=fact.entities,
349
- canonical_entities=fact.canonical_entities,
350
- observation_date=fact.observation_date,
351
- referenced_date=fact.referenced_date,
352
- confidence=fact.confidence, importance=fact.importance,
353
- evidence_count=fact.evidence_count,
354
- access_count=fact.access_count,
355
- embedding=fact.embedding, created_at=fact.created_at,
356
- )
357
- clean_candidates.append((clean_fact, score))
341
+ # V3.3.16: Strip speaker tags WITHOUT copying full AtomicFact objects.
342
+ # Previously created full copies including 768-dim embeddings (~6KB each),
343
+ # which over 304 recalls caused pymalloc arena fragmentation → 25GB.
344
+ # Now: temporarily patch .content on originals, rerank, then restore.
345
+ originals: list[tuple[AtomicFact, str]] = [] # (fact, original_content)
346
+ for fact, _ in candidates:
347
+ orig = fact.content
348
+ fact.content = re.sub(r'^\[[A-Za-z]+\]:\s*', '', orig)
349
+ originals.append((fact, orig))
358
350
 
359
351
  try:
360
352
  scored = self._reranker.rerank( # type: ignore[union-attr]
361
- query, clean_candidates, top_k=len(clean_candidates),
353
+ query, candidates, top_k=len(candidates),
362
354
  )
363
355
  except Exception as exc:
364
356
  logger.warning("Cross-encoder rerank failed: %s", exc)
365
357
  return fused
358
+ finally:
359
+ # Restore original content (with speaker tags)
360
+ for fact, orig_content in originals:
361
+ fact.content = orig_content
366
362
 
367
363
  score_map = {fact.fact_id: score for fact, score in scored}
368
364
 
@@ -94,8 +94,10 @@ class CrossEncoderReranker:
94
94
  def _start_background_warmup(self) -> None:
95
95
  """Start worker and load model in background thread.
96
96
 
97
- Returns immediately. The worker loads the model in parallel
98
- with the rest of engine initialization and the first recall.
97
+ V3.3.16: Uses _send_request (lock-protected) instead of raw
98
+ stdin/stdout access. Previous code wrote to stdin without the
99
+ lock, creating a race where the warmup's readline thread could
100
+ steal responses meant for _send_request → deadlock → timeout.
99
101
  """
100
102
  if self._worker_loading or self._model_loaded:
101
103
  return
@@ -106,26 +108,18 @@ class CrossEncoderReranker:
106
108
  self._ensure_worker()
107
109
  if self._worker_proc is None:
108
110
  return
109
- # Send load command and wait for response
110
- req = json.dumps({
111
+ resp = self._send_request({
111
112
  "cmd": "load",
112
113
  "model_name": self._model_name,
113
114
  "backend": self._backend,
114
- }) + "\n"
115
- self._worker_proc.stdin.write(req)
116
- self._worker_proc.stdin.flush()
117
- resp_line = self._readline_with_timeout(
118
- self._worker_proc.stdout, _SUBPROCESS_RESPONSE_TIMEOUT,
119
- )
120
- if resp_line:
121
- resp = json.loads(resp_line)
122
- if resp.get("ok"):
123
- self._model_loaded = True
124
- logger.info(
125
- "Reranker worker warm (backend=%s)",
126
- resp.get("backend", "?"),
127
- )
128
- self._reset_idle_timer()
115
+ }, timeout=_SUBPROCESS_RESPONSE_TIMEOUT)
116
+ if resp and resp.get("ok"):
117
+ self._model_loaded = True
118
+ logger.info(
119
+ "Reranker worker warm (backend=%s, warmup_inference=%s)",
120
+ resp.get("backend", "?"),
121
+ resp.get("warmup_inference", False),
122
+ )
129
123
  except Exception as exc:
130
124
  logger.debug("Background reranker warmup failed: %s", exc)
131
125
  finally:
@@ -321,13 +315,14 @@ class CrossEncoderReranker:
321
315
 
322
316
  documents = [fact.content for fact, _ in candidates]
323
317
 
324
- # V3.3.12: Increased timeout 10s→60s L-12-v2 needs PyTorch + ONNX load.
325
- # Critical: Paper 2 ablation showed -30.7pp without reranking.
318
+ # V3.3.16: Timeout 180sONNX CoreML compilation can take 30-60s on
319
+ # first inference even after model load. The warmup_inference in the
320
+ # worker should prevent this, but 180s is a safety net.
326
321
  resp = self._send_request({
327
322
  "cmd": "rerank",
328
323
  "query": query,
329
324
  "documents": documents,
330
- }, timeout=60.0)
325
+ }, timeout=180.0)
331
326
 
332
327
  if resp is None or not resp.get("ok"):
333
328
  # Fallback: return by existing score