npm - superlocalmemory - Versions diffs - 3.3.16 → 3.3.17 - Mend

superlocalmemory 3.3.16 → 3.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/package.json +1 -1
package/pyproject.toml +1 -1
package/src/superlocalmemory/core/embedding_worker.py +10 -0
package/src/superlocalmemory/core/embeddings.py +20 -2
package/src/superlocalmemory/core/recall_pipeline.py +24 -8
package/src/superlocalmemory/core/recall_worker.py +11 -1
package/src/superlocalmemory/core/reranker_worker.py +12 -3
package/src/superlocalmemory/retrieval/reranker.py +13 -19

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "superlocalmemory",
-  "version": "3.3.16",
+  "version": "3.3.17",
   "description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
   "keywords": [
     "ai-memory",

package/pyproject.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "superlocalmemory"
-version = "3.3.16"
+version = "3.3.17"
 description = "Information-geometric agent memory with mathematical guarantees"
 readme = "README.md"
 license = {text = "MIT"}

package/src/superlocalmemory/core/embedding_worker.py CHANGED Viewed

@@ -144,6 +144,16 @@ def _worker_main() -> None:
                 _respond({"ok": True, "vectors": result, "dim": dim})
             except Exception as exc:
                 _respond({"ok": False, "error": str(exc)})
+            # V3.3.16: RSS watchdog — self-terminate if memory exceeds 1.5GB.
+            # PyTorch on ARM64 Mac never returns memory to OS. After ~200 embeds
+            # a worker that started at 300MB grows to 17GB+. Parent auto-respawns
+            # a fresh worker on next request (existing mechanism in embeddings.py).
+            import resource
+            rss_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 / 1024
+            if rss_mb > 2500:
+                sys.exit(0)
             continue
         _respond({"ok": False, "error": f"Unknown command: {cmd}"})

package/src/superlocalmemory/core/embeddings.py CHANGED Viewed

@@ -207,11 +207,29 @@ class EmbeddingService:
                 return resp["vectors"]
             except (BrokenPipeError, OSError, json.JSONDecodeError) as exc:
                 logger.warning(
-                    "Embedding worker communication failed: %s. "
-                    "Run 'slm doctor' to check dependencies and Python version.",
+                    "Embedding worker communication failed: %s — respawning.",
                     exc,
                 )
                 self._kill_worker()
+                # V3.3.16: Auto-retry once after worker death (RSS watchdog
+                # or crash). Respawn + re-send instead of returning None.
+                try:
+                    self._ensure_worker()
+                    if self._worker_proc is not None:
+                        self._worker_proc.stdin.write(req)
+                        self._worker_proc.stdin.flush()
+                        resp_line = self._readline_with_timeout(
+                            self._worker_proc.stdout,
+                            _SUBPROCESS_RESPONSE_TIMEOUT,
+                        )
+                        if resp_line:
+                            resp = json.loads(resp_line)
+                            if resp.get("ok"):
+                                self._reset_idle_timer()
+                                self._request_count = 1
+                                return resp["vectors"]
+                except Exception:
+                    self._kill_worker()
                 return None
     @staticmethod

package/src/superlocalmemory/core/recall_pipeline.py CHANGED Viewed

@@ -263,16 +263,32 @@ def run_recall(
         for r in response.results:
             trust_scorer.update_on_access("fact", r.fact.fact_id, profile_id)
-    # V3.3.16: Access count update only — no redundant embedding call.
-    # Fisher Bayesian variance update moved to store_pipeline (write-time)
-    # to avoid per-recall memory pressure from numpy array creation.
-    # Previously: embedder.embed(query) here duplicated the embed call
-    # already done in retrieval engine, creating 768-dim numpy arrays
-    # 304 times during benchmark → pymalloc arena fragmentation → 25GB.
+    # Fisher Bayesian update on recall — narrows variance on accessed facts
+    # so they score higher on subsequent recalls (critical for benchmark: +24pp).
+    # V3.3.16: Reuse query embedding from retrieval engine cache instead of
+    # calling embedder.embed() again (which was the memory leak source).
+    q_var_arr = None
+    if embedder and hasattr(retrieval_engine, '_query_embedding_cache'):
+        cached_emb = retrieval_engine._query_embedding_cache.get(query)
+        if cached_emb is not None:
+            import numpy as _np
+            _, q_var_list = embedder.compute_fisher_params(cached_emb)
+            q_var_arr = _np.array(q_var_list, dtype=_np.float64)
     for r in response.results:
-        db.update_fact(r.fact.fact_id, {
+        updates: dict[str, object] = {
             "access_count": r.fact.access_count + 1,
-        })
+        }
+        if (q_var_arr is not None
+                and r.fact.fisher_variance
+                and len(r.fact.fisher_variance) == len(q_var_arr)
+                and r.fact.access_count >= 3):
+            import numpy as _np
+            f_var = _np.array(r.fact.fisher_variance, dtype=_np.float64)
+            new_var = 1.0 / (1.0 / _np.maximum(f_var, 0.05) + 1.0 / _np.maximum(q_var_arr, 0.05))
+            new_var = _np.clip(new_var, 0.05, 2.0)
+            updates["fisher_variance"] = new_var.tolist()
+        db.update_fact(r.fact.fact_id, updates)
     # Post-operation hooks (audit, trust signal, learning)
     hook_ctx["result_count"] = len(response.results)

package/src/superlocalmemory/core/recall_worker.py CHANGED Viewed

@@ -321,6 +321,13 @@ def _worker_main() -> None:
         except Exception as exc:
             _respond({"ok": False, "error": str(exc)})
+        # V3.3.16: RSS watchdog — self-terminate if memory exceeds 1.5GB.
+        # Parent auto-respawns a fresh worker on next request.
+        import resource
+        rss_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 / 1024
+        if rss_mb > 2500:
+            sys.exit(0)
 def _respond(data: dict) -> None:
     sys.stdout.write(json.dumps(data) + "\n")
@@ -328,4 +335,7 @@ def _respond(data: dict) -> None:
 if __name__ == "__main__":
-    _worker_main()
+    try:
+        _worker_main()
+    except KeyboardInterrupt:
+        sys.exit(0)

package/src/superlocalmemory/core/reranker_worker.py CHANGED Viewed

@@ -124,10 +124,12 @@ def _worker_main() -> None:
             warmup_ok = False
             if model is not None:
                 try:
+                    # Use 60 pairs (realistic batch size) to trigger CoreML
+                    # compilation for the actual workload. 3 pairs compiled a
+                    # different execution plan that got recompiled on 60 pairs.
                     dummy_pairs = [
-                        ("What is the capital of France?", "Paris is the capital of France."),
-                        ("Who wrote Hamlet?", "Shakespeare wrote many plays."),
-                        ("What color is the sky?", "The sky is blue on a clear day."),
+                        (f"What happened to person {i}?", f"Person {i} went to location {i} and did activity {i} last summer with friends.")
+                        for i in range(60)
                     ]
                     try:
                         import torch
@@ -174,6 +176,13 @@ def _worker_main() -> None:
                 })
             except Exception as exc:
                 _respond({"ok": False, "error": str(exc)})
+            # V3.3.16: RSS watchdog — same as embedding_worker
+            import resource
+            rss_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 / 1024
+            if rss_mb > 2500:
+                sys.exit(0)
             continue
         if cmd == "score":

package/src/superlocalmemory/retrieval/reranker.py CHANGED Viewed

@@ -94,8 +94,10 @@ class CrossEncoderReranker:
     def _start_background_warmup(self) -> None:
         """Start worker and load model in background thread.
-        Returns immediately. The worker loads the model in parallel
-        with the rest of engine initialization and the first recall.
+        V3.3.16: Uses _send_request (lock-protected) instead of raw
+        stdin/stdout access. Previous code wrote to stdin without the
+        lock, creating a race where the warmup's readline thread could
+        steal responses meant for _send_request → deadlock → timeout.
         """
         if self._worker_loading or self._model_loaded:
             return
@@ -106,26 +108,18 @@ class CrossEncoderReranker:
                 self._ensure_worker()
                 if self._worker_proc is None:
                     return
-                # Send load command and wait for response
-                req = json.dumps({
+                resp = self._send_request({
                     "cmd": "load",
                     "model_name": self._model_name,
                     "backend": self._backend,
-                }) + "\n"
-                self._worker_proc.stdin.write(req)
-                self._worker_proc.stdin.flush()
-                resp_line = self._readline_with_timeout(
-                    self._worker_proc.stdout, _SUBPROCESS_RESPONSE_TIMEOUT,
-                )
-                if resp_line:
-                    resp = json.loads(resp_line)
-                    if resp.get("ok"):
-                        self._model_loaded = True
-                        logger.info(
-                            "Reranker worker warm (backend=%s)",
-                            resp.get("backend", "?"),
-                        )
-                        self._reset_idle_timer()
+                }, timeout=_SUBPROCESS_RESPONSE_TIMEOUT)
+                if resp and resp.get("ok"):
+                    self._model_loaded = True
+                    logger.info(
+                        "Reranker worker warm (backend=%s, warmup_inference=%s)",
+                        resp.get("backend", "?"),
+                        resp.get("warmup_inference", False),
+                    )
             except Exception as exc:
                 logger.debug("Background reranker warmup failed: %s", exc)
             finally: