npm - superlocalmemory - Versions diffs - 3.3.17 → 3.3.18 - Mend

superlocalmemory 3.3.17 → 3.3.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/package.json +1 -1
package/pyproject.toml +1 -1
package/src/superlocalmemory/core/config.py +1 -1
package/src/superlocalmemory/core/embedding_worker.py +40 -21
package/src/superlocalmemory/core/reranker_worker.py +3 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "superlocalmemory",
-  "version": "3.3.17",
+  "version": "3.3.18",
   "description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
   "keywords": [
     "ai-memory",

package/pyproject.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "superlocalmemory"
-version = "3.3.17"
+version = "3.3.18"
 description = "Information-geometric agent memory with mathematical guarantees"
 readme = "README.md"
 license = {text = "MIT"}

package/src/superlocalmemory/core/config.py CHANGED Viewed

@@ -155,7 +155,7 @@ class RetrievalConfig:
     # Reranking (V3.3.2: ONNX backend enabled for all modes)
     use_cross_encoder: bool = True
     cross_encoder_model: str = "cross-encoder/ms-marco-MiniLM-L-12-v2"
-    cross_encoder_backend: str = "onnx"  # "onnx" (~200MB) or "" (PyTorch, ~1.5GB)
+    cross_encoder_backend: str = ""  # "" = PyTorch (~500MB stable), "onnx" = ONNX (leaks on ARM64 CoreML)
     # Agentic (Mode C only)
     agentic_max_rounds: int = 3

package/src/superlocalmemory/core/embedding_worker.py CHANGED Viewed

@@ -35,6 +35,8 @@ os.environ["PYTORCH_MPS_MEM_LIMIT"] = "0"
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 os.environ["TORCH_DEVICE"] = "cpu"
+# V3.3.17: Disable CoreML EP for ONNX Runtime — uses 3-5GB on ARM64 Mac.
+os.environ["ORT_DISABLE_COREML"] = "1"
 # SIGTERM bridge: Docker/systemd send SIGTERM to stop processes.
 # Without this, the worker ignores SIGTERM and becomes a zombie.
@@ -65,6 +67,34 @@ def _start_parent_watchdog() -> None:
     t.start()
+def _load_embedding_model(name: str) -> tuple:
+    """Load embedding model. ONNX first (no memory leak), PyTorch fallback.
+    V3.3.17: PyTorch SentenceTransformer on ARM64 Mac leaks memory —
+    grows from 300MB to 17GB after ~200 encode calls. ONNX Runtime
+    has no such issue. Same approach as CrossEncoder ONNX migration.
+    Returns (model, backend_name) or (None, "").
+    """
+    from sentence_transformers import SentenceTransformer
+    # Tier 1: ONNX (stable memory, ~200MB footprint)
+    try:
+        m = SentenceTransformer(name, backend="onnx", trust_remote_code=True)
+        return m, "onnx"
+    except Exception:
+        pass
+    # Tier 2: PyTorch CPU (stable at ~1.4GB after 100+ calls, verified)
+    try:
+        import torch
+        with torch.inference_mode():
+            m = SentenceTransformer(name, trust_remote_code=True, device="cpu")
+        return m, "pytorch"
+    except Exception:
+        return None, ""
 def _worker_main() -> None:
     """Main loop: read JSON requests from stdin, write responses to stdout."""
     _start_parent_watchdog()  # V3.3.7: self-terminate if parent dies
@@ -97,18 +127,17 @@ def _worker_main() -> None:
         if cmd == "load":
             name = req.get("model_name", "nomic-ai/nomic-embed-text-v1.5")
             expected_dim = req.get("dimension", 768)
-            try:
-                from sentence_transformers import SentenceTransformer
-                model = SentenceTransformer(name, trust_remote_code=True, device="cpu")
+            model, active_backend = _load_embedding_model(name)
+            if model is not None:
                 dim = model.get_sentence_embedding_dimension()
                 if dim != expected_dim:
                     _respond({"ok": False, "error": f"Dimension mismatch: {dim} != {expected_dim}"})
                     model = None
                     continue
                 model_name = name
-                _respond({"ok": True, "dim": dim, "model": name})
-            except Exception as exc:
-                _respond({"ok": False, "error": str(exc)})
+                _respond({"ok": True, "dim": dim, "model": name, "backend": active_backend})
+            else:
+                _respond({"ok": False, "error": "Model load failed"})
             continue
         if cmd == "embed":
@@ -117,26 +146,16 @@ def _worker_main() -> None:
                 _respond({"ok": False, "error": "No texts provided"})
                 continue
             if model is None:
-                # Auto-load if not yet loaded
                 name = req.get("model_name", "nomic-ai/nomic-embed-text-v1.5")
-                expected_dim = req.get("dimension", 768)
-                try:
-                    from sentence_transformers import SentenceTransformer
-                    model = SentenceTransformer(name, trust_remote_code=True, device="cpu")
+                model, active_backend = _load_embedding_model(name)
+                if model is not None:
                     dim = model.get_sentence_embedding_dimension()
                     model_name = name
-                except Exception as exc:
-                    _respond({"ok": False, "error": f"Model load failed: {exc}"})
+                else:
+                    _respond({"ok": False, "error": "Model load failed"})
                     continue
             try:
-                # torch.inference_mode prevents autograd graph accumulation
-                # which causes silent memory leaks over long-running sessions.
-                try:
-                    import torch
-                    with torch.inference_mode():
-                        vecs = model.encode(texts, normalize_embeddings=True)
-                except ImportError:
-                    vecs = model.encode(texts, normalize_embeddings=True)
+                vecs = model.encode(texts, normalize_embeddings=True)
                 if isinstance(vecs, np.ndarray) and vecs.ndim == 2:
                     result = [vecs[i].tolist() for i in range(vecs.shape[0])]
                 else:

package/src/superlocalmemory/core/reranker_worker.py CHANGED Viewed

@@ -40,6 +40,9 @@ os.environ["PYTORCH_MPS_MEM_LIMIT"] = "0"
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 os.environ["TORCH_DEVICE"] = "cpu"
+# V3.3.17: Disable CoreML EP for ONNX Runtime. CoreML compiles execution
+# plans that consume 3-5GB on ARM64 Mac. CPU EP is ~500MB and fast enough.
+os.environ["ORT_DISABLE_COREML"] = "1"
 # SIGTERM bridge for Docker/systemd
 if sys.platform != "win32":