npm - superlocalmemory - Versions diffs - 3.4.36 → 3.4.37 - Mend

superlocalmemory 3.4.36 → 3.4.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/CHANGELOG.md +42 -0
package/package.json +1 -1
package/pyproject.toml +1 -1
package/src/superlocalmemory/__init__.py +1 -1
package/src/superlocalmemory/cli/commands.py +1 -0
package/src/superlocalmemory/core/embedding_worker.py +1 -1
package/src/superlocalmemory/core/embeddings.py +5 -8
package/src/superlocalmemory/core/health_monitor.py +2 -2
package/src/superlocalmemory/core/recall_worker.py +3 -1
package/src/superlocalmemory/retrieval/reranker.py +2 -1
package/src/superlocalmemory/server/unified_daemon.py +81 -10
package/src/superlocalmemory.egg-info/PKG-INFO +663 -0
package/src/superlocalmemory.egg-info/SOURCES.txt +451 -0
package/src/superlocalmemory.egg-info/dependency_links.txt +1 -0
package/src/superlocalmemory.egg-info/entry_points.txt +2 -0
package/src/superlocalmemory.egg-info/requires.txt +59 -0
package/src/superlocalmemory.egg-info/top_level.txt +1 -0

package/CHANGELOG.md CHANGED Viewed

@@ -10,6 +10,48 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ---
+## [3.4.37] - 2026-04-26
+**P0 RAM fix.** Total SLM footprint reduced from ~14 GB peak to ~2.3 GB peak
+(84% reduction). Idle dropped from ~2.5 GB to ~1.0 GB. Users with 16 GB
+laptops can now run SLM without uninstalling.
+### Fixed
+- **CoreML EP allocation** — Added `ORT_DISABLE_COREML=1` to
+  `recall_worker.py`, `cli/commands.py` (warmup diagnose path), and the
+  Popen environment dicts in `core/embeddings.py` and
+  `retrieval/reranker.py`. Previously only `embedding_worker.py` and
+  `reranker_worker.py` set this. On ARM64 Mac, ONNX Runtime's CoreML
+  Execution Provider allocated 3-5 GB per missing guard.
+- **Duplicate MemoryEngine** — The QueueConsumer (recall_queue.db drain)
+  was routing through `WorkerPool` → `recall_worker` subprocess, which
+  loaded a SECOND full MemoryEngine inside the daemon. Now routes through
+  the daemon's in-process engine via the new `EngineRecallAdapter`.
+  Eliminates ~800 MB of duplication.
+- **Eager warmup** — Removed `WorkerPool.shared().warmup()` from daemon
+  startup. The recall_worker subprocess no longer spawns at boot. It
+  remains available as a fallback for dashboard/chat routes.
+### Changed
+- **RSS limits tightened:**
+  - `embedding_worker` self-kill: 4000 MB → 1800 MB
+  - `recall_worker` self-kill: 2500 MB → 1500 MB
+  - Daemon watchdog `MAX_WORKER_MB`: 4096 MB → 1800 MB
+  - `HealthMonitor.global_rss_budget_mb`: 4096 MB → 2500 MB
+- **Watchdog interval:** 60s → 15s in both daemon watchdog and
+  HealthMonitor `check_interval_sec`. Catches memory spikes faster.
+- **Idle timeouts:**
+  - `SLM_EMBED_IDLE_TIMEOUT`: 1800s (30 min) → 300s (5 min)
+  - `SLM_RERANKER_IDLE_TIMEOUT`: 1800s → 300s
+  - Reduces idle RAM held by ML model subprocesses.
+### Added
+- **`EngineRecallAdapter`** in `unified_daemon.py` — wraps the in-process
+  MemoryEngine to satisfy `RecallPoolProtocol` for the QueueConsumer.
+  Eliminates the recall_worker subprocess on the hot path.
+---
 ## [3.4.36] - 2026-04-25
 Persistent hook daemon: recall latency drops from ~2.2s to sub-second by

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "superlocalmemory",
-  "version": "3.4.36",
+  "version": "3.4.37",
   "description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
   "keywords": [
     "ai-memory",

package/pyproject.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "superlocalmemory"
-version = "3.4.36"
+version = "3.4.37"
 description = "Information-geometric agent memory with mathematical guarantees"
 readme = "README.md"
 license = {text = "AGPL-3.0-or-later"}

package/src/superlocalmemory/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """SuperLocalMemory — information-geometric agent memory."""
-__version__ = "3.4.36"
+__version__ = "3.4.37"

package/src/superlocalmemory/cli/commands.py CHANGED Viewed

@@ -1710,6 +1710,7 @@ def _warmup_diagnose() -> None:
     """Diagnostic helper when warmup fails."""
     print("\nDiagnosing...")
     print(f"  Python executable: {sys.executable}")
+    os.environ["ORT_DISABLE_COREML"] = "1"
     try:
         from sentence_transformers import SentenceTransformer
         print("  sentence-transformers: importable")

package/src/superlocalmemory/core/embedding_worker.py CHANGED Viewed

@@ -151,7 +151,7 @@ def _worker_main() -> None:
                 _respond({"ok": False, "error": str(exc)})
             # V3.3.16: RSS watchdog — V3.4.24: cross-platform via platform_utils.
-            _rss_limit = int(os.environ.get("SLM_EMBED_WORKER_RSS_LIMIT_MB", 4000))
+            _rss_limit = int(os.environ.get("SLM_EMBED_WORKER_RSS_LIMIT_MB", 1800))
             rss_mb = get_rss_mb()
             if rss_mb > 0 and rss_mb > _rss_limit:
                 sys.exit(0)

package/src/superlocalmemory/core/embeddings.py CHANGED Viewed

@@ -140,14 +140,10 @@ def release_embedding_lock() -> None:
         _embedding_lock_fd = None
-_IDLE_TIMEOUT_SECONDS = 1800  # 30 minutes — keep model warm across bursty use.
-# V3.3.12: Configurable via SLM_EMBED_IDLE_TIMEOUT env var (seconds).
-# V3.4.19: Bumped from 120 → 1800 to eliminate the 30-60s cold-start pain
-# when the embedding worker was killed too aggressively. Safety: the
-# per-embed RSS self-check (SLM_EMBED_WORKER_RSS_LIMIT_MB, 4GB default) and
-# the daemon memory watchdog (unified_daemon.py, 4GB/60s) still cap any
-# runaway. To restore the old aggressive policy without redeploying, set
-# ``SLM_EMBED_IDLE_TIMEOUT=120`` and ``slm restart``.
+_IDLE_TIMEOUT_SECONDS = 300  # 5 minutes — balance cold-start vs RAM.
+# V3.4.37: Reduced from 1800 → 300. Holding 1.1 GB for 30 min idle
+# wastes RAM on laptops. 5 min covers bursty session_init+recall
+# patterns while freeing memory between sessions.
 _IDLE_TIMEOUT_SECONDS = int(os.environ.get("SLM_EMBED_IDLE_TIMEOUT", _IDLE_TIMEOUT_SECONDS))
 # V3.3.21: Configurable response timeout — 180s default, but batch ingestion
 # (2-turn chunks across 10 conversations) needs 600s+ to survive cold-start
@@ -476,6 +472,7 @@ class EmbeddingService:
                 "PYTORCH_ENABLE_MPS_FALLBACK": "1",
                 "TOKENIZERS_PARALLELISM": "false",
                 "TORCH_DEVICE": "cpu",
+                "ORT_DISABLE_COREML": "1",
             }
             from superlocalmemory.core.platform_utils import popen_platform_kwargs
             self._worker_proc = subprocess.Popen(

package/src/superlocalmemory/core/health_monitor.py CHANGED Viewed

@@ -133,9 +133,9 @@ class HealthMonitor:
     def __init__(
         self,
-        global_rss_budget_mb: int = 4096,
+        global_rss_budget_mb: int = 2500,
         heartbeat_timeout_sec: int = 60,
-        check_interval_sec: int = 30,
+        check_interval_sec: int = 15,
         enable_structured_logging: bool = True,
     ):
         self._budget_mb = global_rss_budget_mb

package/src/superlocalmemory/core/recall_worker.py CHANGED Viewed

@@ -28,6 +28,8 @@ os.environ["PYTORCH_MPS_MEM_LIMIT"] = "0"
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 os.environ["TORCH_DEVICE"] = "cpu"
+# V3.4.37: Disable CoreML EP — uses 3-5GB on ARM64 Mac.
+os.environ["ORT_DISABLE_COREML"] = "1"
 # SIGTERM bridge: Docker/systemd send SIGTERM to stop processes.
 # Without this, the worker ignores SIGTERM and becomes a zombie.
@@ -324,7 +326,7 @@ def _worker_main() -> None:
         # V3.3.16: RSS watchdog — V3.4.24: cross-platform via platform_utils.
         rss_mb = get_rss_mb()
-        if rss_mb > 0 and rss_mb > 2500:
+        if rss_mb > 0 and rss_mb > 1500:
             sys.exit(0)

package/src/superlocalmemory/retrieval/reranker.py CHANGED Viewed

@@ -51,7 +51,7 @@ _live_rerankers: set[weakref.ref] = set()
 logger = logging.getLogger(__name__)
-_IDLE_TIMEOUT_SECONDS = 1800  # 30 min — keep cross-encoder warm for active sessions.
+_IDLE_TIMEOUT_SECONDS = 300  # V3.4.37: 5 min (was 30) — balance cold-start vs RAM.
 # V3.3.12: Configurable via SLM_RERANKER_IDLE_TIMEOUT env var.
 # V3.4.19: Bumped from 120 → 1800 in lock-step with the embedding worker.
 # Set ``SLM_RERANKER_IDLE_TIMEOUT=120`` + ``slm restart`` to revert.
@@ -192,6 +192,7 @@ class CrossEncoderReranker:
                 "PYTORCH_ENABLE_MPS_FALLBACK": "1",
                 "TOKENIZERS_PARALLELISM": "false",
                 "TORCH_DEVICE": "cpu",
+                "ORT_DISABLE_COREML": "1",
             }
             from superlocalmemory.core.platform_utils import popen_platform_kwargs
             self._worker_proc = subprocess.Popen(

package/src/superlocalmemory/server/unified_daemon.py CHANGED Viewed

@@ -66,6 +66,75 @@ class ObserveRequest(BaseModel):
     content: str
+# ---------------------------------------------------------------------------
+# V3.4.37: Engine recall adapter — routes QueueConsumer through the daemon's
+# in-process MemoryEngine instead of spawning a recall_worker subprocess.
+# Saves ~800 MB by eliminating the duplicate engine.
+# ---------------------------------------------------------------------------
+class EngineRecallAdapter:
+    """Adapts MemoryEngine.recall() to RecallPoolProtocol for QueueConsumer.
+    The daemon already has a full MemoryEngine in-process. The QueueConsumer
+    previously routed through WorkerPool → recall_worker subprocess, which
+    loaded a SECOND MemoryEngine. This adapter eliminates that duplication.
+    """
+    def __init__(self, engine) -> None:
+        self._engine = engine
+    def recall(self, query: str, limit: int = 10, session_id: str = "") -> dict:
+        response = self._engine.recall(
+            query, limit=limit, session_id=session_id or None,
+        )
+        memory_ids = list({
+            r.fact.memory_id for r in response.results[:limit]
+            if r.fact.memory_id
+        })
+        memory_map = (
+            self._engine._db.get_memory_content_batch(memory_ids)
+            if memory_ids else {}
+        )
+        results = []
+        for r in response.results[:limit]:
+            fact_type = getattr(r.fact, "fact_type", None)
+            lifecycle = getattr(r.fact, "lifecycle", None)
+            results.append({
+                "fact_id": r.fact.fact_id,
+                "memory_id": r.fact.memory_id,
+                "content": r.fact.content[:300],
+                "source_content": memory_map.get(r.fact.memory_id, ""),
+                "score": round(r.score, 4),
+                "confidence": round(r.confidence, 4),
+                "trust_score": round(r.trust_score, 4),
+                "channel_scores": {
+                    k: round(v, 4)
+                    for k, v in (r.channel_scores or {}).items()
+                },
+                "fact_type": fact_type.value
+                    if fact_type and hasattr(fact_type, "value") else "",
+                "lifecycle": lifecycle.value
+                    if lifecycle and hasattr(lifecycle, "value") else "",
+                "access_count": getattr(r.fact, "access_count", 0),
+                "evidence_chain": list(
+                    getattr(r, "evidence_chain", []) or []
+                ),
+            })
+        return {
+            "ok": True,
+            "query": query,
+            "query_type": response.query_type,
+            "result_count": len(results),
+            "retrieval_time_ms": round(response.retrieval_time_ms, 1),
+            "channel_weights": {
+                k: round(v, 3)
+                for k, v in (response.channel_weights or {}).items()
+            },
+            "total_candidates": getattr(response, "total_candidates", 0),
+            "results": results,
+        }
 # ---------------------------------------------------------------------------
 # v3.4.32: Recall-priority gate for the pending materializer.
 # All /remember writes go to pending.db and return fast; a background
@@ -397,9 +466,10 @@ async def lifespan(application: FastAPI):
         # Set up observe buffer
         _observe_buffer.set_engine(engine)
-        # Pre-warm workers (background)
-        from superlocalmemory.core.worker_pool import WorkerPool
-        WorkerPool.shared().warmup()
+        # V3.4.37: Removed WorkerPool.warmup() — the recall_worker subprocess
+        # duplicated the daemon's MemoryEngine (800+ MB). QueueConsumer now
+        # uses the daemon's engine directly via EngineRecallAdapter.
+        # WorkerPool is still available as fallback for dashboard/chat routes.
         # Force reranker warmup
         retrieval_eng = getattr(engine, '_retrieval_engine', None)
@@ -422,8 +492,9 @@ async def lifespan(application: FastAPI):
                 logger.warning("Embedding warmup failed: %s", exc)
         threading.Thread(target=_warmup_embedder, daemon=True, name="embed-warmup").start()
-        # v3.4.26: Start QueueConsumer — drains recall_queue.db via pool.recall().
-        # Must start AFTER WorkerPool.warmup() so the worker is ready.
+        # v3.4.37: QueueConsumer uses daemon's engine directly via adapter.
+        # Previously routed through WorkerPool → recall_worker subprocess,
+        # which loaded a duplicate MemoryEngine (~800 MB waste).
         try:
             from pathlib import Path as _QP
             from superlocalmemory.core.queue_consumer import QueueConsumer
@@ -432,7 +503,7 @@ async def lifespan(application: FastAPI):
             _recall_queue = RecallQueue(_queue_db)
             _queue_consumer = QueueConsumer(
                 queue=_recall_queue,
-                pool=WorkerPool.shared(),
+                pool=EngineRecallAdapter(engine),
             )
             _queue_consumer.start()
             application.state.queue_consumer = _queue_consumer
@@ -466,9 +537,9 @@ async def lifespan(application: FastAPI):
         from superlocalmemory.core.health_monitor import HealthMonitor
         health_config = getattr(config, 'health', None)
         monitor = HealthMonitor(
-            global_rss_budget_mb=getattr(health_config, 'global_rss_budget_mb', 4096) if health_config else 4096,
+            global_rss_budget_mb=getattr(health_config, 'global_rss_budget_mb', 2500) if health_config else 2500,
             heartbeat_timeout_sec=getattr(health_config, 'heartbeat_timeout_sec', 60) if health_config else 60,
-            check_interval_sec=getattr(health_config, 'health_check_interval_sec', 30) if health_config else 30,
+            check_interval_sec=getattr(health_config, 'health_check_interval_sec', 15) if health_config else 15,
             enable_structured_logging=getattr(health_config, 'enable_structured_logging', True) if health_config else True,
         )
         monitor.start()
@@ -1259,11 +1330,11 @@ def _start_memory_watchdog() -> None:
     """
     import threading
-    MAX_WORKER_MB = 4096  # 4GB per worker — ONNX full model is 1.6GB + overhead
+    MAX_WORKER_MB = 1800  # V3.4.37: 1.8GB — ONNX nomic-embed is ~1.7GB loaded
     def watchdog_loop():
         while True:
-            time.sleep(60)
+            time.sleep(15)  # V3.4.37: 15s (was 60s) — catch spikes faster
             try:
                 import psutil
                 parent = psutil.Process(os.getpid())