npm - superlocalmemory - Versions diffs - 3.0.34 → 3.0.36 - Mend

superlocalmemory 3.0.34 → 3.0.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/package.json +1 -1
package/pyproject.toml +1 -1
package/src/superlocalmemory/core/config.py +2 -2
package/src/superlocalmemory/core/recall_worker.py +14 -0
package/src/superlocalmemory/core/summarizer.py +42 -5
package/src/superlocalmemory/core/worker_pool.py +32 -2
package/src/superlocalmemory/llm/backbone.py +25 -2
package/src/superlocalmemory/server/routes/helpers.py +26 -20
package/src/superlocalmemory/server/ui.py +15 -4

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "superlocalmemory",
-  "version": "3.0.34",
+  "version": "3.0.36",
   "description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
   "keywords": [
     "ai-memory",

package/pyproject.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "superlocalmemory"
-version = "3.0.34"
+version = "3.0.36"
 description = "Information-geometric agent memory with mathematical guarantees"
 readme = "README.md"
 license = {text = "MIT"}

package/src/superlocalmemory/core/config.py CHANGED Viewed

@@ -348,7 +348,7 @@ class SLMConfig:
                 ),
                 llm=LLMConfig(),  # No LLM
                 retrieval=RetrievalConfig(
-                    use_cross_encoder=False,  # Disabled: 30s PyTorch cold start kills UX
+                    use_cross_encoder=True,
                 ),
                 math=MathConfig(
                     sheaf_contradiction_threshold=0.45,  # 768d threshold
@@ -370,7 +370,7 @@ class SLMConfig:
                     api_base=llm_api_base or "http://localhost:11434",
                     api_key=llm_api_key or "",
                 ),
-                retrieval=RetrievalConfig(use_cross_encoder=False),
+                retrieval=RetrievalConfig(use_cross_encoder=True),
             )
         # Mode C — FULL POWER, UNRESTRICTED

package/src/superlocalmemory/core/recall_worker.py CHANGED Viewed

@@ -222,6 +222,20 @@ def _worker_main() -> None:
             _respond({"ok": True})
             continue
+        if cmd == "warmup":
+            # Pre-load engine + all models (embedding, reranker, BM25, LLM)
+            # Called at dashboard/MCP startup so first real request is fast.
+            # A dummy recall triggers lazy-loaded components (cross-encoder, BM25 index).
+            try:
+                engine = _get_engine()
+                fact_count = engine._db.get_fact_count(engine._profile_id) if engine._db else 0
+                if fact_count > 0:
+                    engine.recall("warmup", limit=1)
+                _respond({"ok": True, "message": "Engine warm", "facts": fact_count})
+            except Exception as exc:
+                _respond({"ok": False, "error": f"Warmup failed: {exc}"})
+            continue
         try:
             if cmd == "recall":
                 result = _handle_recall(req.get("query", ""), req.get("limit", 10))

package/src/superlocalmemory/core/summarizer.py CHANGED Viewed

@@ -94,9 +94,14 @@ class Summarizer:
     # ------------------------------------------------------------------
     def _has_llm(self) -> bool:
-        """Check if LLM is available."""
+        """Check if LLM is available (AND warm for Ollama).
+        For Mode B (Ollama): only returns True if the model is already
+        loaded in memory. NEVER triggers a cold model load — that would
+        spike 5+ GB of RAM on every recall, unacceptable on ≤32 GB machines.
+        """
         if self._mode == "b":
-            return True  # Ollama assumed running
+            return self._is_ollama_model_warm()
         if self._mode == "c":
             return bool(
                 os.environ.get("OPENROUTER_API_KEY")
@@ -104,6 +109,27 @@ class Summarizer:
             )
         return False
+    def _is_ollama_model_warm(self) -> bool:
+        """Check if the LLM model is already loaded in Ollama memory.
+        Queries Ollama /api/ps. Returns True only if our model is loaded,
+        preventing cold-load memory spikes during recall.
+        """
+        try:
+            import httpx
+            model = getattr(self._config.llm, 'model', None) or "llama3.1:8b"
+            model_base = model.split(":")[0]
+            with httpx.Client(timeout=httpx.Timeout(2.0)) as client:
+                resp = client.get("http://localhost:11434/api/ps")
+                if resp.status_code != 200:
+                    return False
+                for m in resp.json().get("models", []):
+                    if model_base in m.get("name", ""):
+                        return True
+            return False
+        except Exception:
+            return False
     def _call_llm(self, prompt: str, max_tokens: int = 200) -> str:
         """Route to Ollama (B) or OpenRouter (C)."""
         if self._mode == "b":
@@ -111,15 +137,26 @@ class Summarizer:
         return self._call_openrouter(prompt, max_tokens)
     def _call_ollama(self, prompt: str, max_tokens: int = 200) -> str:
-        """Call local Ollama for summary generation."""
+        """Call local Ollama for summary generation.
+        CRITICAL: num_ctx MUST be set. Without it, Ollama defaults to the
+        model's native context (128K for llama3.1) which allocates ~30 GB
+        of KV cache — fatal on machines with ≤32 GB RAM.
+        SLM prompts are <500 tokens; 4096 context is more than enough.
+        """
         import httpx
         model = getattr(self._config.llm, 'model', None) or "llama3.1:8b"
-        with httpx.Client(timeout=httpx.Timeout(20.0)) as client:
+        with httpx.Client(timeout=httpx.Timeout(30.0)) as client:
             resp = client.post("http://localhost:11434/api/generate", json={
                 "model": model,
                 "prompt": prompt,
                 "stream": False,
-                "options": {"num_predict": max_tokens, "temperature": 0.3},
+                "keep_alive": "30s",
+                "options": {
+                    "num_predict": max_tokens,
+                    "temperature": 0.3,
+                    "num_ctx": 4096,
+                },
             })
             resp.raise_for_status()
             return resp.json().get("response", "").strip()

package/src/superlocalmemory/core/worker_pool.py CHANGED Viewed

@@ -28,8 +28,9 @@ import time
 logger = logging.getLogger(__name__)
-_IDLE_TIMEOUT = 120  # 2 min — kill worker after idle
+_IDLE_TIMEOUT = 120   # 2 min — kill worker after idle
 _REQUEST_TIMEOUT = 60  # 60 sec max per request
+_WARMUP_TIMEOUT = 120  # 2 min — first cold start loads PyTorch + models
 class WorkerPool:
@@ -102,6 +103,31 @@ class WorkerPool:
         with self._lock:
             self._kill()
+    def warmup(self) -> None:
+        """Pre-spawn and warm up the worker in a background thread.
+        Spawns the recall_worker subprocess so that PyTorch, models, and
+        the engine are all loaded BEFORE the first user request. This
+        amortizes the 30s cold-start at dashboard/MCP startup time.
+        Call from startup events — non-blocking, runs in background.
+        """
+        def _do_warmup() -> None:
+            logger.info("Worker warmup starting (background)...")
+            try:
+                result = self._send_with_timeout(
+                    {"cmd": "warmup"}, timeout=_WARMUP_TIMEOUT,
+                )
+                if result.get("ok"):
+                    logger.info("Worker warmup complete (engine + models ready)")
+                else:
+                    logger.warning("Worker warmup returned: %s", result)
+            except Exception as exc:
+                logger.warning("Worker warmup failed: %s", exc)
+        t = threading.Thread(target=_do_warmup, daemon=True, name="worker-warmup")
+        t.start()
     @property
     def worker_pid(self) -> int | None:
         """PID of the worker process, or None if not running."""
@@ -115,6 +141,10 @@ class WorkerPool:
     def _send(self, request: dict) -> dict:
         """Send request to worker and get response. Thread-safe."""
+        return self._send_with_timeout(request, timeout=_REQUEST_TIMEOUT)
+    def _send_with_timeout(self, request: dict, timeout: float) -> dict:
+        """Send request with configurable timeout. Thread-safe."""
         with self._lock:
             self._ensure_worker()
             if self._proc is None:
@@ -129,7 +159,7 @@ class WorkerPool:
                 import selectors
                 sel = selectors.DefaultSelector()
                 sel.register(self._proc.stdout, selectors.EVENT_READ)
-                ready = sel.select(timeout=_REQUEST_TIMEOUT)
+                ready = sel.select(timeout=timeout)
                 sel.close()
                 if not ready:

package/src/superlocalmemory/llm/backbone.py CHANGED Viewed

@@ -127,13 +127,34 @@ class LLMBackbone:
     # -- Properties ---------------------------------------------------------
     def is_available(self) -> bool:
-        """True when the provider is ready for requests."""
+        """True when the provider is ready for requests.
+        For Ollama: only returns True if the model is already loaded in
+        memory. Prevents cold-load memory spikes (5+ GB) during recall.
+        """
         if not self._provider:
             return False
         if self._provider == "ollama":
-            return True
+            return self._is_ollama_model_warm()
         return bool(self._api_key)
+    def _is_ollama_model_warm(self) -> bool:
+        """Check if the LLM model is already loaded in Ollama."""
+        try:
+            model_base = self._model.split(":")[0]
+            resp = httpx.get(
+                f"{_OLLAMA_DEFAULT_BASE}/api/ps",
+                timeout=httpx.Timeout(2.0),
+            )
+            if resp.status_code != 200:
+                return False
+            for m in resp.json().get("models", []):
+                if model_base in m.get("name", ""):
+                    return True
+            return False
+        except Exception:
+            return False
     @property
     def provider(self) -> str:
         return self._provider
@@ -250,6 +271,8 @@ class LLMBackbone:
             "messages": messages,
             "max_tokens": max_tokens,
             "temperature": temperature,
+            "keep_alive": "30s",
+            "options": {"num_ctx": 4096},
         }
         return self._base_url, headers, payload

package/src/superlocalmemory/server/routes/helpers.py CHANGED Viewed

@@ -22,26 +22,32 @@ from pydantic import BaseModel, Field
 # ---------------------------------------------------------------------------
 def _get_version() -> str:
-    """Read version from package.json / pyproject.toml / importlib."""
-    try:
-        import json as _json
-        pkg_root = Path(__file__).resolve().parent.parent.parent.parent
-        pkg_json = pkg_root / "package.json"
-        if pkg_json.exists():
-            with open(pkg_json) as f:
-                v = _json.load(f).get("version", "")
-                if v:
-                    return v
-    except Exception:
-        pass
-    try:
-        import tomllib
-        toml_path = Path(__file__).resolve().parent.parent.parent.parent / "pyproject.toml"
-        if toml_path.exists():
-            with open(toml_path, "rb") as f:
-                return tomllib.load(f)["project"]["version"]
-    except Exception:
-        pass
+    """Read version from package.json / pyproject.toml / importlib.
+    Walks up from this file to find the project root. In the src layout
+    (running from source tree), package.json is 5 parents up; for an
+    installed package it won't exist, so we fall through to importlib.
+    """
+    here = Path(__file__).resolve()
+    for depth in (5, 4):
+        try:
+            import json as _json
+            root = here
+            for _ in range(depth):
+                root = root.parent
+            pkg_json = root / "package.json"
+            if pkg_json.exists():
+                with open(pkg_json) as f:
+                    v = _json.load(f).get("version", "")
+                    if v:
+                        return v
+            toml_path = root / "pyproject.toml"
+            if toml_path.exists():
+                import tomllib
+                with open(toml_path, "rb") as f:
+                    return tomllib.load(f)["project"]["version"]
+        except Exception:
+            continue
     try:
         from importlib.metadata import version
         return version("superlocalmemory")

package/src/superlocalmemory/server/ui.py CHANGED Viewed

@@ -199,14 +199,25 @@ def create_app() -> FastAPI:
     @application.on_event("startup")
     async def startup_event():
-        """Initialize event bus. Engine runs in subprocess worker (never in this process)."""
-        # Engine is NEVER loaded in the dashboard process.
-        # All recall/search operations go through WorkerPool subprocess.
-        # This keeps the dashboard permanently at ~60 MB.
+        """Initialize event bus and warm up worker subprocess.
+        Engine runs in subprocess worker (never in this process).
+        Background warmup pre-loads PyTorch + models so first recall is fast.
+        """
         application.state.engine = None
         logger.info("Dashboard started (~60 MB, engine runs in subprocess worker)")
         register_event_listener()
+        # Background warmup: pre-spawn worker and load all models.
+        # This runs in a daemon thread — dashboard is responsive immediately.
+        # Worker will be ready by the time user does first search (~10-30s).
+        try:
+            from superlocalmemory.core.worker_pool import WorkerPool
+            WorkerPool.shared().warmup()
+            logger.info("Worker warmup initiated (background)")
+        except Exception as exc:
+            logger.warning("Worker warmup failed to start: %s", exc)
     @application.on_event("shutdown")
     async def shutdown_event():
         """Kill worker subprocess on dashboard shutdown."""