npm - superlocalmemory - Versions diffs - 3.0.35 → 3.0.36 - Mend

superlocalmemory 3.0.35 → 3.0.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/package.json +1 -1
package/pyproject.toml +1 -1
package/src/superlocalmemory/core/summarizer.py +42 -5
package/src/superlocalmemory/llm/backbone.py +25 -2
package/src/superlocalmemory/server/routes/helpers.py +26 -20

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "superlocalmemory",
-  "version": "3.0.35",
+  "version": "3.0.36",
   "description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
   "keywords": [
     "ai-memory",

package/pyproject.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "superlocalmemory"
-version = "3.0.35"
+version = "3.0.36"
 description = "Information-geometric agent memory with mathematical guarantees"
 readme = "README.md"
 license = {text = "MIT"}

package/src/superlocalmemory/core/summarizer.py CHANGED Viewed

@@ -94,9 +94,14 @@ class Summarizer:
     # ------------------------------------------------------------------
     def _has_llm(self) -> bool:
-        """Check if LLM is available."""
+        """Check if LLM is available (AND warm for Ollama).
+        For Mode B (Ollama): only returns True if the model is already
+        loaded in memory. NEVER triggers a cold model load — that would
+        spike 5+ GB of RAM on every recall, unacceptable on ≤32 GB machines.
+        """
         if self._mode == "b":
-            return True  # Ollama assumed running
+            return self._is_ollama_model_warm()
         if self._mode == "c":
             return bool(
                 os.environ.get("OPENROUTER_API_KEY")
@@ -104,6 +109,27 @@ class Summarizer:
             )
         return False
+    def _is_ollama_model_warm(self) -> bool:
+        """Check if the LLM model is already loaded in Ollama memory.
+        Queries Ollama /api/ps. Returns True only if our model is loaded,
+        preventing cold-load memory spikes during recall.
+        """
+        try:
+            import httpx
+            model = getattr(self._config.llm, 'model', None) or "llama3.1:8b"
+            model_base = model.split(":")[0]
+            with httpx.Client(timeout=httpx.Timeout(2.0)) as client:
+                resp = client.get("http://localhost:11434/api/ps")
+                if resp.status_code != 200:
+                    return False
+                for m in resp.json().get("models", []):
+                    if model_base in m.get("name", ""):
+                        return True
+            return False
+        except Exception:
+            return False
     def _call_llm(self, prompt: str, max_tokens: int = 200) -> str:
         """Route to Ollama (B) or OpenRouter (C)."""
         if self._mode == "b":
@@ -111,15 +137,26 @@ class Summarizer:
         return self._call_openrouter(prompt, max_tokens)
     def _call_ollama(self, prompt: str, max_tokens: int = 200) -> str:
-        """Call local Ollama for summary generation."""
+        """Call local Ollama for summary generation.
+        CRITICAL: num_ctx MUST be set. Without it, Ollama defaults to the
+        model's native context (128K for llama3.1) which allocates ~30 GB
+        of KV cache — fatal on machines with ≤32 GB RAM.
+        SLM prompts are <500 tokens; 4096 context is more than enough.
+        """
         import httpx
         model = getattr(self._config.llm, 'model', None) or "llama3.1:8b"
-        with httpx.Client(timeout=httpx.Timeout(20.0)) as client:
+        with httpx.Client(timeout=httpx.Timeout(30.0)) as client:
             resp = client.post("http://localhost:11434/api/generate", json={
                 "model": model,
                 "prompt": prompt,
                 "stream": False,
-                "options": {"num_predict": max_tokens, "temperature": 0.3},
+                "keep_alive": "30s",
+                "options": {
+                    "num_predict": max_tokens,
+                    "temperature": 0.3,
+                    "num_ctx": 4096,
+                },
             })
             resp.raise_for_status()
             return resp.json().get("response", "").strip()

package/src/superlocalmemory/llm/backbone.py CHANGED Viewed

@@ -127,13 +127,34 @@ class LLMBackbone:
     # -- Properties ---------------------------------------------------------
     def is_available(self) -> bool:
-        """True when the provider is ready for requests."""
+        """True when the provider is ready for requests.
+        For Ollama: only returns True if the model is already loaded in
+        memory. Prevents cold-load memory spikes (5+ GB) during recall.
+        """
         if not self._provider:
             return False
         if self._provider == "ollama":
-            return True
+            return self._is_ollama_model_warm()
         return bool(self._api_key)
+    def _is_ollama_model_warm(self) -> bool:
+        """Check if the LLM model is already loaded in Ollama."""
+        try:
+            model_base = self._model.split(":")[0]
+            resp = httpx.get(
+                f"{_OLLAMA_DEFAULT_BASE}/api/ps",
+                timeout=httpx.Timeout(2.0),
+            )
+            if resp.status_code != 200:
+                return False
+            for m in resp.json().get("models", []):
+                if model_base in m.get("name", ""):
+                    return True
+            return False
+        except Exception:
+            return False
     @property
     def provider(self) -> str:
         return self._provider
@@ -250,6 +271,8 @@ class LLMBackbone:
             "messages": messages,
             "max_tokens": max_tokens,
             "temperature": temperature,
+            "keep_alive": "30s",
+            "options": {"num_ctx": 4096},
         }
         return self._base_url, headers, payload

package/src/superlocalmemory/server/routes/helpers.py CHANGED Viewed

@@ -22,26 +22,32 @@ from pydantic import BaseModel, Field
 # ---------------------------------------------------------------------------
 def _get_version() -> str:
-    """Read version from package.json / pyproject.toml / importlib."""
-    try:
-        import json as _json
-        pkg_root = Path(__file__).resolve().parent.parent.parent.parent
-        pkg_json = pkg_root / "package.json"
-        if pkg_json.exists():
-            with open(pkg_json) as f:
-                v = _json.load(f).get("version", "")
-                if v:
-                    return v
-    except Exception:
-        pass
-    try:
-        import tomllib
-        toml_path = Path(__file__).resolve().parent.parent.parent.parent / "pyproject.toml"
-        if toml_path.exists():
-            with open(toml_path, "rb") as f:
-                return tomllib.load(f)["project"]["version"]
-    except Exception:
-        pass
+    """Read version from package.json / pyproject.toml / importlib.
+    Walks up from this file to find the project root. In the src layout
+    (running from source tree), package.json is 5 parents up; for an
+    installed package it won't exist, so we fall through to importlib.
+    """
+    here = Path(__file__).resolve()
+    for depth in (5, 4):
+        try:
+            import json as _json
+            root = here
+            for _ in range(depth):
+                root = root.parent
+            pkg_json = root / "package.json"
+            if pkg_json.exists():
+                with open(pkg_json) as f:
+                    v = _json.load(f).get("version", "")
+                    if v:
+                        return v
+            toml_path = root / "pyproject.toml"
+            if toml_path.exists():
+                import tomllib
+                with open(toml_path, "rb") as f:
+                    return tomllib.load(f)["project"]["version"]
+        except Exception:
+            continue
     try:
         from importlib.metadata import version
         return version("superlocalmemory")