superlocalmemory 3.0.34 → 3.0.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "superlocalmemory",
3
- "version": "3.0.34",
3
+ "version": "3.0.36",
4
4
  "description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
5
5
  "keywords": [
6
6
  "ai-memory",
package/pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "superlocalmemory"
3
- version = "3.0.34"
3
+ version = "3.0.36"
4
4
  description = "Information-geometric agent memory with mathematical guarantees"
5
5
  readme = "README.md"
6
6
  license = {text = "MIT"}
@@ -348,7 +348,7 @@ class SLMConfig:
348
348
  ),
349
349
  llm=LLMConfig(), # No LLM
350
350
  retrieval=RetrievalConfig(
351
- use_cross_encoder=False, # Disabled: 30s PyTorch cold start kills UX
351
+ use_cross_encoder=True,
352
352
  ),
353
353
  math=MathConfig(
354
354
  sheaf_contradiction_threshold=0.45, # 768d threshold
@@ -370,7 +370,7 @@ class SLMConfig:
370
370
  api_base=llm_api_base or "http://localhost:11434",
371
371
  api_key=llm_api_key or "",
372
372
  ),
373
- retrieval=RetrievalConfig(use_cross_encoder=False),
373
+ retrieval=RetrievalConfig(use_cross_encoder=True),
374
374
  )
375
375
 
376
376
  # Mode C — FULL POWER, UNRESTRICTED
@@ -222,6 +222,20 @@ def _worker_main() -> None:
222
222
  _respond({"ok": True})
223
223
  continue
224
224
 
225
+ if cmd == "warmup":
226
+ # Pre-load engine + all models (embedding, reranker, BM25, LLM)
227
+ # Called at dashboard/MCP startup so first real request is fast.
228
+ # A dummy recall triggers lazy-loaded components (cross-encoder, BM25 index).
229
+ try:
230
+ engine = _get_engine()
231
+ fact_count = engine._db.get_fact_count(engine._profile_id) if engine._db else 0
232
+ if fact_count > 0:
233
+ engine.recall("warmup", limit=1)
234
+ _respond({"ok": True, "message": "Engine warm", "facts": fact_count})
235
+ except Exception as exc:
236
+ _respond({"ok": False, "error": f"Warmup failed: {exc}"})
237
+ continue
238
+
225
239
  try:
226
240
  if cmd == "recall":
227
241
  result = _handle_recall(req.get("query", ""), req.get("limit", 10))
@@ -94,9 +94,14 @@ class Summarizer:
94
94
  # ------------------------------------------------------------------
95
95
 
96
96
  def _has_llm(self) -> bool:
97
- """Check if LLM is available."""
97
+ """Check if LLM is available (AND warm for Ollama).
98
+
99
+ For Mode B (Ollama): only returns True if the model is already
100
+ loaded in memory. NEVER triggers a cold model load — that would
101
+ spike 5+ GB of RAM on every recall, unacceptable on ≤32 GB machines.
102
+ """
98
103
  if self._mode == "b":
99
- return True # Ollama assumed running
104
+ return self._is_ollama_model_warm()
100
105
  if self._mode == "c":
101
106
  return bool(
102
107
  os.environ.get("OPENROUTER_API_KEY")
@@ -104,6 +109,27 @@ class Summarizer:
104
109
  )
105
110
  return False
106
111
 
112
+ def _is_ollama_model_warm(self) -> bool:
113
+ """Check if the LLM model is already loaded in Ollama memory.
114
+
115
+ Queries Ollama /api/ps. Returns True only if our model is loaded,
116
+ preventing cold-load memory spikes during recall.
117
+ """
118
+ try:
119
+ import httpx
120
+ model = getattr(self._config.llm, 'model', None) or "llama3.1:8b"
121
+ model_base = model.split(":")[0]
122
+ with httpx.Client(timeout=httpx.Timeout(2.0)) as client:
123
+ resp = client.get("http://localhost:11434/api/ps")
124
+ if resp.status_code != 200:
125
+ return False
126
+ for m in resp.json().get("models", []):
127
+ if model_base in m.get("name", ""):
128
+ return True
129
+ return False
130
+ except Exception:
131
+ return False
132
+
107
133
  def _call_llm(self, prompt: str, max_tokens: int = 200) -> str:
108
134
  """Route to Ollama (B) or OpenRouter (C)."""
109
135
  if self._mode == "b":
@@ -111,15 +137,26 @@ class Summarizer:
111
137
  return self._call_openrouter(prompt, max_tokens)
112
138
 
113
139
  def _call_ollama(self, prompt: str, max_tokens: int = 200) -> str:
114
- """Call local Ollama for summary generation."""
140
+ """Call local Ollama for summary generation.
141
+
142
+ CRITICAL: num_ctx MUST be set. Without it, Ollama defaults to the
143
+ model's native context (128K for llama3.1) which allocates ~30 GB
144
+ of KV cache — fatal on machines with ≤32 GB RAM.
145
+ SLM prompts are <500 tokens; 4096 context is more than enough.
146
+ """
115
147
  import httpx
116
148
  model = getattr(self._config.llm, 'model', None) or "llama3.1:8b"
117
- with httpx.Client(timeout=httpx.Timeout(20.0)) as client:
149
+ with httpx.Client(timeout=httpx.Timeout(30.0)) as client:
118
150
  resp = client.post("http://localhost:11434/api/generate", json={
119
151
  "model": model,
120
152
  "prompt": prompt,
121
153
  "stream": False,
122
- "options": {"num_predict": max_tokens, "temperature": 0.3},
154
+ "keep_alive": "30s",
155
+ "options": {
156
+ "num_predict": max_tokens,
157
+ "temperature": 0.3,
158
+ "num_ctx": 4096,
159
+ },
123
160
  })
124
161
  resp.raise_for_status()
125
162
  return resp.json().get("response", "").strip()
@@ -28,8 +28,9 @@ import time
28
28
 
29
29
  logger = logging.getLogger(__name__)
30
30
 
31
- _IDLE_TIMEOUT = 120 # 2 min — kill worker after idle
31
+ _IDLE_TIMEOUT = 120 # 2 min — kill worker after idle
32
32
  _REQUEST_TIMEOUT = 60 # 60 sec max per request
33
+ _WARMUP_TIMEOUT = 120 # 2 min — first cold start loads PyTorch + models
33
34
 
34
35
 
35
36
  class WorkerPool:
@@ -102,6 +103,31 @@ class WorkerPool:
102
103
  with self._lock:
103
104
  self._kill()
104
105
 
106
+ def warmup(self) -> None:
107
+ """Pre-spawn and warm up the worker in a background thread.
108
+
109
+ Spawns the recall_worker subprocess so that PyTorch, models, and
110
+ the engine are all loaded BEFORE the first user request. This
111
+ amortizes the 30s cold-start at dashboard/MCP startup time.
112
+
113
+ Call from startup events — non-blocking, runs in background.
114
+ """
115
+ def _do_warmup() -> None:
116
+ logger.info("Worker warmup starting (background)...")
117
+ try:
118
+ result = self._send_with_timeout(
119
+ {"cmd": "warmup"}, timeout=_WARMUP_TIMEOUT,
120
+ )
121
+ if result.get("ok"):
122
+ logger.info("Worker warmup complete (engine + models ready)")
123
+ else:
124
+ logger.warning("Worker warmup returned: %s", result)
125
+ except Exception as exc:
126
+ logger.warning("Worker warmup failed: %s", exc)
127
+
128
+ t = threading.Thread(target=_do_warmup, daemon=True, name="worker-warmup")
129
+ t.start()
130
+
105
131
  @property
106
132
  def worker_pid(self) -> int | None:
107
133
  """PID of the worker process, or None if not running."""
@@ -115,6 +141,10 @@ class WorkerPool:
115
141
 
116
142
  def _send(self, request: dict) -> dict:
117
143
  """Send request to worker and get response. Thread-safe."""
144
+ return self._send_with_timeout(request, timeout=_REQUEST_TIMEOUT)
145
+
146
+ def _send_with_timeout(self, request: dict, timeout: float) -> dict:
147
+ """Send request with configurable timeout. Thread-safe."""
118
148
  with self._lock:
119
149
  self._ensure_worker()
120
150
  if self._proc is None:
@@ -129,7 +159,7 @@ class WorkerPool:
129
159
  import selectors
130
160
  sel = selectors.DefaultSelector()
131
161
  sel.register(self._proc.stdout, selectors.EVENT_READ)
132
- ready = sel.select(timeout=_REQUEST_TIMEOUT)
162
+ ready = sel.select(timeout=timeout)
133
163
  sel.close()
134
164
 
135
165
  if not ready:
@@ -127,13 +127,34 @@ class LLMBackbone:
127
127
  # -- Properties ---------------------------------------------------------
128
128
 
129
129
  def is_available(self) -> bool:
130
- """True when the provider is ready for requests."""
130
+ """True when the provider is ready for requests.
131
+
132
+ For Ollama: only returns True if the model is already loaded in
133
+ memory. Prevents cold-load memory spikes (5+ GB) during recall.
134
+ """
131
135
  if not self._provider:
132
136
  return False
133
137
  if self._provider == "ollama":
134
- return True
138
+ return self._is_ollama_model_warm()
135
139
  return bool(self._api_key)
136
140
 
141
+ def _is_ollama_model_warm(self) -> bool:
142
+ """Check if the LLM model is already loaded in Ollama."""
143
+ try:
144
+ model_base = self._model.split(":")[0]
145
+ resp = httpx.get(
146
+ f"{_OLLAMA_DEFAULT_BASE}/api/ps",
147
+ timeout=httpx.Timeout(2.0),
148
+ )
149
+ if resp.status_code != 200:
150
+ return False
151
+ for m in resp.json().get("models", []):
152
+ if model_base in m.get("name", ""):
153
+ return True
154
+ return False
155
+ except Exception:
156
+ return False
157
+
137
158
  @property
138
159
  def provider(self) -> str:
139
160
  return self._provider
@@ -250,6 +271,8 @@ class LLMBackbone:
250
271
  "messages": messages,
251
272
  "max_tokens": max_tokens,
252
273
  "temperature": temperature,
274
+ "keep_alive": "30s",
275
+ "options": {"num_ctx": 4096},
253
276
  }
254
277
  return self._base_url, headers, payload
255
278
 
@@ -22,26 +22,32 @@ from pydantic import BaseModel, Field
22
22
  # ---------------------------------------------------------------------------
23
23
 
24
24
  def _get_version() -> str:
25
- """Read version from package.json / pyproject.toml / importlib."""
26
- try:
27
- import json as _json
28
- pkg_root = Path(__file__).resolve().parent.parent.parent.parent
29
- pkg_json = pkg_root / "package.json"
30
- if pkg_json.exists():
31
- with open(pkg_json) as f:
32
- v = _json.load(f).get("version", "")
33
- if v:
34
- return v
35
- except Exception:
36
- pass
37
- try:
38
- import tomllib
39
- toml_path = Path(__file__).resolve().parent.parent.parent.parent / "pyproject.toml"
40
- if toml_path.exists():
41
- with open(toml_path, "rb") as f:
42
- return tomllib.load(f)["project"]["version"]
43
- except Exception:
44
- pass
25
+ """Read version from package.json / pyproject.toml / importlib.
26
+
27
+ Walks up from this file to find the project root. In the src layout
28
+ (running from source tree), package.json is 5 parents up; for an
29
+ installed package it won't exist, so we fall through to importlib.
30
+ """
31
+ here = Path(__file__).resolve()
32
+ for depth in (5, 4):
33
+ try:
34
+ import json as _json
35
+ root = here
36
+ for _ in range(depth):
37
+ root = root.parent
38
+ pkg_json = root / "package.json"
39
+ if pkg_json.exists():
40
+ with open(pkg_json) as f:
41
+ v = _json.load(f).get("version", "")
42
+ if v:
43
+ return v
44
+ toml_path = root / "pyproject.toml"
45
+ if toml_path.exists():
46
+ import tomllib
47
+ with open(toml_path, "rb") as f:
48
+ return tomllib.load(f)["project"]["version"]
49
+ except Exception:
50
+ continue
45
51
  try:
46
52
  from importlib.metadata import version
47
53
  return version("superlocalmemory")
@@ -199,14 +199,25 @@ def create_app() -> FastAPI:
199
199
 
200
200
  @application.on_event("startup")
201
201
  async def startup_event():
202
- """Initialize event bus. Engine runs in subprocess worker (never in this process)."""
203
- # Engine is NEVER loaded in the dashboard process.
204
- # All recall/search operations go through WorkerPool subprocess.
205
- # This keeps the dashboard permanently at ~60 MB.
202
+ """Initialize event bus and warm up worker subprocess.
203
+
204
+ Engine runs in subprocess worker (never in this process).
205
+ Background warmup pre-loads PyTorch + models so first recall is fast.
206
+ """
206
207
  application.state.engine = None
207
208
  logger.info("Dashboard started (~60 MB, engine runs in subprocess worker)")
208
209
  register_event_listener()
209
210
 
211
+ # Background warmup: pre-spawn worker and load all models.
212
+ # This runs in a daemon thread — dashboard is responsive immediately.
213
+ # Worker will be ready by the time user does first search (~10-30s).
214
+ try:
215
+ from superlocalmemory.core.worker_pool import WorkerPool
216
+ WorkerPool.shared().warmup()
217
+ logger.info("Worker warmup initiated (background)")
218
+ except Exception as exc:
219
+ logger.warning("Worker warmup failed to start: %s", exc)
220
+
210
221
  @application.on_event("shutdown")
211
222
  async def shutdown_event():
212
223
  """Kill worker subprocess on dashboard shutdown."""