superlocalmemory 3.0.34 → 3.0.36
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/pyproject.toml +1 -1
- package/src/superlocalmemory/core/config.py +2 -2
- package/src/superlocalmemory/core/recall_worker.py +14 -0
- package/src/superlocalmemory/core/summarizer.py +42 -5
- package/src/superlocalmemory/core/worker_pool.py +32 -2
- package/src/superlocalmemory/llm/backbone.py +25 -2
- package/src/superlocalmemory/server/routes/helpers.py +26 -20
- package/src/superlocalmemory/server/ui.py +15 -4
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "superlocalmemory",
|
|
3
|
-
"version": "3.0.
|
|
3
|
+
"version": "3.0.36",
|
|
4
4
|
"description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"ai-memory",
|
package/pyproject.toml
CHANGED
|
@@ -348,7 +348,7 @@ class SLMConfig:
|
|
|
348
348
|
),
|
|
349
349
|
llm=LLMConfig(), # No LLM
|
|
350
350
|
retrieval=RetrievalConfig(
|
|
351
|
-
use_cross_encoder=
|
|
351
|
+
use_cross_encoder=True,
|
|
352
352
|
),
|
|
353
353
|
math=MathConfig(
|
|
354
354
|
sheaf_contradiction_threshold=0.45, # 768d threshold
|
|
@@ -370,7 +370,7 @@ class SLMConfig:
|
|
|
370
370
|
api_base=llm_api_base or "http://localhost:11434",
|
|
371
371
|
api_key=llm_api_key or "",
|
|
372
372
|
),
|
|
373
|
-
retrieval=RetrievalConfig(use_cross_encoder=
|
|
373
|
+
retrieval=RetrievalConfig(use_cross_encoder=True),
|
|
374
374
|
)
|
|
375
375
|
|
|
376
376
|
# Mode C — FULL POWER, UNRESTRICTED
|
|
@@ -222,6 +222,20 @@ def _worker_main() -> None:
|
|
|
222
222
|
_respond({"ok": True})
|
|
223
223
|
continue
|
|
224
224
|
|
|
225
|
+
if cmd == "warmup":
|
|
226
|
+
# Pre-load engine + all models (embedding, reranker, BM25, LLM)
|
|
227
|
+
# Called at dashboard/MCP startup so first real request is fast.
|
|
228
|
+
# A dummy recall triggers lazy-loaded components (cross-encoder, BM25 index).
|
|
229
|
+
try:
|
|
230
|
+
engine = _get_engine()
|
|
231
|
+
fact_count = engine._db.get_fact_count(engine._profile_id) if engine._db else 0
|
|
232
|
+
if fact_count > 0:
|
|
233
|
+
engine.recall("warmup", limit=1)
|
|
234
|
+
_respond({"ok": True, "message": "Engine warm", "facts": fact_count})
|
|
235
|
+
except Exception as exc:
|
|
236
|
+
_respond({"ok": False, "error": f"Warmup failed: {exc}"})
|
|
237
|
+
continue
|
|
238
|
+
|
|
225
239
|
try:
|
|
226
240
|
if cmd == "recall":
|
|
227
241
|
result = _handle_recall(req.get("query", ""), req.get("limit", 10))
|
|
@@ -94,9 +94,14 @@ class Summarizer:
|
|
|
94
94
|
# ------------------------------------------------------------------
|
|
95
95
|
|
|
96
96
|
def _has_llm(self) -> bool:
|
|
97
|
-
"""Check if LLM is available.
|
|
97
|
+
"""Check if LLM is available (AND warm for Ollama).
|
|
98
|
+
|
|
99
|
+
For Mode B (Ollama): only returns True if the model is already
|
|
100
|
+
loaded in memory. NEVER triggers a cold model load — that would
|
|
101
|
+
spike 5+ GB of RAM on every recall, unacceptable on ≤32 GB machines.
|
|
102
|
+
"""
|
|
98
103
|
if self._mode == "b":
|
|
99
|
-
return
|
|
104
|
+
return self._is_ollama_model_warm()
|
|
100
105
|
if self._mode == "c":
|
|
101
106
|
return bool(
|
|
102
107
|
os.environ.get("OPENROUTER_API_KEY")
|
|
@@ -104,6 +109,27 @@ class Summarizer:
|
|
|
104
109
|
)
|
|
105
110
|
return False
|
|
106
111
|
|
|
112
|
+
def _is_ollama_model_warm(self) -> bool:
|
|
113
|
+
"""Check if the LLM model is already loaded in Ollama memory.
|
|
114
|
+
|
|
115
|
+
Queries Ollama /api/ps. Returns True only if our model is loaded,
|
|
116
|
+
preventing cold-load memory spikes during recall.
|
|
117
|
+
"""
|
|
118
|
+
try:
|
|
119
|
+
import httpx
|
|
120
|
+
model = getattr(self._config.llm, 'model', None) or "llama3.1:8b"
|
|
121
|
+
model_base = model.split(":")[0]
|
|
122
|
+
with httpx.Client(timeout=httpx.Timeout(2.0)) as client:
|
|
123
|
+
resp = client.get("http://localhost:11434/api/ps")
|
|
124
|
+
if resp.status_code != 200:
|
|
125
|
+
return False
|
|
126
|
+
for m in resp.json().get("models", []):
|
|
127
|
+
if model_base in m.get("name", ""):
|
|
128
|
+
return True
|
|
129
|
+
return False
|
|
130
|
+
except Exception:
|
|
131
|
+
return False
|
|
132
|
+
|
|
107
133
|
def _call_llm(self, prompt: str, max_tokens: int = 200) -> str:
|
|
108
134
|
"""Route to Ollama (B) or OpenRouter (C)."""
|
|
109
135
|
if self._mode == "b":
|
|
@@ -111,15 +137,26 @@ class Summarizer:
|
|
|
111
137
|
return self._call_openrouter(prompt, max_tokens)
|
|
112
138
|
|
|
113
139
|
def _call_ollama(self, prompt: str, max_tokens: int = 200) -> str:
|
|
114
|
-
"""Call local Ollama for summary generation.
|
|
140
|
+
"""Call local Ollama for summary generation.
|
|
141
|
+
|
|
142
|
+
CRITICAL: num_ctx MUST be set. Without it, Ollama defaults to the
|
|
143
|
+
model's native context (128K for llama3.1) which allocates ~30 GB
|
|
144
|
+
of KV cache — fatal on machines with ≤32 GB RAM.
|
|
145
|
+
SLM prompts are <500 tokens; 4096 context is more than enough.
|
|
146
|
+
"""
|
|
115
147
|
import httpx
|
|
116
148
|
model = getattr(self._config.llm, 'model', None) or "llama3.1:8b"
|
|
117
|
-
with httpx.Client(timeout=httpx.Timeout(
|
|
149
|
+
with httpx.Client(timeout=httpx.Timeout(30.0)) as client:
|
|
118
150
|
resp = client.post("http://localhost:11434/api/generate", json={
|
|
119
151
|
"model": model,
|
|
120
152
|
"prompt": prompt,
|
|
121
153
|
"stream": False,
|
|
122
|
-
"
|
|
154
|
+
"keep_alive": "30s",
|
|
155
|
+
"options": {
|
|
156
|
+
"num_predict": max_tokens,
|
|
157
|
+
"temperature": 0.3,
|
|
158
|
+
"num_ctx": 4096,
|
|
159
|
+
},
|
|
123
160
|
})
|
|
124
161
|
resp.raise_for_status()
|
|
125
162
|
return resp.json().get("response", "").strip()
|
|
@@ -28,8 +28,9 @@ import time
|
|
|
28
28
|
|
|
29
29
|
logger = logging.getLogger(__name__)
|
|
30
30
|
|
|
31
|
-
_IDLE_TIMEOUT = 120
|
|
31
|
+
_IDLE_TIMEOUT = 120 # 2 min — kill worker after idle
|
|
32
32
|
_REQUEST_TIMEOUT = 60 # 60 sec max per request
|
|
33
|
+
_WARMUP_TIMEOUT = 120 # 2 min — first cold start loads PyTorch + models
|
|
33
34
|
|
|
34
35
|
|
|
35
36
|
class WorkerPool:
|
|
@@ -102,6 +103,31 @@ class WorkerPool:
|
|
|
102
103
|
with self._lock:
|
|
103
104
|
self._kill()
|
|
104
105
|
|
|
106
|
+
def warmup(self) -> None:
|
|
107
|
+
"""Pre-spawn and warm up the worker in a background thread.
|
|
108
|
+
|
|
109
|
+
Spawns the recall_worker subprocess so that PyTorch, models, and
|
|
110
|
+
the engine are all loaded BEFORE the first user request. This
|
|
111
|
+
amortizes the 30s cold-start at dashboard/MCP startup time.
|
|
112
|
+
|
|
113
|
+
Call from startup events — non-blocking, runs in background.
|
|
114
|
+
"""
|
|
115
|
+
def _do_warmup() -> None:
|
|
116
|
+
logger.info("Worker warmup starting (background)...")
|
|
117
|
+
try:
|
|
118
|
+
result = self._send_with_timeout(
|
|
119
|
+
{"cmd": "warmup"}, timeout=_WARMUP_TIMEOUT,
|
|
120
|
+
)
|
|
121
|
+
if result.get("ok"):
|
|
122
|
+
logger.info("Worker warmup complete (engine + models ready)")
|
|
123
|
+
else:
|
|
124
|
+
logger.warning("Worker warmup returned: %s", result)
|
|
125
|
+
except Exception as exc:
|
|
126
|
+
logger.warning("Worker warmup failed: %s", exc)
|
|
127
|
+
|
|
128
|
+
t = threading.Thread(target=_do_warmup, daemon=True, name="worker-warmup")
|
|
129
|
+
t.start()
|
|
130
|
+
|
|
105
131
|
@property
|
|
106
132
|
def worker_pid(self) -> int | None:
|
|
107
133
|
"""PID of the worker process, or None if not running."""
|
|
@@ -115,6 +141,10 @@ class WorkerPool:
|
|
|
115
141
|
|
|
116
142
|
def _send(self, request: dict) -> dict:
|
|
117
143
|
"""Send request to worker and get response. Thread-safe."""
|
|
144
|
+
return self._send_with_timeout(request, timeout=_REQUEST_TIMEOUT)
|
|
145
|
+
|
|
146
|
+
def _send_with_timeout(self, request: dict, timeout: float) -> dict:
|
|
147
|
+
"""Send request with configurable timeout. Thread-safe."""
|
|
118
148
|
with self._lock:
|
|
119
149
|
self._ensure_worker()
|
|
120
150
|
if self._proc is None:
|
|
@@ -129,7 +159,7 @@ class WorkerPool:
|
|
|
129
159
|
import selectors
|
|
130
160
|
sel = selectors.DefaultSelector()
|
|
131
161
|
sel.register(self._proc.stdout, selectors.EVENT_READ)
|
|
132
|
-
ready = sel.select(timeout=
|
|
162
|
+
ready = sel.select(timeout=timeout)
|
|
133
163
|
sel.close()
|
|
134
164
|
|
|
135
165
|
if not ready:
|
|
@@ -127,13 +127,34 @@ class LLMBackbone:
|
|
|
127
127
|
# -- Properties ---------------------------------------------------------
|
|
128
128
|
|
|
129
129
|
def is_available(self) -> bool:
|
|
130
|
-
"""True when the provider is ready for requests.
|
|
130
|
+
"""True when the provider is ready for requests.
|
|
131
|
+
|
|
132
|
+
For Ollama: only returns True if the model is already loaded in
|
|
133
|
+
memory. Prevents cold-load memory spikes (5+ GB) during recall.
|
|
134
|
+
"""
|
|
131
135
|
if not self._provider:
|
|
132
136
|
return False
|
|
133
137
|
if self._provider == "ollama":
|
|
134
|
-
return
|
|
138
|
+
return self._is_ollama_model_warm()
|
|
135
139
|
return bool(self._api_key)
|
|
136
140
|
|
|
141
|
+
def _is_ollama_model_warm(self) -> bool:
|
|
142
|
+
"""Check if the LLM model is already loaded in Ollama."""
|
|
143
|
+
try:
|
|
144
|
+
model_base = self._model.split(":")[0]
|
|
145
|
+
resp = httpx.get(
|
|
146
|
+
f"{_OLLAMA_DEFAULT_BASE}/api/ps",
|
|
147
|
+
timeout=httpx.Timeout(2.0),
|
|
148
|
+
)
|
|
149
|
+
if resp.status_code != 200:
|
|
150
|
+
return False
|
|
151
|
+
for m in resp.json().get("models", []):
|
|
152
|
+
if model_base in m.get("name", ""):
|
|
153
|
+
return True
|
|
154
|
+
return False
|
|
155
|
+
except Exception:
|
|
156
|
+
return False
|
|
157
|
+
|
|
137
158
|
@property
|
|
138
159
|
def provider(self) -> str:
|
|
139
160
|
return self._provider
|
|
@@ -250,6 +271,8 @@ class LLMBackbone:
|
|
|
250
271
|
"messages": messages,
|
|
251
272
|
"max_tokens": max_tokens,
|
|
252
273
|
"temperature": temperature,
|
|
274
|
+
"keep_alive": "30s",
|
|
275
|
+
"options": {"num_ctx": 4096},
|
|
253
276
|
}
|
|
254
277
|
return self._base_url, headers, payload
|
|
255
278
|
|
|
@@ -22,26 +22,32 @@ from pydantic import BaseModel, Field
|
|
|
22
22
|
# ---------------------------------------------------------------------------
|
|
23
23
|
|
|
24
24
|
def _get_version() -> str:
|
|
25
|
-
"""Read version from package.json / pyproject.toml / importlib.
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
25
|
+
"""Read version from package.json / pyproject.toml / importlib.
|
|
26
|
+
|
|
27
|
+
Walks up from this file to find the project root. In the src layout
|
|
28
|
+
(running from source tree), package.json is 5 parents up; for an
|
|
29
|
+
installed package it won't exist, so we fall through to importlib.
|
|
30
|
+
"""
|
|
31
|
+
here = Path(__file__).resolve()
|
|
32
|
+
for depth in (5, 4):
|
|
33
|
+
try:
|
|
34
|
+
import json as _json
|
|
35
|
+
root = here
|
|
36
|
+
for _ in range(depth):
|
|
37
|
+
root = root.parent
|
|
38
|
+
pkg_json = root / "package.json"
|
|
39
|
+
if pkg_json.exists():
|
|
40
|
+
with open(pkg_json) as f:
|
|
41
|
+
v = _json.load(f).get("version", "")
|
|
42
|
+
if v:
|
|
43
|
+
return v
|
|
44
|
+
toml_path = root / "pyproject.toml"
|
|
45
|
+
if toml_path.exists():
|
|
46
|
+
import tomllib
|
|
47
|
+
with open(toml_path, "rb") as f:
|
|
48
|
+
return tomllib.load(f)["project"]["version"]
|
|
49
|
+
except Exception:
|
|
50
|
+
continue
|
|
45
51
|
try:
|
|
46
52
|
from importlib.metadata import version
|
|
47
53
|
return version("superlocalmemory")
|
|
@@ -199,14 +199,25 @@ def create_app() -> FastAPI:
|
|
|
199
199
|
|
|
200
200
|
@application.on_event("startup")
|
|
201
201
|
async def startup_event():
|
|
202
|
-
"""Initialize event bus
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
202
|
+
"""Initialize event bus and warm up worker subprocess.
|
|
203
|
+
|
|
204
|
+
Engine runs in subprocess worker (never in this process).
|
|
205
|
+
Background warmup pre-loads PyTorch + models so first recall is fast.
|
|
206
|
+
"""
|
|
206
207
|
application.state.engine = None
|
|
207
208
|
logger.info("Dashboard started (~60 MB, engine runs in subprocess worker)")
|
|
208
209
|
register_event_listener()
|
|
209
210
|
|
|
211
|
+
# Background warmup: pre-spawn worker and load all models.
|
|
212
|
+
# This runs in a daemon thread — dashboard is responsive immediately.
|
|
213
|
+
# Worker will be ready by the time user does first search (~10-30s).
|
|
214
|
+
try:
|
|
215
|
+
from superlocalmemory.core.worker_pool import WorkerPool
|
|
216
|
+
WorkerPool.shared().warmup()
|
|
217
|
+
logger.info("Worker warmup initiated (background)")
|
|
218
|
+
except Exception as exc:
|
|
219
|
+
logger.warning("Worker warmup failed to start: %s", exc)
|
|
220
|
+
|
|
210
221
|
@application.on_event("shutdown")
|
|
211
222
|
async def shutdown_event():
|
|
212
223
|
"""Kill worker subprocess on dashboard shutdown."""
|