superlocalmemory 3.4.35 → 3.4.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -10,6 +10,74 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
10
10
 
11
11
  ---
12
12
 
13
+ ## [3.4.37] - 2026-04-26
14
+
15
+ **P0 RAM fix.** Total SLM footprint reduced from ~14 GB peak to ~2.3 GB peak
16
+ (84% reduction). Idle dropped from ~2.5 GB to ~1.0 GB. Users with 16 GB
17
+ laptops can now run SLM without uninstalling.
18
+
19
+ ### Fixed
20
+ - **CoreML EP allocation** — Added `ORT_DISABLE_COREML=1` to
21
+ `recall_worker.py`, `cli/commands.py` (warmup diagnose path), and the
22
+ Popen environment dicts in `core/embeddings.py` and
23
+ `retrieval/reranker.py`. Previously only `embedding_worker.py` and
24
+ `reranker_worker.py` set this. On ARM64 Mac, ONNX Runtime's CoreML
25
+ Execution Provider allocated 3-5 GB per missing guard.
26
+ - **Duplicate MemoryEngine** — The QueueConsumer (recall_queue.db drain)
27
+ was routing through `WorkerPool` → `recall_worker` subprocess, which
28
+ loaded a SECOND full MemoryEngine inside the daemon. Now routes through
29
+ the daemon's in-process engine via the new `EngineRecallAdapter`.
30
+ Eliminates ~800 MB of duplication.
31
+ - **Eager warmup** — Removed `WorkerPool.shared().warmup()` from daemon
32
+ startup. The recall_worker subprocess no longer spawns at boot. It
33
+ remains available as a fallback for dashboard/chat routes.
34
+
35
+ ### Changed
36
+ - **RSS limits tightened:**
37
+ - `embedding_worker` self-kill: 4000 MB → 1800 MB
38
+ - `recall_worker` self-kill: 2500 MB → 1500 MB
39
+ - Daemon watchdog `MAX_WORKER_MB`: 4096 MB → 1800 MB
40
+ - `HealthMonitor.global_rss_budget_mb`: 4096 MB → 2500 MB
41
+ - **Watchdog interval:** 60s → 15s in both daemon watchdog and
42
+ HealthMonitor `check_interval_sec`. Catches memory spikes faster.
43
+ - **Idle timeouts:**
44
+ - `SLM_EMBED_IDLE_TIMEOUT`: 1800s (30 min) → 300s (5 min)
45
+ - `SLM_RERANKER_IDLE_TIMEOUT`: 1800s → 300s
46
+ - Reduces idle RAM held by ML model subprocesses.
47
+
48
+ ### Added
49
+ - **`EngineRecallAdapter`** in `unified_daemon.py` — wraps the in-process
50
+ MemoryEngine to satisfy `RecallPoolProtocol` for the QueueConsumer.
51
+ Eliminates the recall_worker subprocess on the hot path.
52
+
53
+ ---
54
+
55
+ ## [3.4.36] - 2026-04-25
56
+
57
+ Persistent hook daemon: recall latency drops from ~2.2s to sub-second by
58
+ eliminating Python subprocess startup on every prompt.
59
+
60
+ ### Added
61
+ - **`hooks/hook_daemon.py`** — Unix domain socket server that keeps a
62
+ long-lived process for recall requests. Claude Code connects via socket
63
+ instead of spawning a fresh Python interpreter per prompt. Eliminates
64
+ ~300-500ms of subprocess overhead. Starts/stops with the SLM daemon.
65
+ - **Auto-restart watchdog:** `ensure_hook_daemon()` checks socket health
66
+ and restarts the daemon if it died. Claude Code hooks call this before
67
+ connecting, so a crashed daemon is transparent to the user.
68
+ - **Graceful fallback:** if the socket is unavailable, the hook
69
+ automatically falls back to the v3.4.35 subprocess path. Claude Code
70
+ performance is NEVER impacted by daemon failure.
71
+ - **9 new tests** for daemon lifecycle, socket protocol, ack detection,
72
+ watchdog, fallback, and memory safety.
73
+
74
+ ### Performance
75
+ - Ack prompts: ~5ms via socket (was 30ms via subprocess)
76
+ - Substantive recall: target sub-1s (was 2.2s p50 via subprocess)
77
+ - Hook daemon RSS: ~15-20MB (no engine, no ONNX, no PyTorch)
78
+
79
+ ---
80
+
13
81
  ## [3.4.35] - 2026-04-25
14
82
 
15
83
  Production auto-recall: every Claude Code prompt automatically retrieves the
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "superlocalmemory",
3
- "version": "3.4.35",
3
+ "version": "3.4.37",
4
4
  "description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
5
5
  "keywords": [
6
6
  "ai-memory",
package/pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "superlocalmemory"
3
- version = "3.4.35"
3
+ version = "3.4.37"
4
4
  description = "Information-geometric agent memory with mathematical guarantees"
5
5
  readme = "README.md"
6
6
  license = {text = "AGPL-3.0-or-later"}
@@ -1,3 +1,3 @@
1
1
  """SuperLocalMemory — information-geometric agent memory."""
2
2
 
3
- __version__ = "3.4.35"
3
+ __version__ = "3.4.37"
@@ -1710,6 +1710,7 @@ def _warmup_diagnose() -> None:
1710
1710
  """Diagnostic helper when warmup fails."""
1711
1711
  print("\nDiagnosing...")
1712
1712
  print(f" Python executable: {sys.executable}")
1713
+ os.environ["ORT_DISABLE_COREML"] = "1"
1713
1714
  try:
1714
1715
  from sentence_transformers import SentenceTransformer
1715
1716
  print(" sentence-transformers: importable")
@@ -151,7 +151,7 @@ def _worker_main() -> None:
151
151
  _respond({"ok": False, "error": str(exc)})
152
152
 
153
153
  # V3.3.16: RSS watchdog — V3.4.24: cross-platform via platform_utils.
154
- _rss_limit = int(os.environ.get("SLM_EMBED_WORKER_RSS_LIMIT_MB", 4000))
154
+ _rss_limit = int(os.environ.get("SLM_EMBED_WORKER_RSS_LIMIT_MB", 1800))
155
155
  rss_mb = get_rss_mb()
156
156
  if rss_mb > 0 and rss_mb > _rss_limit:
157
157
  sys.exit(0)
@@ -140,14 +140,10 @@ def release_embedding_lock() -> None:
140
140
  _embedding_lock_fd = None
141
141
 
142
142
 
143
- _IDLE_TIMEOUT_SECONDS = 1800 # 30 minutes — keep model warm across bursty use.
144
- # V3.3.12: Configurable via SLM_EMBED_IDLE_TIMEOUT env var (seconds).
145
- # V3.4.19: Bumped from 120 1800 to eliminate the 30-60s cold-start pain
146
- # when the embedding worker was killed too aggressively. Safety: the
147
- # per-embed RSS self-check (SLM_EMBED_WORKER_RSS_LIMIT_MB, 4GB default) and
148
- # the daemon memory watchdog (unified_daemon.py, 4GB/60s) still cap any
149
- # runaway. To restore the old aggressive policy without redeploying, set
150
- # ``SLM_EMBED_IDLE_TIMEOUT=120`` and ``slm restart``.
143
+ _IDLE_TIMEOUT_SECONDS = 300 # 5 minutes — balance cold-start vs RAM.
144
+ # V3.4.37: Reduced from 1800 300. Holding 1.1 GB for 30 min idle
145
+ # wastes RAM on laptops. 5 min covers bursty session_init+recall
146
+ # patterns while freeing memory between sessions.
151
147
  _IDLE_TIMEOUT_SECONDS = int(os.environ.get("SLM_EMBED_IDLE_TIMEOUT", _IDLE_TIMEOUT_SECONDS))
152
148
  # V3.3.21: Configurable response timeout — 180s default, but batch ingestion
153
149
  # (2-turn chunks across 10 conversations) needs 600s+ to survive cold-start
@@ -476,6 +472,7 @@ class EmbeddingService:
476
472
  "PYTORCH_ENABLE_MPS_FALLBACK": "1",
477
473
  "TOKENIZERS_PARALLELISM": "false",
478
474
  "TORCH_DEVICE": "cpu",
475
+ "ORT_DISABLE_COREML": "1",
479
476
  }
480
477
  from superlocalmemory.core.platform_utils import popen_platform_kwargs
481
478
  self._worker_proc = subprocess.Popen(
@@ -133,9 +133,9 @@ class HealthMonitor:
133
133
 
134
134
  def __init__(
135
135
  self,
136
- global_rss_budget_mb: int = 4096,
136
+ global_rss_budget_mb: int = 2500,
137
137
  heartbeat_timeout_sec: int = 60,
138
- check_interval_sec: int = 30,
138
+ check_interval_sec: int = 15,
139
139
  enable_structured_logging: bool = True,
140
140
  ):
141
141
  self._budget_mb = global_rss_budget_mb
@@ -28,6 +28,8 @@ os.environ["PYTORCH_MPS_MEM_LIMIT"] = "0"
28
28
  os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
29
29
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
30
30
  os.environ["TORCH_DEVICE"] = "cpu"
31
+ # V3.4.37: Disable CoreML EP — uses 3-5GB on ARM64 Mac.
32
+ os.environ["ORT_DISABLE_COREML"] = "1"
31
33
 
32
34
  # SIGTERM bridge: Docker/systemd send SIGTERM to stop processes.
33
35
  # Without this, the worker ignores SIGTERM and becomes a zombie.
@@ -324,7 +326,7 @@ def _worker_main() -> None:
324
326
 
325
327
  # V3.3.16: RSS watchdog — V3.4.24: cross-platform via platform_utils.
326
328
  rss_mb = get_rss_mb()
327
- if rss_mb > 0 and rss_mb > 2500:
329
+ if rss_mb > 0 and rss_mb > 1500:
328
330
  sys.exit(0)
329
331
 
330
332
 
@@ -76,6 +76,30 @@ def _get_queue_db_path():
76
76
  return slm_dir / "recall_queue.db"
77
77
 
78
78
 
79
+ def _try_socket_first(prompt: str, session_id: str) -> dict | None:
80
+ """Try the persistent hook daemon socket. Returns full envelope or None.
81
+
82
+ The socket path returns an already-formatted Claude Code envelope.
83
+ If this returns a non-None dict, the caller writes it to stdout directly
84
+ (skip _do_recall + _format_envelope). Returns None on any failure,
85
+ triggering the subprocess fallback.
86
+ """
87
+ try:
88
+ from superlocalmemory.hooks.hook_daemon import try_socket_recall
89
+ response = try_socket_recall(
90
+ prompt=prompt,
91
+ session_id=session_id,
92
+ timeout=_get_mode_timeout(_detect_mode()),
93
+ )
94
+ if response is None or not isinstance(response, dict):
95
+ return None
96
+ if not response:
97
+ return {}
98
+ return response
99
+ except Exception:
100
+ return None
101
+
102
+
79
103
  def _do_recall(query: str, limit: int = _DEFAULT_LIMIT, session_id: str = "") -> list[dict] | None:
80
104
  """Enqueue recall to queue, poll for result. Returns list of dicts or None."""
81
105
  try:
@@ -192,6 +216,14 @@ def main() -> int:
192
216
  sys.stdout.write("{}")
193
217
  return 0
194
218
 
219
+ try:
220
+ socket_result = _try_socket_first(prompt, session_id)
221
+ if socket_result is not None:
222
+ sys.stdout.write(json.dumps(socket_result) if socket_result else "{}")
223
+ return 0
224
+ except Exception:
225
+ pass
226
+
195
227
  try:
196
228
  results = _do_recall(prompt, limit=_DEFAULT_LIMIT, session_id=session_id)
197
229
  except Exception:
@@ -0,0 +1,276 @@
1
+ # Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
2
+ # Licensed under AGPL-3.0-or-later - see LICENSE file
3
+ # Part of SuperLocalMemory V3 | https://qualixar.com | https://varunpratap.com
4
+
5
+ """Persistent hook daemon — Unix socket server for sub-200ms recall.
6
+
7
+ Eliminates Python subprocess startup (~300-500ms) by keeping a long-lived
8
+ process that Claude Code hooks talk to via Unix domain socket.
9
+
10
+ Protocol (newline-delimited JSON):
11
+ Client → {"prompt": "...", "session_id": "..."}\n
12
+ Server → {"hookSpecificOutput": {...}}\n (or {}\n for ack/empty)
13
+
14
+ MEMORY SAFETY: This module NEVER imports MemoryEngine. All recall goes
15
+ through recall_queue.db → QueueConsumer → pool.recall(). The hook daemon
16
+ stays at ~15-20MB RSS.
17
+
18
+ Lifecycle: started by unified_daemon.py alongside QueueConsumer. If it
19
+ crashes, auto_recall_hook.py falls back to subprocess (v3.4.35 path).
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import json
25
+ import logging
26
+ import os
27
+ import socket
28
+ import threading
29
+ import time
30
+ from pathlib import Path
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+ _DEFAULT_SOCK_NAME = "hook_daemon.sock"
35
+
36
+
37
+ def _default_sock_path() -> Path:
38
+ return Path.home() / ".superlocalmemory" / _DEFAULT_SOCK_NAME
39
+
40
+
41
+ def _default_queue_db_path() -> Path:
42
+ return Path.home() / ".superlocalmemory" / "recall_queue.db"
43
+
44
+
45
+ class HookDaemon:
46
+ """Unix socket server for persistent auto-recall.
47
+
48
+ Accepts newline-delimited JSON requests, runs the same logic as
49
+ auto_recall_hook.main() but without Python startup cost.
50
+ """
51
+
52
+ def __init__(
53
+ self,
54
+ sock_path: Path | None = None,
55
+ queue_db_path: Path | None = None,
56
+ ) -> None:
57
+ self._sock_path = sock_path or _default_sock_path()
58
+ self._queue_db_path = queue_db_path or _default_queue_db_path()
59
+ self._running = False
60
+ self._stop_event = threading.Event()
61
+ self._thread: threading.Thread | None = None
62
+ self._server_sock: socket.socket | None = None
63
+ self._queue = None
64
+
65
+ @property
66
+ def running(self) -> bool:
67
+ return self._running
68
+
69
+ def start(self) -> None:
70
+ if self._running:
71
+ return
72
+ if self._sock_path.exists():
73
+ self._sock_path.unlink()
74
+
75
+ from superlocalmemory.core.recall_queue import RecallQueue
76
+ self._queue = RecallQueue(self._queue_db_path)
77
+
78
+ self._server_sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
79
+ self._server_sock.bind(str(self._sock_path))
80
+ self._server_sock.listen(8)
81
+ self._server_sock.settimeout(1.0)
82
+
83
+ self._stop_event.clear()
84
+ self._running = True
85
+ self._thread = threading.Thread(
86
+ target=self._accept_loop,
87
+ daemon=True,
88
+ name="slm-hook-daemon",
89
+ )
90
+ self._thread.start()
91
+ logger.info("HookDaemon started on %s", self._sock_path)
92
+
93
+ def stop(self) -> None:
94
+ if not self._running:
95
+ return
96
+ self._stop_event.set()
97
+ self._running = False
98
+ if self._server_sock is not None:
99
+ try:
100
+ self._server_sock.close()
101
+ except Exception:
102
+ pass
103
+ self._server_sock = None
104
+ if self._thread is not None:
105
+ self._thread.join(timeout=3.0)
106
+ self._thread = None
107
+ if self._sock_path.exists():
108
+ try:
109
+ self._sock_path.unlink()
110
+ except Exception:
111
+ pass
112
+ if self._queue is not None:
113
+ try:
114
+ self._queue.close()
115
+ except Exception:
116
+ pass
117
+ self._queue = None
118
+ logger.info("HookDaemon stopped")
119
+
120
+ def _accept_loop(self) -> None:
121
+ while not self._stop_event.is_set():
122
+ try:
123
+ client, _ = self._server_sock.accept()
124
+ except socket.timeout:
125
+ continue
126
+ except OSError:
127
+ if self._stop_event.is_set():
128
+ break
129
+ continue
130
+ threading.Thread(
131
+ target=self._handle_client,
132
+ args=(client,),
133
+ daemon=True,
134
+ name="slm-hook-client",
135
+ ).start()
136
+
137
+ def _handle_client(self, client: socket.socket) -> None:
138
+ try:
139
+ client.settimeout(30.0)
140
+ data = b""
141
+ while b"\n" not in data:
142
+ chunk = client.recv(4096)
143
+ if not chunk:
144
+ return
145
+ data += chunk
146
+
147
+ line = data.decode("utf-8").strip()
148
+ if not line:
149
+ client.sendall(b"{}\n")
150
+ return
151
+
152
+ try:
153
+ payload = json.loads(line)
154
+ except Exception:
155
+ client.sendall(b"{}\n")
156
+ return
157
+
158
+ response = self._process_request(payload)
159
+ client.sendall((json.dumps(response) + "\n").encode("utf-8"))
160
+ except Exception:
161
+ try:
162
+ client.sendall(b"{}\n")
163
+ except Exception:
164
+ pass
165
+ finally:
166
+ try:
167
+ client.close()
168
+ except Exception:
169
+ pass
170
+
171
+ def _process_request(self, payload: dict) -> dict:
172
+ from superlocalmemory.hooks.auto_recall_hook import (
173
+ _is_ack, _get_mode_timeout, _detect_mode, _format_envelope,
174
+ _DEFAULT_LIMIT,
175
+ )
176
+ from superlocalmemory.core.recall_queue import QueueTimeoutError
177
+
178
+ prompt = payload.get("prompt", "")
179
+ session_id = payload.get("session_id", "")
180
+
181
+ if not prompt or not isinstance(prompt, str):
182
+ return {}
183
+
184
+ if _is_ack(prompt):
185
+ return {}
186
+
187
+ try:
188
+ mode = _detect_mode()
189
+ timeout = _get_mode_timeout(mode)
190
+ stall_timeout = max(timeout - 5.0, 5.0)
191
+
192
+ request_id = self._queue.enqueue(
193
+ query=prompt,
194
+ limit_n=_DEFAULT_LIMIT,
195
+ mode=mode,
196
+ agent_id="hook_daemon",
197
+ session_id=session_id,
198
+ priority="high",
199
+ stall_timeout_s=stall_timeout,
200
+ )
201
+
202
+ result = self._queue.poll_result(request_id, timeout_s=timeout)
203
+
204
+ if isinstance(result, dict) and result.get("ok") is not False:
205
+ results = result.get("results", [])
206
+ if results:
207
+ return _format_envelope(results)
208
+ return {}
209
+ except (QueueTimeoutError, Exception):
210
+ return {}
211
+
212
+
213
+ def try_socket_recall(
214
+ sock_path: Path | None = None,
215
+ prompt: str = "",
216
+ session_id: str = "",
217
+ timeout: float = 15.0,
218
+ ) -> dict | None:
219
+ """Try to get recall result via the persistent hook daemon socket.
220
+
221
+ Returns the hook envelope dict on success, or None if the daemon
222
+ is unavailable (triggers subprocess fallback in auto_recall_hook).
223
+ """
224
+ path = sock_path or _default_sock_path()
225
+ if not path.exists():
226
+ return None
227
+
228
+ try:
229
+ client = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
230
+ client.settimeout(timeout)
231
+ client.connect(str(path))
232
+
233
+ request = json.dumps({"prompt": prompt, "session_id": session_id}) + "\n"
234
+ client.sendall(request.encode("utf-8"))
235
+
236
+ data = b""
237
+ while b"\n" not in data:
238
+ chunk = client.recv(8192)
239
+ if not chunk:
240
+ break
241
+ data += chunk
242
+
243
+ client.close()
244
+
245
+ if not data.strip():
246
+ return None
247
+
248
+ response = json.loads(data.decode("utf-8").strip())
249
+ return response if isinstance(response, dict) else None
250
+ except Exception:
251
+ return None
252
+
253
+
254
+ def ensure_hook_daemon(
255
+ sock_path: Path | None = None,
256
+ queue_db_path: Path | None = None,
257
+ ) -> HookDaemon | None:
258
+ """Start hook daemon if not already running. Returns daemon or None."""
259
+ path = sock_path or _default_sock_path()
260
+
261
+ if path.exists():
262
+ try:
263
+ test = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
264
+ test.settimeout(1.0)
265
+ test.connect(str(path))
266
+ test.close()
267
+ return None
268
+ except Exception:
269
+ pass
270
+
271
+ daemon = HookDaemon(
272
+ sock_path=path,
273
+ queue_db_path=queue_db_path or _default_queue_db_path(),
274
+ )
275
+ daemon.start()
276
+ return daemon
@@ -51,7 +51,7 @@ _live_rerankers: set[weakref.ref] = set()
51
51
 
52
52
  logger = logging.getLogger(__name__)
53
53
 
54
- _IDLE_TIMEOUT_SECONDS = 1800 # 30 min — keep cross-encoder warm for active sessions.
54
+ _IDLE_TIMEOUT_SECONDS = 300 # V3.4.37: 5 min (was 30) balance cold-start vs RAM.
55
55
  # V3.3.12: Configurable via SLM_RERANKER_IDLE_TIMEOUT env var.
56
56
  # V3.4.19: Bumped from 120 → 1800 in lock-step with the embedding worker.
57
57
  # Set ``SLM_RERANKER_IDLE_TIMEOUT=120`` + ``slm restart`` to revert.
@@ -192,6 +192,7 @@ class CrossEncoderReranker:
192
192
  "PYTORCH_ENABLE_MPS_FALLBACK": "1",
193
193
  "TOKENIZERS_PARALLELISM": "false",
194
194
  "TORCH_DEVICE": "cpu",
195
+ "ORT_DISABLE_COREML": "1",
195
196
  }
196
197
  from superlocalmemory.core.platform_utils import popen_platform_kwargs
197
198
  self._worker_proc = subprocess.Popen(