superlocalmemory 3.4.36 → 3.4.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -10,6 +10,48 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
10
10
 
11
11
  ---
12
12
 
13
+ ## [3.4.37] - 2026-04-26
14
+
15
+ **P0 RAM fix.** Total SLM footprint reduced from ~14 GB peak to ~2.3 GB peak
16
+ (84% reduction). Idle dropped from ~2.5 GB to ~1.0 GB. Users with 16 GB
17
+ laptops can now run SLM without uninstalling.
18
+
19
+ ### Fixed
20
+ - **CoreML EP allocation** — Added `ORT_DISABLE_COREML=1` to
21
+ `recall_worker.py`, `cli/commands.py` (warmup diagnose path), and the
22
+ Popen environment dicts in `core/embeddings.py` and
23
+ `retrieval/reranker.py`. Previously only `embedding_worker.py` and
24
+ `reranker_worker.py` set this. On ARM64 Mac, ONNX Runtime's CoreML
25
+ Execution Provider allocated 3-5 GB per missing guard.
26
+ - **Duplicate MemoryEngine** — The QueueConsumer (recall_queue.db drain)
27
+ was routing through `WorkerPool` → `recall_worker` subprocess, which
28
+ loaded a SECOND full MemoryEngine inside the daemon. Now routes through
29
+ the daemon's in-process engine via the new `EngineRecallAdapter`.
30
+ Eliminates ~800 MB of duplication.
31
+ - **Eager warmup** — Removed `WorkerPool.shared().warmup()` from daemon
32
+ startup. The recall_worker subprocess no longer spawns at boot. It
33
+ remains available as a fallback for dashboard/chat routes.
34
+
35
+ ### Changed
36
+ - **RSS limits tightened:**
37
+ - `embedding_worker` self-kill: 4000 MB → 1800 MB
38
+ - `recall_worker` self-kill: 2500 MB → 1500 MB
39
+ - Daemon watchdog `MAX_WORKER_MB`: 4096 MB → 1800 MB
40
+ - `HealthMonitor.global_rss_budget_mb`: 4096 MB → 2500 MB
41
+ - **Watchdog interval:** 60s → 15s in both daemon watchdog and
42
+ HealthMonitor `check_interval_sec`. Catches memory spikes faster.
43
+ - **Idle timeouts:**
44
+ - `SLM_EMBED_IDLE_TIMEOUT`: 1800s (30 min) → 300s (5 min)
45
+ - `SLM_RERANKER_IDLE_TIMEOUT`: 1800s → 300s
46
+ - Reduces idle RAM held by ML model subprocesses.
47
+
48
+ ### Added
49
+ - **`EngineRecallAdapter`** in `unified_daemon.py` — wraps the in-process
50
+ MemoryEngine to satisfy `RecallPoolProtocol` for the QueueConsumer.
51
+ Eliminates the recall_worker subprocess on the hot path.
52
+
53
+ ---
54
+
13
55
  ## [3.4.36] - 2026-04-25
14
56
 
15
57
  Persistent hook daemon: recall latency drops from ~2.2s to sub-second by
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "superlocalmemory",
3
- "version": "3.4.36",
3
+ "version": "3.4.37",
4
4
  "description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
5
5
  "keywords": [
6
6
  "ai-memory",
package/pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "superlocalmemory"
3
- version = "3.4.36"
3
+ version = "3.4.37"
4
4
  description = "Information-geometric agent memory with mathematical guarantees"
5
5
  readme = "README.md"
6
6
  license = {text = "AGPL-3.0-or-later"}
@@ -1,3 +1,3 @@
1
1
  """SuperLocalMemory — information-geometric agent memory."""
2
2
 
3
- __version__ = "3.4.36"
3
+ __version__ = "3.4.37"
@@ -1710,6 +1710,7 @@ def _warmup_diagnose() -> None:
1710
1710
  """Diagnostic helper when warmup fails."""
1711
1711
  print("\nDiagnosing...")
1712
1712
  print(f" Python executable: {sys.executable}")
1713
+ os.environ["ORT_DISABLE_COREML"] = "1"
1713
1714
  try:
1714
1715
  from sentence_transformers import SentenceTransformer
1715
1716
  print(" sentence-transformers: importable")
@@ -151,7 +151,7 @@ def _worker_main() -> None:
151
151
  _respond({"ok": False, "error": str(exc)})
152
152
 
153
153
  # V3.3.16: RSS watchdog — V3.4.24: cross-platform via platform_utils.
154
- _rss_limit = int(os.environ.get("SLM_EMBED_WORKER_RSS_LIMIT_MB", 4000))
154
+ _rss_limit = int(os.environ.get("SLM_EMBED_WORKER_RSS_LIMIT_MB", 1800))
155
155
  rss_mb = get_rss_mb()
156
156
  if rss_mb > 0 and rss_mb > _rss_limit:
157
157
  sys.exit(0)
@@ -140,14 +140,10 @@ def release_embedding_lock() -> None:
140
140
  _embedding_lock_fd = None
141
141
 
142
142
 
143
- _IDLE_TIMEOUT_SECONDS = 1800 # 30 minutes — keep model warm across bursty use.
144
- # V3.3.12: Configurable via SLM_EMBED_IDLE_TIMEOUT env var (seconds).
145
- # V3.4.19: Bumped from 120 1800 to eliminate the 30-60s cold-start pain
146
- # when the embedding worker was killed too aggressively. Safety: the
147
- # per-embed RSS self-check (SLM_EMBED_WORKER_RSS_LIMIT_MB, 4GB default) and
148
- # the daemon memory watchdog (unified_daemon.py, 4GB/60s) still cap any
149
- # runaway. To restore the old aggressive policy without redeploying, set
150
- # ``SLM_EMBED_IDLE_TIMEOUT=120`` and ``slm restart``.
143
+ _IDLE_TIMEOUT_SECONDS = 300 # 5 minutes — balance cold-start vs RAM.
144
+ # V3.4.37: Reduced from 1800 300. Holding 1.1 GB for 30 min idle
145
+ # wastes RAM on laptops. 5 min covers bursty session_init+recall
146
+ # patterns while freeing memory between sessions.
151
147
  _IDLE_TIMEOUT_SECONDS = int(os.environ.get("SLM_EMBED_IDLE_TIMEOUT", _IDLE_TIMEOUT_SECONDS))
152
148
  # V3.3.21: Configurable response timeout — 180s default, but batch ingestion
153
149
  # (2-turn chunks across 10 conversations) needs 600s+ to survive cold-start
@@ -476,6 +472,7 @@ class EmbeddingService:
476
472
  "PYTORCH_ENABLE_MPS_FALLBACK": "1",
477
473
  "TOKENIZERS_PARALLELISM": "false",
478
474
  "TORCH_DEVICE": "cpu",
475
+ "ORT_DISABLE_COREML": "1",
479
476
  }
480
477
  from superlocalmemory.core.platform_utils import popen_platform_kwargs
481
478
  self._worker_proc = subprocess.Popen(
@@ -133,9 +133,9 @@ class HealthMonitor:
133
133
 
134
134
  def __init__(
135
135
  self,
136
- global_rss_budget_mb: int = 4096,
136
+ global_rss_budget_mb: int = 2500,
137
137
  heartbeat_timeout_sec: int = 60,
138
- check_interval_sec: int = 30,
138
+ check_interval_sec: int = 15,
139
139
  enable_structured_logging: bool = True,
140
140
  ):
141
141
  self._budget_mb = global_rss_budget_mb
@@ -28,6 +28,8 @@ os.environ["PYTORCH_MPS_MEM_LIMIT"] = "0"
28
28
  os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
29
29
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
30
30
  os.environ["TORCH_DEVICE"] = "cpu"
31
+ # V3.4.37: Disable CoreML EP — uses 3-5GB on ARM64 Mac.
32
+ os.environ["ORT_DISABLE_COREML"] = "1"
31
33
 
32
34
  # SIGTERM bridge: Docker/systemd send SIGTERM to stop processes.
33
35
  # Without this, the worker ignores SIGTERM and becomes a zombie.
@@ -324,7 +326,7 @@ def _worker_main() -> None:
324
326
 
325
327
  # V3.3.16: RSS watchdog — V3.4.24: cross-platform via platform_utils.
326
328
  rss_mb = get_rss_mb()
327
- if rss_mb > 0 and rss_mb > 2500:
329
+ if rss_mb > 0 and rss_mb > 1500:
328
330
  sys.exit(0)
329
331
 
330
332
 
@@ -51,7 +51,7 @@ _live_rerankers: set[weakref.ref] = set()
51
51
 
52
52
  logger = logging.getLogger(__name__)
53
53
 
54
- _IDLE_TIMEOUT_SECONDS = 1800 # 30 min — keep cross-encoder warm for active sessions.
54
+ _IDLE_TIMEOUT_SECONDS = 300 # V3.4.37: 5 min (was 30) balance cold-start vs RAM.
55
55
  # V3.3.12: Configurable via SLM_RERANKER_IDLE_TIMEOUT env var.
56
56
  # V3.4.19: Bumped from 120 → 1800 in lock-step with the embedding worker.
57
57
  # Set ``SLM_RERANKER_IDLE_TIMEOUT=120`` + ``slm restart`` to revert.
@@ -192,6 +192,7 @@ class CrossEncoderReranker:
192
192
  "PYTORCH_ENABLE_MPS_FALLBACK": "1",
193
193
  "TOKENIZERS_PARALLELISM": "false",
194
194
  "TORCH_DEVICE": "cpu",
195
+ "ORT_DISABLE_COREML": "1",
195
196
  }
196
197
  from superlocalmemory.core.platform_utils import popen_platform_kwargs
197
198
  self._worker_proc = subprocess.Popen(
@@ -66,6 +66,75 @@ class ObserveRequest(BaseModel):
66
66
  content: str
67
67
 
68
68
 
69
+ # ---------------------------------------------------------------------------
70
+ # V3.4.37: Engine recall adapter — routes QueueConsumer through the daemon's
71
+ # in-process MemoryEngine instead of spawning a recall_worker subprocess.
72
+ # Saves ~800 MB by eliminating the duplicate engine.
73
+ # ---------------------------------------------------------------------------
74
+
75
+ class EngineRecallAdapter:
76
+ """Adapts MemoryEngine.recall() to RecallPoolProtocol for QueueConsumer.
77
+
78
+ The daemon already has a full MemoryEngine in-process. The QueueConsumer
79
+ previously routed through WorkerPool → recall_worker subprocess, which
80
+ loaded a SECOND MemoryEngine. This adapter eliminates that duplication.
81
+ """
82
+
83
+ def __init__(self, engine) -> None:
84
+ self._engine = engine
85
+
86
+ def recall(self, query: str, limit: int = 10, session_id: str = "") -> dict:
87
+ response = self._engine.recall(
88
+ query, limit=limit, session_id=session_id or None,
89
+ )
90
+ memory_ids = list({
91
+ r.fact.memory_id for r in response.results[:limit]
92
+ if r.fact.memory_id
93
+ })
94
+ memory_map = (
95
+ self._engine._db.get_memory_content_batch(memory_ids)
96
+ if memory_ids else {}
97
+ )
98
+ results = []
99
+ for r in response.results[:limit]:
100
+ fact_type = getattr(r.fact, "fact_type", None)
101
+ lifecycle = getattr(r.fact, "lifecycle", None)
102
+ results.append({
103
+ "fact_id": r.fact.fact_id,
104
+ "memory_id": r.fact.memory_id,
105
+ "content": r.fact.content[:300],
106
+ "source_content": memory_map.get(r.fact.memory_id, ""),
107
+ "score": round(r.score, 4),
108
+ "confidence": round(r.confidence, 4),
109
+ "trust_score": round(r.trust_score, 4),
110
+ "channel_scores": {
111
+ k: round(v, 4)
112
+ for k, v in (r.channel_scores or {}).items()
113
+ },
114
+ "fact_type": fact_type.value
115
+ if fact_type and hasattr(fact_type, "value") else "",
116
+ "lifecycle": lifecycle.value
117
+ if lifecycle and hasattr(lifecycle, "value") else "",
118
+ "access_count": getattr(r.fact, "access_count", 0),
119
+ "evidence_chain": list(
120
+ getattr(r, "evidence_chain", []) or []
121
+ ),
122
+ })
123
+ return {
124
+ "ok": True,
125
+ "query": query,
126
+ "query_type": response.query_type,
127
+ "result_count": len(results),
128
+ "retrieval_time_ms": round(response.retrieval_time_ms, 1),
129
+ "channel_weights": {
130
+ k: round(v, 3)
131
+ for k, v in (response.channel_weights or {}).items()
132
+ },
133
+ "total_candidates": getattr(response, "total_candidates", 0),
134
+ "results": results,
135
+ }
136
+
137
+
69
138
  # ---------------------------------------------------------------------------
70
139
  # v3.4.32: Recall-priority gate for the pending materializer.
71
140
  # All /remember writes go to pending.db and return fast; a background
@@ -397,9 +466,10 @@ async def lifespan(application: FastAPI):
397
466
  # Set up observe buffer
398
467
  _observe_buffer.set_engine(engine)
399
468
 
400
- # Pre-warm workers (background)
401
- from superlocalmemory.core.worker_pool import WorkerPool
402
- WorkerPool.shared().warmup()
469
+ # V3.4.37: Removed WorkerPool.warmup() — the recall_worker subprocess
470
+ # duplicated the daemon's MemoryEngine (800+ MB). QueueConsumer now
471
+ # uses the daemon's engine directly via EngineRecallAdapter.
472
+ # WorkerPool is still available as fallback for dashboard/chat routes.
403
473
 
404
474
  # Force reranker warmup
405
475
  retrieval_eng = getattr(engine, '_retrieval_engine', None)
@@ -422,8 +492,9 @@ async def lifespan(application: FastAPI):
422
492
  logger.warning("Embedding warmup failed: %s", exc)
423
493
  threading.Thread(target=_warmup_embedder, daemon=True, name="embed-warmup").start()
424
494
 
425
- # v3.4.26: Start QueueConsumer drains recall_queue.db via pool.recall().
426
- # Must start AFTER WorkerPool.warmup() so the worker is ready.
495
+ # v3.4.37: QueueConsumer uses daemon's engine directly via adapter.
496
+ # Previously routed through WorkerPool recall_worker subprocess,
497
+ # which loaded a duplicate MemoryEngine (~800 MB waste).
427
498
  try:
428
499
  from pathlib import Path as _QP
429
500
  from superlocalmemory.core.queue_consumer import QueueConsumer
@@ -432,7 +503,7 @@ async def lifespan(application: FastAPI):
432
503
  _recall_queue = RecallQueue(_queue_db)
433
504
  _queue_consumer = QueueConsumer(
434
505
  queue=_recall_queue,
435
- pool=WorkerPool.shared(),
506
+ pool=EngineRecallAdapter(engine),
436
507
  )
437
508
  _queue_consumer.start()
438
509
  application.state.queue_consumer = _queue_consumer
@@ -466,9 +537,9 @@ async def lifespan(application: FastAPI):
466
537
  from superlocalmemory.core.health_monitor import HealthMonitor
467
538
  health_config = getattr(config, 'health', None)
468
539
  monitor = HealthMonitor(
469
- global_rss_budget_mb=getattr(health_config, 'global_rss_budget_mb', 4096) if health_config else 4096,
540
+ global_rss_budget_mb=getattr(health_config, 'global_rss_budget_mb', 2500) if health_config else 2500,
470
541
  heartbeat_timeout_sec=getattr(health_config, 'heartbeat_timeout_sec', 60) if health_config else 60,
471
- check_interval_sec=getattr(health_config, 'health_check_interval_sec', 30) if health_config else 30,
542
+ check_interval_sec=getattr(health_config, 'health_check_interval_sec', 15) if health_config else 15,
472
543
  enable_structured_logging=getattr(health_config, 'enable_structured_logging', True) if health_config else True,
473
544
  )
474
545
  monitor.start()
@@ -1259,11 +1330,11 @@ def _start_memory_watchdog() -> None:
1259
1330
  """
1260
1331
  import threading
1261
1332
 
1262
- MAX_WORKER_MB = 4096 # 4GB per worker — ONNX full model is 1.6GB + overhead
1333
+ MAX_WORKER_MB = 1800 # V3.4.37: 1.8GB — ONNX nomic-embed is ~1.7GB loaded
1263
1334
 
1264
1335
  def watchdog_loop():
1265
1336
  while True:
1266
- time.sleep(60)
1337
+ time.sleep(15) # V3.4.37: 15s (was 60s) — catch spikes faster
1267
1338
  try:
1268
1339
  import psutil
1269
1340
  parent = psutil.Process(os.getpid())