superlocalmemory 3.4.36 → 3.4.38
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +100 -0
- package/package.json +1 -1
- package/pyproject.toml +1 -1
- package/src/superlocalmemory/__init__.py +1 -1
- package/src/superlocalmemory/cli/commands.py +1 -0
- package/src/superlocalmemory/cli/pending_store.py +13 -4
- package/src/superlocalmemory/core/embedding_worker.py +1 -1
- package/src/superlocalmemory/core/embeddings.py +5 -8
- package/src/superlocalmemory/core/health_monitor.py +2 -2
- package/src/superlocalmemory/core/recall_worker.py +3 -1
- package/src/superlocalmemory/core/store_pipeline.py +9 -0
- package/src/superlocalmemory/encoding/scene_builder.py +15 -1
- package/src/superlocalmemory/retrieval/reranker.py +2 -1
- package/src/superlocalmemory/server/unified_daemon.py +107 -11
- package/src/superlocalmemory.egg-info/PKG-INFO +663 -0
- package/src/superlocalmemory.egg-info/SOURCES.txt +451 -0
- package/src/superlocalmemory.egg-info/dependency_links.txt +1 -0
- package/src/superlocalmemory.egg-info/entry_points.txt +2 -0
- package/src/superlocalmemory.egg-info/requires.txt +59 -0
- package/src/superlocalmemory.egg-info/top_level.txt +1 -0
package/CHANGELOG.md
CHANGED
|
@@ -10,6 +10,106 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
10
10
|
|
|
11
11
|
---
|
|
12
12
|
|
|
13
|
+
## [3.4.38] - 2026-04-26
|
|
14
|
+
|
|
15
|
+
**P0 silent data loss fix.** The async `/remember` pipeline was broken since
|
|
16
|
+
v3.4.32 — memories were being marked "queued" and acknowledged but never
|
|
17
|
+
actually persisting to memory.db during runtime. Only daemon-restart drained
|
|
18
|
+
the pending queue (limit 20 per restart). 18 memories were permanently lost
|
|
19
|
+
to a NoneType iterable crash between April 15-26, 2026, all recoverable
|
|
20
|
+
because the content was preserved in pending.db.
|
|
21
|
+
|
|
22
|
+
### Fixed
|
|
23
|
+
- **Materializer `_engine` NameError** (`unified_daemon.py`). The background
|
|
24
|
+
pending materializer thread referenced a module-level `_engine` global
|
|
25
|
+
that was never declared. Result: every iteration threw `NameError: name
|
|
26
|
+
'_engine' is not defined`, the exception was caught and logged as
|
|
27
|
+
"materializer loop error", and the thread slept 5s and retried forever
|
|
28
|
+
without ever processing pending memories. Bug present since v3.4.32.
|
|
29
|
+
Fixed by declaring `_engine = None` at module level and assigning
|
|
30
|
+
`_engine = engine` in the FastAPI lifespan after `engine.initialize()`.
|
|
31
|
+
- **scene_builder NoneType crash** (`encoding/scene_builder.py:assign_to_scene`).
|
|
32
|
+
When the embedding worker was unavailable (cold-start timeout, crash),
|
|
33
|
+
`embedder.embed()` returned None. The code checked `theme_emb is None`
|
|
34
|
+
but never checked `fact_emb is None`, so `_cosine(None, theme_emb)`
|
|
35
|
+
called `zip(None, theme_emb)` → `'NoneType' object is not iterable`,
|
|
36
|
+
propagating up through `engine.store()` → mark_failed → permanent loss.
|
|
37
|
+
Fixed by guarding `fact_emb is None` (skip scene assignment, still create
|
|
38
|
+
scene) and adding defensive `None` check to `_cosine()` itself.
|
|
39
|
+
- **Retry-aware mark_failed** (`cli/pending_store.py`). Previously, ANY
|
|
40
|
+
exception during materialization permanently marked the memory as
|
|
41
|
+
failed — even transient errors like embedding worker timeout. Now uses
|
|
42
|
+
the existing `retry_count` column: keeps status as `pending` until 3
|
|
43
|
+
retries, only marks `failed` after all retries are exhausted.
|
|
44
|
+
|
|
45
|
+
### Added
|
|
46
|
+
- **Diagnostic logging in materializer** — "Materializer: waiting for
|
|
47
|
+
engine to init...", "engine acquired, starting drain loop", "processing
|
|
48
|
+
N pending memories" — so operators can verify the materializer is alive
|
|
49
|
+
without grepping for absence of error messages.
|
|
50
|
+
- **`tests/test_integration/test_async_remember_e2e.py`** — full
|
|
51
|
+
production pipeline test: POST `/remember` (async, default mode) →
|
|
52
|
+
wait up to 60s → verify content in `memory.db` → recall returns it.
|
|
53
|
+
This is the test that was missing for 8+ months. The 4,501 existing
|
|
54
|
+
test functions test components in isolation (mocking `store_pending`)
|
|
55
|
+
and never exercise the full async flow that real users hit.
|
|
56
|
+
|
|
57
|
+
### Recovery
|
|
58
|
+
On install, if you have existing failed records in `pending.db`, they will
|
|
59
|
+
be auto-retried on the next daemon restart by `engine._process_pending_memories()`.
|
|
60
|
+
To manually recover, run:
|
|
61
|
+
```python
|
|
62
|
+
import sqlite3
|
|
63
|
+
db = sqlite3.connect('~/.superlocalmemory/pending.db')
|
|
64
|
+
db.execute("UPDATE pending_memories SET status='pending', retry_count=0, error=NULL WHERE status='failed'")
|
|
65
|
+
db.commit()
|
|
66
|
+
```
|
|
67
|
+
Then `slm restart`.
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## [3.4.37] - 2026-04-26
|
|
72
|
+
|
|
73
|
+
**P0 RAM fix.** Total SLM footprint reduced from ~14 GB peak to ~2.3 GB peak
|
|
74
|
+
(84% reduction). Idle dropped from ~2.5 GB to ~1.0 GB. Users with 16 GB
|
|
75
|
+
laptops can now run SLM without uninstalling.
|
|
76
|
+
|
|
77
|
+
### Fixed
|
|
78
|
+
- **CoreML EP allocation** — Added `ORT_DISABLE_COREML=1` to
|
|
79
|
+
`recall_worker.py`, `cli/commands.py` (warmup diagnose path), and the
|
|
80
|
+
Popen environment dicts in `core/embeddings.py` and
|
|
81
|
+
`retrieval/reranker.py`. Previously only `embedding_worker.py` and
|
|
82
|
+
`reranker_worker.py` set this. On ARM64 Mac, ONNX Runtime's CoreML
|
|
83
|
+
Execution Provider allocated 3-5 GB per missing guard.
|
|
84
|
+
- **Duplicate MemoryEngine** — The QueueConsumer (recall_queue.db drain)
|
|
85
|
+
was routing through `WorkerPool` → `recall_worker` subprocess, which
|
|
86
|
+
loaded a SECOND full MemoryEngine inside the daemon. Now routes through
|
|
87
|
+
the daemon's in-process engine via the new `EngineRecallAdapter`.
|
|
88
|
+
Eliminates ~800 MB of duplication.
|
|
89
|
+
- **Eager warmup** — Removed `WorkerPool.shared().warmup()` from daemon
|
|
90
|
+
startup. The recall_worker subprocess no longer spawns at boot. It
|
|
91
|
+
remains available as a fallback for dashboard/chat routes.
|
|
92
|
+
|
|
93
|
+
### Changed
|
|
94
|
+
- **RSS limits tightened:**
|
|
95
|
+
- `embedding_worker` self-kill: 4000 MB → 1800 MB
|
|
96
|
+
- `recall_worker` self-kill: 2500 MB → 1500 MB
|
|
97
|
+
- Daemon watchdog `MAX_WORKER_MB`: 4096 MB → 1800 MB
|
|
98
|
+
- `HealthMonitor.global_rss_budget_mb`: 4096 MB → 2500 MB
|
|
99
|
+
- **Watchdog interval:** 60s → 15s in both daemon watchdog and
|
|
100
|
+
HealthMonitor `check_interval_sec`. Catches memory spikes faster.
|
|
101
|
+
- **Idle timeouts:**
|
|
102
|
+
- `SLM_EMBED_IDLE_TIMEOUT`: 1800s (30 min) → 300s (5 min)
|
|
103
|
+
- `SLM_RERANKER_IDLE_TIMEOUT`: 1800s → 300s
|
|
104
|
+
- Reduces idle RAM held by ML model subprocesses.
|
|
105
|
+
|
|
106
|
+
### Added
|
|
107
|
+
- **`EngineRecallAdapter`** in `unified_daemon.py` — wraps the in-process
|
|
108
|
+
MemoryEngine to satisfy `RecallPoolProtocol` for the QueueConsumer.
|
|
109
|
+
Eliminates the recall_worker subprocess on the hot path.
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
13
113
|
## [3.4.36] - 2026-04-25
|
|
14
114
|
|
|
15
115
|
Persistent hook daemon: recall latency drops from ~2.2s to sub-second by
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "superlocalmemory",
|
|
3
|
-
"version": "3.4.
|
|
3
|
+
"version": "3.4.38",
|
|
4
4
|
"description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"ai-memory",
|
package/pyproject.toml
CHANGED
|
@@ -1710,6 +1710,7 @@ def _warmup_diagnose() -> None:
|
|
|
1710
1710
|
"""Diagnostic helper when warmup fails."""
|
|
1711
1711
|
print("\nDiagnosing...")
|
|
1712
1712
|
print(f" Python executable: {sys.executable}")
|
|
1713
|
+
os.environ["ORT_DISABLE_COREML"] = "1"
|
|
1713
1714
|
try:
|
|
1714
1715
|
from sentence_transformers import SentenceTransformer
|
|
1715
1716
|
print(" sentence-transformers: importable")
|
|
@@ -122,13 +122,22 @@ def mark_done(row_id: int, base_dir: Path | None = None) -> None:
|
|
|
122
122
|
|
|
123
123
|
|
|
124
124
|
def mark_failed(row_id: int, error: str, base_dir: Path | None = None) -> None:
|
|
125
|
-
"""Mark a pending memory as failed with error message.
|
|
125
|
+
"""Mark a pending memory as failed with error message.
|
|
126
|
+
|
|
127
|
+
v3.4.38: Now retry-aware. If retry_count < _MAX_RETRIES, keeps status as
|
|
128
|
+
'pending' so the materializer will retry on next iteration. Only marks
|
|
129
|
+
permanently failed after _MAX_RETRIES (3) attempts. The previous behavior
|
|
130
|
+
permanently lost 18 memories between April 15-26, 2026 to transient errors.
|
|
131
|
+
"""
|
|
126
132
|
conn = _get_db(base_dir)
|
|
127
133
|
try:
|
|
134
|
+
# Increment retry count and conditionally update status
|
|
128
135
|
conn.execute(
|
|
129
|
-
"UPDATE pending_memories SET
|
|
130
|
-
"retry_count = retry_count + 1
|
|
131
|
-
|
|
136
|
+
"UPDATE pending_memories SET error = ?, "
|
|
137
|
+
"retry_count = retry_count + 1, "
|
|
138
|
+
"status = CASE WHEN retry_count + 1 >= ? THEN 'failed' ELSE 'pending' END "
|
|
139
|
+
"WHERE id = ?",
|
|
140
|
+
(error, _MAX_RETRIES, row_id),
|
|
132
141
|
)
|
|
133
142
|
conn.commit()
|
|
134
143
|
finally:
|
|
@@ -151,7 +151,7 @@ def _worker_main() -> None:
|
|
|
151
151
|
_respond({"ok": False, "error": str(exc)})
|
|
152
152
|
|
|
153
153
|
# V3.3.16: RSS watchdog — V3.4.24: cross-platform via platform_utils.
|
|
154
|
-
_rss_limit = int(os.environ.get("SLM_EMBED_WORKER_RSS_LIMIT_MB",
|
|
154
|
+
_rss_limit = int(os.environ.get("SLM_EMBED_WORKER_RSS_LIMIT_MB", 1800))
|
|
155
155
|
rss_mb = get_rss_mb()
|
|
156
156
|
if rss_mb > 0 and rss_mb > _rss_limit:
|
|
157
157
|
sys.exit(0)
|
|
@@ -140,14 +140,10 @@ def release_embedding_lock() -> None:
|
|
|
140
140
|
_embedding_lock_fd = None
|
|
141
141
|
|
|
142
142
|
|
|
143
|
-
_IDLE_TIMEOUT_SECONDS =
|
|
144
|
-
# V3.
|
|
145
|
-
#
|
|
146
|
-
#
|
|
147
|
-
# per-embed RSS self-check (SLM_EMBED_WORKER_RSS_LIMIT_MB, 4GB default) and
|
|
148
|
-
# the daemon memory watchdog (unified_daemon.py, 4GB/60s) still cap any
|
|
149
|
-
# runaway. To restore the old aggressive policy without redeploying, set
|
|
150
|
-
# ``SLM_EMBED_IDLE_TIMEOUT=120`` and ``slm restart``.
|
|
143
|
+
_IDLE_TIMEOUT_SECONDS = 300 # 5 minutes — balance cold-start vs RAM.
|
|
144
|
+
# V3.4.37: Reduced from 1800 → 300. Holding 1.1 GB for 30 min idle
|
|
145
|
+
# wastes RAM on laptops. 5 min covers bursty session_init+recall
|
|
146
|
+
# patterns while freeing memory between sessions.
|
|
151
147
|
_IDLE_TIMEOUT_SECONDS = int(os.environ.get("SLM_EMBED_IDLE_TIMEOUT", _IDLE_TIMEOUT_SECONDS))
|
|
152
148
|
# V3.3.21: Configurable response timeout — 180s default, but batch ingestion
|
|
153
149
|
# (2-turn chunks across 10 conversations) needs 600s+ to survive cold-start
|
|
@@ -476,6 +472,7 @@ class EmbeddingService:
|
|
|
476
472
|
"PYTORCH_ENABLE_MPS_FALLBACK": "1",
|
|
477
473
|
"TOKENIZERS_PARALLELISM": "false",
|
|
478
474
|
"TORCH_DEVICE": "cpu",
|
|
475
|
+
"ORT_DISABLE_COREML": "1",
|
|
479
476
|
}
|
|
480
477
|
from superlocalmemory.core.platform_utils import popen_platform_kwargs
|
|
481
478
|
self._worker_proc = subprocess.Popen(
|
|
@@ -133,9 +133,9 @@ class HealthMonitor:
|
|
|
133
133
|
|
|
134
134
|
def __init__(
|
|
135
135
|
self,
|
|
136
|
-
global_rss_budget_mb: int =
|
|
136
|
+
global_rss_budget_mb: int = 2500,
|
|
137
137
|
heartbeat_timeout_sec: int = 60,
|
|
138
|
-
check_interval_sec: int =
|
|
138
|
+
check_interval_sec: int = 15,
|
|
139
139
|
enable_structured_logging: bool = True,
|
|
140
140
|
):
|
|
141
141
|
self._budget_mb = global_rss_budget_mb
|
|
@@ -28,6 +28,8 @@ os.environ["PYTORCH_MPS_MEM_LIMIT"] = "0"
|
|
|
28
28
|
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
|
29
29
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
30
30
|
os.environ["TORCH_DEVICE"] = "cpu"
|
|
31
|
+
# V3.4.37: Disable CoreML EP — uses 3-5GB on ARM64 Mac.
|
|
32
|
+
os.environ["ORT_DISABLE_COREML"] = "1"
|
|
31
33
|
|
|
32
34
|
# SIGTERM bridge: Docker/systemd send SIGTERM to stop processes.
|
|
33
35
|
# Without this, the worker ignores SIGTERM and becomes a zombie.
|
|
@@ -324,7 +326,7 @@ def _worker_main() -> None:
|
|
|
324
326
|
|
|
325
327
|
# V3.3.16: RSS watchdog — V3.4.24: cross-platform via platform_utils.
|
|
326
328
|
rss_mb = get_rss_mb()
|
|
327
|
-
if rss_mb > 0 and rss_mb >
|
|
329
|
+
if rss_mb > 0 and rss_mb > 1500:
|
|
328
330
|
sys.exit(0)
|
|
329
331
|
|
|
330
332
|
|
|
@@ -167,6 +167,15 @@ def run_store(
|
|
|
167
167
|
session_date=parsed_date, speaker_a=speaker,
|
|
168
168
|
)
|
|
169
169
|
|
|
170
|
+
# v3.4.38: Defensive None guard. extract_facts() returns None on transient
|
|
171
|
+
# failures (embedding worker timeout, LLM call fail). Without this guard,
|
|
172
|
+
# line 201's `{f.content for f in facts}` raises 'NoneType' object is not
|
|
173
|
+
# iterable, causing the caller to mark_failed permanently — even though
|
|
174
|
+
# the content is still recoverable. 18 memories were lost to this between
|
|
175
|
+
# April 15-26, 2026.
|
|
176
|
+
if facts is None:
|
|
177
|
+
facts = []
|
|
178
|
+
|
|
170
179
|
# V3.3.11: Also store raw content as a verbatim fact to preserve details
|
|
171
180
|
# that fact extraction may abstract away (dates, names, specifics).
|
|
172
181
|
# This ensures BM25 and semantic search can always find the original text.
|
|
@@ -56,6 +56,15 @@ class SceneBuilder:
|
|
|
56
56
|
# Always compute fact embedding first — needed for comparisons
|
|
57
57
|
fact_emb = self._embedder.embed(new_fact.content)
|
|
58
58
|
|
|
59
|
+
# v3.4.38: Defensive None guard. embedder.embed() returns None when
|
|
60
|
+
# the embedding worker is unavailable (timeout, crash). Without this
|
|
61
|
+
# guard, _cosine(None, theme_emb) → zip(None, ...) → 'NoneType'
|
|
62
|
+
# object is not iterable, propagating up to engine.store() and
|
|
63
|
+
# causing the entire memory to be lost. Better to skip scene
|
|
64
|
+
# assignment than lose the memory.
|
|
65
|
+
if fact_emb is None:
|
|
66
|
+
return self._create_scene(new_fact, profile_id)
|
|
67
|
+
|
|
59
68
|
scenes = self._get_scenes(profile_id)
|
|
60
69
|
if not scenes:
|
|
61
70
|
return self._create_scene(new_fact, profile_id)
|
|
@@ -189,7 +198,12 @@ class SceneBuilder:
|
|
|
189
198
|
)
|
|
190
199
|
|
|
191
200
|
|
|
192
|
-
def _cosine(a: list[float], b: list[float]) -> float:
|
|
201
|
+
def _cosine(a: list[float] | None, b: list[float] | None) -> float:
|
|
202
|
+
# v3.4.38: Defensive None guard — embedder can return None on worker
|
|
203
|
+
# unavailability. Returning 0.0 is correct: zero similarity means no
|
|
204
|
+
# match, which falls back to creating a new scene.
|
|
205
|
+
if a is None or b is None:
|
|
206
|
+
return 0.0
|
|
193
207
|
dot = sum(x * y for x, y in zip(a, b))
|
|
194
208
|
na = sum(x * x for x in a) ** 0.5
|
|
195
209
|
nb = sum(x * x for x in b) ** 0.5
|
|
@@ -51,7 +51,7 @@ _live_rerankers: set[weakref.ref] = set()
|
|
|
51
51
|
|
|
52
52
|
logger = logging.getLogger(__name__)
|
|
53
53
|
|
|
54
|
-
_IDLE_TIMEOUT_SECONDS =
|
|
54
|
+
_IDLE_TIMEOUT_SECONDS = 300 # V3.4.37: 5 min (was 30) — balance cold-start vs RAM.
|
|
55
55
|
# V3.3.12: Configurable via SLM_RERANKER_IDLE_TIMEOUT env var.
|
|
56
56
|
# V3.4.19: Bumped from 120 → 1800 in lock-step with the embedding worker.
|
|
57
57
|
# Set ``SLM_RERANKER_IDLE_TIMEOUT=120`` + ``slm restart`` to revert.
|
|
@@ -192,6 +192,7 @@ class CrossEncoderReranker:
|
|
|
192
192
|
"PYTORCH_ENABLE_MPS_FALLBACK": "1",
|
|
193
193
|
"TOKENIZERS_PARALLELISM": "false",
|
|
194
194
|
"TORCH_DEVICE": "cpu",
|
|
195
|
+
"ORT_DISABLE_COREML": "1",
|
|
195
196
|
}
|
|
196
197
|
from superlocalmemory.core.platform_utils import popen_platform_kwargs
|
|
197
198
|
self._worker_proc = subprocess.Popen(
|
|
@@ -66,6 +66,75 @@ class ObserveRequest(BaseModel):
|
|
|
66
66
|
content: str
|
|
67
67
|
|
|
68
68
|
|
|
69
|
+
# ---------------------------------------------------------------------------
|
|
70
|
+
# V3.4.37: Engine recall adapter — routes QueueConsumer through the daemon's
|
|
71
|
+
# in-process MemoryEngine instead of spawning a recall_worker subprocess.
|
|
72
|
+
# Saves ~800 MB by eliminating the duplicate engine.
|
|
73
|
+
# ---------------------------------------------------------------------------
|
|
74
|
+
|
|
75
|
+
class EngineRecallAdapter:
|
|
76
|
+
"""Adapts MemoryEngine.recall() to RecallPoolProtocol for QueueConsumer.
|
|
77
|
+
|
|
78
|
+
The daemon already has a full MemoryEngine in-process. The QueueConsumer
|
|
79
|
+
previously routed through WorkerPool → recall_worker subprocess, which
|
|
80
|
+
loaded a SECOND MemoryEngine. This adapter eliminates that duplication.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
def __init__(self, engine) -> None:
|
|
84
|
+
self._engine = engine
|
|
85
|
+
|
|
86
|
+
def recall(self, query: str, limit: int = 10, session_id: str = "") -> dict:
|
|
87
|
+
response = self._engine.recall(
|
|
88
|
+
query, limit=limit, session_id=session_id or None,
|
|
89
|
+
)
|
|
90
|
+
memory_ids = list({
|
|
91
|
+
r.fact.memory_id for r in response.results[:limit]
|
|
92
|
+
if r.fact.memory_id
|
|
93
|
+
})
|
|
94
|
+
memory_map = (
|
|
95
|
+
self._engine._db.get_memory_content_batch(memory_ids)
|
|
96
|
+
if memory_ids else {}
|
|
97
|
+
)
|
|
98
|
+
results = []
|
|
99
|
+
for r in response.results[:limit]:
|
|
100
|
+
fact_type = getattr(r.fact, "fact_type", None)
|
|
101
|
+
lifecycle = getattr(r.fact, "lifecycle", None)
|
|
102
|
+
results.append({
|
|
103
|
+
"fact_id": r.fact.fact_id,
|
|
104
|
+
"memory_id": r.fact.memory_id,
|
|
105
|
+
"content": r.fact.content[:300],
|
|
106
|
+
"source_content": memory_map.get(r.fact.memory_id, ""),
|
|
107
|
+
"score": round(r.score, 4),
|
|
108
|
+
"confidence": round(r.confidence, 4),
|
|
109
|
+
"trust_score": round(r.trust_score, 4),
|
|
110
|
+
"channel_scores": {
|
|
111
|
+
k: round(v, 4)
|
|
112
|
+
for k, v in (r.channel_scores or {}).items()
|
|
113
|
+
},
|
|
114
|
+
"fact_type": fact_type.value
|
|
115
|
+
if fact_type and hasattr(fact_type, "value") else "",
|
|
116
|
+
"lifecycle": lifecycle.value
|
|
117
|
+
if lifecycle and hasattr(lifecycle, "value") else "",
|
|
118
|
+
"access_count": getattr(r.fact, "access_count", 0),
|
|
119
|
+
"evidence_chain": list(
|
|
120
|
+
getattr(r, "evidence_chain", []) or []
|
|
121
|
+
),
|
|
122
|
+
})
|
|
123
|
+
return {
|
|
124
|
+
"ok": True,
|
|
125
|
+
"query": query,
|
|
126
|
+
"query_type": response.query_type,
|
|
127
|
+
"result_count": len(results),
|
|
128
|
+
"retrieval_time_ms": round(response.retrieval_time_ms, 1),
|
|
129
|
+
"channel_weights": {
|
|
130
|
+
k: round(v, 3)
|
|
131
|
+
for k, v in (response.channel_weights or {}).items()
|
|
132
|
+
},
|
|
133
|
+
"total_candidates": getattr(response, "total_candidates", 0),
|
|
134
|
+
"results": results,
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
|
|
69
138
|
# ---------------------------------------------------------------------------
|
|
70
139
|
# v3.4.32: Recall-priority gate for the pending materializer.
|
|
71
140
|
# All /remember writes go to pending.db and return fast; a background
|
|
@@ -79,6 +148,13 @@ from superlocalmemory.core.recall_gate import (
|
|
|
79
148
|
in_flight as _recalls_in_flight,
|
|
80
149
|
)
|
|
81
150
|
|
|
151
|
+
# v3.4.38: Module-level engine reference for the pending materializer.
|
|
152
|
+
# Set by the FastAPI lifespan after engine.initialize(). Was missing before,
|
|
153
|
+
# causing "name '_engine' is not defined" errors that blocked materialization
|
|
154
|
+
# of pending memories — they accumulated forever, only being processed at
|
|
155
|
+
# daemon startup via engine._process_pending_memories().
|
|
156
|
+
_engine = None
|
|
157
|
+
|
|
82
158
|
|
|
83
159
|
# ---------------------------------------------------------------------------
|
|
84
160
|
# Observation debounce buffer (migrated from daemon.py)
|
|
@@ -351,6 +427,9 @@ async def lifespan(application: FastAPI):
|
|
|
351
427
|
|
|
352
428
|
application.state.engine = engine
|
|
353
429
|
application.state.config = config
|
|
430
|
+
# v3.4.38: Wire module-level _engine for the pending materializer.
|
|
431
|
+
global _engine
|
|
432
|
+
_engine = engine
|
|
354
433
|
logger.info("Unified daemon: MemoryEngine initialized (mode=%s)", config.mode.value)
|
|
355
434
|
|
|
356
435
|
# LLD-07 §4 — deferred migrations (e.g. M006 reward column) need to
|
|
@@ -397,9 +476,10 @@ async def lifespan(application: FastAPI):
|
|
|
397
476
|
# Set up observe buffer
|
|
398
477
|
_observe_buffer.set_engine(engine)
|
|
399
478
|
|
|
400
|
-
#
|
|
401
|
-
|
|
402
|
-
|
|
479
|
+
# V3.4.37: Removed WorkerPool.warmup() — the recall_worker subprocess
|
|
480
|
+
# duplicated the daemon's MemoryEngine (800+ MB). QueueConsumer now
|
|
481
|
+
# uses the daemon's engine directly via EngineRecallAdapter.
|
|
482
|
+
# WorkerPool is still available as fallback for dashboard/chat routes.
|
|
403
483
|
|
|
404
484
|
# Force reranker warmup
|
|
405
485
|
retrieval_eng = getattr(engine, '_retrieval_engine', None)
|
|
@@ -422,8 +502,9 @@ async def lifespan(application: FastAPI):
|
|
|
422
502
|
logger.warning("Embedding warmup failed: %s", exc)
|
|
423
503
|
threading.Thread(target=_warmup_embedder, daemon=True, name="embed-warmup").start()
|
|
424
504
|
|
|
425
|
-
# v3.4.
|
|
426
|
-
#
|
|
505
|
+
# v3.4.37: QueueConsumer uses daemon's engine directly via adapter.
|
|
506
|
+
# Previously routed through WorkerPool → recall_worker subprocess,
|
|
507
|
+
# which loaded a duplicate MemoryEngine (~800 MB waste).
|
|
427
508
|
try:
|
|
428
509
|
from pathlib import Path as _QP
|
|
429
510
|
from superlocalmemory.core.queue_consumer import QueueConsumer
|
|
@@ -432,7 +513,7 @@ async def lifespan(application: FastAPI):
|
|
|
432
513
|
_recall_queue = RecallQueue(_queue_db)
|
|
433
514
|
_queue_consumer = QueueConsumer(
|
|
434
515
|
queue=_recall_queue,
|
|
435
|
-
pool=
|
|
516
|
+
pool=EngineRecallAdapter(engine),
|
|
436
517
|
)
|
|
437
518
|
_queue_consumer.start()
|
|
438
519
|
application.state.queue_consumer = _queue_consumer
|
|
@@ -466,9 +547,9 @@ async def lifespan(application: FastAPI):
|
|
|
466
547
|
from superlocalmemory.core.health_monitor import HealthMonitor
|
|
467
548
|
health_config = getattr(config, 'health', None)
|
|
468
549
|
monitor = HealthMonitor(
|
|
469
|
-
global_rss_budget_mb=getattr(health_config, 'global_rss_budget_mb',
|
|
550
|
+
global_rss_budget_mb=getattr(health_config, 'global_rss_budget_mb', 2500) if health_config else 2500,
|
|
470
551
|
heartbeat_timeout_sec=getattr(health_config, 'heartbeat_timeout_sec', 60) if health_config else 60,
|
|
471
|
-
check_interval_sec=getattr(health_config, 'health_check_interval_sec',
|
|
552
|
+
check_interval_sec=getattr(health_config, 'health_check_interval_sec', 15) if health_config else 15,
|
|
472
553
|
enable_structured_logging=getattr(health_config, 'enable_structured_logging', True) if health_config else True,
|
|
473
554
|
)
|
|
474
555
|
monitor.start()
|
|
@@ -1259,11 +1340,11 @@ def _start_memory_watchdog() -> None:
|
|
|
1259
1340
|
"""
|
|
1260
1341
|
import threading
|
|
1261
1342
|
|
|
1262
|
-
MAX_WORKER_MB =
|
|
1343
|
+
MAX_WORKER_MB = 1800 # V3.4.37: 1.8GB — ONNX nomic-embed is ~1.7GB loaded
|
|
1263
1344
|
|
|
1264
1345
|
def watchdog_loop():
|
|
1265
1346
|
while True:
|
|
1266
|
-
time.sleep(
|
|
1347
|
+
time.sleep(15) # V3.4.37: 15s (was 60s) — catch spikes faster
|
|
1267
1348
|
try:
|
|
1268
1349
|
import psutil
|
|
1269
1350
|
parent = psutil.Process(os.getpid())
|
|
@@ -1307,16 +1388,31 @@ def _start_pending_materializer() -> None:
|
|
|
1307
1388
|
from superlocalmemory.cli.pending_store import (
|
|
1308
1389
|
get_pending, mark_done, mark_failed,
|
|
1309
1390
|
)
|
|
1391
|
+
# v3.4.38: log first engine acquisition so we know materializer is alive
|
|
1392
|
+
_engine_logged = False
|
|
1393
|
+
_waiting_logged = False
|
|
1310
1394
|
while not _materializer_stop.is_set():
|
|
1311
1395
|
try:
|
|
1312
|
-
|
|
1396
|
+
# v3.4.38: Read fresh module global on every iteration so we
|
|
1397
|
+
# pick up the engine after lifespan sets it. Use the import
|
|
1398
|
+
# trick to ensure we're reading the live module attribute,
|
|
1399
|
+
# not a stale local reference.
|
|
1400
|
+
import superlocalmemory.server.unified_daemon as _ud
|
|
1401
|
+
engine = _ud._engine
|
|
1313
1402
|
if engine is None:
|
|
1403
|
+
if not _waiting_logged:
|
|
1404
|
+
logger.info("Materializer: waiting for engine to init...")
|
|
1405
|
+
_waiting_logged = True
|
|
1314
1406
|
time.sleep(2.0)
|
|
1315
1407
|
continue
|
|
1408
|
+
if not _engine_logged:
|
|
1409
|
+
logger.info("Materializer: engine acquired, starting drain loop")
|
|
1410
|
+
_engine_logged = True
|
|
1316
1411
|
pending = get_pending(limit=5)
|
|
1317
1412
|
if not pending:
|
|
1318
1413
|
time.sleep(2.0)
|
|
1319
1414
|
continue
|
|
1415
|
+
logger.info("Materializer: processing %d pending memories", len(pending))
|
|
1320
1416
|
for item in pending:
|
|
1321
1417
|
if _materializer_stop.is_set():
|
|
1322
1418
|
break
|