superlocalmemory 3.3.16 → 3.3.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/pyproject.toml +1 -1
- package/src/superlocalmemory/core/config.py +1 -1
- package/src/superlocalmemory/core/embedding_worker.py +50 -21
- package/src/superlocalmemory/core/embeddings.py +20 -2
- package/src/superlocalmemory/core/recall_pipeline.py +24 -8
- package/src/superlocalmemory/core/recall_worker.py +11 -1
- package/src/superlocalmemory/core/reranker_worker.py +15 -3
- package/src/superlocalmemory/retrieval/reranker.py +13 -19
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "superlocalmemory",
|
|
3
|
-
"version": "3.3.
|
|
3
|
+
"version": "3.3.18",
|
|
4
4
|
"description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"ai-memory",
|
package/pyproject.toml
CHANGED
|
@@ -155,7 +155,7 @@ class RetrievalConfig:
|
|
|
155
155
|
# Reranking (V3.3.2: ONNX backend enabled for all modes)
|
|
156
156
|
use_cross_encoder: bool = True
|
|
157
157
|
cross_encoder_model: str = "cross-encoder/ms-marco-MiniLM-L-12-v2"
|
|
158
|
-
cross_encoder_backend: str = "
|
|
158
|
+
cross_encoder_backend: str = "" # "" = PyTorch (~500MB stable), "onnx" = ONNX (leaks on ARM64 CoreML)
|
|
159
159
|
|
|
160
160
|
# Agentic (Mode C only)
|
|
161
161
|
agentic_max_rounds: int = 3
|
|
@@ -35,6 +35,8 @@ os.environ["PYTORCH_MPS_MEM_LIMIT"] = "0"
|
|
|
35
35
|
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
|
36
36
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
37
37
|
os.environ["TORCH_DEVICE"] = "cpu"
|
|
38
|
+
# V3.3.17: Disable CoreML EP for ONNX Runtime — uses 3-5GB on ARM64 Mac.
|
|
39
|
+
os.environ["ORT_DISABLE_COREML"] = "1"
|
|
38
40
|
|
|
39
41
|
# SIGTERM bridge: Docker/systemd send SIGTERM to stop processes.
|
|
40
42
|
# Without this, the worker ignores SIGTERM and becomes a zombie.
|
|
@@ -65,6 +67,34 @@ def _start_parent_watchdog() -> None:
|
|
|
65
67
|
t.start()
|
|
66
68
|
|
|
67
69
|
|
|
70
|
+
def _load_embedding_model(name: str) -> tuple:
|
|
71
|
+
"""Load embedding model. ONNX first (no memory leak), PyTorch fallback.
|
|
72
|
+
|
|
73
|
+
V3.3.17: PyTorch SentenceTransformer on ARM64 Mac leaks memory —
|
|
74
|
+
grows from 300MB to 17GB after ~200 encode calls. ONNX Runtime
|
|
75
|
+
has no such issue. Same approach as CrossEncoder ONNX migration.
|
|
76
|
+
|
|
77
|
+
Returns (model, backend_name) or (None, "").
|
|
78
|
+
"""
|
|
79
|
+
from sentence_transformers import SentenceTransformer
|
|
80
|
+
|
|
81
|
+
# Tier 1: ONNX (stable memory, ~200MB footprint)
|
|
82
|
+
try:
|
|
83
|
+
m = SentenceTransformer(name, backend="onnx", trust_remote_code=True)
|
|
84
|
+
return m, "onnx"
|
|
85
|
+
except Exception:
|
|
86
|
+
pass
|
|
87
|
+
|
|
88
|
+
# Tier 2: PyTorch CPU (stable at ~1.4GB after 100+ calls, verified)
|
|
89
|
+
try:
|
|
90
|
+
import torch
|
|
91
|
+
with torch.inference_mode():
|
|
92
|
+
m = SentenceTransformer(name, trust_remote_code=True, device="cpu")
|
|
93
|
+
return m, "pytorch"
|
|
94
|
+
except Exception:
|
|
95
|
+
return None, ""
|
|
96
|
+
|
|
97
|
+
|
|
68
98
|
def _worker_main() -> None:
|
|
69
99
|
"""Main loop: read JSON requests from stdin, write responses to stdout."""
|
|
70
100
|
_start_parent_watchdog() # V3.3.7: self-terminate if parent dies
|
|
@@ -97,18 +127,17 @@ def _worker_main() -> None:
|
|
|
97
127
|
if cmd == "load":
|
|
98
128
|
name = req.get("model_name", "nomic-ai/nomic-embed-text-v1.5")
|
|
99
129
|
expected_dim = req.get("dimension", 768)
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
model = SentenceTransformer(name, trust_remote_code=True, device="cpu")
|
|
130
|
+
model, active_backend = _load_embedding_model(name)
|
|
131
|
+
if model is not None:
|
|
103
132
|
dim = model.get_sentence_embedding_dimension()
|
|
104
133
|
if dim != expected_dim:
|
|
105
134
|
_respond({"ok": False, "error": f"Dimension mismatch: {dim} != {expected_dim}"})
|
|
106
135
|
model = None
|
|
107
136
|
continue
|
|
108
137
|
model_name = name
|
|
109
|
-
_respond({"ok": True, "dim": dim, "model": name})
|
|
110
|
-
|
|
111
|
-
_respond({"ok": False, "error":
|
|
138
|
+
_respond({"ok": True, "dim": dim, "model": name, "backend": active_backend})
|
|
139
|
+
else:
|
|
140
|
+
_respond({"ok": False, "error": "Model load failed"})
|
|
112
141
|
continue
|
|
113
142
|
|
|
114
143
|
if cmd == "embed":
|
|
@@ -117,26 +146,16 @@ def _worker_main() -> None:
|
|
|
117
146
|
_respond({"ok": False, "error": "No texts provided"})
|
|
118
147
|
continue
|
|
119
148
|
if model is None:
|
|
120
|
-
# Auto-load if not yet loaded
|
|
121
149
|
name = req.get("model_name", "nomic-ai/nomic-embed-text-v1.5")
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
from sentence_transformers import SentenceTransformer
|
|
125
|
-
model = SentenceTransformer(name, trust_remote_code=True, device="cpu")
|
|
150
|
+
model, active_backend = _load_embedding_model(name)
|
|
151
|
+
if model is not None:
|
|
126
152
|
dim = model.get_sentence_embedding_dimension()
|
|
127
153
|
model_name = name
|
|
128
|
-
|
|
129
|
-
_respond({"ok": False, "error":
|
|
154
|
+
else:
|
|
155
|
+
_respond({"ok": False, "error": "Model load failed"})
|
|
130
156
|
continue
|
|
131
157
|
try:
|
|
132
|
-
|
|
133
|
-
# which causes silent memory leaks over long-running sessions.
|
|
134
|
-
try:
|
|
135
|
-
import torch
|
|
136
|
-
with torch.inference_mode():
|
|
137
|
-
vecs = model.encode(texts, normalize_embeddings=True)
|
|
138
|
-
except ImportError:
|
|
139
|
-
vecs = model.encode(texts, normalize_embeddings=True)
|
|
158
|
+
vecs = model.encode(texts, normalize_embeddings=True)
|
|
140
159
|
if isinstance(vecs, np.ndarray) and vecs.ndim == 2:
|
|
141
160
|
result = [vecs[i].tolist() for i in range(vecs.shape[0])]
|
|
142
161
|
else:
|
|
@@ -144,6 +163,16 @@ def _worker_main() -> None:
|
|
|
144
163
|
_respond({"ok": True, "vectors": result, "dim": dim})
|
|
145
164
|
except Exception as exc:
|
|
146
165
|
_respond({"ok": False, "error": str(exc)})
|
|
166
|
+
|
|
167
|
+
# V3.3.16: RSS watchdog — self-terminate if memory exceeds 1.5GB.
|
|
168
|
+
# PyTorch on ARM64 Mac never returns memory to OS. After ~200 embeds
|
|
169
|
+
# a worker that started at 300MB grows to 17GB+. Parent auto-respawns
|
|
170
|
+
# a fresh worker on next request (existing mechanism in embeddings.py).
|
|
171
|
+
import resource
|
|
172
|
+
rss_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 / 1024
|
|
173
|
+
if rss_mb > 2500:
|
|
174
|
+
sys.exit(0)
|
|
175
|
+
|
|
147
176
|
continue
|
|
148
177
|
|
|
149
178
|
_respond({"ok": False, "error": f"Unknown command: {cmd}"})
|
|
@@ -207,11 +207,29 @@ class EmbeddingService:
|
|
|
207
207
|
return resp["vectors"]
|
|
208
208
|
except (BrokenPipeError, OSError, json.JSONDecodeError) as exc:
|
|
209
209
|
logger.warning(
|
|
210
|
-
"Embedding worker communication failed: %s.
|
|
211
|
-
"Run 'slm doctor' to check dependencies and Python version.",
|
|
210
|
+
"Embedding worker communication failed: %s — respawning.",
|
|
212
211
|
exc,
|
|
213
212
|
)
|
|
214
213
|
self._kill_worker()
|
|
214
|
+
# V3.3.16: Auto-retry once after worker death (RSS watchdog
|
|
215
|
+
# or crash). Respawn + re-send instead of returning None.
|
|
216
|
+
try:
|
|
217
|
+
self._ensure_worker()
|
|
218
|
+
if self._worker_proc is not None:
|
|
219
|
+
self._worker_proc.stdin.write(req)
|
|
220
|
+
self._worker_proc.stdin.flush()
|
|
221
|
+
resp_line = self._readline_with_timeout(
|
|
222
|
+
self._worker_proc.stdout,
|
|
223
|
+
_SUBPROCESS_RESPONSE_TIMEOUT,
|
|
224
|
+
)
|
|
225
|
+
if resp_line:
|
|
226
|
+
resp = json.loads(resp_line)
|
|
227
|
+
if resp.get("ok"):
|
|
228
|
+
self._reset_idle_timer()
|
|
229
|
+
self._request_count = 1
|
|
230
|
+
return resp["vectors"]
|
|
231
|
+
except Exception:
|
|
232
|
+
self._kill_worker()
|
|
215
233
|
return None
|
|
216
234
|
|
|
217
235
|
@staticmethod
|
|
@@ -263,16 +263,32 @@ def run_recall(
|
|
|
263
263
|
for r in response.results:
|
|
264
264
|
trust_scorer.update_on_access("fact", r.fact.fact_id, profile_id)
|
|
265
265
|
|
|
266
|
-
#
|
|
267
|
-
#
|
|
268
|
-
#
|
|
269
|
-
#
|
|
270
|
-
|
|
271
|
-
|
|
266
|
+
# Fisher Bayesian update on recall — narrows variance on accessed facts
|
|
267
|
+
# so they score higher on subsequent recalls (critical for benchmark: +24pp).
|
|
268
|
+
# V3.3.16: Reuse query embedding from retrieval engine cache instead of
|
|
269
|
+
# calling embedder.embed() again (which was the memory leak source).
|
|
270
|
+
q_var_arr = None
|
|
271
|
+
if embedder and hasattr(retrieval_engine, '_query_embedding_cache'):
|
|
272
|
+
cached_emb = retrieval_engine._query_embedding_cache.get(query)
|
|
273
|
+
if cached_emb is not None:
|
|
274
|
+
import numpy as _np
|
|
275
|
+
_, q_var_list = embedder.compute_fisher_params(cached_emb)
|
|
276
|
+
q_var_arr = _np.array(q_var_list, dtype=_np.float64)
|
|
277
|
+
|
|
272
278
|
for r in response.results:
|
|
273
|
-
|
|
279
|
+
updates: dict[str, object] = {
|
|
274
280
|
"access_count": r.fact.access_count + 1,
|
|
275
|
-
}
|
|
281
|
+
}
|
|
282
|
+
if (q_var_arr is not None
|
|
283
|
+
and r.fact.fisher_variance
|
|
284
|
+
and len(r.fact.fisher_variance) == len(q_var_arr)
|
|
285
|
+
and r.fact.access_count >= 3):
|
|
286
|
+
import numpy as _np
|
|
287
|
+
f_var = _np.array(r.fact.fisher_variance, dtype=_np.float64)
|
|
288
|
+
new_var = 1.0 / (1.0 / _np.maximum(f_var, 0.05) + 1.0 / _np.maximum(q_var_arr, 0.05))
|
|
289
|
+
new_var = _np.clip(new_var, 0.05, 2.0)
|
|
290
|
+
updates["fisher_variance"] = new_var.tolist()
|
|
291
|
+
db.update_fact(r.fact.fact_id, updates)
|
|
276
292
|
|
|
277
293
|
# Post-operation hooks (audit, trust signal, learning)
|
|
278
294
|
hook_ctx["result_count"] = len(response.results)
|
|
@@ -321,6 +321,13 @@ def _worker_main() -> None:
|
|
|
321
321
|
except Exception as exc:
|
|
322
322
|
_respond({"ok": False, "error": str(exc)})
|
|
323
323
|
|
|
324
|
+
# V3.3.16: RSS watchdog — self-terminate if memory exceeds 1.5GB.
|
|
325
|
+
# Parent auto-respawns a fresh worker on next request.
|
|
326
|
+
import resource
|
|
327
|
+
rss_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 / 1024
|
|
328
|
+
if rss_mb > 2500:
|
|
329
|
+
sys.exit(0)
|
|
330
|
+
|
|
324
331
|
|
|
325
332
|
def _respond(data: dict) -> None:
|
|
326
333
|
sys.stdout.write(json.dumps(data) + "\n")
|
|
@@ -328,4 +335,7 @@ def _respond(data: dict) -> None:
|
|
|
328
335
|
|
|
329
336
|
|
|
330
337
|
if __name__ == "__main__":
|
|
331
|
-
|
|
338
|
+
try:
|
|
339
|
+
_worker_main()
|
|
340
|
+
except KeyboardInterrupt:
|
|
341
|
+
sys.exit(0)
|
|
@@ -40,6 +40,9 @@ os.environ["PYTORCH_MPS_MEM_LIMIT"] = "0"
|
|
|
40
40
|
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
|
41
41
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
42
42
|
os.environ["TORCH_DEVICE"] = "cpu"
|
|
43
|
+
# V3.3.17: Disable CoreML EP for ONNX Runtime. CoreML compiles execution
|
|
44
|
+
# plans that consume 3-5GB on ARM64 Mac. CPU EP is ~500MB and fast enough.
|
|
45
|
+
os.environ["ORT_DISABLE_COREML"] = "1"
|
|
43
46
|
|
|
44
47
|
# SIGTERM bridge for Docker/systemd
|
|
45
48
|
if sys.platform != "win32":
|
|
@@ -124,10 +127,12 @@ def _worker_main() -> None:
|
|
|
124
127
|
warmup_ok = False
|
|
125
128
|
if model is not None:
|
|
126
129
|
try:
|
|
130
|
+
# Use 60 pairs (realistic batch size) to trigger CoreML
|
|
131
|
+
# compilation for the actual workload. 3 pairs compiled a
|
|
132
|
+
# different execution plan that got recompiled on 60 pairs.
|
|
127
133
|
dummy_pairs = [
|
|
128
|
-
("What
|
|
129
|
-
|
|
130
|
-
("What color is the sky?", "The sky is blue on a clear day."),
|
|
134
|
+
(f"What happened to person {i}?", f"Person {i} went to location {i} and did activity {i} last summer with friends.")
|
|
135
|
+
for i in range(60)
|
|
131
136
|
]
|
|
132
137
|
try:
|
|
133
138
|
import torch
|
|
@@ -174,6 +179,13 @@ def _worker_main() -> None:
|
|
|
174
179
|
})
|
|
175
180
|
except Exception as exc:
|
|
176
181
|
_respond({"ok": False, "error": str(exc)})
|
|
182
|
+
|
|
183
|
+
# V3.3.16: RSS watchdog — same as embedding_worker
|
|
184
|
+
import resource
|
|
185
|
+
rss_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 / 1024
|
|
186
|
+
if rss_mb > 2500:
|
|
187
|
+
sys.exit(0)
|
|
188
|
+
|
|
177
189
|
continue
|
|
178
190
|
|
|
179
191
|
if cmd == "score":
|
|
@@ -94,8 +94,10 @@ class CrossEncoderReranker:
|
|
|
94
94
|
def _start_background_warmup(self) -> None:
|
|
95
95
|
"""Start worker and load model in background thread.
|
|
96
96
|
|
|
97
|
-
|
|
98
|
-
|
|
97
|
+
V3.3.16: Uses _send_request (lock-protected) instead of raw
|
|
98
|
+
stdin/stdout access. Previous code wrote to stdin without the
|
|
99
|
+
lock, creating a race where the warmup's readline thread could
|
|
100
|
+
steal responses meant for _send_request → deadlock → timeout.
|
|
99
101
|
"""
|
|
100
102
|
if self._worker_loading or self._model_loaded:
|
|
101
103
|
return
|
|
@@ -106,26 +108,18 @@ class CrossEncoderReranker:
|
|
|
106
108
|
self._ensure_worker()
|
|
107
109
|
if self._worker_proc is None:
|
|
108
110
|
return
|
|
109
|
-
|
|
110
|
-
req = json.dumps({
|
|
111
|
+
resp = self._send_request({
|
|
111
112
|
"cmd": "load",
|
|
112
113
|
"model_name": self._model_name,
|
|
113
114
|
"backend": self._backend,
|
|
114
|
-
})
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
if resp.get("ok"):
|
|
123
|
-
self._model_loaded = True
|
|
124
|
-
logger.info(
|
|
125
|
-
"Reranker worker warm (backend=%s)",
|
|
126
|
-
resp.get("backend", "?"),
|
|
127
|
-
)
|
|
128
|
-
self._reset_idle_timer()
|
|
115
|
+
}, timeout=_SUBPROCESS_RESPONSE_TIMEOUT)
|
|
116
|
+
if resp and resp.get("ok"):
|
|
117
|
+
self._model_loaded = True
|
|
118
|
+
logger.info(
|
|
119
|
+
"Reranker worker warm (backend=%s, warmup_inference=%s)",
|
|
120
|
+
resp.get("backend", "?"),
|
|
121
|
+
resp.get("warmup_inference", False),
|
|
122
|
+
)
|
|
129
123
|
except Exception as exc:
|
|
130
124
|
logger.debug("Background reranker warmup failed: %s", exc)
|
|
131
125
|
finally:
|