superlocalmemory 3.3.15 → 3.3.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/pyproject.toml +1 -1
- package/src/superlocalmemory/core/embedding_worker.py +10 -0
- package/src/superlocalmemory/core/embeddings.py +20 -2
- package/src/superlocalmemory/core/recall_pipeline.py +46 -19
- package/src/superlocalmemory/core/recall_worker.py +11 -1
- package/src/superlocalmemory/core/reranker_worker.py +30 -0
- package/src/superlocalmemory/retrieval/engine.py +14 -18
- package/src/superlocalmemory/retrieval/reranker.py +17 -22
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "superlocalmemory",
|
|
3
|
-
"version": "3.3.
|
|
3
|
+
"version": "3.3.17",
|
|
4
4
|
"description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"ai-memory",
|
package/pyproject.toml
CHANGED
|
@@ -144,6 +144,16 @@ def _worker_main() -> None:
|
|
|
144
144
|
_respond({"ok": True, "vectors": result, "dim": dim})
|
|
145
145
|
except Exception as exc:
|
|
146
146
|
_respond({"ok": False, "error": str(exc)})
|
|
147
|
+
|
|
148
|
+
# V3.3.16: RSS watchdog — self-terminate if memory exceeds 1.5GB.
|
|
149
|
+
# PyTorch on ARM64 Mac never returns memory to OS. After ~200 embeds
|
|
150
|
+
# a worker that started at 300MB grows to 17GB+. Parent auto-respawns
|
|
151
|
+
# a fresh worker on next request (existing mechanism in embeddings.py).
|
|
152
|
+
import resource
|
|
153
|
+
rss_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 / 1024
|
|
154
|
+
if rss_mb > 2500:
|
|
155
|
+
sys.exit(0)
|
|
156
|
+
|
|
147
157
|
continue
|
|
148
158
|
|
|
149
159
|
_respond({"ok": False, "error": f"Unknown command: {cmd}"})
|
|
@@ -207,11 +207,29 @@ class EmbeddingService:
|
|
|
207
207
|
return resp["vectors"]
|
|
208
208
|
except (BrokenPipeError, OSError, json.JSONDecodeError) as exc:
|
|
209
209
|
logger.warning(
|
|
210
|
-
"Embedding worker communication failed: %s.
|
|
211
|
-
"Run 'slm doctor' to check dependencies and Python version.",
|
|
210
|
+
"Embedding worker communication failed: %s — respawning.",
|
|
212
211
|
exc,
|
|
213
212
|
)
|
|
214
213
|
self._kill_worker()
|
|
214
|
+
# V3.3.16: Auto-retry once after worker death (RSS watchdog
|
|
215
|
+
# or crash). Respawn + re-send instead of returning None.
|
|
216
|
+
try:
|
|
217
|
+
self._ensure_worker()
|
|
218
|
+
if self._worker_proc is not None:
|
|
219
|
+
self._worker_proc.stdin.write(req)
|
|
220
|
+
self._worker_proc.stdin.flush()
|
|
221
|
+
resp_line = self._readline_with_timeout(
|
|
222
|
+
self._worker_proc.stdout,
|
|
223
|
+
_SUBPROCESS_RESPONSE_TIMEOUT,
|
|
224
|
+
)
|
|
225
|
+
if resp_line:
|
|
226
|
+
resp = json.loads(resp_line)
|
|
227
|
+
if resp.get("ok"):
|
|
228
|
+
self._reset_idle_timer()
|
|
229
|
+
self._request_count = 1
|
|
230
|
+
return resp["vectors"]
|
|
231
|
+
except Exception:
|
|
232
|
+
self._kill_worker()
|
|
215
233
|
return None
|
|
216
234
|
|
|
217
235
|
@staticmethod
|
|
@@ -24,6 +24,36 @@ from superlocalmemory.storage.models import Mode, RecallResponse
|
|
|
24
24
|
logger = logging.getLogger(__name__)
|
|
25
25
|
|
|
26
26
|
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
# V3.3.16: Module-level singletons for recall hot-path objects.
|
|
29
|
+
# Prevents creating new BehavioralTracker / ForgettingScheduler per recall
|
|
30
|
+
# (304 recalls = 304 objects that fragment pymalloc arenas → 25GB).
|
|
31
|
+
# ---------------------------------------------------------------------------
|
|
32
|
+
|
|
33
|
+
_behavioral_tracker_cache: dict[int, object] = {}
|
|
34
|
+
_forgetting_scheduler_cache: dict[int, object] = {}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _get_behavioral_tracker(db: Any) -> Any:
|
|
38
|
+
"""Get or create a cached BehavioralTracker for this DB instance."""
|
|
39
|
+
key = id(db)
|
|
40
|
+
if key not in _behavioral_tracker_cache:
|
|
41
|
+
from superlocalmemory.learning.behavioral import BehavioralTracker
|
|
42
|
+
_behavioral_tracker_cache[key] = BehavioralTracker(db)
|
|
43
|
+
return _behavioral_tracker_cache[key]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _get_forgetting_scheduler(db: Any, config: Any) -> Any:
|
|
47
|
+
"""Get or create a cached ForgettingScheduler for this DB instance."""
|
|
48
|
+
key = id(db)
|
|
49
|
+
if key not in _forgetting_scheduler_cache:
|
|
50
|
+
from superlocalmemory.learning.forgetting_scheduler import ForgettingScheduler
|
|
51
|
+
from superlocalmemory.math.ebbinghaus import EbbinghausCurve
|
|
52
|
+
ebbinghaus = EbbinghausCurve(config.forgetting)
|
|
53
|
+
_forgetting_scheduler_cache[key] = ForgettingScheduler(db, ebbinghaus, config.forgetting)
|
|
54
|
+
return _forgetting_scheduler_cache[key]
|
|
55
|
+
|
|
56
|
+
|
|
27
57
|
# ---------------------------------------------------------------------------
|
|
28
58
|
# apply_adaptive_ranking (was MemoryEngine._apply_adaptive_ranking)
|
|
29
59
|
# ---------------------------------------------------------------------------
|
|
@@ -192,11 +222,11 @@ def run_recall(
|
|
|
192
222
|
except Exception as exc:
|
|
193
223
|
logger.debug("Access log batch store failed: %s", exc)
|
|
194
224
|
|
|
195
|
-
# V3.3.
|
|
225
|
+
# V3.3.16: Behavioral tracking + spaced repetition use module-level
|
|
226
|
+
# singletons to avoid creating new objects per recall (was causing
|
|
227
|
+
# object accumulation across 304 benchmark recalls).
|
|
196
228
|
try:
|
|
197
|
-
|
|
198
|
-
_tracker = BehavioralTracker(db)
|
|
199
|
-
_tracker.record_query(
|
|
229
|
+
_get_behavioral_tracker(db).record_query(
|
|
200
230
|
profile_id=profile_id, query=query,
|
|
201
231
|
query_type=response.query_type,
|
|
202
232
|
result_count=len(response.results),
|
|
@@ -204,15 +234,11 @@ def run_recall(
|
|
|
204
234
|
except Exception as exc:
|
|
205
235
|
logger.debug("Behavioral tracking: %s", exc)
|
|
206
236
|
|
|
207
|
-
# V3.3.12: Spaced repetition update on recall (Ebbinghaus on_access_event)
|
|
208
237
|
if response.results:
|
|
209
238
|
try:
|
|
210
|
-
|
|
211
|
-
from superlocalmemory.math.ebbinghaus import EbbinghausCurve
|
|
212
|
-
_ebbinghaus = EbbinghausCurve(config.forgetting)
|
|
213
|
-
_fsched = ForgettingScheduler(db, _ebbinghaus, config.forgetting)
|
|
239
|
+
fsched = _get_forgetting_scheduler(db, config)
|
|
214
240
|
for r in response.results[:10]:
|
|
215
|
-
|
|
241
|
+
fsched.on_access_event(r.fact.fact_id, profile_id)
|
|
216
242
|
except Exception as exc:
|
|
217
243
|
logger.debug("Spaced repetition update: %s", exc)
|
|
218
244
|
|
|
@@ -237,30 +263,31 @@ def run_recall(
|
|
|
237
263
|
for r in response.results:
|
|
238
264
|
trust_scorer.update_on_access("fact", r.fact.fact_id, profile_id)
|
|
239
265
|
|
|
240
|
-
# Fisher Bayesian update on recall
|
|
241
|
-
|
|
266
|
+
# Fisher Bayesian update on recall — narrows variance on accessed facts
|
|
267
|
+
# so they score higher on subsequent recalls (critical for benchmark: +24pp).
|
|
268
|
+
# V3.3.16: Reuse query embedding from retrieval engine cache instead of
|
|
269
|
+
# calling embedder.embed() again (which was the memory leak source).
|
|
242
270
|
q_var_arr = None
|
|
243
|
-
if embedder and
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
271
|
+
if embedder and hasattr(retrieval_engine, '_query_embedding_cache'):
|
|
272
|
+
cached_emb = retrieval_engine._query_embedding_cache.get(query)
|
|
273
|
+
if cached_emb is not None:
|
|
274
|
+
import numpy as _np
|
|
275
|
+
_, q_var_list = embedder.compute_fisher_params(cached_emb)
|
|
276
|
+
q_var_arr = _np.array(q_var_list, dtype=_np.float64)
|
|
247
277
|
|
|
248
278
|
for r in response.results:
|
|
249
279
|
updates: dict[str, object] = {
|
|
250
280
|
"access_count": r.fact.access_count + 1,
|
|
251
281
|
}
|
|
252
|
-
# Bayesian variance narrowing after 3+ accesses
|
|
253
282
|
if (q_var_arr is not None
|
|
254
283
|
and r.fact.fisher_variance
|
|
255
284
|
and len(r.fact.fisher_variance) == len(q_var_arr)
|
|
256
285
|
and r.fact.access_count >= 3):
|
|
257
286
|
import numpy as _np
|
|
258
287
|
f_var = _np.array(r.fact.fisher_variance, dtype=_np.float64)
|
|
259
|
-
# Conjugate Gaussian update: 1/new_var = 1/f_var + 1/q_var
|
|
260
288
|
new_var = 1.0 / (1.0 / _np.maximum(f_var, 0.05) + 1.0 / _np.maximum(q_var_arr, 0.05))
|
|
261
289
|
new_var = _np.clip(new_var, 0.05, 2.0)
|
|
262
290
|
updates["fisher_variance"] = new_var.tolist()
|
|
263
|
-
|
|
264
291
|
db.update_fact(r.fact.fact_id, updates)
|
|
265
292
|
|
|
266
293
|
# Post-operation hooks (audit, trust signal, learning)
|
|
@@ -321,6 +321,13 @@ def _worker_main() -> None:
|
|
|
321
321
|
except Exception as exc:
|
|
322
322
|
_respond({"ok": False, "error": str(exc)})
|
|
323
323
|
|
|
324
|
+
# V3.3.16: RSS watchdog — self-terminate if memory exceeds 1.5GB.
|
|
325
|
+
# Parent auto-respawns a fresh worker on next request.
|
|
326
|
+
import resource
|
|
327
|
+
rss_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 / 1024
|
|
328
|
+
if rss_mb > 2500:
|
|
329
|
+
sys.exit(0)
|
|
330
|
+
|
|
324
331
|
|
|
325
332
|
def _respond(data: dict) -> None:
|
|
326
333
|
sys.stdout.write(json.dumps(data) + "\n")
|
|
@@ -328,4 +335,7 @@ def _respond(data: dict) -> None:
|
|
|
328
335
|
|
|
329
336
|
|
|
330
337
|
if __name__ == "__main__":
|
|
331
|
-
|
|
338
|
+
try:
|
|
339
|
+
_worker_main()
|
|
340
|
+
except KeyboardInterrupt:
|
|
341
|
+
sys.exit(0)
|
|
@@ -118,10 +118,33 @@ def _worker_main() -> None:
|
|
|
118
118
|
name = req.get("model_name", "cross-encoder/ms-marco-MiniLM-L-12-v2")
|
|
119
119
|
backend = req.get("backend", "onnx")
|
|
120
120
|
model, active_backend, model_name = _load_model(name, backend)
|
|
121
|
+
# V3.3.16: Run real inference to trigger ONNX CoreML JIT compilation.
|
|
122
|
+
# Without this, first real rerank call triggers 30-60s compilation
|
|
123
|
+
# that exceeds the caller's timeout, killing the worker.
|
|
124
|
+
warmup_ok = False
|
|
125
|
+
if model is not None:
|
|
126
|
+
try:
|
|
127
|
+
# Use 60 pairs (realistic batch size) to trigger CoreML
|
|
128
|
+
# compilation for the actual workload. 3 pairs compiled a
|
|
129
|
+
# different execution plan that got recompiled on 60 pairs.
|
|
130
|
+
dummy_pairs = [
|
|
131
|
+
(f"What happened to person {i}?", f"Person {i} went to location {i} and did activity {i} last summer with friends.")
|
|
132
|
+
for i in range(60)
|
|
133
|
+
]
|
|
134
|
+
try:
|
|
135
|
+
import torch
|
|
136
|
+
with torch.inference_mode():
|
|
137
|
+
_scores = model.predict(dummy_pairs)
|
|
138
|
+
except ImportError:
|
|
139
|
+
_scores = model.predict(dummy_pairs)
|
|
140
|
+
warmup_ok = True
|
|
141
|
+
except Exception:
|
|
142
|
+
pass
|
|
121
143
|
_respond({
|
|
122
144
|
"ok": model is not None,
|
|
123
145
|
"backend": active_backend,
|
|
124
146
|
"model": model_name,
|
|
147
|
+
"warmup_inference": warmup_ok,
|
|
125
148
|
})
|
|
126
149
|
continue
|
|
127
150
|
|
|
@@ -153,6 +176,13 @@ def _worker_main() -> None:
|
|
|
153
176
|
})
|
|
154
177
|
except Exception as exc:
|
|
155
178
|
_respond({"ok": False, "error": str(exc)})
|
|
179
|
+
|
|
180
|
+
# V3.3.16: RSS watchdog — same as embedding_worker
|
|
181
|
+
import resource
|
|
182
|
+
rss_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 / 1024
|
|
183
|
+
if rss_mb > 2500:
|
|
184
|
+
sys.exit(0)
|
|
185
|
+
|
|
156
186
|
continue
|
|
157
187
|
|
|
158
188
|
if cmd == "score":
|
|
@@ -338,31 +338,27 @@ class RetrievalEngine:
|
|
|
338
338
|
if not candidates:
|
|
339
339
|
return fused
|
|
340
340
|
|
|
341
|
-
#
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
observation_date=fact.observation_date,
|
|
351
|
-
referenced_date=fact.referenced_date,
|
|
352
|
-
confidence=fact.confidence, importance=fact.importance,
|
|
353
|
-
evidence_count=fact.evidence_count,
|
|
354
|
-
access_count=fact.access_count,
|
|
355
|
-
embedding=fact.embedding, created_at=fact.created_at,
|
|
356
|
-
)
|
|
357
|
-
clean_candidates.append((clean_fact, score))
|
|
341
|
+
# V3.3.16: Strip speaker tags WITHOUT copying full AtomicFact objects.
|
|
342
|
+
# Previously created full copies including 768-dim embeddings (~6KB each),
|
|
343
|
+
# which over 304 recalls caused pymalloc arena fragmentation → 25GB.
|
|
344
|
+
# Now: temporarily patch .content on originals, rerank, then restore.
|
|
345
|
+
originals: list[tuple[AtomicFact, str]] = [] # (fact, original_content)
|
|
346
|
+
for fact, _ in candidates:
|
|
347
|
+
orig = fact.content
|
|
348
|
+
fact.content = re.sub(r'^\[[A-Za-z]+\]:\s*', '', orig)
|
|
349
|
+
originals.append((fact, orig))
|
|
358
350
|
|
|
359
351
|
try:
|
|
360
352
|
scored = self._reranker.rerank( # type: ignore[union-attr]
|
|
361
|
-
query,
|
|
353
|
+
query, candidates, top_k=len(candidates),
|
|
362
354
|
)
|
|
363
355
|
except Exception as exc:
|
|
364
356
|
logger.warning("Cross-encoder rerank failed: %s", exc)
|
|
365
357
|
return fused
|
|
358
|
+
finally:
|
|
359
|
+
# Restore original content (with speaker tags)
|
|
360
|
+
for fact, orig_content in originals:
|
|
361
|
+
fact.content = orig_content
|
|
366
362
|
|
|
367
363
|
score_map = {fact.fact_id: score for fact, score in scored}
|
|
368
364
|
|
|
@@ -94,8 +94,10 @@ class CrossEncoderReranker:
|
|
|
94
94
|
def _start_background_warmup(self) -> None:
|
|
95
95
|
"""Start worker and load model in background thread.
|
|
96
96
|
|
|
97
|
-
|
|
98
|
-
|
|
97
|
+
V3.3.16: Uses _send_request (lock-protected) instead of raw
|
|
98
|
+
stdin/stdout access. Previous code wrote to stdin without the
|
|
99
|
+
lock, creating a race where the warmup's readline thread could
|
|
100
|
+
steal responses meant for _send_request → deadlock → timeout.
|
|
99
101
|
"""
|
|
100
102
|
if self._worker_loading or self._model_loaded:
|
|
101
103
|
return
|
|
@@ -106,26 +108,18 @@ class CrossEncoderReranker:
|
|
|
106
108
|
self._ensure_worker()
|
|
107
109
|
if self._worker_proc is None:
|
|
108
110
|
return
|
|
109
|
-
|
|
110
|
-
req = json.dumps({
|
|
111
|
+
resp = self._send_request({
|
|
111
112
|
"cmd": "load",
|
|
112
113
|
"model_name": self._model_name,
|
|
113
114
|
"backend": self._backend,
|
|
114
|
-
})
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
if resp.get("ok"):
|
|
123
|
-
self._model_loaded = True
|
|
124
|
-
logger.info(
|
|
125
|
-
"Reranker worker warm (backend=%s)",
|
|
126
|
-
resp.get("backend", "?"),
|
|
127
|
-
)
|
|
128
|
-
self._reset_idle_timer()
|
|
115
|
+
}, timeout=_SUBPROCESS_RESPONSE_TIMEOUT)
|
|
116
|
+
if resp and resp.get("ok"):
|
|
117
|
+
self._model_loaded = True
|
|
118
|
+
logger.info(
|
|
119
|
+
"Reranker worker warm (backend=%s, warmup_inference=%s)",
|
|
120
|
+
resp.get("backend", "?"),
|
|
121
|
+
resp.get("warmup_inference", False),
|
|
122
|
+
)
|
|
129
123
|
except Exception as exc:
|
|
130
124
|
logger.debug("Background reranker warmup failed: %s", exc)
|
|
131
125
|
finally:
|
|
@@ -321,13 +315,14 @@ class CrossEncoderReranker:
|
|
|
321
315
|
|
|
322
316
|
documents = [fact.content for fact, _ in candidates]
|
|
323
317
|
|
|
324
|
-
# V3.3.
|
|
325
|
-
#
|
|
318
|
+
# V3.3.16: Timeout 180s — ONNX CoreML compilation can take 30-60s on
|
|
319
|
+
# first inference even after model load. The warmup_inference in the
|
|
320
|
+
# worker should prevent this, but 180s is a safety net.
|
|
326
321
|
resp = self._send_request({
|
|
327
322
|
"cmd": "rerank",
|
|
328
323
|
"query": query,
|
|
329
324
|
"documents": documents,
|
|
330
|
-
}, timeout=
|
|
325
|
+
}, timeout=180.0)
|
|
331
326
|
|
|
332
327
|
if resp is None or not resp.get("ok"):
|
|
333
328
|
# Fallback: return by existing score
|