superlocalmemory 3.3.16 → 3.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "superlocalmemory",
3
- "version": "3.3.16",
3
+ "version": "3.3.17",
4
4
  "description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
5
5
  "keywords": [
6
6
  "ai-memory",
package/pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "superlocalmemory"
3
- version = "3.3.16"
3
+ version = "3.3.17"
4
4
  description = "Information-geometric agent memory with mathematical guarantees"
5
5
  readme = "README.md"
6
6
  license = {text = "MIT"}
@@ -144,6 +144,16 @@ def _worker_main() -> None:
144
144
  _respond({"ok": True, "vectors": result, "dim": dim})
145
145
  except Exception as exc:
146
146
  _respond({"ok": False, "error": str(exc)})
147
+
148
+ # V3.3.16: RSS watchdog — self-terminate if memory exceeds 1.5GB.
149
+ # PyTorch on ARM64 Mac never returns memory to OS. After ~200 embeds
150
+ # a worker that started at 300MB grows to 17GB+. Parent auto-respawns
151
+ # a fresh worker on next request (existing mechanism in embeddings.py).
152
+ import resource
153
+ rss_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 / 1024
154
+ if rss_mb > 2500:
155
+ sys.exit(0)
156
+
147
157
  continue
148
158
 
149
159
  _respond({"ok": False, "error": f"Unknown command: {cmd}"})
@@ -207,11 +207,29 @@ class EmbeddingService:
207
207
  return resp["vectors"]
208
208
  except (BrokenPipeError, OSError, json.JSONDecodeError) as exc:
209
209
  logger.warning(
210
- "Embedding worker communication failed: %s. "
211
- "Run 'slm doctor' to check dependencies and Python version.",
210
+ "Embedding worker communication failed: %s — respawning.",
212
211
  exc,
213
212
  )
214
213
  self._kill_worker()
214
+ # V3.3.16: Auto-retry once after worker death (RSS watchdog
215
+ # or crash). Respawn + re-send instead of returning None.
216
+ try:
217
+ self._ensure_worker()
218
+ if self._worker_proc is not None:
219
+ self._worker_proc.stdin.write(req)
220
+ self._worker_proc.stdin.flush()
221
+ resp_line = self._readline_with_timeout(
222
+ self._worker_proc.stdout,
223
+ _SUBPROCESS_RESPONSE_TIMEOUT,
224
+ )
225
+ if resp_line:
226
+ resp = json.loads(resp_line)
227
+ if resp.get("ok"):
228
+ self._reset_idle_timer()
229
+ self._request_count = 1
230
+ return resp["vectors"]
231
+ except Exception:
232
+ self._kill_worker()
215
233
  return None
216
234
 
217
235
  @staticmethod
@@ -263,16 +263,32 @@ def run_recall(
263
263
  for r in response.results:
264
264
  trust_scorer.update_on_access("fact", r.fact.fact_id, profile_id)
265
265
 
266
- # V3.3.16: Access count update onlyno redundant embedding call.
267
- # Fisher Bayesian variance update moved to store_pipeline (write-time)
268
- # to avoid per-recall memory pressure from numpy array creation.
269
- # Previously: embedder.embed(query) here duplicated the embed call
270
- # already done in retrieval engine, creating 768-dim numpy arrays
271
- # 304 times during benchmark → pymalloc arena fragmentation → 25GB.
266
+ # Fisher Bayesian update on recall narrows variance on accessed facts
267
+ # so they score higher on subsequent recalls (critical for benchmark: +24pp).
268
+ # V3.3.16: Reuse query embedding from retrieval engine cache instead of
269
+ # calling embedder.embed() again (which was the memory leak source).
270
+ q_var_arr = None
271
+ if embedder and hasattr(retrieval_engine, '_query_embedding_cache'):
272
+ cached_emb = retrieval_engine._query_embedding_cache.get(query)
273
+ if cached_emb is not None:
274
+ import numpy as _np
275
+ _, q_var_list = embedder.compute_fisher_params(cached_emb)
276
+ q_var_arr = _np.array(q_var_list, dtype=_np.float64)
277
+
272
278
  for r in response.results:
273
- db.update_fact(r.fact.fact_id, {
279
+ updates: dict[str, object] = {
274
280
  "access_count": r.fact.access_count + 1,
275
- })
281
+ }
282
+ if (q_var_arr is not None
283
+ and r.fact.fisher_variance
284
+ and len(r.fact.fisher_variance) == len(q_var_arr)
285
+ and r.fact.access_count >= 3):
286
+ import numpy as _np
287
+ f_var = _np.array(r.fact.fisher_variance, dtype=_np.float64)
288
+ new_var = 1.0 / (1.0 / _np.maximum(f_var, 0.05) + 1.0 / _np.maximum(q_var_arr, 0.05))
289
+ new_var = _np.clip(new_var, 0.05, 2.0)
290
+ updates["fisher_variance"] = new_var.tolist()
291
+ db.update_fact(r.fact.fact_id, updates)
276
292
 
277
293
  # Post-operation hooks (audit, trust signal, learning)
278
294
  hook_ctx["result_count"] = len(response.results)
@@ -321,6 +321,13 @@ def _worker_main() -> None:
321
321
  except Exception as exc:
322
322
  _respond({"ok": False, "error": str(exc)})
323
323
 
324
+ # V3.3.16: RSS watchdog — self-terminate if memory exceeds 1.5GB.
325
+ # Parent auto-respawns a fresh worker on next request.
326
+ import resource
327
+ rss_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 / 1024
328
+ if rss_mb > 2500:
329
+ sys.exit(0)
330
+
324
331
 
325
332
  def _respond(data: dict) -> None:
326
333
  sys.stdout.write(json.dumps(data) + "\n")
@@ -328,4 +335,7 @@ def _respond(data: dict) -> None:
328
335
 
329
336
 
330
337
  if __name__ == "__main__":
331
- _worker_main()
338
+ try:
339
+ _worker_main()
340
+ except KeyboardInterrupt:
341
+ sys.exit(0)
@@ -124,10 +124,12 @@ def _worker_main() -> None:
124
124
  warmup_ok = False
125
125
  if model is not None:
126
126
  try:
127
+ # Use 60 pairs (realistic batch size) to trigger CoreML
128
+ # compilation for the actual workload. 3 pairs compiled a
129
+ # different execution plan that got recompiled on 60 pairs.
127
130
  dummy_pairs = [
128
- ("What is the capital of France?", "Paris is the capital of France."),
129
- ("Who wrote Hamlet?", "Shakespeare wrote many plays."),
130
- ("What color is the sky?", "The sky is blue on a clear day."),
131
+ (f"What happened to person {i}?", f"Person {i} went to location {i} and did activity {i} last summer with friends.")
132
+ for i in range(60)
131
133
  ]
132
134
  try:
133
135
  import torch
@@ -174,6 +176,13 @@ def _worker_main() -> None:
174
176
  })
175
177
  except Exception as exc:
176
178
  _respond({"ok": False, "error": str(exc)})
179
+
180
+ # V3.3.16: RSS watchdog — same as embedding_worker
181
+ import resource
182
+ rss_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 / 1024
183
+ if rss_mb > 2500:
184
+ sys.exit(0)
185
+
177
186
  continue
178
187
 
179
188
  if cmd == "score":
@@ -94,8 +94,10 @@ class CrossEncoderReranker:
94
94
  def _start_background_warmup(self) -> None:
95
95
  """Start worker and load model in background thread.
96
96
 
97
- Returns immediately. The worker loads the model in parallel
98
- with the rest of engine initialization and the first recall.
97
+ V3.3.16: Uses _send_request (lock-protected) instead of raw
98
+ stdin/stdout access. Previous code wrote to stdin without the
99
+ lock, creating a race where the warmup's readline thread could
100
+ steal responses meant for _send_request → deadlock → timeout.
99
101
  """
100
102
  if self._worker_loading or self._model_loaded:
101
103
  return
@@ -106,26 +108,18 @@ class CrossEncoderReranker:
106
108
  self._ensure_worker()
107
109
  if self._worker_proc is None:
108
110
  return
109
- # Send load command and wait for response
110
- req = json.dumps({
111
+ resp = self._send_request({
111
112
  "cmd": "load",
112
113
  "model_name": self._model_name,
113
114
  "backend": self._backend,
114
- }) + "\n"
115
- self._worker_proc.stdin.write(req)
116
- self._worker_proc.stdin.flush()
117
- resp_line = self._readline_with_timeout(
118
- self._worker_proc.stdout, _SUBPROCESS_RESPONSE_TIMEOUT,
119
- )
120
- if resp_line:
121
- resp = json.loads(resp_line)
122
- if resp.get("ok"):
123
- self._model_loaded = True
124
- logger.info(
125
- "Reranker worker warm (backend=%s)",
126
- resp.get("backend", "?"),
127
- )
128
- self._reset_idle_timer()
115
+ }, timeout=_SUBPROCESS_RESPONSE_TIMEOUT)
116
+ if resp and resp.get("ok"):
117
+ self._model_loaded = True
118
+ logger.info(
119
+ "Reranker worker warm (backend=%s, warmup_inference=%s)",
120
+ resp.get("backend", "?"),
121
+ resp.get("warmup_inference", False),
122
+ )
129
123
  except Exception as exc:
130
124
  logger.debug("Background reranker warmup failed: %s", exc)
131
125
  finally: