superlocalmemory 3.3.17 → 3.3.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "superlocalmemory",
3
- "version": "3.3.17",
3
+ "version": "3.3.18",
4
4
  "description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
5
5
  "keywords": [
6
6
  "ai-memory",
package/pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "superlocalmemory"
3
- version = "3.3.17"
3
+ version = "3.3.18"
4
4
  description = "Information-geometric agent memory with mathematical guarantees"
5
5
  readme = "README.md"
6
6
  license = {text = "MIT"}
@@ -155,7 +155,7 @@ class RetrievalConfig:
155
155
  # Reranking (V3.3.2: ONNX backend enabled for all modes)
156
156
  use_cross_encoder: bool = True
157
157
  cross_encoder_model: str = "cross-encoder/ms-marco-MiniLM-L-12-v2"
158
- cross_encoder_backend: str = "onnx" # "onnx" (~200MB) or "" (PyTorch, ~1.5GB)
158
+ cross_encoder_backend: str = "" # "" = PyTorch (~500MB stable), "onnx" = ONNX (leaks on ARM64 CoreML)
159
159
 
160
160
  # Agentic (Mode C only)
161
161
  agentic_max_rounds: int = 3
@@ -35,6 +35,8 @@ os.environ["PYTORCH_MPS_MEM_LIMIT"] = "0"
35
35
  os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
36
36
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
37
37
  os.environ["TORCH_DEVICE"] = "cpu"
38
+ # V3.3.17: Disable CoreML EP for ONNX Runtime — uses 3-5GB on ARM64 Mac.
39
+ os.environ["ORT_DISABLE_COREML"] = "1"
38
40
 
39
41
  # SIGTERM bridge: Docker/systemd send SIGTERM to stop processes.
40
42
  # Without this, the worker ignores SIGTERM and becomes a zombie.
@@ -65,6 +67,34 @@ def _start_parent_watchdog() -> None:
65
67
  t.start()
66
68
 
67
69
 
70
+ def _load_embedding_model(name: str) -> tuple:
71
+ """Load embedding model. ONNX first (no memory leak), PyTorch fallback.
72
+
73
+ V3.3.17: PyTorch SentenceTransformer on ARM64 Mac leaks memory —
74
+ grows from 300MB to 17GB after ~200 encode calls. ONNX Runtime
75
+ has no such issue. Same approach as CrossEncoder ONNX migration.
76
+
77
+ Returns (model, backend_name) or (None, "").
78
+ """
79
+ from sentence_transformers import SentenceTransformer
80
+
81
+ # Tier 1: ONNX (stable memory, ~200MB footprint)
82
+ try:
83
+ m = SentenceTransformer(name, backend="onnx", trust_remote_code=True)
84
+ return m, "onnx"
85
+ except Exception:
86
+ pass
87
+
88
+ # Tier 2: PyTorch CPU (stable at ~1.4GB after 100+ calls, verified)
89
+ try:
90
+ import torch
91
+ with torch.inference_mode():
92
+ m = SentenceTransformer(name, trust_remote_code=True, device="cpu")
93
+ return m, "pytorch"
94
+ except Exception:
95
+ return None, ""
96
+
97
+
68
98
  def _worker_main() -> None:
69
99
  """Main loop: read JSON requests from stdin, write responses to stdout."""
70
100
  _start_parent_watchdog() # V3.3.7: self-terminate if parent dies
@@ -97,18 +127,17 @@ def _worker_main() -> None:
97
127
  if cmd == "load":
98
128
  name = req.get("model_name", "nomic-ai/nomic-embed-text-v1.5")
99
129
  expected_dim = req.get("dimension", 768)
100
- try:
101
- from sentence_transformers import SentenceTransformer
102
- model = SentenceTransformer(name, trust_remote_code=True, device="cpu")
130
+ model, active_backend = _load_embedding_model(name)
131
+ if model is not None:
103
132
  dim = model.get_sentence_embedding_dimension()
104
133
  if dim != expected_dim:
105
134
  _respond({"ok": False, "error": f"Dimension mismatch: {dim} != {expected_dim}"})
106
135
  model = None
107
136
  continue
108
137
  model_name = name
109
- _respond({"ok": True, "dim": dim, "model": name})
110
- except Exception as exc:
111
- _respond({"ok": False, "error": str(exc)})
138
+ _respond({"ok": True, "dim": dim, "model": name, "backend": active_backend})
139
+ else:
140
+ _respond({"ok": False, "error": "Model load failed"})
112
141
  continue
113
142
 
114
143
  if cmd == "embed":
@@ -117,26 +146,16 @@ def _worker_main() -> None:
117
146
  _respond({"ok": False, "error": "No texts provided"})
118
147
  continue
119
148
  if model is None:
120
- # Auto-load if not yet loaded
121
149
  name = req.get("model_name", "nomic-ai/nomic-embed-text-v1.5")
122
- expected_dim = req.get("dimension", 768)
123
- try:
124
- from sentence_transformers import SentenceTransformer
125
- model = SentenceTransformer(name, trust_remote_code=True, device="cpu")
150
+ model, active_backend = _load_embedding_model(name)
151
+ if model is not None:
126
152
  dim = model.get_sentence_embedding_dimension()
127
153
  model_name = name
128
- except Exception as exc:
129
- _respond({"ok": False, "error": f"Model load failed: {exc}"})
154
+ else:
155
+ _respond({"ok": False, "error": "Model load failed"})
130
156
  continue
131
157
  try:
132
- # torch.inference_mode prevents autograd graph accumulation
133
- # which causes silent memory leaks over long-running sessions.
134
- try:
135
- import torch
136
- with torch.inference_mode():
137
- vecs = model.encode(texts, normalize_embeddings=True)
138
- except ImportError:
139
- vecs = model.encode(texts, normalize_embeddings=True)
158
+ vecs = model.encode(texts, normalize_embeddings=True)
140
159
  if isinstance(vecs, np.ndarray) and vecs.ndim == 2:
141
160
  result = [vecs[i].tolist() for i in range(vecs.shape[0])]
142
161
  else:
@@ -40,6 +40,9 @@ os.environ["PYTORCH_MPS_MEM_LIMIT"] = "0"
40
40
  os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
41
41
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
42
42
  os.environ["TORCH_DEVICE"] = "cpu"
43
+ # V3.3.17: Disable CoreML EP for ONNX Runtime. CoreML compiles execution
44
+ # plans that consume 3-5GB on ARM64 Mac. CPU EP is ~500MB and fast enough.
45
+ os.environ["ORT_DISABLE_COREML"] = "1"
43
46
 
44
47
  # SIGTERM bridge for Docker/systemd
45
48
  if sys.platform != "win32":