superlocalmemory 3.3.1 → 3.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -16,6 +16,18 @@ SuperLocalMemory V3 - Intelligent local memory system for AI coding assistants.
16
16
 
17
17
  ---
18
18
 
19
+ ## [3.3.3] - 2026-04-01 — Langevin Awakening
20
+
21
+ ### Fixed
22
+ - **Langevin dynamics now active** — positions were never initialized at store time, causing the entire Langevin lifecycle system to be inert (0 positioned facts). New facts now receive near-origin positions (Strategy A).
23
+ - **Backfill for existing facts** — maintenance now initializes unpositioned facts using metadata-aware equilibrium seeding (Strategy B) followed by 50-step burn-in (Strategy C). Old, rarely-accessed facts land in their correct lifecycle zones immediately.
24
+
25
+ ### Improved
26
+ - Maintenance returns `langevin_backfilled` count for observability
27
+ - Health check now reports positioned facts accurately after backfill
28
+
29
+ ---
30
+
19
31
  ## [3.3.0] - 2026-03-31 — The Living Brain
20
32
 
21
33
  ### New Features
@@ -5,7 +5,8 @@
5
5
  "args": [
6
6
  "mcp"
7
7
  ],
8
- "description": "SuperLocalMemory V3 - 100% local memory system"
8
+ "description": "SuperLocalMemory V3 - 100% local memory system",
9
+ "type": "stdio"
9
10
  }
10
11
  }
11
12
  }
@@ -5,7 +5,8 @@
5
5
  "args": [
6
6
  "mcp"
7
7
  ],
8
- "description": "SuperLocalMemory V3 - 100% local memory system for AI assistants"
8
+ "description": "SuperLocalMemory V3 - 100% local memory system for AI assistants",
9
+ "type": "stdio"
9
10
  }
10
11
  }
11
12
  }
@@ -5,7 +5,8 @@
5
5
  "args": [
6
6
  "mcp"
7
7
  ],
8
- "description": "SuperLocalMemory V3 - Local memory system with mathematical foundations and knowledge graphs"
8
+ "description": "SuperLocalMemory V3 - Local memory system with mathematical foundations and knowledge graphs",
9
+ "type": "stdio"
9
10
  }
10
11
  }
11
12
  }
@@ -5,7 +5,8 @@
5
5
  "args": [
6
6
  "mcp"
7
7
  ],
8
- "description": "SuperLocalMemory V3"
8
+ "description": "SuperLocalMemory V3",
9
+ "type": "stdio"
9
10
  }
10
11
  }
11
12
  }
@@ -5,7 +5,8 @@
5
5
  "args": [
6
6
  "mcp"
7
7
  ],
8
- "description": "SuperLocalMemory V3"
8
+ "description": "SuperLocalMemory V3",
9
+ "type": "stdio"
9
10
  }
10
11
  }
11
12
  }
@@ -4,6 +4,7 @@
4
4
  "args": [
5
5
  "mcp"
6
6
  ],
7
- "env": {}
7
+ "env": {},
8
+ "type": "stdio"
8
9
  }
9
10
  }
@@ -5,7 +5,8 @@
5
5
  "args": [
6
6
  "mcp"
7
7
  ],
8
- "description": "SuperLocalMemory V3 - 100% local memory system for AI assistants"
8
+ "description": "SuperLocalMemory V3 - 100% local memory system for AI assistants",
9
+ "type": "stdio"
9
10
  }
10
11
  }
11
12
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "superlocalmemory",
3
- "version": "3.3.1",
3
+ "version": "3.3.3",
4
4
  "description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
5
5
  "keywords": [
6
6
  "ai-memory",
package/pyproject.toml CHANGED
@@ -1,10 +1,10 @@
1
1
  [project]
2
2
  name = "superlocalmemory"
3
- version = "3.3.1"
3
+ version = "3.3.3"
4
4
  description = "Information-geometric agent memory with mathematical guarantees"
5
5
  readme = "README.md"
6
6
  license = {text = "MIT"}
7
- requires-python = ">=3.11"
7
+ requires-python = ">=3.11,<3.15"
8
8
  authors = [
9
9
  {name = "Varun Pratap Bhardwaj", email = "admin@superlocalmemory.com"},
10
10
  ]
@@ -48,11 +48,13 @@ dependencies = [
48
48
 
49
49
  [project.optional-dependencies]
50
50
  search = [
51
- "sentence-transformers>=2.5.0,<4.0.0",
51
+ "sentence-transformers>=4.0.0",
52
+ "sentence-transformers[onnx]>=4.0.0",
52
53
  "einops>=0.8.2",
53
54
  "torch>=2.2.0",
54
55
  "scikit-learn>=1.3.0,<2.0.0",
55
56
  "geoopt>=0.5.0",
57
+ "onnxruntime>=1.17.0",
56
58
  ]
57
59
  ui = [
58
60
  "fastapi[all]>=0.135.1",
@@ -72,6 +74,7 @@ full = [
72
74
  dev = [
73
75
  "pytest>=8.0",
74
76
  "pytest-cov>=4.1",
77
+ "sqlite-vec>=0.1.6",
75
78
  ]
76
79
 
77
80
  [project.urls]
@@ -112,20 +112,27 @@ if (pipInstall(coreDeps, 'core')) {
112
112
  console.log(' Run manually: pip install ' + coreDeps.join(' '));
113
113
  }
114
114
 
115
- // Search dependencies (IMPORTANT — enables semantic search, 4-channel retrieval)
116
- const searchDeps = ['sentence-transformers>=2.5.0', 'einops>=0.7.0', 'geoopt>=0.5.0'];
115
+ // Search + ONNX reranking (V3.3.2 — enables 6-channel retrieval + cross-encoder)
116
+ const searchDeps = [
117
+ 'sentence-transformers[onnx]>=4.0.0',
118
+ 'einops>=0.7.0', 'geoopt>=0.5.0',
119
+ 'onnxruntime>=1.17.0',
120
+ ];
117
121
 
118
- console.log('\nInstalling semantic search engine (downloads ~500MB on first use)...');
122
+ console.log('\nInstalling semantic search + ONNX reranking engine...');
123
+ console.log(' (sentence-transformers 4+, ONNX Runtime, Fisher-Rao geometry)');
119
124
  if (pipInstall(searchDeps, 'search')) {
120
- console.log('✓ Semantic search engine installed (sentence-transformers + einops + Fisher-Rao)');
125
+ console.log('✓ Search engine installed (sentence-transformers + ONNX + Fisher-Rao)');
126
+ console.log(' Cross-encoder reranking enabled for ALL modes (+30pp quality)');
121
127
  console.log('');
122
- console.log(' Note: The embedding model (nomic-ai/nomic-embed-text-v1.5, ~500MB)');
123
- console.log(' will download automatically on first use (slm remember / slm recall).');
128
+ console.log(' Models auto-download on first use:');
129
+ console.log(' - Embedding: nomic-ai/nomic-embed-text-v1.5 (~500MB)');
130
+ console.log(' - Reranker: cross-encoder/ms-marco-MiniLM-L-6-v2 (~90MB)');
124
131
  console.log(' To pre-download now, run: slm warmup');
125
132
  } else {
126
- console.log('⚠ Semantic search installation failed (BM25 keyword search still works).');
127
- console.log(' For full 4-channel retrieval, run:');
128
- console.log(' pip install sentence-transformers einops geoopt');
133
+ console.log('⚠ Search engine installation failed (BM25 keyword search still works).');
134
+ console.log(' For full 6-channel retrieval + reranking, run:');
135
+ console.log(' pip install "sentence-transformers[onnx]>=4.0.0" einops geoopt onnxruntime');
129
136
  }
130
137
 
131
138
  // Dashboard dependencies (IMPORTANT — enables web dashboard + MCP server)
@@ -993,35 +993,64 @@ def cmd_dashboard(args: Namespace) -> None:
993
993
  print("Or install manually: pip install 'fastapi[all]' uvicorn")
994
994
  sys.exit(1)
995
995
 
996
+ import os
997
+ import signal
996
998
  import socket
997
999
 
998
1000
  port = getattr(args, "port", 8765)
999
1001
 
1000
- def _find_port(preferred: int) -> int:
1001
- for p in [preferred] + list(range(preferred + 1, preferred + 20)):
1002
- try:
1003
- with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
1004
- s.bind(("127.0.0.1", p))
1005
- return p
1006
- except OSError:
1007
- continue
1008
- return preferred
1002
+ def _kill_existing_on_port(target_port: int) -> None:
1003
+ """Kill any existing SLM dashboard on the target port.
1009
1004
 
1010
- ui_port = _find_port(port)
1011
- if ui_port != port:
1012
- print(f" Port {port} in use using {ui_port} instead")
1005
+ V3.3.2: ONE port, no auto-increment. If port is busy with
1006
+ another SLM instance, kill it. If busy with a non-SLM process,
1007
+ warn and exitnever silently shift to a different port.
1008
+ """
1009
+ if sys.platform == "win32":
1010
+ return # Windows: user must close manually
1011
+ try:
1012
+ import subprocess
1013
+ result = subprocess.run(
1014
+ ["lsof", "-ti", f":{target_port}"],
1015
+ capture_output=True, text=True, timeout=5,
1016
+ )
1017
+ if result.returncode == 0 and result.stdout.strip():
1018
+ pids = result.stdout.strip().split("\n")
1019
+ for pid_str in pids:
1020
+ pid = int(pid_str.strip())
1021
+ if pid == os.getpid():
1022
+ continue
1023
+ # Check if it's an SLM/Python process
1024
+ ps_result = subprocess.run(
1025
+ ["ps", "-p", str(pid), "-o", "command="],
1026
+ capture_output=True, text=True, timeout=5,
1027
+ )
1028
+ cmd = ps_result.stdout.strip().lower()
1029
+ if "superlocalmemory" in cmd or "slm" in cmd or "uvicorn" in cmd:
1030
+ os.kill(pid, signal.SIGTERM)
1031
+ print(f" Stopped previous dashboard (PID {pid})")
1032
+ import time
1033
+ time.sleep(1)
1034
+ except Exception:
1035
+ pass # Best-effort
1036
+
1037
+ _kill_existing_on_port(port)
1038
+
1039
+ # Brief wait for port to fully release after killing old process
1040
+ import time
1041
+ time.sleep(1)
1013
1042
 
1014
1043
  print("=" * 60)
1015
1044
  print(" SuperLocalMemory V3 — Web Dashboard")
1016
1045
  print("=" * 60)
1017
- print(f" Dashboard: http://localhost:{ui_port}")
1018
- print(f" API Docs: http://localhost:{ui_port}/api/docs")
1046
+ print(f" Dashboard: http://localhost:{port}")
1047
+ print(f" API Docs: http://localhost:{port}/api/docs")
1019
1048
  print(" Press Ctrl+C to stop\n")
1020
1049
 
1021
1050
  from superlocalmemory.server.ui import create_app
1022
1051
 
1023
1052
  app = create_app()
1024
- uvicorn.run(app, host="127.0.0.1", port=ui_port, log_level="info")
1053
+ uvicorn.run(app, host="127.0.0.1", port=port, log_level="info")
1025
1054
 
1026
1055
 
1027
1056
  # -- Profiles (supports --json) -------------------------------------------
@@ -152,9 +152,10 @@ class RetrievalConfig:
152
152
  entity_graph_max_hops: int = 3
153
153
  temporal_proximity_days: int = 30
154
154
 
155
- # Reranking
155
+ # Reranking (V3.3.2: ONNX backend enabled for all modes)
156
156
  use_cross_encoder: bool = True
157
- cross_encoder_model: str = "BAAI/bge-reranker-v2-m3"
157
+ cross_encoder_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
158
+ cross_encoder_backend: str = "onnx" # "onnx" (~200MB) or "" (PyTorch, ~1.5GB)
158
159
 
159
160
  # Agentic (Mode C only)
160
161
  agentic_max_rounds: int = 3
@@ -611,6 +612,15 @@ class SLMConfig:
611
612
 
612
613
  rt = data.get("retrieval", {})
613
614
  if rt:
615
+ # V3.3.2 migration: auto-enable ONNX cross-encoder.
616
+ # Pre-3.3.2 configs had use_cross_encoder=False because the
617
+ # PyTorch cross-encoder used ~1.5GB RAM. With ONNX backend
618
+ # (~200MB), it's now safe for all modes. Detect old configs
619
+ # by the absence of cross_encoder_backend field.
620
+ if "cross_encoder_backend" not in rt:
621
+ rt["use_cross_encoder"] = True
622
+ rt["cross_encoder_model"] = "cross-encoder/ms-marco-MiniLM-L-6-v2"
623
+ rt["cross_encoder_backend"] = "onnx"
614
624
  config.retrieval = RetrievalConfig(**{
615
625
  k: v for k, v in rt.items()
616
626
  if k in RetrievalConfig.__dataclass_fields__
@@ -650,6 +660,8 @@ class SLMConfig:
650
660
  },
651
661
  "retrieval": {
652
662
  "use_cross_encoder": self.retrieval.use_cross_encoder,
663
+ "cross_encoder_model": self.retrieval.cross_encoder_model,
664
+ "cross_encoder_backend": self.retrieval.cross_encoder_backend,
653
665
  },
654
666
  }
655
667
 
@@ -725,8 +737,8 @@ class SLMConfig:
725
737
  ),
726
738
  llm=LLMConfig(), # No LLM
727
739
  retrieval=RetrievalConfig(
728
- # Mode A: no cross-encoder (saves ~1.5GB PyTorch RAM)
729
- use_cross_encoder=False,
740
+ # V3.3.2: ONNX cross-encoder enabled for all modes (~200MB)
741
+ use_cross_encoder=True,
730
742
  ),
731
743
  math=MathConfig(
732
744
  sheaf_contradiction_threshold=0.45, # 768d threshold
@@ -750,8 +762,8 @@ class SLMConfig:
750
762
  api_key=llm_api_key or "",
751
763
  ),
752
764
  retrieval=RetrievalConfig(
753
- # Mode B: no cross-encoder (saves ~1.5GB PyTorch RAM)
754
- use_cross_encoder=False,
765
+ # V3.3.2: ONNX cross-encoder enabled for all modes (~200MB)
766
+ use_cross_encoder=True,
755
767
  ),
756
768
  )
757
769
 
@@ -23,6 +23,7 @@ Part of Qualixar | Author: Varun Pratap Bhardwaj
23
23
  from __future__ import annotations
24
24
 
25
25
  import json
26
+ import signal
26
27
  import sys
27
28
  import os
28
29
 
@@ -34,6 +35,11 @@ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
34
35
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
35
36
  os.environ["TORCH_DEVICE"] = "cpu"
36
37
 
38
+ # SIGTERM bridge: Docker/systemd send SIGTERM to stop processes.
39
+ # Without this, the worker ignores SIGTERM and becomes a zombie.
40
+ if sys.platform != "win32":
41
+ signal.signal(signal.SIGTERM, lambda *_: sys.exit(0))
42
+
37
43
 
38
44
  def _worker_main() -> None:
39
45
  """Main loop: read JSON requests from stdin, write responses to stdout."""
@@ -97,7 +103,14 @@ def _worker_main() -> None:
97
103
  _respond({"ok": False, "error": f"Model load failed: {exc}"})
98
104
  continue
99
105
  try:
100
- vecs = model.encode(texts, normalize_embeddings=True)
106
+ # torch.inference_mode prevents autograd graph accumulation
107
+ # which causes silent memory leaks over long-running sessions.
108
+ try:
109
+ import torch
110
+ with torch.inference_mode():
111
+ vecs = model.encode(texts, normalize_embeddings=True)
112
+ except ImportError:
113
+ vecs = model.encode(texts, normalize_embeddings=True)
101
114
  if isinstance(vecs, np.ndarray) and vecs.ndim == 2:
102
115
  result = [vecs[i].tolist() for i in range(vecs.shape[0])]
103
116
  else:
@@ -45,7 +45,8 @@ class DimensionMismatchError(RuntimeError):
45
45
 
46
46
 
47
47
  _IDLE_TIMEOUT_SECONDS = 120 # 2 minutes — kill worker after idle
48
- _SUBPROCESS_RESPONSE_TIMEOUT = 60 # seconds max wait for worker response
48
+ _SUBPROCESS_RESPONSE_TIMEOUT = 120 # V3.3.2: 120s for ONNX cold start
49
+ _WORKER_RECYCLE_AFTER = 1000 # Recycle worker after N requests (C++ fragmentation prevention)
49
50
 
50
51
 
51
52
  class EmbeddingService:
@@ -66,6 +67,7 @@ class EmbeddingService:
66
67
  self._last_used: float = 0.0
67
68
  self._idle_timer: threading.Timer | None = None
68
69
  self._worker_ready = False
70
+ self._request_count: int = 0
69
71
 
70
72
  @property
71
73
  def is_available(self) -> bool:
@@ -144,6 +146,13 @@ class EmbeddingService:
144
146
  never hangs indefinitely on cold model loads or network issues.
145
147
  """
146
148
  with self._lock:
149
+ # Worker recycling: restart after N requests to prevent
150
+ # C++ allocator fragmentation over long-running sessions.
151
+ if self._request_count >= _WORKER_RECYCLE_AFTER and self._worker_proc is not None:
152
+ logger.info("Recycling embedding worker after %d requests", self._request_count)
153
+ self._kill_worker()
154
+ self._request_count = 0
155
+
147
156
  self._ensure_worker()
148
157
  if self._worker_proc is None:
149
158
  return None
@@ -176,6 +185,7 @@ class EmbeddingService:
176
185
  logger.warning("Worker error: %s", resp.get("error"))
177
186
  return None
178
187
  self._reset_idle_timer()
188
+ self._request_count += 1
179
189
  return resp["vectors"]
180
190
  except (BrokenPipeError, OSError, json.JSONDecodeError) as exc:
181
191
  logger.warning(
@@ -235,6 +245,7 @@ class EmbeddingService:
235
245
  text=True,
236
246
  bufsize=1,
237
247
  env=env,
248
+ start_new_session=True, # Prevent terminal signals bleeding to worker
238
249
  )
239
250
  logger.info("Embedding worker spawned (PID %d)", self._worker_proc.pid)
240
251
  self._worker_ready = True
@@ -437,7 +437,10 @@ def init_retrieval(
437
437
 
438
438
  reranker = None
439
439
  if config.retrieval.use_cross_encoder:
440
- reranker = CrossEncoderReranker(config.retrieval.cross_encoder_model)
440
+ reranker = CrossEncoderReranker(
441
+ config.retrieval.cross_encoder_model,
442
+ backend=config.retrieval.cross_encoder_backend,
443
+ )
441
444
 
442
445
  profile_ch = ProfileChannel(db)
443
446
  bridge = BridgeDiscovery(db)
@@ -6,6 +6,7 @@
6
6
 
7
7
  Periodic batch processing for mathematical layers:
8
8
  1. Langevin batch_step on all active facts (self-organization)
9
+ 1a. Backfill: seed uninitialized facts with metadata-aware positions (B+C)
9
10
  2. Sheaf batch consistency check on recent facts
10
11
  3. Fisher adaptive temperature recalculation
11
12
 
@@ -18,15 +19,72 @@ License: MIT
18
19
  from __future__ import annotations
19
20
 
20
21
  import logging
22
+ import math as _math
21
23
  from datetime import UTC, datetime, timedelta
22
24
  from typing import TYPE_CHECKING
23
25
 
26
+ import numpy as np
27
+
24
28
  if TYPE_CHECKING:
25
29
  from superlocalmemory.core.config import SLMConfig
26
30
  from superlocalmemory.storage.database import DatabaseManager
27
31
 
28
32
  logger = logging.getLogger(__name__)
29
33
 
34
+ # Backfill constants
35
+ _BACKFILL_BURN_IN_STEPS = 50
36
+ _LANGEVIN_DIM = 8
37
+ _MAX_NORM = 0.99
38
+
39
+
40
+ def _compute_equilibrium_radius(
41
+ access_count: int,
42
+ age_days: float,
43
+ importance: float,
44
+ temperature: float = 0.3,
45
+ dim: int = 8,
46
+ ) -> float:
47
+ """Compute metadata-aware equilibrium radius (Strategy B).
48
+
49
+ Uses the Langevin potential coefficients to estimate where a fact
50
+ would settle if it had been in the dynamics from the start.
51
+
52
+ r_eq ≈ sqrt(T * dim / (2 * effective_alpha))
53
+ """
54
+ alpha, beta, gamma, delta = 3.0, 0.8, 0.005, 0.5
55
+ effective_alpha = (
56
+ alpha
57
+ + beta * _math.log(access_count + 1) / 10.0
58
+ - gamma * min(age_days, 365.0) / 365.0
59
+ + delta * importance
60
+ )
61
+ effective_alpha = max(0.1, effective_alpha)
62
+ r_eq = _math.sqrt(temperature * dim / (2.0 * effective_alpha))
63
+ return min(r_eq, _MAX_NORM * 0.95)
64
+
65
+
66
+ def _seed_langevin_position(
67
+ access_count: int,
68
+ age_days: float,
69
+ importance: float,
70
+ temperature: float = 0.3,
71
+ dim: int = 8,
72
+ ) -> list[float]:
73
+ """Create a metadata-aware initial position (Strategy B).
74
+
75
+ Places the fact at the equilibrium radius with a random direction.
76
+ """
77
+ r_eq = _compute_equilibrium_radius(
78
+ access_count, age_days, importance, temperature, dim,
79
+ )
80
+ rng = np.random.default_rng()
81
+ direction = rng.standard_normal(dim)
82
+ norm = float(np.linalg.norm(direction))
83
+ if norm < 1e-8:
84
+ direction = np.ones(dim)
85
+ norm = float(np.linalg.norm(direction))
86
+ return (direction / norm * r_eq).tolist()
87
+
30
88
 
31
89
  def run_maintenance(
32
90
  db: DatabaseManager,
@@ -44,6 +102,7 @@ def run_maintenance(
44
102
  Dict of counts: langevin_updated, sheaf_checked, etc.
45
103
  """
46
104
  counts: dict[str, int] = {
105
+ "langevin_backfilled": 0,
47
106
  "langevin_updated": 0,
48
107
  "fisher_coupled": 0,
49
108
  "sheaf_checked": 0,
@@ -53,13 +112,60 @@ def run_maintenance(
53
112
  if not facts:
54
113
  return counts
55
114
 
56
- # 1. Langevin batch step
115
+ # 1a. Backfill: seed uninitialized facts with metadata-aware positions (B+C)
116
+ if config.math.langevin_persist_positions:
117
+ try:
118
+ from superlocalmemory.math.langevin import LangevinDynamics
119
+
120
+ ld = LangevinDynamics(
121
+ dim=_LANGEVIN_DIM,
122
+ dt=config.math.langevin_dt,
123
+ temperature=config.math.langevin_temperature,
124
+ )
125
+
126
+ backfilled = 0
127
+ for f in facts:
128
+ if f.langevin_position is not None:
129
+ continue
130
+ created = datetime.fromisoformat(
131
+ f.created_at.replace("Z", "+00:00")
132
+ ) if f.created_at else datetime.now(UTC)
133
+ age_days = max(
134
+ 0.0,
135
+ (datetime.now(UTC) - created).total_seconds() / 86400.0,
136
+ )
137
+ # Strategy B: metadata-aware seed position
138
+ position = _seed_langevin_position(
139
+ f.access_count, age_days, f.importance,
140
+ config.math.langevin_temperature, _LANGEVIN_DIM,
141
+ )
142
+ # Strategy C: burn-in from the seeded position
143
+ for step_i in range(_BACKFILL_BURN_IN_STEPS):
144
+ position, _ = ld.step(
145
+ position, f.access_count, age_days, f.importance,
146
+ )
147
+ weight = ld.compute_lifecycle_weight(position)
148
+ lifecycle = ld.get_lifecycle_state(weight).value
149
+ db.update_fact(f.fact_id, {
150
+ "langevin_position": position,
151
+ "lifecycle": lifecycle,
152
+ })
153
+ f.langevin_position = position # update in-memory for step 1b
154
+ backfilled += 1
155
+
156
+ counts["langevin_backfilled"] = backfilled
157
+ if backfilled:
158
+ logger.info("Langevin backfill: %d facts initialized", backfilled)
159
+ except Exception as exc:
160
+ logger.warning("Langevin backfill failed: %s", exc)
161
+
162
+ # 1b. Langevin batch step on all positioned facts
57
163
  if config.math.langevin_persist_positions:
58
164
  try:
59
165
  from superlocalmemory.math.langevin import LangevinDynamics
60
166
 
61
167
  ld = LangevinDynamics(
62
- dim=8,
168
+ dim=_LANGEVIN_DIM,
63
169
  dt=config.math.langevin_dt,
64
170
  temperature=config.math.langevin_temperature,
65
171
  )
@@ -165,8 +271,8 @@ def run_maintenance(
165
271
  logger.warning("Sheaf maintenance failed: %s", exc)
166
272
 
167
273
  logger.info(
168
- "Maintenance complete: %d Langevin, %d Fisher-coupled, %d Sheaf",
169
- counts["langevin_updated"], counts["fisher_coupled"],
170
- counts["sheaf_checked"],
274
+ "Maintenance complete: %d backfilled, %d Langevin, %d Fisher-coupled, %d Sheaf",
275
+ counts["langevin_backfilled"], counts["langevin_updated"],
276
+ counts["fisher_coupled"], counts["sheaf_checked"],
171
277
  )
172
278
  return counts
@@ -68,7 +68,7 @@ MODE_A = ModeCapabilities(
68
68
  description=(
69
69
  "Local Guardian — Zero LLM, zero cloud. "
70
70
  "Uses nomic-embed-text-v1.5 encoder (768d, 8K context) for embeddings. "
71
- "spaCy + rules for extraction. Cross-encoder for reranking. "
71
+ "spaCy + rules for extraction. ONNX cross-encoder reranking (~200MB). "
72
72
  "Full EU AI Act compliance. Target: 65%+"
73
73
  ),
74
74
  )
@@ -89,6 +89,7 @@ MODE_B = ModeCapabilities(
89
89
  description=(
90
90
  "Smart Local — Local Ollama LLM (Phi-3, Llama 3.2). "
91
91
  "LLM-quality extraction and classification, fully local. "
92
+ "ONNX cross-encoder reranking (~200MB). "
92
93
  "No cloud, no data export. EU AI Act compliant. Target: 75-80%"
93
94
  ),
94
95
  )
@@ -18,6 +18,7 @@ from __future__ import annotations
18
18
 
19
19
  import json
20
20
  import os
21
+ import signal
21
22
  import sys
22
23
 
23
24
  # Force CPU BEFORE any torch import
@@ -28,6 +29,11 @@ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
28
29
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
29
30
  os.environ["TORCH_DEVICE"] = "cpu"
30
31
 
32
+ # SIGTERM bridge: Docker/systemd send SIGTERM to stop processes.
33
+ # Without this, the worker ignores SIGTERM and becomes a zombie.
34
+ if sys.platform != "win32":
35
+ signal.signal(signal.SIGTERM, lambda *_: sys.exit(0))
36
+
31
37
  _engine = None
32
38
 
33
39
 
@@ -223,14 +229,14 @@ def _worker_main() -> None:
223
229
  continue
224
230
 
225
231
  if cmd == "warmup":
226
- # Pre-load engine + all models (embedding, reranker, BM25, LLM)
227
- # Called at dashboard/MCP startup so first real request is fast.
228
- # A dummy recall triggers lazy-loaded components (cross-encoder, BM25 index).
232
+ # Pre-load engine + database + embeddings only.
233
+ # V3.3.2: Do NOT run a dummy recall it triggers the ONNX
234
+ # cross-encoder export (~30s) which combined with engine init
235
+ # exceeds the worker timeout. The cross-encoder loads lazily
236
+ # in a background thread on the first real recall instead.
229
237
  try:
230
238
  engine = _get_engine()
231
239
  fact_count = engine._db.get_fact_count(engine._profile_id) if engine._db else 0
232
- if fact_count > 0:
233
- engine.recall("warmup", limit=1)
234
240
  _respond({"ok": True, "message": "Engine warm", "facts": fact_count})
235
241
  except Exception as exc:
236
242
  _respond({"ok": False, "error": f"Warmup failed: {exc}"})
@@ -25,6 +25,25 @@ from superlocalmemory.storage.models import (
25
25
 
26
26
  logger = logging.getLogger(__name__)
27
27
 
28
+ # Langevin initialization radius for new facts (ACTIVE zone < 0.3)
29
+ _INIT_LANGEVIN_RADIUS = 0.05
30
+
31
+
32
+ def _init_langevin_position(dim: int = 8) -> list[float]:
33
+ """Initialize Langevin position near origin for a new fact.
34
+
35
+ Small random perturbation ensures each fact gets a unique position
36
+ while staying deep in the ACTIVE zone (radius < 0.3).
37
+ """
38
+ import numpy as np
39
+ rng = np.random.default_rng()
40
+ direction = rng.standard_normal(dim)
41
+ norm = float(np.linalg.norm(direction))
42
+ if norm < 1e-8:
43
+ direction = np.ones(dim)
44
+ norm = float(np.linalg.norm(direction))
45
+ return (direction / norm * _INIT_LANGEVIN_RADIUS).tolist()
46
+
28
47
 
29
48
  # ---------------------------------------------------------------------------
30
49
  # enrich_fact (was MemoryEngine._enrich_fact)
@@ -59,6 +78,10 @@ def enrich_fact(
59
78
  emotion = tag_emotion(fact.content)
60
79
  signal = infer_signal(fact.content)
61
80
 
81
+ # Strategy A: initialize Langevin position near origin (ACTIVE zone).
82
+ # New facts start as ACTIVE; dynamics will evolve them based on access patterns.
83
+ langevin_pos = _init_langevin_position(dim=8)
84
+
62
85
  return AtomicFact(
63
86
  fact_id=fact.fact_id, memory_id=record.memory_id,
64
87
  profile_id=profile_id, content=fact.content,
@@ -73,6 +96,7 @@ def enrich_fact(
73
96
  evidence_count=fact.evidence_count,
74
97
  source_turn_ids=fact.source_turn_ids, session_id=record.session_id,
75
98
  embedding=embedding, fisher_mean=fisher_mean, fisher_variance=fisher_variance,
99
+ langevin_position=langevin_pos,
76
100
  emotional_valence=emotion.valence, emotional_arousal=emotion.arousal,
77
101
  signal_type=signal, created_at=fact.created_at,
78
102
  )
@@ -29,8 +29,9 @@ import time
29
29
  logger = logging.getLogger(__name__)
30
30
 
31
31
  _IDLE_TIMEOUT = 120 # 2 min — kill worker after idle
32
- _REQUEST_TIMEOUT = 60 # 60 sec max per request
33
- _WARMUP_TIMEOUT = 120 # 2 min — first cold start loads PyTorch + models
32
+ _REQUEST_TIMEOUT = 120 # 120 sec per request (V3.3.2: ONNX cold start can take 30-60s)
33
+ _WARMUP_TIMEOUT = 180 # 3 min — first cold start: engine + ONNX export + models
34
+ _WORKER_RECYCLE_AFTER = 1000 # Recycle worker after N requests (C++ fragmentation prevention)
34
35
 
35
36
 
36
37
  class WorkerPool:
@@ -49,6 +50,7 @@ class WorkerPool:
49
50
  self._proc: subprocess.Popen | None = None
50
51
  self._idle_timer: threading.Timer | None = None
51
52
  self._last_used: float = 0.0
53
+ self._request_count: int = 0
52
54
 
53
55
  @classmethod
54
56
  def shared(cls) -> WorkerPool:
@@ -146,6 +148,13 @@ class WorkerPool:
146
148
  def _send_with_timeout(self, request: dict, timeout: float) -> dict:
147
149
  """Send request with configurable timeout. Thread-safe."""
148
150
  with self._lock:
151
+ # Worker recycling: restart after N requests to prevent
152
+ # C++ allocator fragmentation over long-running sessions.
153
+ if self._request_count >= _WORKER_RECYCLE_AFTER and self._proc is not None:
154
+ logger.info("Recycling recall worker after %d requests", self._request_count)
155
+ self._kill()
156
+ self._request_count = 0
157
+
149
158
  self._ensure_worker()
150
159
  if self._proc is None:
151
160
  return {"ok": False, "error": "Worker failed to start"}
@@ -168,6 +177,7 @@ class WorkerPool:
168
177
  return {"ok": False, "error": "Worker died"}
169
178
 
170
179
  self._reset_idle_timer()
180
+ self._request_count += 1
171
181
  return json.loads(resp_line)
172
182
 
173
183
  except (BrokenPipeError, OSError, json.JSONDecodeError) as exc:
@@ -227,6 +237,7 @@ class WorkerPool:
227
237
  text=True,
228
238
  bufsize=1,
229
239
  env=env,
240
+ start_new_session=True, # Prevent terminal signals bleeding to worker
230
241
  )
231
242
  logger.info("Recall worker spawned (PID %d)", self._proc.pid)
232
243
  except Exception as exc:
@@ -194,6 +194,7 @@ class IDEConnector:
194
194
  data["mcpServers"] = {}
195
195
 
196
196
  data["mcpServers"]["superlocalmemory"] = {
197
+ "type": "stdio",
197
198
  "command": "slm",
198
199
  "args": ["mcp"],
199
200
  "enabled": True,
@@ -14,6 +14,9 @@ License: MIT
14
14
  from __future__ import annotations
15
15
 
16
16
  import logging
17
+ import platform
18
+ import struct
19
+ import sys
17
20
  import threading
18
21
  from typing import Any
19
22
 
@@ -22,56 +25,151 @@ from superlocalmemory.storage.models import AtomicFact
22
25
  logger = logging.getLogger(__name__)
23
26
 
24
27
 
28
+ def _detect_onnx_variant() -> str:
29
+ """Auto-detect the best ONNX model variant for the current platform.
30
+
31
+ Returns the file_name parameter for CrossEncoder model_kwargs.
32
+ Platform detection:
33
+ - macOS ARM64 (Apple Silicon): qint8_arm64
34
+ - x86_64 with AVX2: quint8_avx2
35
+ - Everything else: default model.onnx (float32, works everywhere)
36
+ """
37
+ arch = platform.machine().lower()
38
+ is_64bit = struct.calcsize("P") * 8 == 64
39
+
40
+ if sys.platform == "darwin" and arch in ("arm64", "aarch64"):
41
+ return "onnx/model_qint8_arm64.onnx"
42
+
43
+ if arch in ("x86_64", "amd64") and is_64bit:
44
+ return "onnx/model_quint8_avx2.onnx"
45
+
46
+ return "onnx/model.onnx"
47
+
48
+
25
49
  class CrossEncoderReranker:
26
50
  """Rerank candidate facts using a local cross-encoder model.
27
51
 
52
+ V3.3.2: Uses ONNX backend by default (~200MB) instead of full PyTorch
53
+ (~1.5GB). Three-tier fallback: ONNX → PyTorch → no reranking.
54
+ Auto-detects the optimal quantized ONNX variant per platform.
55
+
28
56
  When the model is unavailable (missing package, download failure,
29
57
  offline environment), falls back to returning candidates in their
30
58
  original score order — never crashes.
31
59
 
32
60
  Args:
33
61
  model_name: HuggingFace cross-encoder model identifier.
62
+ backend: Inference backend. "onnx" for ONNX Runtime (light),
63
+ "" for PyTorch (heavy). Default: "onnx".
34
64
  """
35
65
 
36
66
  def __init__(
37
67
  self,
38
- model_name: str = "BAAI/bge-reranker-v2-m3",
68
+ model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2",
69
+ backend: str = "onnx",
39
70
  ) -> None:
40
71
  self._model_name = model_name
72
+ self._backend = backend
41
73
  self._model: Any = None
42
74
  self._loaded = False
75
+ self._loading = False # True while background load is in progress
76
+ self._active_backend: str = ""
43
77
  self._lock = threading.Lock()
44
78
 
45
79
  # ------------------------------------------------------------------
46
- # Lazy loading
80
+ # Lazy loading (non-blocking)
47
81
  # ------------------------------------------------------------------
48
82
 
49
83
  def _ensure_model(self) -> None:
50
- """Load cross-encoder on first use (thread-safe)."""
84
+ """Trigger model load in background (non-blocking).
85
+
86
+ On first call, starts loading in a background thread and returns
87
+ immediately. The model becomes available for subsequent calls
88
+ once loading completes. This prevents the 30s ONNX cold start
89
+ from blocking the first recall request.
90
+
91
+ Three-tier fallback:
92
+ 1. ONNX backend with platform-optimal quantization — ~100-200MB RAM
93
+ 2. PyTorch backend (requires torch) — ~1.5GB RAM
94
+ 3. No model (graceful degradation) — 0 RAM
95
+ """
51
96
  if self._loaded:
52
97
  return
53
98
 
54
99
  with self._lock:
55
- if self._loaded:
56
- return # Double-check after acquiring lock
57
- try:
58
- from sentence_transformers import CrossEncoder
59
-
60
- self._model = CrossEncoder(self._model_name)
100
+ if self._loaded or self._loading:
101
+ return
102
+ self._loading = True
103
+
104
+ # Load in background thread so first recall isn't blocked
105
+ loader = threading.Thread(
106
+ target=self._load_model, daemon=True, name="ce-loader",
107
+ )
108
+ loader.start()
109
+
110
+ def _load_model(self) -> None:
111
+ """Actually load the model (runs in background thread)."""
112
+ try:
113
+ from sentence_transformers import CrossEncoder
114
+
115
+ if self._backend == "onnx":
116
+ try:
117
+ onnx_file = _detect_onnx_variant()
118
+ model = CrossEncoder(
119
+ self._model_name,
120
+ backend="onnx",
121
+ model_kwargs={"file_name": onnx_file},
122
+ )
123
+ self._model = model
124
+ self._active_backend = "onnx"
125
+ logger.info(
126
+ "Cross-encoder loaded (ONNX %s): %s",
127
+ onnx_file, self._model_name,
128
+ )
129
+ except Exception as onnx_exc:
130
+ logger.info(
131
+ "ONNX backend unavailable (%s), falling back to PyTorch",
132
+ onnx_exc,
133
+ )
134
+ model = CrossEncoder(self._model_name)
135
+ self._model = model
136
+ self._active_backend = "pytorch"
137
+ logger.info(
138
+ "Cross-encoder loaded (PyTorch fallback): %s",
139
+ self._model_name,
140
+ )
141
+ else:
142
+ model = CrossEncoder(self._model_name)
143
+ self._model = model
144
+ self._active_backend = "pytorch"
61
145
  logger.info("Cross-encoder loaded: %s", self._model_name)
62
- except ImportError:
63
- logger.warning(
64
- "sentence-transformers not installed; "
65
- "cross-encoder reranking disabled"
66
- )
67
- except OSError as exc:
68
- logger.warning(
69
- "Failed to load cross-encoder %s: %s",
70
- self._model_name,
71
- exc,
72
- )
73
- finally:
74
- self._loaded = True
146
+ except ImportError:
147
+ logger.warning(
148
+ "sentence-transformers not installed; "
149
+ "cross-encoder reranking disabled"
150
+ )
151
+ except OSError as exc:
152
+ logger.warning(
153
+ "Failed to load cross-encoder %s: %s",
154
+ self._model_name,
155
+ exc,
156
+ )
157
+ finally:
158
+ self._loaded = True
159
+ self._loading = False
160
+
161
+ def _ensure_model_blocking(self) -> None:
162
+ """Load model synchronously (blocks until ready).
163
+
164
+ Used by warmup and is_available where we need the model NOW.
165
+ """
166
+ if self._loaded:
167
+ return
168
+ with self._lock:
169
+ if self._loaded:
170
+ return
171
+ self._loading = True
172
+ self._load_model()
75
173
 
76
174
  # ------------------------------------------------------------------
77
175
  # Public API
@@ -104,10 +202,13 @@ class CrossEncoderReranker:
104
202
  if not candidates:
105
203
  return []
106
204
 
205
+ # Non-blocking: trigger background load if not yet started
107
206
  self._ensure_model()
108
207
 
109
208
  if self._model is None:
110
- # Fallback: keep existing score order
209
+ # Model not loaded yet (still loading in background or failed).
210
+ # Graceful fallback: return candidates sorted by existing score.
211
+ # Next recall will use the model once it's ready.
111
212
  sorted_cands = sorted(
112
213
  candidates, key=lambda x: x[1], reverse=True
113
214
  )
@@ -150,5 +251,5 @@ class CrossEncoderReranker:
150
251
  @property
151
252
  def is_available(self) -> bool:
152
253
  """Whether the cross-encoder model is loaded and ready."""
153
- self._ensure_model()
254
+ self._ensure_model_blocking()
154
255
  return self._model is not None