npm - superlocalmemory - Versions diffs - 3.3.1 → 3.3.3 - Mend

superlocalmemory 3.3.1 → 3.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/CHANGELOG.md +12 -0
package/ide/configs/antigravity-mcp.json +2 -1
package/ide/configs/claude-desktop-mcp.json +2 -1
package/ide/configs/cursor-mcp.json +2 -1
package/ide/configs/gemini-cli-mcp.json +2 -1
package/ide/configs/jetbrains-mcp.json +2 -1
package/ide/configs/perplexity-mcp.json +2 -1
package/ide/configs/windsurf-mcp.json +2 -1
package/package.json +1 -1
package/pyproject.toml +6 -3
package/scripts/postinstall.js +16 -9
package/src/superlocalmemory/cli/commands.py +44 -15
package/src/superlocalmemory/core/config.py +18 -6
package/src/superlocalmemory/core/embedding_worker.py +14 -1
package/src/superlocalmemory/core/embeddings.py +12 -1
package/src/superlocalmemory/core/engine_wiring.py +4 -1
package/src/superlocalmemory/core/maintenance.py +111 -5
package/src/superlocalmemory/core/modes.py +2 -1
package/src/superlocalmemory/core/recall_worker.py +11 -5
package/src/superlocalmemory/core/store_pipeline.py +24 -0
package/src/superlocalmemory/core/worker_pool.py +13 -2
package/src/superlocalmemory/hooks/ide_connector.py +1 -0
package/src/superlocalmemory/retrieval/reranker.py +125 -24

package/CHANGELOG.md CHANGED Viewed

@@ -16,6 +16,18 @@ SuperLocalMemory V3 - Intelligent local memory system for AI coding assistants.
 ---
+## [3.3.3] - 2026-04-01 — Langevin Awakening
+### Fixed
+- **Langevin dynamics now active** — positions were never initialized at store time, causing the entire Langevin lifecycle system to be inert (0 positioned facts). New facts now receive near-origin positions (Strategy A).
+- **Backfill for existing facts** — maintenance now initializes unpositioned facts using metadata-aware equilibrium seeding (Strategy B) followed by 50-step burn-in (Strategy C). Old, rarely-accessed facts land in their correct lifecycle zones immediately.
+### Improved
+- Maintenance returns `langevin_backfilled` count for observability
+- Health check now reports positioned facts accurately after backfill
+---
 ## [3.3.0] - 2026-03-31 — The Living Brain
 ### New Features

package/ide/configs/antigravity-mcp.json CHANGED Viewed

@@ -5,7 +5,8 @@
       "args": [
         "mcp"
       ],
-      "description": "SuperLocalMemory V3 - 100% local memory system"
+      "description": "SuperLocalMemory V3 - 100% local memory system",
+      "type": "stdio"
     }
   }
 }

package/ide/configs/claude-desktop-mcp.json CHANGED Viewed

@@ -5,7 +5,8 @@
       "args": [
         "mcp"
       ],
-      "description": "SuperLocalMemory V3 - 100% local memory system for AI assistants"
+      "description": "SuperLocalMemory V3 - 100% local memory system for AI assistants",
+      "type": "stdio"
     }
   }
 }

package/ide/configs/cursor-mcp.json CHANGED Viewed

@@ -5,7 +5,8 @@
       "args": [
         "mcp"
       ],
-      "description": "SuperLocalMemory V3 - Local memory system with mathematical foundations and knowledge graphs"
+      "description": "SuperLocalMemory V3 - Local memory system with mathematical foundations and knowledge graphs",
+      "type": "stdio"
     }
   }
 }

package/ide/configs/gemini-cli-mcp.json CHANGED Viewed

@@ -5,7 +5,8 @@
       "args": [
         "mcp"
       ],
-      "description": "SuperLocalMemory V3"
+      "description": "SuperLocalMemory V3",
+      "type": "stdio"
     }
   }
 }

package/ide/configs/jetbrains-mcp.json CHANGED Viewed

@@ -5,7 +5,8 @@
       "args": [
         "mcp"
       ],
-      "description": "SuperLocalMemory V3"
+      "description": "SuperLocalMemory V3",
+      "type": "stdio"
     }
   }
 }

package/ide/configs/perplexity-mcp.json CHANGED Viewed

@@ -4,6 +4,7 @@
     "args": [
       "mcp"
     ],
-    "env": {}
+    "env": {},
+    "type": "stdio"
   }
 }

package/ide/configs/windsurf-mcp.json CHANGED Viewed

@@ -5,7 +5,8 @@
       "args": [
         "mcp"
       ],
-      "description": "SuperLocalMemory V3 - 100% local memory system for AI assistants"
+      "description": "SuperLocalMemory V3 - 100% local memory system for AI assistants",
+      "type": "stdio"
     }
   }
 }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "superlocalmemory",
-  "version": "3.3.1",
+  "version": "3.3.3",
   "description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
   "keywords": [
     "ai-memory",

package/pyproject.toml CHANGED Viewed

@@ -1,10 +1,10 @@
 [project]
 name = "superlocalmemory"
-version = "3.3.1"
+version = "3.3.3"
 description = "Information-geometric agent memory with mathematical guarantees"
 readme = "README.md"
 license = {text = "MIT"}
-requires-python = ">=3.11"
+requires-python = ">=3.11,<3.15"
 authors = [
     {name = "Varun Pratap Bhardwaj", email = "admin@superlocalmemory.com"},
 ]
@@ -48,11 +48,13 @@ dependencies = [
 [project.optional-dependencies]
 search = [
-    "sentence-transformers>=2.5.0,<4.0.0",
+    "sentence-transformers>=4.0.0",
+    "sentence-transformers[onnx]>=4.0.0",
     "einops>=0.8.2",
     "torch>=2.2.0",
     "scikit-learn>=1.3.0,<2.0.0",
     "geoopt>=0.5.0",
+    "onnxruntime>=1.17.0",
 ]
 ui = [
     "fastapi[all]>=0.135.1",
@@ -72,6 +74,7 @@ full = [
 dev = [
     "pytest>=8.0",
     "pytest-cov>=4.1",
+    "sqlite-vec>=0.1.6",
 ]
 [project.urls]

package/scripts/postinstall.js CHANGED Viewed

@@ -112,20 +112,27 @@ if (pipInstall(coreDeps, 'core')) {
     console.log('  Run manually: pip install ' + coreDeps.join(' '));
 }
-// Search dependencies (IMPORTANT — enables semantic search, 4-channel retrieval)
-const searchDeps = ['sentence-transformers>=2.5.0', 'einops>=0.7.0', 'geoopt>=0.5.0'];
+// Search + ONNX reranking (V3.3.2 — enables 6-channel retrieval + cross-encoder)
+const searchDeps = [
+    'sentence-transformers[onnx]>=4.0.0',
+    'einops>=0.7.0', 'geoopt>=0.5.0',
+    'onnxruntime>=1.17.0',
+];
-console.log('\nInstalling semantic search engine (downloads ~500MB on first use)...');
+console.log('\nInstalling semantic search + ONNX reranking engine...');
+console.log('  (sentence-transformers 4+, ONNX Runtime, Fisher-Rao geometry)');
 if (pipInstall(searchDeps, 'search')) {
-    console.log('✓ Semantic search engine installed (sentence-transformers + einops + Fisher-Rao)');
+    console.log('✓ Search engine installed (sentence-transformers + ONNX + Fisher-Rao)');
+    console.log('  Cross-encoder reranking enabled for ALL modes (+30pp quality)');
     console.log('');
-    console.log('  Note: The embedding model (nomic-ai/nomic-embed-text-v1.5, ~500MB)');
-    console.log('  will download automatically on first use (slm remember / slm recall).');
+    console.log('  Models auto-download on first use:');
+    console.log('    - Embedding: nomic-ai/nomic-embed-text-v1.5 (~500MB)');
+    console.log('    - Reranker: cross-encoder/ms-marco-MiniLM-L-6-v2 (~90MB)');
     console.log('  To pre-download now, run: slm warmup');
 } else {
-    console.log('⚠ Semantic search installation failed (BM25 keyword search still works).');
-    console.log('  For full 4-channel retrieval, run:');
-    console.log('  pip install sentence-transformers einops geoopt');
+    console.log('⚠ Search engine installation failed (BM25 keyword search still works).');
+    console.log('  For full 6-channel retrieval + reranking, run:');
+    console.log('  pip install "sentence-transformers[onnx]>=4.0.0" einops geoopt onnxruntime');
 }
 // Dashboard dependencies (IMPORTANT — enables web dashboard + MCP server)

package/src/superlocalmemory/cli/commands.py CHANGED Viewed

@@ -993,35 +993,64 @@ def cmd_dashboard(args: Namespace) -> None:
         print("Or install manually: pip install 'fastapi[all]' uvicorn")
         sys.exit(1)
+    import os
+    import signal
     import socket
     port = getattr(args, "port", 8765)
-    def _find_port(preferred: int) -> int:
-        for p in [preferred] + list(range(preferred + 1, preferred + 20)):
-            try:
-                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-                    s.bind(("127.0.0.1", p))
-                    return p
-            except OSError:
-                continue
-        return preferred
+    def _kill_existing_on_port(target_port: int) -> None:
+        """Kill any existing SLM dashboard on the target port.
-    ui_port = _find_port(port)
-    if ui_port != port:
-        print(f"  Port {port} in use — using {ui_port} instead")
+        V3.3.2: ONE port, no auto-increment. If port is busy with
+        another SLM instance, kill it. If busy with a non-SLM process,
+        warn and exit — never silently shift to a different port.
+        """
+        if sys.platform == "win32":
+            return  # Windows: user must close manually
+        try:
+            import subprocess
+            result = subprocess.run(
+                ["lsof", "-ti", f":{target_port}"],
+                capture_output=True, text=True, timeout=5,
+            )
+            if result.returncode == 0 and result.stdout.strip():
+                pids = result.stdout.strip().split("\n")
+                for pid_str in pids:
+                    pid = int(pid_str.strip())
+                    if pid == os.getpid():
+                        continue
+                    # Check if it's an SLM/Python process
+                    ps_result = subprocess.run(
+                        ["ps", "-p", str(pid), "-o", "command="],
+                        capture_output=True, text=True, timeout=5,
+                    )
+                    cmd = ps_result.stdout.strip().lower()
+                    if "superlocalmemory" in cmd or "slm" in cmd or "uvicorn" in cmd:
+                        os.kill(pid, signal.SIGTERM)
+                        print(f"  Stopped previous dashboard (PID {pid})")
+                        import time
+                        time.sleep(1)
+        except Exception:
+            pass  # Best-effort
+    _kill_existing_on_port(port)
+    # Brief wait for port to fully release after killing old process
+    import time
+    time.sleep(1)
     print("=" * 60)
     print("  SuperLocalMemory V3 — Web Dashboard")
     print("=" * 60)
-    print(f"  Dashboard:  http://localhost:{ui_port}")
-    print(f"  API Docs:   http://localhost:{ui_port}/api/docs")
+    print(f"  Dashboard:  http://localhost:{port}")
+    print(f"  API Docs:   http://localhost:{port}/api/docs")
     print("  Press Ctrl+C to stop\n")
     from superlocalmemory.server.ui import create_app
     app = create_app()
-    uvicorn.run(app, host="127.0.0.1", port=ui_port, log_level="info")
+    uvicorn.run(app, host="127.0.0.1", port=port, log_level="info")
 # -- Profiles (supports --json) -------------------------------------------

package/src/superlocalmemory/core/config.py CHANGED Viewed

@@ -152,9 +152,10 @@ class RetrievalConfig:
     entity_graph_max_hops: int = 3
     temporal_proximity_days: int = 30
-    # Reranking
+    # Reranking (V3.3.2: ONNX backend enabled for all modes)
     use_cross_encoder: bool = True
-    cross_encoder_model: str = "BAAI/bge-reranker-v2-m3"
+    cross_encoder_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+    cross_encoder_backend: str = "onnx"  # "onnx" (~200MB) or "" (PyTorch, ~1.5GB)
     # Agentic (Mode C only)
     agentic_max_rounds: int = 3
@@ -611,6 +612,15 @@ class SLMConfig:
         rt = data.get("retrieval", {})
         if rt:
+            # V3.3.2 migration: auto-enable ONNX cross-encoder.
+            # Pre-3.3.2 configs had use_cross_encoder=False because the
+            # PyTorch cross-encoder used ~1.5GB RAM. With ONNX backend
+            # (~200MB), it's now safe for all modes. Detect old configs
+            # by the absence of cross_encoder_backend field.
+            if "cross_encoder_backend" not in rt:
+                rt["use_cross_encoder"] = True
+                rt["cross_encoder_model"] = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+                rt["cross_encoder_backend"] = "onnx"
             config.retrieval = RetrievalConfig(**{
                 k: v for k, v in rt.items()
                 if k in RetrievalConfig.__dataclass_fields__
@@ -650,6 +660,8 @@ class SLMConfig:
             },
             "retrieval": {
                 "use_cross_encoder": self.retrieval.use_cross_encoder,
+                "cross_encoder_model": self.retrieval.cross_encoder_model,
+                "cross_encoder_backend": self.retrieval.cross_encoder_backend,
             },
         }
@@ -725,8 +737,8 @@ class SLMConfig:
                 ),
                 llm=LLMConfig(),  # No LLM
                 retrieval=RetrievalConfig(
-                    # Mode A: no cross-encoder (saves ~1.5GB PyTorch RAM)
-                    use_cross_encoder=False,
+                    # V3.3.2: ONNX cross-encoder enabled for all modes (~200MB)
+                    use_cross_encoder=True,
                 ),
                 math=MathConfig(
                     sheaf_contradiction_threshold=0.45,  # 768d threshold
@@ -750,8 +762,8 @@ class SLMConfig:
                     api_key=llm_api_key or "",
                 ),
                 retrieval=RetrievalConfig(
-                    # Mode B: no cross-encoder (saves ~1.5GB PyTorch RAM)
-                    use_cross_encoder=False,
+                    # V3.3.2: ONNX cross-encoder enabled for all modes (~200MB)
+                    use_cross_encoder=True,
                 ),
             )

package/src/superlocalmemory/core/embedding_worker.py CHANGED Viewed

@@ -23,6 +23,7 @@ Part of Qualixar | Author: Varun Pratap Bhardwaj
 from __future__ import annotations
 import json
+import signal
 import sys
 import os
@@ -34,6 +35,11 @@ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 os.environ["TORCH_DEVICE"] = "cpu"
+# SIGTERM bridge: Docker/systemd send SIGTERM to stop processes.
+# Without this, the worker ignores SIGTERM and becomes a zombie.
+if sys.platform != "win32":
+    signal.signal(signal.SIGTERM, lambda *_: sys.exit(0))
 def _worker_main() -> None:
     """Main loop: read JSON requests from stdin, write responses to stdout."""
@@ -97,7 +103,14 @@ def _worker_main() -> None:
                     _respond({"ok": False, "error": f"Model load failed: {exc}"})
                     continue
             try:
-                vecs = model.encode(texts, normalize_embeddings=True)
+                # torch.inference_mode prevents autograd graph accumulation
+                # which causes silent memory leaks over long-running sessions.
+                try:
+                    import torch
+                    with torch.inference_mode():
+                        vecs = model.encode(texts, normalize_embeddings=True)
+                except ImportError:
+                    vecs = model.encode(texts, normalize_embeddings=True)
                 if isinstance(vecs, np.ndarray) and vecs.ndim == 2:
                     result = [vecs[i].tolist() for i in range(vecs.shape[0])]
                 else:

package/src/superlocalmemory/core/embeddings.py CHANGED Viewed

@@ -45,7 +45,8 @@ class DimensionMismatchError(RuntimeError):
 _IDLE_TIMEOUT_SECONDS = 120  # 2 minutes — kill worker after idle
-_SUBPROCESS_RESPONSE_TIMEOUT = 60  # seconds — max wait for worker response
+_SUBPROCESS_RESPONSE_TIMEOUT = 120  # V3.3.2: 120s for ONNX cold start
+_WORKER_RECYCLE_AFTER = 1000  # Recycle worker after N requests (C++ fragmentation prevention)
 class EmbeddingService:
@@ -66,6 +67,7 @@ class EmbeddingService:
         self._last_used: float = 0.0
         self._idle_timer: threading.Timer | None = None
         self._worker_ready = False
+        self._request_count: int = 0
     @property
     def is_available(self) -> bool:
@@ -144,6 +146,13 @@ class EmbeddingService:
         never hangs indefinitely on cold model loads or network issues.
         """
         with self._lock:
+            # Worker recycling: restart after N requests to prevent
+            # C++ allocator fragmentation over long-running sessions.
+            if self._request_count >= _WORKER_RECYCLE_AFTER and self._worker_proc is not None:
+                logger.info("Recycling embedding worker after %d requests", self._request_count)
+                self._kill_worker()
+                self._request_count = 0
             self._ensure_worker()
             if self._worker_proc is None:
                 return None
@@ -176,6 +185,7 @@ class EmbeddingService:
                     logger.warning("Worker error: %s", resp.get("error"))
                     return None
                 self._reset_idle_timer()
+                self._request_count += 1
                 return resp["vectors"]
             except (BrokenPipeError, OSError, json.JSONDecodeError) as exc:
                 logger.warning(
@@ -235,6 +245,7 @@ class EmbeddingService:
                 text=True,
                 bufsize=1,
                 env=env,
+                start_new_session=True,  # Prevent terminal signals bleeding to worker
             )
             logger.info("Embedding worker spawned (PID %d)", self._worker_proc.pid)
             self._worker_ready = True

package/src/superlocalmemory/core/engine_wiring.py CHANGED Viewed

@@ -437,7 +437,10 @@ def init_retrieval(
     reranker = None
     if config.retrieval.use_cross_encoder:
-        reranker = CrossEncoderReranker(config.retrieval.cross_encoder_model)
+        reranker = CrossEncoderReranker(
+            config.retrieval.cross_encoder_model,
+            backend=config.retrieval.cross_encoder_backend,
+        )
     profile_ch = ProfileChannel(db)
     bridge = BridgeDiscovery(db)

package/src/superlocalmemory/core/maintenance.py CHANGED Viewed

@@ -6,6 +6,7 @@
 Periodic batch processing for mathematical layers:
 1. Langevin batch_step on all active facts (self-organization)
+   1a. Backfill: seed uninitialized facts with metadata-aware positions (B+C)
 2. Sheaf batch consistency check on recent facts
 3. Fisher adaptive temperature recalculation
@@ -18,15 +19,72 @@ License: MIT
 from __future__ import annotations
 import logging
+import math as _math
 from datetime import UTC, datetime, timedelta
 from typing import TYPE_CHECKING
+import numpy as np
 if TYPE_CHECKING:
     from superlocalmemory.core.config import SLMConfig
     from superlocalmemory.storage.database import DatabaseManager
 logger = logging.getLogger(__name__)
+# Backfill constants
+_BACKFILL_BURN_IN_STEPS = 50
+_LANGEVIN_DIM = 8
+_MAX_NORM = 0.99
+def _compute_equilibrium_radius(
+    access_count: int,
+    age_days: float,
+    importance: float,
+    temperature: float = 0.3,
+    dim: int = 8,
+) -> float:
+    """Compute metadata-aware equilibrium radius (Strategy B).
+    Uses the Langevin potential coefficients to estimate where a fact
+    would settle if it had been in the dynamics from the start.
+    r_eq ≈ sqrt(T * dim / (2 * effective_alpha))
+    """
+    alpha, beta, gamma, delta = 3.0, 0.8, 0.005, 0.5
+    effective_alpha = (
+        alpha
+        + beta * _math.log(access_count + 1) / 10.0
+        - gamma * min(age_days, 365.0) / 365.0
+        + delta * importance
+    )
+    effective_alpha = max(0.1, effective_alpha)
+    r_eq = _math.sqrt(temperature * dim / (2.0 * effective_alpha))
+    return min(r_eq, _MAX_NORM * 0.95)
+def _seed_langevin_position(
+    access_count: int,
+    age_days: float,
+    importance: float,
+    temperature: float = 0.3,
+    dim: int = 8,
+) -> list[float]:
+    """Create a metadata-aware initial position (Strategy B).
+    Places the fact at the equilibrium radius with a random direction.
+    """
+    r_eq = _compute_equilibrium_radius(
+        access_count, age_days, importance, temperature, dim,
+    )
+    rng = np.random.default_rng()
+    direction = rng.standard_normal(dim)
+    norm = float(np.linalg.norm(direction))
+    if norm < 1e-8:
+        direction = np.ones(dim)
+        norm = float(np.linalg.norm(direction))
+    return (direction / norm * r_eq).tolist()
 def run_maintenance(
     db: DatabaseManager,
@@ -44,6 +102,7 @@ def run_maintenance(
         Dict of counts: langevin_updated, sheaf_checked, etc.
     """
     counts: dict[str, int] = {
+        "langevin_backfilled": 0,
         "langevin_updated": 0,
         "fisher_coupled": 0,
         "sheaf_checked": 0,
@@ -53,13 +112,60 @@ def run_maintenance(
     if not facts:
         return counts
-    # 1. Langevin batch step
+    # 1a. Backfill: seed uninitialized facts with metadata-aware positions (B+C)
+    if config.math.langevin_persist_positions:
+        try:
+            from superlocalmemory.math.langevin import LangevinDynamics
+            ld = LangevinDynamics(
+                dim=_LANGEVIN_DIM,
+                dt=config.math.langevin_dt,
+                temperature=config.math.langevin_temperature,
+            )
+            backfilled = 0
+            for f in facts:
+                if f.langevin_position is not None:
+                    continue
+                created = datetime.fromisoformat(
+                    f.created_at.replace("Z", "+00:00")
+                ) if f.created_at else datetime.now(UTC)
+                age_days = max(
+                    0.0,
+                    (datetime.now(UTC) - created).total_seconds() / 86400.0,
+                )
+                # Strategy B: metadata-aware seed position
+                position = _seed_langevin_position(
+                    f.access_count, age_days, f.importance,
+                    config.math.langevin_temperature, _LANGEVIN_DIM,
+                )
+                # Strategy C: burn-in from the seeded position
+                for step_i in range(_BACKFILL_BURN_IN_STEPS):
+                    position, _ = ld.step(
+                        position, f.access_count, age_days, f.importance,
+                    )
+                weight = ld.compute_lifecycle_weight(position)
+                lifecycle = ld.get_lifecycle_state(weight).value
+                db.update_fact(f.fact_id, {
+                    "langevin_position": position,
+                    "lifecycle": lifecycle,
+                })
+                f.langevin_position = position  # update in-memory for step 1b
+                backfilled += 1
+            counts["langevin_backfilled"] = backfilled
+            if backfilled:
+                logger.info("Langevin backfill: %d facts initialized", backfilled)
+        except Exception as exc:
+            logger.warning("Langevin backfill failed: %s", exc)
+    # 1b. Langevin batch step on all positioned facts
     if config.math.langevin_persist_positions:
         try:
             from superlocalmemory.math.langevin import LangevinDynamics
             ld = LangevinDynamics(
-                dim=8,
+                dim=_LANGEVIN_DIM,
                 dt=config.math.langevin_dt,
                 temperature=config.math.langevin_temperature,
             )
@@ -165,8 +271,8 @@ def run_maintenance(
             logger.warning("Sheaf maintenance failed: %s", exc)
     logger.info(
-        "Maintenance complete: %d Langevin, %d Fisher-coupled, %d Sheaf",
-        counts["langevin_updated"], counts["fisher_coupled"],
-        counts["sheaf_checked"],
+        "Maintenance complete: %d backfilled, %d Langevin, %d Fisher-coupled, %d Sheaf",
+        counts["langevin_backfilled"], counts["langevin_updated"],
+        counts["fisher_coupled"], counts["sheaf_checked"],
     )
     return counts

package/src/superlocalmemory/core/modes.py CHANGED Viewed

@@ -68,7 +68,7 @@ MODE_A = ModeCapabilities(
     description=(
         "Local Guardian — Zero LLM, zero cloud. "
         "Uses nomic-embed-text-v1.5 encoder (768d, 8K context) for embeddings. "
-        "spaCy + rules for extraction. Cross-encoder for reranking. "
+        "spaCy + rules for extraction. ONNX cross-encoder reranking (~200MB). "
         "Full EU AI Act compliance. Target: 65%+"
     ),
 )
@@ -89,6 +89,7 @@ MODE_B = ModeCapabilities(
     description=(
         "Smart Local — Local Ollama LLM (Phi-3, Llama 3.2). "
         "LLM-quality extraction and classification, fully local. "
+        "ONNX cross-encoder reranking (~200MB). "
         "No cloud, no data export. EU AI Act compliant. Target: 75-80%"
     ),
 )

package/src/superlocalmemory/core/recall_worker.py CHANGED Viewed

@@ -18,6 +18,7 @@ from __future__ import annotations
 import json
 import os
+import signal
 import sys
 # Force CPU BEFORE any torch import
@@ -28,6 +29,11 @@ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 os.environ["TORCH_DEVICE"] = "cpu"
+# SIGTERM bridge: Docker/systemd send SIGTERM to stop processes.
+# Without this, the worker ignores SIGTERM and becomes a zombie.
+if sys.platform != "win32":
+    signal.signal(signal.SIGTERM, lambda *_: sys.exit(0))
 _engine = None
@@ -223,14 +229,14 @@ def _worker_main() -> None:
             continue
         if cmd == "warmup":
-            # Pre-load engine + all models (embedding, reranker, BM25, LLM)
-            # Called at dashboard/MCP startup so first real request is fast.
-            # A dummy recall triggers lazy-loaded components (cross-encoder, BM25 index).
+            # Pre-load engine + database + embeddings only.
+            # V3.3.2: Do NOT run a dummy recall — it triggers the ONNX
+            # cross-encoder export (~30s) which combined with engine init
+            # exceeds the worker timeout. The cross-encoder loads lazily
+            # in a background thread on the first real recall instead.
             try:
                 engine = _get_engine()
                 fact_count = engine._db.get_fact_count(engine._profile_id) if engine._db else 0
-                if fact_count > 0:
-                    engine.recall("warmup", limit=1)
                 _respond({"ok": True, "message": "Engine warm", "facts": fact_count})
             except Exception as exc:
                 _respond({"ok": False, "error": f"Warmup failed: {exc}"})

package/src/superlocalmemory/core/store_pipeline.py CHANGED Viewed

@@ -25,6 +25,25 @@ from superlocalmemory.storage.models import (
 logger = logging.getLogger(__name__)
+# Langevin initialization radius for new facts (ACTIVE zone < 0.3)
+_INIT_LANGEVIN_RADIUS = 0.05
+def _init_langevin_position(dim: int = 8) -> list[float]:
+    """Initialize Langevin position near origin for a new fact.
+    Small random perturbation ensures each fact gets a unique position
+    while staying deep in the ACTIVE zone (radius < 0.3).
+    """
+    import numpy as np
+    rng = np.random.default_rng()
+    direction = rng.standard_normal(dim)
+    norm = float(np.linalg.norm(direction))
+    if norm < 1e-8:
+        direction = np.ones(dim)
+        norm = float(np.linalg.norm(direction))
+    return (direction / norm * _INIT_LANGEVIN_RADIUS).tolist()
 # ---------------------------------------------------------------------------
 # enrich_fact  (was MemoryEngine._enrich_fact)
@@ -59,6 +78,10 @@ def enrich_fact(
     emotion = tag_emotion(fact.content)
     signal = infer_signal(fact.content)
+    # Strategy A: initialize Langevin position near origin (ACTIVE zone).
+    # New facts start as ACTIVE; dynamics will evolve them based on access patterns.
+    langevin_pos = _init_langevin_position(dim=8)
     return AtomicFact(
         fact_id=fact.fact_id, memory_id=record.memory_id,
         profile_id=profile_id, content=fact.content,
@@ -73,6 +96,7 @@ def enrich_fact(
         evidence_count=fact.evidence_count,
         source_turn_ids=fact.source_turn_ids, session_id=record.session_id,
         embedding=embedding, fisher_mean=fisher_mean, fisher_variance=fisher_variance,
+        langevin_position=langevin_pos,
         emotional_valence=emotion.valence, emotional_arousal=emotion.arousal,
         signal_type=signal, created_at=fact.created_at,
     )

package/src/superlocalmemory/core/worker_pool.py CHANGED Viewed

@@ -29,8 +29,9 @@ import time
 logger = logging.getLogger(__name__)
 _IDLE_TIMEOUT = 120   # 2 min — kill worker after idle
-_REQUEST_TIMEOUT = 60  # 60 sec max per request
-_WARMUP_TIMEOUT = 120  # 2 min — first cold start loads PyTorch + models
+_REQUEST_TIMEOUT = 120  # 120 sec per request (V3.3.2: ONNX cold start can take 30-60s)
+_WARMUP_TIMEOUT = 180  # 3 min — first cold start: engine + ONNX export + models
+_WORKER_RECYCLE_AFTER = 1000  # Recycle worker after N requests (C++ fragmentation prevention)
 class WorkerPool:
@@ -49,6 +50,7 @@ class WorkerPool:
         self._proc: subprocess.Popen | None = None
         self._idle_timer: threading.Timer | None = None
         self._last_used: float = 0.0
+        self._request_count: int = 0
     @classmethod
     def shared(cls) -> WorkerPool:
@@ -146,6 +148,13 @@ class WorkerPool:
     def _send_with_timeout(self, request: dict, timeout: float) -> dict:
         """Send request with configurable timeout. Thread-safe."""
         with self._lock:
+            # Worker recycling: restart after N requests to prevent
+            # C++ allocator fragmentation over long-running sessions.
+            if self._request_count >= _WORKER_RECYCLE_AFTER and self._proc is not None:
+                logger.info("Recycling recall worker after %d requests", self._request_count)
+                self._kill()
+                self._request_count = 0
             self._ensure_worker()
             if self._proc is None:
                 return {"ok": False, "error": "Worker failed to start"}
@@ -168,6 +177,7 @@ class WorkerPool:
                     return {"ok": False, "error": "Worker died"}
                 self._reset_idle_timer()
+                self._request_count += 1
                 return json.loads(resp_line)
             except (BrokenPipeError, OSError, json.JSONDecodeError) as exc:
@@ -227,6 +237,7 @@ class WorkerPool:
                 text=True,
                 bufsize=1,
                 env=env,
+                start_new_session=True,  # Prevent terminal signals bleeding to worker
             )
             logger.info("Recall worker spawned (PID %d)", self._proc.pid)
         except Exception as exc:

package/src/superlocalmemory/hooks/ide_connector.py CHANGED Viewed

@@ -194,6 +194,7 @@ class IDEConnector:
             data["mcpServers"] = {}
         data["mcpServers"]["superlocalmemory"] = {
+            "type": "stdio",
             "command": "slm",
             "args": ["mcp"],
             "enabled": True,

package/src/superlocalmemory/retrieval/reranker.py CHANGED Viewed

@@ -14,6 +14,9 @@ License: MIT
 from __future__ import annotations
 import logging
+import platform
+import struct
+import sys
 import threading
 from typing import Any
@@ -22,56 +25,151 @@ from superlocalmemory.storage.models import AtomicFact
 logger = logging.getLogger(__name__)
+def _detect_onnx_variant() -> str:
+    """Auto-detect the best ONNX model variant for the current platform.
+    Returns the file_name parameter for CrossEncoder model_kwargs.
+    Platform detection:
+    - macOS ARM64 (Apple Silicon): qint8_arm64
+    - x86_64 with AVX2: quint8_avx2
+    - Everything else: default model.onnx (float32, works everywhere)
+    """
+    arch = platform.machine().lower()
+    is_64bit = struct.calcsize("P") * 8 == 64
+    if sys.platform == "darwin" and arch in ("arm64", "aarch64"):
+        return "onnx/model_qint8_arm64.onnx"
+    if arch in ("x86_64", "amd64") and is_64bit:
+        return "onnx/model_quint8_avx2.onnx"
+    return "onnx/model.onnx"
 class CrossEncoderReranker:
     """Rerank candidate facts using a local cross-encoder model.
+    V3.3.2: Uses ONNX backend by default (~200MB) instead of full PyTorch
+    (~1.5GB). Three-tier fallback: ONNX → PyTorch → no reranking.
+    Auto-detects the optimal quantized ONNX variant per platform.
     When the model is unavailable (missing package, download failure,
     offline environment), falls back to returning candidates in their
     original score order — never crashes.
     Args:
         model_name: HuggingFace cross-encoder model identifier.
+        backend: Inference backend. "onnx" for ONNX Runtime (light),
+            "" for PyTorch (heavy). Default: "onnx".
     """
     def __init__(
         self,
-        model_name: str = "BAAI/bge-reranker-v2-m3",
+        model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2",
+        backend: str = "onnx",
     ) -> None:
         self._model_name = model_name
+        self._backend = backend
         self._model: Any = None
         self._loaded = False
+        self._loading = False  # True while background load is in progress
+        self._active_backend: str = ""
         self._lock = threading.Lock()
     # ------------------------------------------------------------------
-    # Lazy loading
+    # Lazy loading (non-blocking)
     # ------------------------------------------------------------------
     def _ensure_model(self) -> None:
-        """Load cross-encoder on first use (thread-safe)."""
+        """Trigger model load in background (non-blocking).
+        On first call, starts loading in a background thread and returns
+        immediately. The model becomes available for subsequent calls
+        once loading completes. This prevents the 30s ONNX cold start
+        from blocking the first recall request.
+        Three-tier fallback:
+        1. ONNX backend with platform-optimal quantization — ~100-200MB RAM
+        2. PyTorch backend (requires torch) — ~1.5GB RAM
+        3. No model (graceful degradation) — 0 RAM
+        """
         if self._loaded:
             return
         with self._lock:
-            if self._loaded:
-                return  # Double-check after acquiring lock
-            try:
-                from sentence_transformers import CrossEncoder
-                self._model = CrossEncoder(self._model_name)
+            if self._loaded or self._loading:
+                return
+            self._loading = True
+        # Load in background thread so first recall isn't blocked
+        loader = threading.Thread(
+            target=self._load_model, daemon=True, name="ce-loader",
+        )
+        loader.start()
+    def _load_model(self) -> None:
+        """Actually load the model (runs in background thread)."""
+        try:
+            from sentence_transformers import CrossEncoder
+            if self._backend == "onnx":
+                try:
+                    onnx_file = _detect_onnx_variant()
+                    model = CrossEncoder(
+                        self._model_name,
+                        backend="onnx",
+                        model_kwargs={"file_name": onnx_file},
+                    )
+                    self._model = model
+                    self._active_backend = "onnx"
+                    logger.info(
+                        "Cross-encoder loaded (ONNX %s): %s",
+                        onnx_file, self._model_name,
+                    )
+                except Exception as onnx_exc:
+                    logger.info(
+                        "ONNX backend unavailable (%s), falling back to PyTorch",
+                        onnx_exc,
+                    )
+                    model = CrossEncoder(self._model_name)
+                    self._model = model
+                    self._active_backend = "pytorch"
+                    logger.info(
+                        "Cross-encoder loaded (PyTorch fallback): %s",
+                        self._model_name,
+                    )
+            else:
+                model = CrossEncoder(self._model_name)
+                self._model = model
+                self._active_backend = "pytorch"
                 logger.info("Cross-encoder loaded: %s", self._model_name)
-            except ImportError:
-                logger.warning(
-                    "sentence-transformers not installed; "
-                    "cross-encoder reranking disabled"
-                )
-            except OSError as exc:
-                logger.warning(
-                    "Failed to load cross-encoder %s: %s",
-                    self._model_name,
-                    exc,
-                )
-            finally:
-                self._loaded = True
+        except ImportError:
+            logger.warning(
+                "sentence-transformers not installed; "
+                "cross-encoder reranking disabled"
+            )
+        except OSError as exc:
+            logger.warning(
+                "Failed to load cross-encoder %s: %s",
+                self._model_name,
+                exc,
+            )
+        finally:
+            self._loaded = True
+            self._loading = False
+    def _ensure_model_blocking(self) -> None:
+        """Load model synchronously (blocks until ready).
+        Used by warmup and is_available where we need the model NOW.
+        """
+        if self._loaded:
+            return
+        with self._lock:
+            if self._loaded:
+                return
+            self._loading = True
+        self._load_model()
     # ------------------------------------------------------------------
     # Public API
@@ -104,10 +202,13 @@ class CrossEncoderReranker:
         if not candidates:
             return []
+        # Non-blocking: trigger background load if not yet started
         self._ensure_model()
         if self._model is None:
-            # Fallback: keep existing score order
+            # Model not loaded yet (still loading in background or failed).
+            # Graceful fallback: return candidates sorted by existing score.
+            # Next recall will use the model once it's ready.
             sorted_cands = sorted(
                 candidates, key=lambda x: x[1], reverse=True
             )
@@ -150,5 +251,5 @@ class CrossEncoderReranker:
     @property
     def is_available(self) -> bool:
         """Whether the cross-encoder model is loaded and ready."""
-        self._ensure_model()
+        self._ensure_model_blocking()
         return self._model is not None