npm - superlocalmemory - Versions diffs - 3.3.1 → 3.3.2 - Mend

superlocalmemory 3.3.1 → 3.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/ide/configs/antigravity-mcp.json +2 -1
package/ide/configs/claude-desktop-mcp.json +2 -1
package/ide/configs/cursor-mcp.json +2 -1
package/ide/configs/gemini-cli-mcp.json +2 -1
package/ide/configs/jetbrains-mcp.json +2 -1
package/ide/configs/perplexity-mcp.json +2 -1
package/ide/configs/windsurf-mcp.json +2 -1
package/package.json +1 -1
package/pyproject.toml +6 -3
package/scripts/postinstall.js +16 -9
package/src/superlocalmemory/cli/commands.py +44 -15
package/src/superlocalmemory/core/config.py +18 -6
package/src/superlocalmemory/core/embedding_worker.py +14 -1
package/src/superlocalmemory/core/embeddings.py +12 -1
package/src/superlocalmemory/core/engine_wiring.py +4 -1
package/src/superlocalmemory/core/modes.py +2 -1
package/src/superlocalmemory/core/recall_worker.py +11 -5
package/src/superlocalmemory/core/worker_pool.py +13 -2
package/src/superlocalmemory/hooks/ide_connector.py +1 -0
package/src/superlocalmemory/retrieval/reranker.py +125 -24

package/ide/configs/antigravity-mcp.json CHANGED Viewed

@@ -5,7 +5,8 @@
       "args": [
         "mcp"
       ],
-      "description": "SuperLocalMemory V3 - 100% local memory system"
+      "description": "SuperLocalMemory V3 - 100% local memory system",
+      "type": "stdio"
     }
   }
 }

package/ide/configs/claude-desktop-mcp.json CHANGED Viewed

@@ -5,7 +5,8 @@
       "args": [
         "mcp"
       ],
-      "description": "SuperLocalMemory V3 - 100% local memory system for AI assistants"
+      "description": "SuperLocalMemory V3 - 100% local memory system for AI assistants",
+      "type": "stdio"
     }
   }
 }

package/ide/configs/cursor-mcp.json CHANGED Viewed

@@ -5,7 +5,8 @@
       "args": [
         "mcp"
       ],
-      "description": "SuperLocalMemory V3 - Local memory system with mathematical foundations and knowledge graphs"
+      "description": "SuperLocalMemory V3 - Local memory system with mathematical foundations and knowledge graphs",
+      "type": "stdio"
     }
   }
 }

package/ide/configs/gemini-cli-mcp.json CHANGED Viewed

@@ -5,7 +5,8 @@
       "args": [
         "mcp"
       ],
-      "description": "SuperLocalMemory V3"
+      "description": "SuperLocalMemory V3",
+      "type": "stdio"
     }
   }
 }

package/ide/configs/jetbrains-mcp.json CHANGED Viewed

@@ -5,7 +5,8 @@
       "args": [
         "mcp"
       ],
-      "description": "SuperLocalMemory V3"
+      "description": "SuperLocalMemory V3",
+      "type": "stdio"
     }
   }
 }

package/ide/configs/perplexity-mcp.json CHANGED Viewed

@@ -4,6 +4,7 @@
     "args": [
       "mcp"
     ],
-    "env": {}
+    "env": {},
+    "type": "stdio"
   }
 }

package/ide/configs/windsurf-mcp.json CHANGED Viewed

@@ -5,7 +5,8 @@
       "args": [
         "mcp"
       ],
-      "description": "SuperLocalMemory V3 - 100% local memory system for AI assistants"
+      "description": "SuperLocalMemory V3 - 100% local memory system for AI assistants",
+      "type": "stdio"
     }
   }
 }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "superlocalmemory",
-  "version": "3.3.1",
+  "version": "3.3.2",
   "description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
   "keywords": [
     "ai-memory",

package/pyproject.toml CHANGED Viewed

@@ -1,10 +1,10 @@
 [project]
 name = "superlocalmemory"
-version = "3.3.1"
+version = "3.3.2"
 description = "Information-geometric agent memory with mathematical guarantees"
 readme = "README.md"
 license = {text = "MIT"}
-requires-python = ">=3.11"
+requires-python = ">=3.11,<3.15"
 authors = [
     {name = "Varun Pratap Bhardwaj", email = "admin@superlocalmemory.com"},
 ]
@@ -48,11 +48,13 @@ dependencies = [
 [project.optional-dependencies]
 search = [
-    "sentence-transformers>=2.5.0,<4.0.0",
+    "sentence-transformers>=4.0.0",
+    "sentence-transformers[onnx]>=4.0.0",
     "einops>=0.8.2",
     "torch>=2.2.0",
     "scikit-learn>=1.3.0,<2.0.0",
     "geoopt>=0.5.0",
+    "onnxruntime>=1.17.0",
 ]
 ui = [
     "fastapi[all]>=0.135.1",
@@ -72,6 +74,7 @@ full = [
 dev = [
     "pytest>=8.0",
     "pytest-cov>=4.1",
+    "sqlite-vec>=0.1.6",
 ]
 [project.urls]

package/scripts/postinstall.js CHANGED Viewed

@@ -112,20 +112,27 @@ if (pipInstall(coreDeps, 'core')) {
     console.log('  Run manually: pip install ' + coreDeps.join(' '));
 }
-// Search dependencies (IMPORTANT — enables semantic search, 4-channel retrieval)
-const searchDeps = ['sentence-transformers>=2.5.0', 'einops>=0.7.0', 'geoopt>=0.5.0'];
+// Search + ONNX reranking (V3.3.2 — enables 6-channel retrieval + cross-encoder)
+const searchDeps = [
+    'sentence-transformers[onnx]>=4.0.0',
+    'einops>=0.7.0', 'geoopt>=0.5.0',
+    'onnxruntime>=1.17.0',
+];
-console.log('\nInstalling semantic search engine (downloads ~500MB on first use)...');
+console.log('\nInstalling semantic search + ONNX reranking engine...');
+console.log('  (sentence-transformers 4+, ONNX Runtime, Fisher-Rao geometry)');
 if (pipInstall(searchDeps, 'search')) {
-    console.log('✓ Semantic search engine installed (sentence-transformers + einops + Fisher-Rao)');
+    console.log('✓ Search engine installed (sentence-transformers + ONNX + Fisher-Rao)');
+    console.log('  Cross-encoder reranking enabled for ALL modes (+30pp quality)');
     console.log('');
-    console.log('  Note: The embedding model (nomic-ai/nomic-embed-text-v1.5, ~500MB)');
-    console.log('  will download automatically on first use (slm remember / slm recall).');
+    console.log('  Models auto-download on first use:');
+    console.log('    - Embedding: nomic-ai/nomic-embed-text-v1.5 (~500MB)');
+    console.log('    - Reranker: cross-encoder/ms-marco-MiniLM-L-6-v2 (~90MB)');
     console.log('  To pre-download now, run: slm warmup');
 } else {
-    console.log('⚠ Semantic search installation failed (BM25 keyword search still works).');
-    console.log('  For full 4-channel retrieval, run:');
-    console.log('  pip install sentence-transformers einops geoopt');
+    console.log('⚠ Search engine installation failed (BM25 keyword search still works).');
+    console.log('  For full 6-channel retrieval + reranking, run:');
+    console.log('  pip install "sentence-transformers[onnx]>=4.0.0" einops geoopt onnxruntime');
 }
 // Dashboard dependencies (IMPORTANT — enables web dashboard + MCP server)

package/src/superlocalmemory/cli/commands.py CHANGED Viewed

@@ -993,35 +993,64 @@ def cmd_dashboard(args: Namespace) -> None:
         print("Or install manually: pip install 'fastapi[all]' uvicorn")
         sys.exit(1)
+    import os
+    import signal
     import socket
     port = getattr(args, "port", 8765)
-    def _find_port(preferred: int) -> int:
-        for p in [preferred] + list(range(preferred + 1, preferred + 20)):
-            try:
-                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-                    s.bind(("127.0.0.1", p))
-                    return p
-            except OSError:
-                continue
-        return preferred
+    def _kill_existing_on_port(target_port: int) -> None:
+        """Kill any existing SLM dashboard on the target port.
-    ui_port = _find_port(port)
-    if ui_port != port:
-        print(f"  Port {port} in use — using {ui_port} instead")
+        V3.3.2: ONE port, no auto-increment. If port is busy with
+        another SLM instance, kill it. If busy with a non-SLM process,
+        warn and exit — never silently shift to a different port.
+        """
+        if sys.platform == "win32":
+            return  # Windows: user must close manually
+        try:
+            import subprocess
+            result = subprocess.run(
+                ["lsof", "-ti", f":{target_port}"],
+                capture_output=True, text=True, timeout=5,
+            )
+            if result.returncode == 0 and result.stdout.strip():
+                pids = result.stdout.strip().split("\n")
+                for pid_str in pids:
+                    pid = int(pid_str.strip())
+                    if pid == os.getpid():
+                        continue
+                    # Check if it's an SLM/Python process
+                    ps_result = subprocess.run(
+                        ["ps", "-p", str(pid), "-o", "command="],
+                        capture_output=True, text=True, timeout=5,
+                    )
+                    cmd = ps_result.stdout.strip().lower()
+                    if "superlocalmemory" in cmd or "slm" in cmd or "uvicorn" in cmd:
+                        os.kill(pid, signal.SIGTERM)
+                        print(f"  Stopped previous dashboard (PID {pid})")
+                        import time
+                        time.sleep(1)
+        except Exception:
+            pass  # Best-effort
+    _kill_existing_on_port(port)
+    # Brief wait for port to fully release after killing old process
+    import time
+    time.sleep(1)
     print("=" * 60)
     print("  SuperLocalMemory V3 — Web Dashboard")
     print("=" * 60)
-    print(f"  Dashboard:  http://localhost:{ui_port}")
-    print(f"  API Docs:   http://localhost:{ui_port}/api/docs")
+    print(f"  Dashboard:  http://localhost:{port}")
+    print(f"  API Docs:   http://localhost:{port}/api/docs")
     print("  Press Ctrl+C to stop\n")
     from superlocalmemory.server.ui import create_app
     app = create_app()
-    uvicorn.run(app, host="127.0.0.1", port=ui_port, log_level="info")
+    uvicorn.run(app, host="127.0.0.1", port=port, log_level="info")
 # -- Profiles (supports --json) -------------------------------------------

package/src/superlocalmemory/core/config.py CHANGED Viewed

@@ -152,9 +152,10 @@ class RetrievalConfig:
     entity_graph_max_hops: int = 3
     temporal_proximity_days: int = 30
-    # Reranking
+    # Reranking (V3.3.2: ONNX backend enabled for all modes)
     use_cross_encoder: bool = True
-    cross_encoder_model: str = "BAAI/bge-reranker-v2-m3"
+    cross_encoder_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+    cross_encoder_backend: str = "onnx"  # "onnx" (~200MB) or "" (PyTorch, ~1.5GB)
     # Agentic (Mode C only)
     agentic_max_rounds: int = 3
@@ -611,6 +612,15 @@ class SLMConfig:
         rt = data.get("retrieval", {})
         if rt:
+            # V3.3.2 migration: auto-enable ONNX cross-encoder.
+            # Pre-3.3.2 configs had use_cross_encoder=False because the
+            # PyTorch cross-encoder used ~1.5GB RAM. With ONNX backend
+            # (~200MB), it's now safe for all modes. Detect old configs
+            # by the absence of cross_encoder_backend field.
+            if "cross_encoder_backend" not in rt:
+                rt["use_cross_encoder"] = True
+                rt["cross_encoder_model"] = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+                rt["cross_encoder_backend"] = "onnx"
             config.retrieval = RetrievalConfig(**{
                 k: v for k, v in rt.items()
                 if k in RetrievalConfig.__dataclass_fields__
@@ -650,6 +660,8 @@ class SLMConfig:
             },
             "retrieval": {
                 "use_cross_encoder": self.retrieval.use_cross_encoder,
+                "cross_encoder_model": self.retrieval.cross_encoder_model,
+                "cross_encoder_backend": self.retrieval.cross_encoder_backend,
             },
         }
@@ -725,8 +737,8 @@ class SLMConfig:
                 ),
                 llm=LLMConfig(),  # No LLM
                 retrieval=RetrievalConfig(
-                    # Mode A: no cross-encoder (saves ~1.5GB PyTorch RAM)
-                    use_cross_encoder=False,
+                    # V3.3.2: ONNX cross-encoder enabled for all modes (~200MB)
+                    use_cross_encoder=True,
                 ),
                 math=MathConfig(
                     sheaf_contradiction_threshold=0.45,  # 768d threshold
@@ -750,8 +762,8 @@ class SLMConfig:
                     api_key=llm_api_key or "",
                 ),
                 retrieval=RetrievalConfig(
-                    # Mode B: no cross-encoder (saves ~1.5GB PyTorch RAM)
-                    use_cross_encoder=False,
+                    # V3.3.2: ONNX cross-encoder enabled for all modes (~200MB)
+                    use_cross_encoder=True,
                 ),
             )

package/src/superlocalmemory/core/embedding_worker.py CHANGED Viewed

@@ -23,6 +23,7 @@ Part of Qualixar | Author: Varun Pratap Bhardwaj
 from __future__ import annotations
 import json
+import signal
 import sys
 import os
@@ -34,6 +35,11 @@ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 os.environ["TORCH_DEVICE"] = "cpu"
+# SIGTERM bridge: Docker/systemd send SIGTERM to stop processes.
+# Without this, the worker ignores SIGTERM and becomes a zombie.
+if sys.platform != "win32":
+    signal.signal(signal.SIGTERM, lambda *_: sys.exit(0))
 def _worker_main() -> None:
     """Main loop: read JSON requests from stdin, write responses to stdout."""
@@ -97,7 +103,14 @@ def _worker_main() -> None:
                     _respond({"ok": False, "error": f"Model load failed: {exc}"})
                     continue
             try:
-                vecs = model.encode(texts, normalize_embeddings=True)
+                # torch.inference_mode prevents autograd graph accumulation
+                # which causes silent memory leaks over long-running sessions.
+                try:
+                    import torch
+                    with torch.inference_mode():
+                        vecs = model.encode(texts, normalize_embeddings=True)
+                except ImportError:
+                    vecs = model.encode(texts, normalize_embeddings=True)
                 if isinstance(vecs, np.ndarray) and vecs.ndim == 2:
                     result = [vecs[i].tolist() for i in range(vecs.shape[0])]
                 else:

package/src/superlocalmemory/core/embeddings.py CHANGED Viewed

@@ -45,7 +45,8 @@ class DimensionMismatchError(RuntimeError):
 _IDLE_TIMEOUT_SECONDS = 120  # 2 minutes — kill worker after idle
-_SUBPROCESS_RESPONSE_TIMEOUT = 60  # seconds — max wait for worker response
+_SUBPROCESS_RESPONSE_TIMEOUT = 120  # V3.3.2: 120s for ONNX cold start
+_WORKER_RECYCLE_AFTER = 1000  # Recycle worker after N requests (C++ fragmentation prevention)
 class EmbeddingService:
@@ -66,6 +67,7 @@ class EmbeddingService:
         self._last_used: float = 0.0
         self._idle_timer: threading.Timer | None = None
         self._worker_ready = False
+        self._request_count: int = 0
     @property
     def is_available(self) -> bool:
@@ -144,6 +146,13 @@ class EmbeddingService:
         never hangs indefinitely on cold model loads or network issues.
         """
         with self._lock:
+            # Worker recycling: restart after N requests to prevent
+            # C++ allocator fragmentation over long-running sessions.
+            if self._request_count >= _WORKER_RECYCLE_AFTER and self._worker_proc is not None:
+                logger.info("Recycling embedding worker after %d requests", self._request_count)
+                self._kill_worker()
+                self._request_count = 0
             self._ensure_worker()
             if self._worker_proc is None:
                 return None
@@ -176,6 +185,7 @@ class EmbeddingService:
                     logger.warning("Worker error: %s", resp.get("error"))
                     return None
                 self._reset_idle_timer()
+                self._request_count += 1
                 return resp["vectors"]
             except (BrokenPipeError, OSError, json.JSONDecodeError) as exc:
                 logger.warning(
@@ -235,6 +245,7 @@ class EmbeddingService:
                 text=True,
                 bufsize=1,
                 env=env,
+                start_new_session=True,  # Prevent terminal signals bleeding to worker
             )
             logger.info("Embedding worker spawned (PID %d)", self._worker_proc.pid)
             self._worker_ready = True

package/src/superlocalmemory/core/engine_wiring.py CHANGED Viewed

@@ -437,7 +437,10 @@ def init_retrieval(
     reranker = None
     if config.retrieval.use_cross_encoder:
-        reranker = CrossEncoderReranker(config.retrieval.cross_encoder_model)
+        reranker = CrossEncoderReranker(
+            config.retrieval.cross_encoder_model,
+            backend=config.retrieval.cross_encoder_backend,
+        )
     profile_ch = ProfileChannel(db)
     bridge = BridgeDiscovery(db)

package/src/superlocalmemory/core/modes.py CHANGED Viewed

@@ -68,7 +68,7 @@ MODE_A = ModeCapabilities(
     description=(
         "Local Guardian — Zero LLM, zero cloud. "
         "Uses nomic-embed-text-v1.5 encoder (768d, 8K context) for embeddings. "
-        "spaCy + rules for extraction. Cross-encoder for reranking. "
+        "spaCy + rules for extraction. ONNX cross-encoder reranking (~200MB). "
         "Full EU AI Act compliance. Target: 65%+"
     ),
 )
@@ -89,6 +89,7 @@ MODE_B = ModeCapabilities(
     description=(
         "Smart Local — Local Ollama LLM (Phi-3, Llama 3.2). "
         "LLM-quality extraction and classification, fully local. "
+        "ONNX cross-encoder reranking (~200MB). "
         "No cloud, no data export. EU AI Act compliant. Target: 75-80%"
     ),
 )

package/src/superlocalmemory/core/recall_worker.py CHANGED Viewed

@@ -18,6 +18,7 @@ from __future__ import annotations
 import json
 import os
+import signal
 import sys
 # Force CPU BEFORE any torch import
@@ -28,6 +29,11 @@ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 os.environ["TORCH_DEVICE"] = "cpu"
+# SIGTERM bridge: Docker/systemd send SIGTERM to stop processes.
+# Without this, the worker ignores SIGTERM and becomes a zombie.
+if sys.platform != "win32":
+    signal.signal(signal.SIGTERM, lambda *_: sys.exit(0))
 _engine = None
@@ -223,14 +229,14 @@ def _worker_main() -> None:
             continue
         if cmd == "warmup":
-            # Pre-load engine + all models (embedding, reranker, BM25, LLM)
-            # Called at dashboard/MCP startup so first real request is fast.
-            # A dummy recall triggers lazy-loaded components (cross-encoder, BM25 index).
+            # Pre-load engine + database + embeddings only.
+            # V3.3.2: Do NOT run a dummy recall — it triggers the ONNX
+            # cross-encoder export (~30s) which combined with engine init
+            # exceeds the worker timeout. The cross-encoder loads lazily
+            # in a background thread on the first real recall instead.
             try:
                 engine = _get_engine()
                 fact_count = engine._db.get_fact_count(engine._profile_id) if engine._db else 0
-                if fact_count > 0:
-                    engine.recall("warmup", limit=1)
                 _respond({"ok": True, "message": "Engine warm", "facts": fact_count})
             except Exception as exc:
                 _respond({"ok": False, "error": f"Warmup failed: {exc}"})

package/src/superlocalmemory/core/worker_pool.py CHANGED Viewed

@@ -29,8 +29,9 @@ import time
 logger = logging.getLogger(__name__)
 _IDLE_TIMEOUT = 120   # 2 min — kill worker after idle
-_REQUEST_TIMEOUT = 60  # 60 sec max per request
-_WARMUP_TIMEOUT = 120  # 2 min — first cold start loads PyTorch + models
+_REQUEST_TIMEOUT = 120  # 120 sec per request (V3.3.2: ONNX cold start can take 30-60s)
+_WARMUP_TIMEOUT = 180  # 3 min — first cold start: engine + ONNX export + models
+_WORKER_RECYCLE_AFTER = 1000  # Recycle worker after N requests (C++ fragmentation prevention)
 class WorkerPool:
@@ -49,6 +50,7 @@ class WorkerPool:
         self._proc: subprocess.Popen | None = None
         self._idle_timer: threading.Timer | None = None
         self._last_used: float = 0.0
+        self._request_count: int = 0
     @classmethod
     def shared(cls) -> WorkerPool:
@@ -146,6 +148,13 @@ class WorkerPool:
     def _send_with_timeout(self, request: dict, timeout: float) -> dict:
         """Send request with configurable timeout. Thread-safe."""
         with self._lock:
+            # Worker recycling: restart after N requests to prevent
+            # C++ allocator fragmentation over long-running sessions.
+            if self._request_count >= _WORKER_RECYCLE_AFTER and self._proc is not None:
+                logger.info("Recycling recall worker after %d requests", self._request_count)
+                self._kill()
+                self._request_count = 0
             self._ensure_worker()
             if self._proc is None:
                 return {"ok": False, "error": "Worker failed to start"}
@@ -168,6 +177,7 @@ class WorkerPool:
                     return {"ok": False, "error": "Worker died"}
                 self._reset_idle_timer()
+                self._request_count += 1
                 return json.loads(resp_line)
             except (BrokenPipeError, OSError, json.JSONDecodeError) as exc:
@@ -227,6 +237,7 @@ class WorkerPool:
                 text=True,
                 bufsize=1,
                 env=env,
+                start_new_session=True,  # Prevent terminal signals bleeding to worker
             )
             logger.info("Recall worker spawned (PID %d)", self._proc.pid)
         except Exception as exc:

package/src/superlocalmemory/hooks/ide_connector.py CHANGED Viewed

@@ -194,6 +194,7 @@ class IDEConnector:
             data["mcpServers"] = {}
         data["mcpServers"]["superlocalmemory"] = {
+            "type": "stdio",
             "command": "slm",
             "args": ["mcp"],
             "enabled": True,

package/src/superlocalmemory/retrieval/reranker.py CHANGED Viewed

@@ -14,6 +14,9 @@ License: MIT
 from __future__ import annotations
 import logging
+import platform
+import struct
+import sys
 import threading
 from typing import Any
@@ -22,56 +25,151 @@ from superlocalmemory.storage.models import AtomicFact
 logger = logging.getLogger(__name__)
+def _detect_onnx_variant() -> str:
+    """Auto-detect the best ONNX model variant for the current platform.
+    Returns the file_name parameter for CrossEncoder model_kwargs.
+    Platform detection:
+    - macOS ARM64 (Apple Silicon): qint8_arm64
+    - x86_64 with AVX2: quint8_avx2
+    - Everything else: default model.onnx (float32, works everywhere)
+    """
+    arch = platform.machine().lower()
+    is_64bit = struct.calcsize("P") * 8 == 64
+    if sys.platform == "darwin" and arch in ("arm64", "aarch64"):
+        return "onnx/model_qint8_arm64.onnx"
+    if arch in ("x86_64", "amd64") and is_64bit:
+        return "onnx/model_quint8_avx2.onnx"
+    return "onnx/model.onnx"
 class CrossEncoderReranker:
     """Rerank candidate facts using a local cross-encoder model.
+    V3.3.2: Uses ONNX backend by default (~200MB) instead of full PyTorch
+    (~1.5GB). Three-tier fallback: ONNX → PyTorch → no reranking.
+    Auto-detects the optimal quantized ONNX variant per platform.
     When the model is unavailable (missing package, download failure,
     offline environment), falls back to returning candidates in their
     original score order — never crashes.
     Args:
         model_name: HuggingFace cross-encoder model identifier.
+        backend: Inference backend. "onnx" for ONNX Runtime (light),
+            "" for PyTorch (heavy). Default: "onnx".
     """
     def __init__(
         self,
-        model_name: str = "BAAI/bge-reranker-v2-m3",
+        model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2",
+        backend: str = "onnx",
     ) -> None:
         self._model_name = model_name
+        self._backend = backend
         self._model: Any = None
         self._loaded = False
+        self._loading = False  # True while background load is in progress
+        self._active_backend: str = ""
         self._lock = threading.Lock()
     # ------------------------------------------------------------------
-    # Lazy loading
+    # Lazy loading (non-blocking)
     # ------------------------------------------------------------------
     def _ensure_model(self) -> None:
-        """Load cross-encoder on first use (thread-safe)."""
+        """Trigger model load in background (non-blocking).
+        On first call, starts loading in a background thread and returns
+        immediately. The model becomes available for subsequent calls
+        once loading completes. This prevents the 30s ONNX cold start
+        from blocking the first recall request.
+        Three-tier fallback:
+        1. ONNX backend with platform-optimal quantization — ~100-200MB RAM
+        2. PyTorch backend (requires torch) — ~1.5GB RAM
+        3. No model (graceful degradation) — 0 RAM
+        """
         if self._loaded:
             return
         with self._lock:
-            if self._loaded:
-                return  # Double-check after acquiring lock
-            try:
-                from sentence_transformers import CrossEncoder
-                self._model = CrossEncoder(self._model_name)
+            if self._loaded or self._loading:
+                return
+            self._loading = True
+        # Load in background thread so first recall isn't blocked
+        loader = threading.Thread(
+            target=self._load_model, daemon=True, name="ce-loader",
+        )
+        loader.start()
+    def _load_model(self) -> None:
+        """Actually load the model (runs in background thread)."""
+        try:
+            from sentence_transformers import CrossEncoder
+            if self._backend == "onnx":
+                try:
+                    onnx_file = _detect_onnx_variant()
+                    model = CrossEncoder(
+                        self._model_name,
+                        backend="onnx",
+                        model_kwargs={"file_name": onnx_file},
+                    )
+                    self._model = model
+                    self._active_backend = "onnx"
+                    logger.info(
+                        "Cross-encoder loaded (ONNX %s): %s",
+                        onnx_file, self._model_name,
+                    )
+                except Exception as onnx_exc:
+                    logger.info(
+                        "ONNX backend unavailable (%s), falling back to PyTorch",
+                        onnx_exc,
+                    )
+                    model = CrossEncoder(self._model_name)
+                    self._model = model
+                    self._active_backend = "pytorch"
+                    logger.info(
+                        "Cross-encoder loaded (PyTorch fallback): %s",
+                        self._model_name,
+                    )
+            else:
+                model = CrossEncoder(self._model_name)
+                self._model = model
+                self._active_backend = "pytorch"
                 logger.info("Cross-encoder loaded: %s", self._model_name)
-            except ImportError:
-                logger.warning(
-                    "sentence-transformers not installed; "
-                    "cross-encoder reranking disabled"
-                )
-            except OSError as exc:
-                logger.warning(
-                    "Failed to load cross-encoder %s: %s",
-                    self._model_name,
-                    exc,
-                )
-            finally:
-                self._loaded = True
+        except ImportError:
+            logger.warning(
+                "sentence-transformers not installed; "
+                "cross-encoder reranking disabled"
+            )
+        except OSError as exc:
+            logger.warning(
+                "Failed to load cross-encoder %s: %s",
+                self._model_name,
+                exc,
+            )
+        finally:
+            self._loaded = True
+            self._loading = False
+    def _ensure_model_blocking(self) -> None:
+        """Load model synchronously (blocks until ready).
+        Used by warmup and is_available where we need the model NOW.
+        """
+        if self._loaded:
+            return
+        with self._lock:
+            if self._loaded:
+                return
+            self._loading = True
+        self._load_model()
     # ------------------------------------------------------------------
     # Public API
@@ -104,10 +202,13 @@ class CrossEncoderReranker:
         if not candidates:
             return []
+        # Non-blocking: trigger background load if not yet started
         self._ensure_model()
         if self._model is None:
-            # Fallback: keep existing score order
+            # Model not loaded yet (still loading in background or failed).
+            # Graceful fallback: return candidates sorted by existing score.
+            # Next recall will use the model once it's ready.
             sorted_cands = sorted(
                 candidates, key=lambda x: x[1], reverse=True
             )
@@ -150,5 +251,5 @@ class CrossEncoderReranker:
     @property
     def is_available(self) -> bool:
         """Whether the cross-encoder model is loaded and ready."""
-        self._ensure_model()
+        self._ensure_model_blocking()
         return self._model is not None