npm - superlocalmemory - Versions diffs - 3.3.3 → 3.3.4 - Mend

superlocalmemory 3.3.3 → 3.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/package.json +1 -1
package/pyproject.toml +5 -1
package/src/superlocalmemory/cli/commands.py +21 -4
package/src/superlocalmemory/cli/main.py +6 -0
package/src/superlocalmemory/core/config.py +12 -9
package/src/superlocalmemory/core/worker_pool.py +9 -2
package/src/superlocalmemory/encoding/cognitive_consolidator.py +19 -1
package/src/superlocalmemory/encoding/emotional.py +5 -2
package/src/superlocalmemory/encoding/entity_resolver.py +1 -1
package/src/superlocalmemory/math/polar_quant.py +3 -1
package/src/superlocalmemory/retrieval/engine.py +36 -8
package/src/superlocalmemory/retrieval/reranker.py +240 -163
package/src/superlocalmemory/storage/embedding_migrator.py +4 -3

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "superlocalmemory",
-  "version": "3.3.3",
+  "version": "3.3.4",
   "description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
   "keywords": [
     "ai-memory",

package/pyproject.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "superlocalmemory"
-version = "3.3.3"
+version = "3.3.4"
 description = "Information-geometric agent memory with mathematical guarantees"
 readme = "README.md"
 license = {text = "MIT"}
@@ -98,6 +98,10 @@ testpaths = ["tests"]
 pythonpath = ["src"]
 markers = [
     "slow: marks tests as slow (deselect with '-m \"not slow\"')",
+    "ollama: marks tests that require a running Ollama instance",
+]
+filterwarnings = [
+    "ignore::DeprecationWarning:vaderSentiment",
 ]
 [tool.coverage.run]

package/src/superlocalmemory/cli/commands.py CHANGED Viewed

@@ -113,6 +113,10 @@ def cmd_mode(args: Namespace) -> None:
         if (config.embedding.provider != updated.embedding.provider
                 or config.embedding.model_name != updated.embedding.model_name):
             print("  ⚠ Embedding model changed. Re-indexing will run on next recall.")
+        # V3.3.4: Warn if Mode C lacks cloud API key
+        if args.value == "c" and not updated.llm.api_key:
+            print("  ⚠ Mode C requires a cloud API key. Run: slm provider set")
     else:
         print(f"Current mode: {config.mode.value.upper()}")
@@ -356,12 +360,20 @@ def cmd_forget(args: Namespace) -> None:
             sys.exit(1)
         raise
+    dry_run = getattr(args, 'dry_run', False)
     if use_json:
         from superlocalmemory.cli.json_output import json_print
         if not matches:
             json_print("forget", data={"matched_count": 0, "deleted_count": 0, "matches": []})
             return
         match_items = [{"fact_id": f.fact_id, "content": f.content[:120]} for f in matches[:20]]
+        if dry_run:
+            json_print("forget", data={
+                "matched_count": len(matches), "deleted_count": 0,
+                "dry_run": True, "matches": match_items,
+            })
+            return
         if getattr(args, 'yes', False):
             for f in matches:
                 engine._db.delete_fact(f.fact_id)
@@ -387,6 +399,9 @@ def cmd_forget(args: Namespace) -> None:
     print(f"Found {len(matches)} matching memories:")
     for f in matches[:10]:
         print(f"  - {f.fact_id[:8]}... {f.content[:80]}")
+    if dry_run:
+        print(f"(dry run — {len(matches)} would be deleted)")
+        return
     if getattr(args, 'yes', False):
         for f in matches:
             engine._db.delete_fact(f.fact_id)
@@ -861,7 +876,8 @@ def cmd_trace(args: Namespace) -> None:
     try:
         config = SLMConfig.load()
         engine = MemoryEngine(config)
-        response = engine.recall(args.query, limit=5)
+        limit = getattr(args, 'limit', 10)
+        response = engine.recall(args.query, limit=limit)
     except Exception as exc:
         if use_json:
             from superlocalmemory.cli.json_output import json_print
@@ -1435,6 +1451,7 @@ def cmd_consolidate(args: Namespace) -> None:
     use_json = getattr(args, "json", False)
     cognitive = getattr(args, "cognitive", False)
+    dry_run = getattr(args, "dry_run", False)
     profile = getattr(args, "profile", "")
     if not cognitive:
@@ -1460,7 +1477,7 @@ def cmd_consolidate(args: Namespace) -> None:
         )
         consolidator = CognitiveConsolidator(db=engine._db)
-        result = consolidator.run_pipeline(pid)
+        result = consolidator.run_pipeline(pid, dry_run=dry_run)
     except Exception as exc:
         if use_json:
             from superlocalmemory.cli.json_output import json_print
@@ -1473,7 +1490,7 @@ def cmd_consolidate(args: Namespace) -> None:
     if use_json:
         from superlocalmemory.cli.json_output import json_print
         json_print("consolidate", data={
-            "clusters_found": result.clusters_found,
+            "clusters_processed": result.clusters_processed,
             "blocks_created": result.blocks_created,
             "facts_archived": result.facts_archived,
             "compression_ratio": round(result.compression_ratio, 3),
@@ -1484,7 +1501,7 @@ def cmd_consolidate(args: Namespace) -> None:
         return
     print("CCQ Cognitive Consolidation")
-    print(f"  Clusters found:     {result.clusters_found}")
+    print(f"  Clusters processed: {result.clusters_processed}")
     print(f"  Blocks created:     {result.blocks_created}")
     print(f"  Facts archived:     {result.facts_archived}")
     print(f"  Compression ratio:  {result.compression_ratio:.3f}")

package/src/superlocalmemory/cli/main.py CHANGED Viewed

@@ -123,6 +123,7 @@ def main() -> None:
     forget_p = sub.add_parser("forget", help="Delete memories matching a query (fuzzy)")
     forget_p.add_argument("query", help="Query to match for deletion")
+    forget_p.add_argument("--dry-run", action="store_true", default=False, help="Preview matches without deleting")
     forget_p.add_argument("--yes", "-y", action="store_true", help="Skip confirmation prompt")
     forget_p.add_argument("--json", action="store_true", help="Output structured JSON (agent-native)")
@@ -151,6 +152,7 @@ def main() -> None:
     trace_p = sub.add_parser("trace", help="Recall with per-channel score breakdown")
     trace_p.add_argument("query", help="Search query")
+    trace_p.add_argument("--limit", type=int, default=10, help="Max results (default 10)")
     trace_p.add_argument("--json", action="store_true", help="Output structured JSON (agent-native)")
     # -- Diagnostics (continued) ----------------------------------------
@@ -217,6 +219,10 @@ def main() -> None:
         "--cognitive", action="store_true",
         help="Run CCQ cognitive consolidation",
     )
+    consolidate_p.add_argument(
+        "--dry-run", action="store_true", default=False,
+        help="Preview without applying",
+    )
     consolidate_p.add_argument("--profile", default="", help="Target profile")
     consolidate_p.add_argument("--json", action="store_true", help="Output structured JSON (agent-native)")

package/src/superlocalmemory/core/config.py CHANGED Viewed

@@ -612,15 +612,15 @@ class SLMConfig:
         rt = data.get("retrieval", {})
         if rt:
-            # V3.3.2 migration: auto-enable ONNX cross-encoder.
-            # Pre-3.3.2 configs had use_cross_encoder=False because the
-            # PyTorch cross-encoder used ~1.5GB RAM. With ONNX backend
-            # (~200MB), it's now safe for all modes. Detect old configs
-            # by the absence of cross_encoder_backend field.
+            # V3.3.2 migration: add ONNX cross-encoder backend field.
+            # Pre-3.3.2 configs lacked cross_encoder_backend. Add it,
+            # but NEVER override an explicit use_cross_encoder setting.
+            # The user's explicit choice always wins.
             if "cross_encoder_backend" not in rt:
-                rt["use_cross_encoder"] = True
-                rt["cross_encoder_model"] = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+                rt.setdefault("cross_encoder_model", "cross-encoder/ms-marco-MiniLM-L-6-v2")
                 rt["cross_encoder_backend"] = "onnx"
+                # Only auto-enable if user didn't explicitly set the field
+                rt.setdefault("use_cross_encoder", True)
             config.retrieval = RetrievalConfig(**{
                 k: v for k, v in rt.items()
                 if k in RetrievalConfig.__dataclass_fields__
@@ -768,6 +768,9 @@ class SLMConfig:
             )
         # Mode C — FULL POWER, UNRESTRICTED
+        # Don't carry over local-only providers (ollama) to cloud mode
+        c_provider = llm_provider if llm_provider not in ("ollama", "") else "openrouter"
+        c_model = llm_model if llm_provider not in ("ollama", "") else "anthropic/claude-sonnet-4"
         return cls(
             mode=mode,
             base_dir=_base,
@@ -779,8 +782,8 @@ class SLMConfig:
                 deployment_name=embedding_deployment,
             ),
             llm=LLMConfig(
-                provider=llm_provider or "azure",
-                model=llm_model or "gpt-4.1-mini",
+                provider=c_provider,
+                model=c_model,
                 api_key=llm_api_key,
                 api_base=llm_api_base,
             ),

package/src/superlocalmemory/core/worker_pool.py CHANGED Viewed

@@ -142,8 +142,15 @@ class WorkerPool:
     # ------------------------------------------------------------------
     def _send(self, request: dict) -> dict:
-        """Send request to worker and get response. Thread-safe."""
-        return self._send_with_timeout(request, timeout=_REQUEST_TIMEOUT)
+        """Send request to worker and get response. Thread-safe.
+        Auto-retries once on worker death (idle timeout, crash).
+        """
+        resp = self._send_with_timeout(request, timeout=_REQUEST_TIMEOUT)
+        if not resp.get("ok") and "Worker" in resp.get("error", ""):
+            logger.info("Auto-restarting worker after failure, retrying request")
+            resp = self._send_with_timeout(request, timeout=_REQUEST_TIMEOUT)
+        return resp
     def _send_with_timeout(self, request: dict, timeout: float) -> dict:
         """Send request with configurable timeout. Thread-safe."""

package/src/superlocalmemory/encoding/cognitive_consolidator.py CHANGED Viewed

@@ -214,11 +214,17 @@ class CognitiveConsolidator:
     # Public API
     # ------------------------------------------------------------------
-    def run_pipeline(self, profile_id: str) -> CCQPipelineResult:
+    def run_pipeline(
+        self, profile_id: str, dry_run: bool = False,
+    ) -> CCQPipelineResult:
         """Execute the full 6-step CCQ pipeline.
         Per-cluster error isolation: one cluster failure does NOT
         abort the pipeline (HR-07).
+        Args:
+            profile_id: Target profile.
+            dry_run: If True, identify clusters but don't apply changes.
         """
         # Step 1: Identify candidates
         candidates = self._step1_identify(profile_id)
@@ -230,6 +236,18 @@ class CognitiveConsolidator:
         if not clusters:
             return self._empty_result()
+        if dry_run:
+            return CCQPipelineResult(
+                clusters_processed=len(clusters),
+                blocks_created=0,
+                facts_archived=len(candidates),
+                total_bytes_before=0,
+                total_bytes_after=0,
+                compression_ratio=0.0,
+                audit_entries=(),
+                errors=(),
+            )
         # Process each cluster
         blocks_created = 0
         facts_archived = 0

package/src/superlocalmemory/encoding/emotional.py CHANGED Viewed

@@ -30,8 +30,11 @@ def _get_vader():
     if _vader_analyzer is not None:
         return _vader_analyzer
     try:
-        from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
-        _vader_analyzer = SentimentIntensityAnalyzer()
+        import warnings
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", category=DeprecationWarning, module="vaderSentiment")
+            from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+            _vader_analyzer = SentimentIntensityAnalyzer()
     except ImportError:
         logger.warning("vaderSentiment not installed — emotional tagging disabled")
         _vader_analyzer = None

package/src/superlocalmemory/encoding/entity_resolver.py CHANGED Viewed

@@ -498,7 +498,7 @@ class EntityResolver:
                 max_tokens=256,
                 temperature=0.0,
             )
-            match = re.search(r"\{.*\}", response, re.DOTALL)
+            match = re.search(r"\{[^}]*\}", response)
             if not match:
                 return {}

package/src/superlocalmemory/math/polar_quant.py CHANGED Viewed

@@ -103,7 +103,9 @@ class PolarQuantEncoder:
         """
         path_str = self._config.rotation_matrix_path
         if not path_str:
-            path_str = str(Path.home() / ".superlocalmemory" / "polar_rotation.npy")
+            path_str = str(
+                Path.home() / ".superlocalmemory" / f"polar_rotation_{self._d}.npy",
+            )
         path = Path(path_str)

package/src/superlocalmemory/retrieval/engine.py CHANGED Viewed

@@ -83,6 +83,10 @@ class RetrievalEngine:
         self._bridge = bridge_discovery
         self._trust_scorer = trust_scorer
+        # V3.3.4: LRU cache for query embeddings (avoids redundant Ollama API calls)
+        self._query_embedding_cache: dict[str, list[float]] = {}
+        self._cache_max_size = 64
         # V3.2: ChannelRegistry for self-registration (Phase 0.5)
         from superlocalmemory.retrieval.channel_registry import ChannelRegistry
         self._registry = ChannelRegistry()
@@ -189,6 +193,21 @@ class RetrievalEngine:
     # -- Channel execution --------------------------------------------------
+    def _embed_query(self, query: str) -> list[float] | None:
+        """Embed query with LRU cache. Avoids redundant Ollama/API calls."""
+        if self._embedder is None:
+            return None
+        cached = self._query_embedding_cache.get(query)
+        if cached is not None:
+            return cached
+        emb = self._embedder.embed(query)
+        # Evict oldest if cache full
+        if len(self._query_embedding_cache) >= self._cache_max_size:
+            oldest = next(iter(self._query_embedding_cache))
+            del self._query_embedding_cache[oldest]
+        self._query_embedding_cache[query] = emb
+        return emb
     def _run_channels(
         self, query: str, profile_id: str, strat: QueryStrategy,
     ) -> dict[str, list[tuple[str, float]]]:
@@ -197,9 +216,20 @@ class RetrievalEngine:
         # Skip channels listed in disabled_channels (ablation support)
         disabled = set(self._config.disabled_channels)
-        if self._semantic is not None and self._embedder is not None and "semantic" not in disabled:
+        # V3.3.4: Embed query ONCE, reuse for semantic + hopfield channels
+        q_emb: list[float] | None = None
+        needs_embedding = (
+            (self._semantic is not None and "semantic" not in disabled)
+            or (self._hopfield is not None and "hopfield" not in disabled)
+        )
+        if needs_embedding:
+            try:
+                q_emb = self._embed_query(query)
+            except Exception as exc:
+                logger.warning("Query embedding failed: %s", exc)
+        if self._semantic is not None and q_emb is not None and "semantic" not in disabled:
             try:
-                q_emb = self._embedder.embed(query)
                 r = self._semantic.search(q_emb, profile_id, self._config.semantic_top_k)
                 if r:
                     out["semantic"] = r
@@ -231,13 +261,11 @@ class RetrievalEngine:
                 logger.warning("Temporal channel: %s", exc)
         # Phase G: Hopfield channel (6th) — energy-based pattern completion
-        if self._hopfield is not None and "hopfield" not in disabled:
+        if self._hopfield is not None and q_emb is not None and "hopfield" not in disabled:
             try:
-                q_emb = self._embedder.embed(query) if self._embedder else None
-                if q_emb is not None:
-                    r = self._hopfield.search(q_emb, profile_id, self._config.hopfield_top_k)
-                    if r:
-                        out["hopfield"] = r
+                r = self._hopfield.search(q_emb, profile_id, self._config.hopfield_top_k)
+                if r:
+                    out["hopfield"] = r
             except Exception as exc:
                 logger.warning("Hopfield channel: %s", exc)

package/src/superlocalmemory/retrieval/reranker.py CHANGED Viewed

@@ -2,10 +2,13 @@
 # Licensed under the MIT License - see LICENSE file
 # Part of SuperLocalMemory V3 | https://qualixar.com | https://varunpratap.com
-"""SuperLocalMemory V3 — Cross-Encoder Reranker.
+"""SuperLocalMemory V3 — Cross-Encoder Reranker (Subprocess-Isolated).
-Scores (query, fact) pairs through a cross-encoder in a single forward
-pass. Lazy model loading, thread-safe via lock.
+V3.3.3: All PyTorch/ONNX model work runs in a SEPARATE subprocess.
+The main process (dashboard, MCP, CLI) NEVER imports torch and stays
+at ~60 MB. Same isolation pattern as EmbeddingService.
+The worker subprocess auto-kills after 2 minutes idle.
 Part of Qualixar | Author: Varun Pratap Bhardwaj
 License: MIT
@@ -13,49 +16,33 @@ License: MIT
 from __future__ import annotations
+import json
 import logging
-import platform
-import struct
+import os
+import subprocess
 import sys
 import threading
+import time
 from typing import Any
 from superlocalmemory.storage.models import AtomicFact
 logger = logging.getLogger(__name__)
-def _detect_onnx_variant() -> str:
-    """Auto-detect the best ONNX model variant for the current platform.
-    Returns the file_name parameter for CrossEncoder model_kwargs.
-    Platform detection:
-    - macOS ARM64 (Apple Silicon): qint8_arm64
-    - x86_64 with AVX2: quint8_avx2
-    - Everything else: default model.onnx (float32, works everywhere)
-    """
-    arch = platform.machine().lower()
-    is_64bit = struct.calcsize("P") * 8 == 64
-    if sys.platform == "darwin" and arch in ("arm64", "aarch64"):
-        return "onnx/model_qint8_arm64.onnx"
-    if arch in ("x86_64", "amd64") and is_64bit:
-        return "onnx/model_quint8_avx2.onnx"
-    return "onnx/model.onnx"
+_IDLE_TIMEOUT_SECONDS = 120  # 2 min → kill worker
+_SUBPROCESS_RESPONSE_TIMEOUT = 120  # 120s for ONNX cold start
+_WORKER_RECYCLE_AFTER = 500  # Recycle after N requests
 class CrossEncoderReranker:
     """Rerank candidate facts using a local cross-encoder model.
-    V3.3.2: Uses ONNX backend by default (~200MB) instead of full PyTorch
-    (~1.5GB). Three-tier fallback: ONNX → PyTorch → no reranking.
-    Auto-detects the optimal quantized ONNX variant per platform.
+    V3.3.3: SUBPROCESS-ISOLATED. The main process never imports
+    sentence_transformers or torch. All model work runs in a child
+    process via JSON over stdin/stdout.
-    When the model is unavailable (missing package, download failure,
-    offline environment), falls back to returning candidates in their
-    original score order — never crashes.
+    Non-blocking first-use: triggers background worker spawn, returns
+    fallback scores until worker is ready.
     Args:
         model_name: HuggingFace cross-encoder model identifier.
@@ -70,106 +57,207 @@ class CrossEncoderReranker:
     ) -> None:
         self._model_name = model_name
         self._backend = backend
-        self._model: Any = None
-        self._loaded = False
-        self._loading = False  # True while background load is in progress
-        self._active_backend: str = ""
+        self._worker_proc: subprocess.Popen | None = None
+        self._model_loaded = False  # True once worker confirms model is ready
+        self._worker_loading = False  # True while background warmup in progress
         self._lock = threading.Lock()
+        self._idle_timer: threading.Timer | None = None
+        self._request_count: int = 0
+        # Start background warmup immediately — worker loads model
+        # while the rest of init continues. First recall gets instant
+        # fallback; second recall uses the warm model.
+        self._start_background_warmup()
     # ------------------------------------------------------------------
-    # Lazy loading (non-blocking)
+    # Background warmup (non-blocking model load)
     # ------------------------------------------------------------------
-    def _ensure_model(self) -> None:
-        """Trigger model load in background (non-blocking).
-        On first call, starts loading in a background thread and returns
-        immediately. The model becomes available for subsequent calls
-        once loading completes. This prevents the 30s ONNX cold start
-        from blocking the first recall request.
+    def _start_background_warmup(self) -> None:
+        """Start worker and load model in background thread.
-        Three-tier fallback:
-        1. ONNX backend with platform-optimal quantization — ~100-200MB RAM
-        2. PyTorch backend (requires torch) — ~1.5GB RAM
-        3. No model (graceful degradation) — 0 RAM
+        Returns immediately. The worker loads the model in parallel
+        with the rest of engine initialization and the first recall.
         """
-        if self._loaded:
+        if self._worker_loading or self._model_loaded:
             return
+        self._worker_loading = True
+        def _warmup() -> None:
+            try:
+                self._ensure_worker()
+                if self._worker_proc is None:
+                    return
+                # Send load command and wait for response
+                req = json.dumps({
+                    "cmd": "load",
+                    "model_name": self._model_name,
+                    "backend": self._backend,
+                }) + "\n"
+                self._worker_proc.stdin.write(req)
+                self._worker_proc.stdin.flush()
+                resp_line = self._readline_with_timeout(
+                    self._worker_proc.stdout, _SUBPROCESS_RESPONSE_TIMEOUT,
+                )
+                if resp_line:
+                    resp = json.loads(resp_line)
+                    if resp.get("ok"):
+                        self._model_loaded = True
+                        logger.info(
+                            "Reranker worker warm (backend=%s)",
+                            resp.get("backend", "?"),
+                        )
+                        self._reset_idle_timer()
+            except Exception as exc:
+                logger.debug("Background reranker warmup failed: %s", exc)
+            finally:
+                self._worker_loading = False
+        t = threading.Thread(target=_warmup, daemon=True, name="ce-warmup")
+        t.start()
-        with self._lock:
-            if self._loaded or self._loading:
-                return
-            self._loading = True
+    # ------------------------------------------------------------------
+    # Worker management (mirrors EmbeddingService pattern)
+    # ------------------------------------------------------------------
-        # Load in background thread so first recall isn't blocked
-        loader = threading.Thread(
-            target=self._load_model, daemon=True, name="ce-loader",
-        )
-        loader.start()
+    def _ensure_worker(self) -> None:
+        """Spawn worker subprocess if not running. Non-blocking."""
+        if self._worker_proc is not None and self._worker_proc.poll() is None:
+            return
+        self._worker_proc = None
+        self._worker_ready = False
-    def _load_model(self) -> None:
-        """Actually load the model (runs in background thread)."""
+        worker_module = "superlocalmemory.core.reranker_worker"
         try:
-            from sentence_transformers import CrossEncoder
-            if self._backend == "onnx":
-                try:
-                    onnx_file = _detect_onnx_variant()
-                    model = CrossEncoder(
-                        self._model_name,
-                        backend="onnx",
-                        model_kwargs={"file_name": onnx_file},
-                    )
-                    self._model = model
-                    self._active_backend = "onnx"
-                    logger.info(
-                        "Cross-encoder loaded (ONNX %s): %s",
-                        onnx_file, self._model_name,
-                    )
-                except Exception as onnx_exc:
-                    logger.info(
-                        "ONNX backend unavailable (%s), falling back to PyTorch",
-                        onnx_exc,
-                    )
-                    model = CrossEncoder(self._model_name)
-                    self._model = model
-                    self._active_backend = "pytorch"
-                    logger.info(
-                        "Cross-encoder loaded (PyTorch fallback): %s",
-                        self._model_name,
-                    )
-            else:
-                model = CrossEncoder(self._model_name)
-                self._model = model
-                self._active_backend = "pytorch"
-                logger.info("Cross-encoder loaded: %s", self._model_name)
-        except ImportError:
-            logger.warning(
-                "sentence-transformers not installed; "
-                "cross-encoder reranking disabled"
+            env = {
+                **os.environ,
+                "CUDA_VISIBLE_DEVICES": "",
+                "PYTORCH_MPS_HIGH_WATERMARK_RATIO": "0.0",
+                "PYTORCH_MPS_MEM_LIMIT": "0",
+                "PYTORCH_ENABLE_MPS_FALLBACK": "1",
+                "TOKENIZERS_PARALLELISM": "false",
+                "TORCH_DEVICE": "cpu",
+            }
+            self._worker_proc = subprocess.Popen(
+                [sys.executable, "-m", worker_module],
+                stdin=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.DEVNULL,
+                text=True,
+                bufsize=1,
+                env=env,
+                start_new_session=True,
             )
-        except OSError as exc:
-            logger.warning(
-                "Failed to load cross-encoder %s: %s",
-                self._model_name,
-                exc,
+            logger.info(
+                "Reranker worker spawned (PID %d)", self._worker_proc.pid,
             )
-        finally:
-            self._loaded = True
-            self._loading = False
+            self._worker_ready = True
+        except Exception as exc:
+            logger.warning("Failed to spawn reranker worker: %s", exc)
+            self._worker_proc = None
-    def _ensure_model_blocking(self) -> None:
-        """Load model synchronously (blocks until ready).
+    def _send_request(self, req: dict, timeout: float | None = None) -> dict | None:
+        """Send JSON request to worker, get response. Thread-safe.
-        Used by warmup and is_available where we need the model NOW.
+        Uses a short timeout (10s) for rerank requests since the model
+        should already be loaded by the background warmup. Uses the full
+        timeout only for explicit load/ping commands.
         """
-        if self._loaded:
-            return
+        effective_timeout = timeout or _SUBPROCESS_RESPONSE_TIMEOUT
         with self._lock:
-            if self._loaded:
-                return
-            self._loading = True
-        self._load_model()
+            if self._request_count >= _WORKER_RECYCLE_AFTER and self._worker_proc is not None:
+                logger.info("Recycling reranker worker after %d requests", self._request_count)
+                self._kill_worker()
+                self._model_loaded = False
+                self._request_count = 0
+            # Ensure worker is alive (re-spawn if crashed)
+            if self._worker_proc is None or self._worker_proc.poll() is not None:
+                self._ensure_worker()
+            if self._worker_proc is None:
+                return None
+            try:
+                msg = json.dumps(req) + "\n"
+                self._worker_proc.stdin.write(msg)
+                self._worker_proc.stdin.flush()
+                resp_line = self._readline_with_timeout(
+                    self._worker_proc.stdout,
+                    effective_timeout,
+                )
+                if not resp_line:
+                    logger.warning("Reranker worker timed out after %ds", effective_timeout)
+                    self._kill_worker()
+                    self._model_loaded = False
+                    return None
+                resp = json.loads(resp_line)
+                self._reset_idle_timer()
+                self._request_count += 1
+                return resp
+            except (BrokenPipeError, OSError, json.JSONDecodeError) as exc:
+                logger.warning("Reranker worker communication failed: %s", exc)
+                self._kill_worker()
+                self._model_loaded = False
+                return None
+    @staticmethod
+    def _readline_with_timeout(stream: Any, timeout_seconds: float) -> str:
+        """Read a line from stream with timeout. Returns '' on timeout."""
+        result_container: list[str] = []
+        error_container: list[Exception] = []
+        def _read() -> None:
+            try:
+                result_container.append(stream.readline())
+            except Exception as exc:
+                error_container.append(exc)
+        reader = threading.Thread(target=_read, daemon=True)
+        reader.start()
+        reader.join(timeout=timeout_seconds)
+        if reader.is_alive():
+            return ""
+        if error_container:
+            raise error_container[0]
+        return result_container[0] if result_container else ""
+    def _kill_worker(self) -> None:
+        """Terminate worker subprocess."""
+        if self._idle_timer is not None:
+            self._idle_timer.cancel()
+            self._idle_timer = None
+        if self._worker_proc is not None:
+            try:
+                self._worker_proc.stdin.write('{"cmd":"quit"}\n')
+                self._worker_proc.stdin.flush()
+                self._worker_proc.wait(timeout=3)
+            except Exception:
+                try:
+                    self._worker_proc.kill()
+                except Exception:
+                    pass
+            self._worker_proc = None
+            self._worker_ready = False
+    def _reset_idle_timer(self) -> None:
+        """Reset idle timer — kills worker after 2 min inactivity."""
+        if self._idle_timer is not None:
+            self._idle_timer.cancel()
+        self._idle_timer = threading.Timer(
+            _IDLE_TIMEOUT_SECONDS, self.unload,
+        )
+        self._idle_timer.daemon = True
+        self._idle_timer.start()
+    def unload(self) -> None:
+        """Kill the worker subprocess to free all memory."""
+        with self._lock:
+            self._kill_worker()
+            logger.info("CrossEncoderReranker: worker killed (idle timeout)")
     # ------------------------------------------------------------------
     # Public API
@@ -183,73 +271,62 @@ class CrossEncoderReranker:
     ) -> list[tuple[AtomicFact, float]]:
         """Rerank candidates by cross-encoder relevance.
-        Each (query, fact.content) pair is scored in a single forward
-        pass. Results are returned sorted by cross-encoder score.
-        When the model is unavailable, returns candidates sorted by
-        their existing score (graceful fallback).
-        Args:
-            query: User query text.
-            candidates: List of (AtomicFact, score) tuples from the
-                fusion stage.
-            top_k: Maximum results to return.
-        Returns:
-            Top-k (AtomicFact, cross_encoder_score) tuples, sorted
-            descending by cross-encoder score.
+        NON-BLOCKING: If the worker is still loading the model
+        (background warmup), returns candidates by existing score
+        immediately. Once the worker is warm, subsequent calls use
+        the cross-encoder. This means CLI first-call gets instant
+        results (without reranking), and MCP gets reranked results
+        (worker stays warm between calls).
         """
         if not candidates:
             return []
-        # Non-blocking: trigger background load if not yet started
-        self._ensure_model()
-        if self._model is None:
-            # Model not loaded yet (still loading in background or failed).
-            # Graceful fallback: return candidates sorted by existing score.
-            # Next recall will use the model once it's ready.
-            sorted_cands = sorted(
-                candidates, key=lambda x: x[1], reverse=True
-            )
+        # Non-blocking: if model isn't loaded yet, return fallback
+        if not self._model_loaded:
+            sorted_cands = sorted(candidates, key=lambda x: x[1], reverse=True)
             return sorted_cands[:top_k]
-        # Build (query, document) pairs for batch scoring
-        pairs: list[tuple[str, str]] = [
-            (query, fact.content) for fact, _ in candidates
-        ]
+        documents = [fact.content for fact, _ in candidates]
+        # Short timeout (10s) — model should already be loaded by warmup.
+        # If worker crashed or is still loading, fallback immediately.
+        resp = self._send_request({
+            "cmd": "rerank",
+            "query": query,
+            "documents": documents,
+        }, timeout=10.0)
-        scores = self._model.predict(pairs)
+        if resp is None or not resp.get("ok"):
+            # Fallback: return by existing score
+            sorted_cands = sorted(candidates, key=lambda x: x[1], reverse=True)
+            return sorted_cands[:top_k]
+        scores = resp["scores"]
         scored: list[tuple[AtomicFact, float]] = [
             (fact, float(score))
             for (fact, _), score in zip(candidates, scores)
         ]
         scored.sort(key=lambda x: x[1], reverse=True)
         return scored[:top_k]
     def score_pair(self, query: str, document: str) -> float:
-        """Score a single (query, document) pair.
-        Args:
-            query: Query text.
-            document: Document text.
-        Returns:
-            Relevance score (higher = more relevant). 0.0 if model
-            is unavailable.
-        """
-        self._ensure_model()
-        if self._model is None:
+        """Score a single (query, document) pair."""
+        resp = self._send_request({
+            "cmd": "score",
+            "query": query,
+            "document": document,
+            "model_name": self._model_name,
+            "backend": self._backend,
+        })
+        if resp is None or not resp.get("ok"):
             return 0.0
-        scores = self._model.predict([(query, document)])
-        return float(scores[0])
+        return float(resp.get("score", 0.0))
     @property
     def is_available(self) -> bool:
-        """Whether the cross-encoder model is loaded and ready."""
-        self._ensure_model_blocking()
-        return self._model is not None
+        """Whether the cross-encoder worker can be spawned."""
+        resp = self._send_request({"cmd": "ping"})
+        if resp is None:
+            return False
+        return resp.get("ok", False)

package/src/superlocalmemory/storage/embedding_migrator.py CHANGED Viewed

@@ -36,11 +36,12 @@ _REINDEX_BATCH_SIZE = 50
 def _model_signature(config: SLMConfig) -> str:
     """Derive a deterministic signature from the active embedding config.
-    The signature combines provider + model_name + dimension so that
-    any change in embedding source is detected.
+    V3.3.4: Only model_name + dimension matter. Provider (sentence-transformers
+    vs ollama) doesn't change the embedding space when the model is the same.
+    This prevents spurious re-indexing when switching Mode A ↔ B.
     """
     emb = config.embedding
-    return f"{emb.provider}::{emb.model_name}::{emb.dimension}"
+    return f"{emb.model_name}::{emb.dimension}"
 def _read_stored_signature(config_dir: Path) -> str: