superlocalmemory 3.3.3 → 3.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "superlocalmemory",
3
- "version": "3.3.3",
3
+ "version": "3.3.4",
4
4
  "description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
5
5
  "keywords": [
6
6
  "ai-memory",
package/pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "superlocalmemory"
3
- version = "3.3.3"
3
+ version = "3.3.4"
4
4
  description = "Information-geometric agent memory with mathematical guarantees"
5
5
  readme = "README.md"
6
6
  license = {text = "MIT"}
@@ -98,6 +98,10 @@ testpaths = ["tests"]
98
98
  pythonpath = ["src"]
99
99
  markers = [
100
100
  "slow: marks tests as slow (deselect with '-m \"not slow\"')",
101
+ "ollama: marks tests that require a running Ollama instance",
102
+ ]
103
+ filterwarnings = [
104
+ "ignore::DeprecationWarning:vaderSentiment",
101
105
  ]
102
106
 
103
107
  [tool.coverage.run]
@@ -113,6 +113,10 @@ def cmd_mode(args: Namespace) -> None:
113
113
  if (config.embedding.provider != updated.embedding.provider
114
114
  or config.embedding.model_name != updated.embedding.model_name):
115
115
  print(" ⚠ Embedding model changed. Re-indexing will run on next recall.")
116
+
117
+ # V3.3.4: Warn if Mode C lacks cloud API key
118
+ if args.value == "c" and not updated.llm.api_key:
119
+ print(" ⚠ Mode C requires a cloud API key. Run: slm provider set")
116
120
  else:
117
121
  print(f"Current mode: {config.mode.value.upper()}")
118
122
 
@@ -356,12 +360,20 @@ def cmd_forget(args: Namespace) -> None:
356
360
  sys.exit(1)
357
361
  raise
358
362
 
363
+ dry_run = getattr(args, 'dry_run', False)
364
+
359
365
  if use_json:
360
366
  from superlocalmemory.cli.json_output import json_print
361
367
  if not matches:
362
368
  json_print("forget", data={"matched_count": 0, "deleted_count": 0, "matches": []})
363
369
  return
364
370
  match_items = [{"fact_id": f.fact_id, "content": f.content[:120]} for f in matches[:20]]
371
+ if dry_run:
372
+ json_print("forget", data={
373
+ "matched_count": len(matches), "deleted_count": 0,
374
+ "dry_run": True, "matches": match_items,
375
+ })
376
+ return
365
377
  if getattr(args, 'yes', False):
366
378
  for f in matches:
367
379
  engine._db.delete_fact(f.fact_id)
@@ -387,6 +399,9 @@ def cmd_forget(args: Namespace) -> None:
387
399
  print(f"Found {len(matches)} matching memories:")
388
400
  for f in matches[:10]:
389
401
  print(f" - {f.fact_id[:8]}... {f.content[:80]}")
402
+ if dry_run:
403
+ print(f"(dry run — {len(matches)} would be deleted)")
404
+ return
390
405
  if getattr(args, 'yes', False):
391
406
  for f in matches:
392
407
  engine._db.delete_fact(f.fact_id)
@@ -861,7 +876,8 @@ def cmd_trace(args: Namespace) -> None:
861
876
  try:
862
877
  config = SLMConfig.load()
863
878
  engine = MemoryEngine(config)
864
- response = engine.recall(args.query, limit=5)
879
+ limit = getattr(args, 'limit', 10)
880
+ response = engine.recall(args.query, limit=limit)
865
881
  except Exception as exc:
866
882
  if use_json:
867
883
  from superlocalmemory.cli.json_output import json_print
@@ -1435,6 +1451,7 @@ def cmd_consolidate(args: Namespace) -> None:
1435
1451
 
1436
1452
  use_json = getattr(args, "json", False)
1437
1453
  cognitive = getattr(args, "cognitive", False)
1454
+ dry_run = getattr(args, "dry_run", False)
1438
1455
  profile = getattr(args, "profile", "")
1439
1456
 
1440
1457
  if not cognitive:
@@ -1460,7 +1477,7 @@ def cmd_consolidate(args: Namespace) -> None:
1460
1477
  )
1461
1478
 
1462
1479
  consolidator = CognitiveConsolidator(db=engine._db)
1463
- result = consolidator.run_pipeline(pid)
1480
+ result = consolidator.run_pipeline(pid, dry_run=dry_run)
1464
1481
  except Exception as exc:
1465
1482
  if use_json:
1466
1483
  from superlocalmemory.cli.json_output import json_print
@@ -1473,7 +1490,7 @@ def cmd_consolidate(args: Namespace) -> None:
1473
1490
  if use_json:
1474
1491
  from superlocalmemory.cli.json_output import json_print
1475
1492
  json_print("consolidate", data={
1476
- "clusters_found": result.clusters_found,
1493
+ "clusters_processed": result.clusters_processed,
1477
1494
  "blocks_created": result.blocks_created,
1478
1495
  "facts_archived": result.facts_archived,
1479
1496
  "compression_ratio": round(result.compression_ratio, 3),
@@ -1484,7 +1501,7 @@ def cmd_consolidate(args: Namespace) -> None:
1484
1501
  return
1485
1502
 
1486
1503
  print("CCQ Cognitive Consolidation")
1487
- print(f" Clusters found: {result.clusters_found}")
1504
+ print(f" Clusters processed: {result.clusters_processed}")
1488
1505
  print(f" Blocks created: {result.blocks_created}")
1489
1506
  print(f" Facts archived: {result.facts_archived}")
1490
1507
  print(f" Compression ratio: {result.compression_ratio:.3f}")
@@ -123,6 +123,7 @@ def main() -> None:
123
123
 
124
124
  forget_p = sub.add_parser("forget", help="Delete memories matching a query (fuzzy)")
125
125
  forget_p.add_argument("query", help="Query to match for deletion")
126
+ forget_p.add_argument("--dry-run", action="store_true", default=False, help="Preview matches without deleting")
126
127
  forget_p.add_argument("--yes", "-y", action="store_true", help="Skip confirmation prompt")
127
128
  forget_p.add_argument("--json", action="store_true", help="Output structured JSON (agent-native)")
128
129
 
@@ -151,6 +152,7 @@ def main() -> None:
151
152
 
152
153
  trace_p = sub.add_parser("trace", help="Recall with per-channel score breakdown")
153
154
  trace_p.add_argument("query", help="Search query")
155
+ trace_p.add_argument("--limit", type=int, default=10, help="Max results (default 10)")
154
156
  trace_p.add_argument("--json", action="store_true", help="Output structured JSON (agent-native)")
155
157
 
156
158
  # -- Diagnostics (continued) ----------------------------------------
@@ -217,6 +219,10 @@ def main() -> None:
217
219
  "--cognitive", action="store_true",
218
220
  help="Run CCQ cognitive consolidation",
219
221
  )
222
+ consolidate_p.add_argument(
223
+ "--dry-run", action="store_true", default=False,
224
+ help="Preview without applying",
225
+ )
220
226
  consolidate_p.add_argument("--profile", default="", help="Target profile")
221
227
  consolidate_p.add_argument("--json", action="store_true", help="Output structured JSON (agent-native)")
222
228
 
@@ -612,15 +612,15 @@ class SLMConfig:
612
612
 
613
613
  rt = data.get("retrieval", {})
614
614
  if rt:
615
- # V3.3.2 migration: auto-enable ONNX cross-encoder.
616
- # Pre-3.3.2 configs had use_cross_encoder=False because the
617
- # PyTorch cross-encoder used ~1.5GB RAM. With ONNX backend
618
- # (~200MB), it's now safe for all modes. Detect old configs
619
- # by the absence of cross_encoder_backend field.
615
+ # V3.3.2 migration: add ONNX cross-encoder backend field.
616
+ # Pre-3.3.2 configs lacked cross_encoder_backend. Add it,
617
+ # but NEVER override an explicit use_cross_encoder setting.
618
+ # The user's explicit choice always wins.
620
619
  if "cross_encoder_backend" not in rt:
621
- rt["use_cross_encoder"] = True
622
- rt["cross_encoder_model"] = "cross-encoder/ms-marco-MiniLM-L-6-v2"
620
+ rt.setdefault("cross_encoder_model", "cross-encoder/ms-marco-MiniLM-L-6-v2")
623
621
  rt["cross_encoder_backend"] = "onnx"
622
+ # Only auto-enable if user didn't explicitly set the field
623
+ rt.setdefault("use_cross_encoder", True)
624
624
  config.retrieval = RetrievalConfig(**{
625
625
  k: v for k, v in rt.items()
626
626
  if k in RetrievalConfig.__dataclass_fields__
@@ -768,6 +768,9 @@ class SLMConfig:
768
768
  )
769
769
 
770
770
  # Mode C — FULL POWER, UNRESTRICTED
771
+ # Don't carry over local-only providers (ollama) to cloud mode
772
+ c_provider = llm_provider if llm_provider not in ("ollama", "") else "openrouter"
773
+ c_model = llm_model if llm_provider not in ("ollama", "") else "anthropic/claude-sonnet-4"
771
774
  return cls(
772
775
  mode=mode,
773
776
  base_dir=_base,
@@ -779,8 +782,8 @@ class SLMConfig:
779
782
  deployment_name=embedding_deployment,
780
783
  ),
781
784
  llm=LLMConfig(
782
- provider=llm_provider or "azure",
783
- model=llm_model or "gpt-4.1-mini",
785
+ provider=c_provider,
786
+ model=c_model,
784
787
  api_key=llm_api_key,
785
788
  api_base=llm_api_base,
786
789
  ),
@@ -142,8 +142,15 @@ class WorkerPool:
142
142
  # ------------------------------------------------------------------
143
143
 
144
144
  def _send(self, request: dict) -> dict:
145
- """Send request to worker and get response. Thread-safe."""
146
- return self._send_with_timeout(request, timeout=_REQUEST_TIMEOUT)
145
+ """Send request to worker and get response. Thread-safe.
146
+
147
+ Auto-retries once on worker death (idle timeout, crash).
148
+ """
149
+ resp = self._send_with_timeout(request, timeout=_REQUEST_TIMEOUT)
150
+ if not resp.get("ok") and "Worker" in resp.get("error", ""):
151
+ logger.info("Auto-restarting worker after failure, retrying request")
152
+ resp = self._send_with_timeout(request, timeout=_REQUEST_TIMEOUT)
153
+ return resp
147
154
 
148
155
  def _send_with_timeout(self, request: dict, timeout: float) -> dict:
149
156
  """Send request with configurable timeout. Thread-safe."""
@@ -214,11 +214,17 @@ class CognitiveConsolidator:
214
214
  # Public API
215
215
  # ------------------------------------------------------------------
216
216
 
217
- def run_pipeline(self, profile_id: str) -> CCQPipelineResult:
217
+ def run_pipeline(
218
+ self, profile_id: str, dry_run: bool = False,
219
+ ) -> CCQPipelineResult:
218
220
  """Execute the full 6-step CCQ pipeline.
219
221
 
220
222
  Per-cluster error isolation: one cluster failure does NOT
221
223
  abort the pipeline (HR-07).
224
+
225
+ Args:
226
+ profile_id: Target profile.
227
+ dry_run: If True, identify clusters but don't apply changes.
222
228
  """
223
229
  # Step 1: Identify candidates
224
230
  candidates = self._step1_identify(profile_id)
@@ -230,6 +236,18 @@ class CognitiveConsolidator:
230
236
  if not clusters:
231
237
  return self._empty_result()
232
238
 
239
+ if dry_run:
240
+ return CCQPipelineResult(
241
+ clusters_processed=len(clusters),
242
+ blocks_created=0,
243
+ facts_archived=len(candidates),
244
+ total_bytes_before=0,
245
+ total_bytes_after=0,
246
+ compression_ratio=0.0,
247
+ audit_entries=(),
248
+ errors=(),
249
+ )
250
+
233
251
  # Process each cluster
234
252
  blocks_created = 0
235
253
  facts_archived = 0
@@ -30,8 +30,11 @@ def _get_vader():
30
30
  if _vader_analyzer is not None:
31
31
  return _vader_analyzer
32
32
  try:
33
- from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
34
- _vader_analyzer = SentimentIntensityAnalyzer()
33
+ import warnings
34
+ with warnings.catch_warnings():
35
+ warnings.filterwarnings("ignore", category=DeprecationWarning, module="vaderSentiment")
36
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
37
+ _vader_analyzer = SentimentIntensityAnalyzer()
35
38
  except ImportError:
36
39
  logger.warning("vaderSentiment not installed — emotional tagging disabled")
37
40
  _vader_analyzer = None
@@ -498,7 +498,7 @@ class EntityResolver:
498
498
  max_tokens=256,
499
499
  temperature=0.0,
500
500
  )
501
- match = re.search(r"\{.*\}", response, re.DOTALL)
501
+ match = re.search(r"\{[^}]*\}", response)
502
502
  if not match:
503
503
  return {}
504
504
 
@@ -103,7 +103,9 @@ class PolarQuantEncoder:
103
103
  """
104
104
  path_str = self._config.rotation_matrix_path
105
105
  if not path_str:
106
- path_str = str(Path.home() / ".superlocalmemory" / "polar_rotation.npy")
106
+ path_str = str(
107
+ Path.home() / ".superlocalmemory" / f"polar_rotation_{self._d}.npy",
108
+ )
107
109
 
108
110
  path = Path(path_str)
109
111
 
@@ -83,6 +83,10 @@ class RetrievalEngine:
83
83
  self._bridge = bridge_discovery
84
84
  self._trust_scorer = trust_scorer
85
85
 
86
+ # V3.3.4: LRU cache for query embeddings (avoids redundant Ollama API calls)
87
+ self._query_embedding_cache: dict[str, list[float]] = {}
88
+ self._cache_max_size = 64
89
+
86
90
  # V3.2: ChannelRegistry for self-registration (Phase 0.5)
87
91
  from superlocalmemory.retrieval.channel_registry import ChannelRegistry
88
92
  self._registry = ChannelRegistry()
@@ -189,6 +193,21 @@ class RetrievalEngine:
189
193
 
190
194
  # -- Channel execution --------------------------------------------------
191
195
 
196
+ def _embed_query(self, query: str) -> list[float] | None:
197
+ """Embed query with LRU cache. Avoids redundant Ollama/API calls."""
198
+ if self._embedder is None:
199
+ return None
200
+ cached = self._query_embedding_cache.get(query)
201
+ if cached is not None:
202
+ return cached
203
+ emb = self._embedder.embed(query)
204
+ # Evict oldest if cache full
205
+ if len(self._query_embedding_cache) >= self._cache_max_size:
206
+ oldest = next(iter(self._query_embedding_cache))
207
+ del self._query_embedding_cache[oldest]
208
+ self._query_embedding_cache[query] = emb
209
+ return emb
210
+
192
211
  def _run_channels(
193
212
  self, query: str, profile_id: str, strat: QueryStrategy,
194
213
  ) -> dict[str, list[tuple[str, float]]]:
@@ -197,9 +216,20 @@ class RetrievalEngine:
197
216
  # Skip channels listed in disabled_channels (ablation support)
198
217
  disabled = set(self._config.disabled_channels)
199
218
 
200
- if self._semantic is not None and self._embedder is not None and "semantic" not in disabled:
219
+ # V3.3.4: Embed query ONCE, reuse for semantic + hopfield channels
220
+ q_emb: list[float] | None = None
221
+ needs_embedding = (
222
+ (self._semantic is not None and "semantic" not in disabled)
223
+ or (self._hopfield is not None and "hopfield" not in disabled)
224
+ )
225
+ if needs_embedding:
226
+ try:
227
+ q_emb = self._embed_query(query)
228
+ except Exception as exc:
229
+ logger.warning("Query embedding failed: %s", exc)
230
+
231
+ if self._semantic is not None and q_emb is not None and "semantic" not in disabled:
201
232
  try:
202
- q_emb = self._embedder.embed(query)
203
233
  r = self._semantic.search(q_emb, profile_id, self._config.semantic_top_k)
204
234
  if r:
205
235
  out["semantic"] = r
@@ -231,13 +261,11 @@ class RetrievalEngine:
231
261
  logger.warning("Temporal channel: %s", exc)
232
262
 
233
263
  # Phase G: Hopfield channel (6th) — energy-based pattern completion
234
- if self._hopfield is not None and "hopfield" not in disabled:
264
+ if self._hopfield is not None and q_emb is not None and "hopfield" not in disabled:
235
265
  try:
236
- q_emb = self._embedder.embed(query) if self._embedder else None
237
- if q_emb is not None:
238
- r = self._hopfield.search(q_emb, profile_id, self._config.hopfield_top_k)
239
- if r:
240
- out["hopfield"] = r
266
+ r = self._hopfield.search(q_emb, profile_id, self._config.hopfield_top_k)
267
+ if r:
268
+ out["hopfield"] = r
241
269
  except Exception as exc:
242
270
  logger.warning("Hopfield channel: %s", exc)
243
271
 
@@ -2,10 +2,13 @@
2
2
  # Licensed under the MIT License - see LICENSE file
3
3
  # Part of SuperLocalMemory V3 | https://qualixar.com | https://varunpratap.com
4
4
 
5
- """SuperLocalMemory V3 — Cross-Encoder Reranker.
5
+ """SuperLocalMemory V3 — Cross-Encoder Reranker (Subprocess-Isolated).
6
6
 
7
- Scores (query, fact) pairs through a cross-encoder in a single forward
8
- pass. Lazy model loading, thread-safe via lock.
7
+ V3.3.3: All PyTorch/ONNX model work runs in a SEPARATE subprocess.
8
+ The main process (dashboard, MCP, CLI) NEVER imports torch and stays
9
+ at ~60 MB. Same isolation pattern as EmbeddingService.
10
+
11
+ The worker subprocess auto-kills after 2 minutes idle.
9
12
 
10
13
  Part of Qualixar | Author: Varun Pratap Bhardwaj
11
14
  License: MIT
@@ -13,49 +16,33 @@ License: MIT
13
16
 
14
17
  from __future__ import annotations
15
18
 
19
+ import json
16
20
  import logging
17
- import platform
18
- import struct
21
+ import os
22
+ import subprocess
19
23
  import sys
20
24
  import threading
25
+ import time
21
26
  from typing import Any
22
27
 
23
28
  from superlocalmemory.storage.models import AtomicFact
24
29
 
25
30
  logger = logging.getLogger(__name__)
26
31
 
27
-
28
- def _detect_onnx_variant() -> str:
29
- """Auto-detect the best ONNX model variant for the current platform.
30
-
31
- Returns the file_name parameter for CrossEncoder model_kwargs.
32
- Platform detection:
33
- - macOS ARM64 (Apple Silicon): qint8_arm64
34
- - x86_64 with AVX2: quint8_avx2
35
- - Everything else: default model.onnx (float32, works everywhere)
36
- """
37
- arch = platform.machine().lower()
38
- is_64bit = struct.calcsize("P") * 8 == 64
39
-
40
- if sys.platform == "darwin" and arch in ("arm64", "aarch64"):
41
- return "onnx/model_qint8_arm64.onnx"
42
-
43
- if arch in ("x86_64", "amd64") and is_64bit:
44
- return "onnx/model_quint8_avx2.onnx"
45
-
46
- return "onnx/model.onnx"
32
+ _IDLE_TIMEOUT_SECONDS = 120 # 2 min → kill worker
33
+ _SUBPROCESS_RESPONSE_TIMEOUT = 120 # 120s for ONNX cold start
34
+ _WORKER_RECYCLE_AFTER = 500 # Recycle after N requests
47
35
 
48
36
 
49
37
  class CrossEncoderReranker:
50
38
  """Rerank candidate facts using a local cross-encoder model.
51
39
 
52
- V3.3.2: Uses ONNX backend by default (~200MB) instead of full PyTorch
53
- (~1.5GB). Three-tier fallback: ONNX PyTorch no reranking.
54
- Auto-detects the optimal quantized ONNX variant per platform.
40
+ V3.3.3: SUBPROCESS-ISOLATED. The main process never imports
41
+ sentence_transformers or torch. All model work runs in a child
42
+ process via JSON over stdin/stdout.
55
43
 
56
- When the model is unavailable (missing package, download failure,
57
- offline environment), falls back to returning candidates in their
58
- original score order — never crashes.
44
+ Non-blocking first-use: triggers background worker spawn, returns
45
+ fallback scores until worker is ready.
59
46
 
60
47
  Args:
61
48
  model_name: HuggingFace cross-encoder model identifier.
@@ -70,106 +57,207 @@ class CrossEncoderReranker:
70
57
  ) -> None:
71
58
  self._model_name = model_name
72
59
  self._backend = backend
73
- self._model: Any = None
74
- self._loaded = False
75
- self._loading = False # True while background load is in progress
76
- self._active_backend: str = ""
60
+ self._worker_proc: subprocess.Popen | None = None
61
+ self._model_loaded = False # True once worker confirms model is ready
62
+ self._worker_loading = False # True while background warmup in progress
77
63
  self._lock = threading.Lock()
64
+ self._idle_timer: threading.Timer | None = None
65
+ self._request_count: int = 0
66
+
67
+ # Start background warmup immediately — worker loads model
68
+ # while the rest of init continues. First recall gets instant
69
+ # fallback; second recall uses the warm model.
70
+ self._start_background_warmup()
78
71
 
79
72
  # ------------------------------------------------------------------
80
- # Lazy loading (non-blocking)
73
+ # Background warmup (non-blocking model load)
81
74
  # ------------------------------------------------------------------
82
75
 
83
- def _ensure_model(self) -> None:
84
- """Trigger model load in background (non-blocking).
85
-
86
- On first call, starts loading in a background thread and returns
87
- immediately. The model becomes available for subsequent calls
88
- once loading completes. This prevents the 30s ONNX cold start
89
- from blocking the first recall request.
76
+ def _start_background_warmup(self) -> None:
77
+ """Start worker and load model in background thread.
90
78
 
91
- Three-tier fallback:
92
- 1. ONNX backend with platform-optimal quantization ~100-200MB RAM
93
- 2. PyTorch backend (requires torch) — ~1.5GB RAM
94
- 3. No model (graceful degradation) — 0 RAM
79
+ Returns immediately. The worker loads the model in parallel
80
+ with the rest of engine initialization and the first recall.
95
81
  """
96
- if self._loaded:
82
+ if self._worker_loading or self._model_loaded:
97
83
  return
84
+ self._worker_loading = True
85
+
86
+ def _warmup() -> None:
87
+ try:
88
+ self._ensure_worker()
89
+ if self._worker_proc is None:
90
+ return
91
+ # Send load command and wait for response
92
+ req = json.dumps({
93
+ "cmd": "load",
94
+ "model_name": self._model_name,
95
+ "backend": self._backend,
96
+ }) + "\n"
97
+ self._worker_proc.stdin.write(req)
98
+ self._worker_proc.stdin.flush()
99
+ resp_line = self._readline_with_timeout(
100
+ self._worker_proc.stdout, _SUBPROCESS_RESPONSE_TIMEOUT,
101
+ )
102
+ if resp_line:
103
+ resp = json.loads(resp_line)
104
+ if resp.get("ok"):
105
+ self._model_loaded = True
106
+ logger.info(
107
+ "Reranker worker warm (backend=%s)",
108
+ resp.get("backend", "?"),
109
+ )
110
+ self._reset_idle_timer()
111
+ except Exception as exc:
112
+ logger.debug("Background reranker warmup failed: %s", exc)
113
+ finally:
114
+ self._worker_loading = False
115
+
116
+ t = threading.Thread(target=_warmup, daemon=True, name="ce-warmup")
117
+ t.start()
98
118
 
99
- with self._lock:
100
- if self._loaded or self._loading:
101
- return
102
- self._loading = True
119
+ # ------------------------------------------------------------------
120
+ # Worker management (mirrors EmbeddingService pattern)
121
+ # ------------------------------------------------------------------
103
122
 
104
- # Load in background thread so first recall isn't blocked
105
- loader = threading.Thread(
106
- target=self._load_model, daemon=True, name="ce-loader",
107
- )
108
- loader.start()
123
+ def _ensure_worker(self) -> None:
124
+ """Spawn worker subprocess if not running. Non-blocking."""
125
+ if self._worker_proc is not None and self._worker_proc.poll() is None:
126
+ return
127
+ self._worker_proc = None
128
+ self._worker_ready = False
109
129
 
110
- def _load_model(self) -> None:
111
- """Actually load the model (runs in background thread)."""
130
+ worker_module = "superlocalmemory.core.reranker_worker"
112
131
  try:
113
- from sentence_transformers import CrossEncoder
114
-
115
- if self._backend == "onnx":
116
- try:
117
- onnx_file = _detect_onnx_variant()
118
- model = CrossEncoder(
119
- self._model_name,
120
- backend="onnx",
121
- model_kwargs={"file_name": onnx_file},
122
- )
123
- self._model = model
124
- self._active_backend = "onnx"
125
- logger.info(
126
- "Cross-encoder loaded (ONNX %s): %s",
127
- onnx_file, self._model_name,
128
- )
129
- except Exception as onnx_exc:
130
- logger.info(
131
- "ONNX backend unavailable (%s), falling back to PyTorch",
132
- onnx_exc,
133
- )
134
- model = CrossEncoder(self._model_name)
135
- self._model = model
136
- self._active_backend = "pytorch"
137
- logger.info(
138
- "Cross-encoder loaded (PyTorch fallback): %s",
139
- self._model_name,
140
- )
141
- else:
142
- model = CrossEncoder(self._model_name)
143
- self._model = model
144
- self._active_backend = "pytorch"
145
- logger.info("Cross-encoder loaded: %s", self._model_name)
146
- except ImportError:
147
- logger.warning(
148
- "sentence-transformers not installed; "
149
- "cross-encoder reranking disabled"
132
+ env = {
133
+ **os.environ,
134
+ "CUDA_VISIBLE_DEVICES": "",
135
+ "PYTORCH_MPS_HIGH_WATERMARK_RATIO": "0.0",
136
+ "PYTORCH_MPS_MEM_LIMIT": "0",
137
+ "PYTORCH_ENABLE_MPS_FALLBACK": "1",
138
+ "TOKENIZERS_PARALLELISM": "false",
139
+ "TORCH_DEVICE": "cpu",
140
+ }
141
+ self._worker_proc = subprocess.Popen(
142
+ [sys.executable, "-m", worker_module],
143
+ stdin=subprocess.PIPE,
144
+ stdout=subprocess.PIPE,
145
+ stderr=subprocess.DEVNULL,
146
+ text=True,
147
+ bufsize=1,
148
+ env=env,
149
+ start_new_session=True,
150
150
  )
151
- except OSError as exc:
152
- logger.warning(
153
- "Failed to load cross-encoder %s: %s",
154
- self._model_name,
155
- exc,
151
+ logger.info(
152
+ "Reranker worker spawned (PID %d)", self._worker_proc.pid,
156
153
  )
157
- finally:
158
- self._loaded = True
159
- self._loading = False
154
+ self._worker_ready = True
155
+ except Exception as exc:
156
+ logger.warning("Failed to spawn reranker worker: %s", exc)
157
+ self._worker_proc = None
160
158
 
161
- def _ensure_model_blocking(self) -> None:
162
- """Load model synchronously (blocks until ready).
159
+ def _send_request(self, req: dict, timeout: float | None = None) -> dict | None:
160
+ """Send JSON request to worker, get response. Thread-safe.
163
161
 
164
- Used by warmup and is_available where we need the model NOW.
162
+ Uses a short timeout (10s) for rerank requests since the model
163
+ should already be loaded by the background warmup. Uses the full
164
+ timeout only for explicit load/ping commands.
165
165
  """
166
- if self._loaded:
167
- return
166
+ effective_timeout = timeout or _SUBPROCESS_RESPONSE_TIMEOUT
167
+
168
168
  with self._lock:
169
- if self._loaded:
170
- return
171
- self._loading = True
172
- self._load_model()
169
+ if self._request_count >= _WORKER_RECYCLE_AFTER and self._worker_proc is not None:
170
+ logger.info("Recycling reranker worker after %d requests", self._request_count)
171
+ self._kill_worker()
172
+ self._model_loaded = False
173
+ self._request_count = 0
174
+
175
+ # Ensure worker is alive (re-spawn if crashed)
176
+ if self._worker_proc is None or self._worker_proc.poll() is not None:
177
+ self._ensure_worker()
178
+ if self._worker_proc is None:
179
+ return None
180
+
181
+ try:
182
+ msg = json.dumps(req) + "\n"
183
+ self._worker_proc.stdin.write(msg)
184
+ self._worker_proc.stdin.flush()
185
+
186
+ resp_line = self._readline_with_timeout(
187
+ self._worker_proc.stdout,
188
+ effective_timeout,
189
+ )
190
+ if not resp_line:
191
+ logger.warning("Reranker worker timed out after %ds", effective_timeout)
192
+ self._kill_worker()
193
+ self._model_loaded = False
194
+ return None
195
+
196
+ resp = json.loads(resp_line)
197
+ self._reset_idle_timer()
198
+ self._request_count += 1
199
+ return resp
200
+ except (BrokenPipeError, OSError, json.JSONDecodeError) as exc:
201
+ logger.warning("Reranker worker communication failed: %s", exc)
202
+ self._kill_worker()
203
+ self._model_loaded = False
204
+ return None
205
+
206
+ @staticmethod
207
+ def _readline_with_timeout(stream: Any, timeout_seconds: float) -> str:
208
+ """Read a line from stream with timeout. Returns '' on timeout."""
209
+ result_container: list[str] = []
210
+ error_container: list[Exception] = []
211
+
212
+ def _read() -> None:
213
+ try:
214
+ result_container.append(stream.readline())
215
+ except Exception as exc:
216
+ error_container.append(exc)
217
+
218
+ reader = threading.Thread(target=_read, daemon=True)
219
+ reader.start()
220
+ reader.join(timeout=timeout_seconds)
221
+
222
+ if reader.is_alive():
223
+ return ""
224
+ if error_container:
225
+ raise error_container[0]
226
+ return result_container[0] if result_container else ""
227
+
228
+ def _kill_worker(self) -> None:
229
+ """Terminate worker subprocess."""
230
+ if self._idle_timer is not None:
231
+ self._idle_timer.cancel()
232
+ self._idle_timer = None
233
+ if self._worker_proc is not None:
234
+ try:
235
+ self._worker_proc.stdin.write('{"cmd":"quit"}\n')
236
+ self._worker_proc.stdin.flush()
237
+ self._worker_proc.wait(timeout=3)
238
+ except Exception:
239
+ try:
240
+ self._worker_proc.kill()
241
+ except Exception:
242
+ pass
243
+ self._worker_proc = None
244
+ self._worker_ready = False
245
+
246
+ def _reset_idle_timer(self) -> None:
247
+ """Reset idle timer — kills worker after 2 min inactivity."""
248
+ if self._idle_timer is not None:
249
+ self._idle_timer.cancel()
250
+ self._idle_timer = threading.Timer(
251
+ _IDLE_TIMEOUT_SECONDS, self.unload,
252
+ )
253
+ self._idle_timer.daemon = True
254
+ self._idle_timer.start()
255
+
256
+ def unload(self) -> None:
257
+ """Kill the worker subprocess to free all memory."""
258
+ with self._lock:
259
+ self._kill_worker()
260
+ logger.info("CrossEncoderReranker: worker killed (idle timeout)")
173
261
 
174
262
  # ------------------------------------------------------------------
175
263
  # Public API
@@ -183,73 +271,62 @@ class CrossEncoderReranker:
183
271
  ) -> list[tuple[AtomicFact, float]]:
184
272
  """Rerank candidates by cross-encoder relevance.
185
273
 
186
- Each (query, fact.content) pair is scored in a single forward
187
- pass. Results are returned sorted by cross-encoder score.
188
-
189
- When the model is unavailable, returns candidates sorted by
190
- their existing score (graceful fallback).
191
-
192
- Args:
193
- query: User query text.
194
- candidates: List of (AtomicFact, score) tuples from the
195
- fusion stage.
196
- top_k: Maximum results to return.
197
-
198
- Returns:
199
- Top-k (AtomicFact, cross_encoder_score) tuples, sorted
200
- descending by cross-encoder score.
274
+ NON-BLOCKING: If the worker is still loading the model
275
+ (background warmup), returns candidates by existing score
276
+ immediately. Once the worker is warm, subsequent calls use
277
+ the cross-encoder. This means CLI first-call gets instant
278
+ results (without reranking), and MCP gets reranked results
279
+ (worker stays warm between calls).
201
280
  """
202
281
  if not candidates:
203
282
  return []
204
283
 
205
- # Non-blocking: trigger background load if not yet started
206
- self._ensure_model()
207
-
208
- if self._model is None:
209
- # Model not loaded yet (still loading in background or failed).
210
- # Graceful fallback: return candidates sorted by existing score.
211
- # Next recall will use the model once it's ready.
212
- sorted_cands = sorted(
213
- candidates, key=lambda x: x[1], reverse=True
214
- )
284
+ # Non-blocking: if model isn't loaded yet, return fallback
285
+ if not self._model_loaded:
286
+ sorted_cands = sorted(candidates, key=lambda x: x[1], reverse=True)
215
287
  return sorted_cands[:top_k]
216
288
 
217
- # Build (query, document) pairs for batch scoring
218
- pairs: list[tuple[str, str]] = [
219
- (query, fact.content) for fact, _ in candidates
220
- ]
289
+ documents = [fact.content for fact, _ in candidates]
290
+
291
+ # Short timeout (10s) model should already be loaded by warmup.
292
+ # If worker crashed or is still loading, fallback immediately.
293
+ resp = self._send_request({
294
+ "cmd": "rerank",
295
+ "query": query,
296
+ "documents": documents,
297
+ }, timeout=10.0)
221
298
 
222
- scores = self._model.predict(pairs)
299
+ if resp is None or not resp.get("ok"):
300
+ # Fallback: return by existing score
301
+ sorted_cands = sorted(candidates, key=lambda x: x[1], reverse=True)
302
+ return sorted_cands[:top_k]
223
303
 
304
+ scores = resp["scores"]
224
305
  scored: list[tuple[AtomicFact, float]] = [
225
306
  (fact, float(score))
226
307
  for (fact, _), score in zip(candidates, scores)
227
308
  ]
228
-
229
309
  scored.sort(key=lambda x: x[1], reverse=True)
230
310
  return scored[:top_k]
231
311
 
232
312
  def score_pair(self, query: str, document: str) -> float:
233
- """Score a single (query, document) pair.
234
-
235
- Args:
236
- query: Query text.
237
- document: Document text.
238
-
239
- Returns:
240
- Relevance score (higher = more relevant). 0.0 if model
241
- is unavailable.
242
- """
243
- self._ensure_model()
244
-
245
- if self._model is None:
313
+ """Score a single (query, document) pair."""
314
+ resp = self._send_request({
315
+ "cmd": "score",
316
+ "query": query,
317
+ "document": document,
318
+ "model_name": self._model_name,
319
+ "backend": self._backend,
320
+ })
321
+
322
+ if resp is None or not resp.get("ok"):
246
323
  return 0.0
247
-
248
- scores = self._model.predict([(query, document)])
249
- return float(scores[0])
324
+ return float(resp.get("score", 0.0))
250
325
 
251
326
  @property
252
327
  def is_available(self) -> bool:
253
- """Whether the cross-encoder model is loaded and ready."""
254
- self._ensure_model_blocking()
255
- return self._model is not None
328
+ """Whether the cross-encoder worker can be spawned."""
329
+ resp = self._send_request({"cmd": "ping"})
330
+ if resp is None:
331
+ return False
332
+ return resp.get("ok", False)
@@ -36,11 +36,12 @@ _REINDEX_BATCH_SIZE = 50
36
36
  def _model_signature(config: SLMConfig) -> str:
37
37
  """Derive a deterministic signature from the active embedding config.
38
38
 
39
- The signature combines provider + model_name + dimension so that
40
- any change in embedding source is detected.
39
+ V3.3.4: Only model_name + dimension matter. Provider (sentence-transformers
40
+ vs ollama) doesn't change the embedding space when the model is the same.
41
+ This prevents spurious re-indexing when switching Mode A ↔ B.
41
42
  """
42
43
  emb = config.embedding
43
- return f"{emb.provider}::{emb.model_name}::{emb.dimension}"
44
+ return f"{emb.model_name}::{emb.dimension}"
44
45
 
45
46
 
46
47
  def _read_stored_signature(config_dir: Path) -> str: