npm - superlocalmemory - Versions diffs - 3.0.17 → 3.0.19 - Mend

superlocalmemory 3.0.17 → 3.0.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/bin/slm-npm +8 -0
package/package.json +1 -1
package/pyproject.toml +1 -1
package/src/superlocalmemory/cli/commands.py +68 -0
package/src/superlocalmemory/cli/main.py +20 -2
package/src/superlocalmemory/core/embedding_worker.py +120 -0
package/src/superlocalmemory/core/embeddings.py +156 -240
package/src/superlocalmemory/core/recall_worker.py +267 -0
package/src/superlocalmemory/core/summarizer.py +182 -0
package/src/superlocalmemory/core/worker_pool.py +217 -0
package/src/superlocalmemory/mcp/server.py +9 -0
package/src/superlocalmemory/mcp/tools_core.py +82 -8
package/src/superlocalmemory/server/routes/helpers.py +21 -0
package/src/superlocalmemory/server/routes/memories.py +107 -33
package/src/superlocalmemory/server/routes/v3_api.py +195 -43
package/src/superlocalmemory/server/ui.py +15 -14
package/src/superlocalmemory/storage/database.py +49 -0
package/src/superlocalmemory.egg-info/PKG-INFO +1 -1
package/src/superlocalmemory.egg-info/SOURCES.txt +4 -0
package/ui/index.html +77 -21
package/ui/js/auto-settings.js +330 -1
package/ui/js/clusters.js +11 -0
package/ui/js/graph-interactions.js +2 -5
package/ui/js/memories.js +65 -2
package/ui/js/modal.js +137 -43
package/ui/js/recall-lab.js +98 -46

package/src/superlocalmemory/core/recall_worker.py ADDED Viewed

@@ -0,0 +1,267 @@
+# Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
+# Licensed under the MIT License - see LICENSE file
+# Part of SuperLocalMemory V3 | https://qualixar.com | https://varunpratap.com
+"""Subprocess recall worker — runs the full recall pipeline in isolation.
+The dashboard/MCP main process NEVER imports torch, numpy, or the engine.
+All heavy work (engine init, embedding, retrieval, reranking) happens here.
+Protocol (JSON over stdin/stdout):
+  Request:  {"cmd": "recall", "query": "...", "limit": 10}
+  Response: {"ok": true, "results": [...], "query_type": "...", ...}
+Part of Qualixar | Author: Varun Pratap Bhardwaj
+"""
+from __future__ import annotations
+import json
+import os
+import sys
+# Force CPU BEFORE any torch import
+os.environ["CUDA_VISIBLE_DEVICES"] = ""
+os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
+os.environ["PYTORCH_MPS_MEM_LIMIT"] = "0"
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+os.environ["TORCH_DEVICE"] = "cpu"
+_engine = None
+def _get_engine():
+    global _engine
+    if _engine is None:
+        from superlocalmemory.core.config import SLMConfig
+        from superlocalmemory.core.engine import MemoryEngine
+        config = SLMConfig.load()
+        _engine = MemoryEngine(config)
+        _engine.initialize()
+    return _engine
+def _handle_recall(query: str, limit: int) -> dict:
+    engine = _get_engine()
+    response = engine.recall(query, limit=limit)
+    # Batch-fetch original memory text for all results
+    memory_ids = list({r.fact.memory_id for r in response.results[:limit] if r.fact.memory_id})
+    memory_map = engine._db.get_memory_content_batch(memory_ids) if memory_ids else {}
+    results = []
+    for r in response.results[:limit]:
+        results.append({
+            "fact_id": r.fact.fact_id,
+            "memory_id": r.fact.memory_id,
+            "content": r.fact.content[:300],
+            "source_content": memory_map.get(r.fact.memory_id, ""),
+            "score": round(r.score, 4),
+            "confidence": round(r.confidence, 4),
+            "trust_score": round(r.trust_score, 4),
+            "channel_scores": {
+                k: round(v, 4) for k, v in (r.channel_scores or {}).items()
+            },
+        })
+    return {
+        "ok": True,
+        "query": query,
+        "query_type": response.query_type,
+        "result_count": len(results),
+        "retrieval_time_ms": round(response.retrieval_time_ms, 1),
+        "results": results,
+    }
+def _handle_store(content: str, metadata: dict) -> dict:
+    engine = _get_engine()
+    session_id = metadata.pop("session_id", "")
+    fact_ids = engine.store(content, session_id=session_id, metadata=metadata)
+    # Generate and persist summary immediately after store (Mode A heuristic, B/C LLM)
+    if fact_ids:
+        try:
+            from superlocalmemory.core.summarizer import Summarizer
+            summarizer = Summarizer(engine._config)
+            summary = summarizer.summarize_cluster([{"content": content}])
+            if summary:
+                # Get the memory_id from the first stored fact
+                rows = engine._db.execute(
+                    "SELECT memory_id FROM atomic_facts WHERE fact_id = ? LIMIT 1",
+                    (fact_ids[0],),
+                )
+                if rows:
+                    memory_id = dict(rows[0])["memory_id"]
+                    engine._db.update_memory_summary(memory_id, summary)
+        except Exception:
+            pass  # Summary is non-critical
+    return {"ok": True, "fact_ids": fact_ids, "count": len(fact_ids)}
+def _handle_get_memory_facts(memory_id: str) -> dict:
+    engine = _get_engine()
+    pid = engine.profile_id
+    # Get original memory content
+    mem_map = engine._db.get_memory_content_batch([memory_id])
+    original = mem_map.get(memory_id, "")
+    # Get child facts
+    facts = engine._db.get_facts_by_memory_id(memory_id, pid)
+    fact_list = []
+    for f in facts:
+        fact_list.append({
+            "fact_id": f.fact_id,
+            "content": f.content,
+            "fact_type": f.fact_type.value if hasattr(f.fact_type, 'value') else str(f.fact_type),
+            "confidence": round(f.confidence, 3),
+            "created_at": f.created_at,
+        })
+    return {
+        "ok": True,
+        "memory_id": memory_id,
+        "original_content": original,
+        "facts": fact_list,
+        "fact_count": len(fact_list),
+    }
+def _handle_delete_memory(fact_id: str, agent_id: str = "system") -> dict:
+    """Delete a specific atomic fact by ID with audit logging."""
+    engine = _get_engine()
+    pid = engine.profile_id
+    rows = engine._db.execute(
+        "SELECT content FROM atomic_facts WHERE fact_id = ? AND profile_id = ? LIMIT 1",
+        (fact_id, pid),
+    )
+    if not rows:
+        return {"ok": False, "error": f"Memory {fact_id} not found"}
+    content_preview = dict(rows[0]).get("content", "")[:80]
+    engine._db.delete_fact(fact_id)
+    # Audit log
+    import logging as _logging
+    _logging.getLogger("superlocalmemory.audit").info(
+        "DELETE fact_id=%s by agent=%s content=%s", fact_id[:16], agent_id, content_preview,
+    )
+    return {"ok": True, "deleted": fact_id, "content_preview": content_preview}
+def _handle_update_memory(fact_id: str, content: str, agent_id: str = "system") -> dict:
+    """Update content of a specific atomic fact with audit logging."""
+    engine = _get_engine()
+    pid = engine.profile_id
+    rows = engine._db.execute(
+        "SELECT content FROM atomic_facts WHERE fact_id = ? AND profile_id = ? LIMIT 1",
+        (fact_id, pid),
+    )
+    if not rows:
+        return {"ok": False, "error": f"Memory {fact_id} not found"}
+    old_content = dict(rows[0]).get("content", "")[:80]
+    engine._db.execute(
+        "UPDATE atomic_facts SET content = ? WHERE fact_id = ?",
+        (content, fact_id),
+    )
+    import logging as _logging
+    _logging.getLogger("superlocalmemory.audit").info(
+        "UPDATE fact_id=%s by agent=%s old=%s new=%s",
+        fact_id[:16], agent_id, old_content, content[:80],
+    )
+    return {"ok": True, "fact_id": fact_id, "content": content}
+def _handle_summarize(texts: list[str], mode: str) -> dict:
+    """Generate summary using heuristic (A) or LLM (B/C)."""
+    from superlocalmemory.core.summarizer import Summarizer
+    engine = _get_engine()
+    summarizer = Summarizer(engine._config)
+    summary = summarizer.summarize_cluster(
+        [{"content": t} for t in texts],
+    )
+    return {"ok": True, "summary": summary}
+def _handle_synthesize(query: str, facts: list[dict]) -> dict:
+    """Generate synthesized answer from query + facts."""
+    from superlocalmemory.core.summarizer import Summarizer
+    engine = _get_engine()
+    summarizer = Summarizer(engine._config)
+    synthesis = summarizer.synthesize_answer(query, facts)
+    return {"ok": True, "synthesis": synthesis}
+def _handle_status() -> dict:
+    engine = _get_engine()
+    pid = engine.profile_id
+    fact_count = engine._db.get_fact_count(pid)
+    return {
+        "ok": True,
+        "mode": engine._config.mode.value,
+        "profile": pid,
+        "fact_count": fact_count,
+    }
+def _worker_main() -> None:
+    """Main loop: read JSON requests from stdin, write responses to stdout."""
+    for line in sys.stdin:
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            req = json.loads(line)
+        except json.JSONDecodeError:
+            _respond({"ok": False, "error": "Invalid JSON"})
+            continue
+        cmd = req.get("cmd", "")
+        if cmd == "quit":
+            break
+        if cmd == "ping":
+            _respond({"ok": True})
+            continue
+        try:
+            if cmd == "recall":
+                result = _handle_recall(req.get("query", ""), req.get("limit", 10))
+                _respond(result)
+            elif cmd == "store":
+                result = _handle_store(req.get("content", ""), req.get("metadata", {}))
+                _respond(result)
+            elif cmd == "delete_memory":
+                result = _handle_delete_memory(
+                    req.get("fact_id", ""), req.get("agent_id", "system"),
+                )
+                _respond(result)
+            elif cmd == "update_memory":
+                result = _handle_update_memory(
+                    req.get("fact_id", ""),
+                    req.get("content", ""),
+                    req.get("agent_id", "system"),
+                )
+                _respond(result)
+            elif cmd == "get_memory_facts":
+                result = _handle_get_memory_facts(req.get("memory_id", ""))
+                _respond(result)
+            elif cmd == "summarize":
+                result = _handle_summarize(req.get("texts", []), req.get("mode", "a"))
+                _respond(result)
+            elif cmd == "synthesize":
+                result = _handle_synthesize(req.get("query", ""), req.get("facts", []))
+                _respond(result)
+            elif cmd == "status":
+                _respond(_handle_status())
+            else:
+                _respond({"ok": False, "error": f"Unknown command: {cmd}"})
+        except Exception as exc:
+            _respond({"ok": False, "error": str(exc)})
+def _respond(data: dict) -> None:
+    sys.stdout.write(json.dumps(data) + "\n")
+    sys.stdout.flush()
+if __name__ == "__main__":
+    _worker_main()

package/src/superlocalmemory/core/summarizer.py ADDED Viewed

@@ -0,0 +1,182 @@
+# Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
+# Licensed under the MIT License - see LICENSE file
+# Part of SuperLocalMemory V3 | https://qualixar.com | https://varunpratap.com
+"""Summarizer — Mode A heuristic + Mode B Ollama + Mode C OpenRouter.
+Generates cluster summaries and search synthesis. All LLM failures
+fall back to heuristic silently — never crashes the caller.
+Part of Qualixar | Author: Varun Pratap Bhardwaj
+"""
+from __future__ import annotations
+import logging
+import os
+import re
+logger = logging.getLogger(__name__)
+class Summarizer:
+    """Generate summaries using heuristic or LLM based on mode."""
+    def __init__(self, config) -> None:
+        self._config = config
+        self._mode = config.mode.value if hasattr(config.mode, 'value') else str(config.mode)
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    def summarize_cluster(self, members: list[dict]) -> str:
+        """Generate a human-readable cluster summary.
+        Args:
+            members: List of dicts with 'content' key.
+        Returns:
+            Summary string (2-3 sentences).
+        """
+        texts = [m.get("content", "") for m in members if m.get("content")]
+        if not texts:
+            return "Empty cluster."
+        if self._mode in ("b", "c") and self._has_llm():
+            try:
+                prompt = self._cluster_prompt(texts[:10])
+                return self._call_llm(prompt, max_tokens=150)
+            except Exception as exc:
+                logger.warning("LLM cluster summary failed, using heuristic: %s", exc)
+        return self._heuristic_summary(texts[:5])
+    def synthesize_answer(self, query: str, facts: list[dict]) -> str:
+        """Generate a synthesized answer from query + retrieved facts.
+        Returns empty string in Mode A (no LLM available).
+        """
+        if self._mode == "a" or not self._has_llm():
+            return ""
+        texts = [f.get("content", "") for f in facts if f.get("content")]
+        if not texts:
+            return ""
+        try:
+            prompt = self._synthesis_prompt(query, texts[:8])
+            return self._call_llm(prompt, max_tokens=250)
+        except Exception as exc:
+            logger.warning("LLM synthesis failed: %s", exc)
+            return ""
+    # ------------------------------------------------------------------
+    # Heuristic (Mode A — always available)
+    # ------------------------------------------------------------------
+    def _heuristic_summary(self, texts: list[str]) -> str:
+        """First sentence from top-3 texts, joined."""
+        sentences = []
+        for text in texts[:3]:
+            first = self._first_sentence(text)
+            if first and first not in sentences:
+                sentences.append(first)
+        return " ".join(sentences)[:300] if sentences else "No summary available."
+    @staticmethod
+    def _first_sentence(text: str) -> str:
+        """Extract first sentence (up to period, question mark, or 100 chars)."""
+        text = text.strip()
+        match = re.match(r'^(.+?[.!?])\s', text)
+        if match:
+            return match.group(1).strip()
+        return text[:100].strip()
+    # ------------------------------------------------------------------
+    # LLM calls (Mode B/C)
+    # ------------------------------------------------------------------
+    def _has_llm(self) -> bool:
+        """Check if LLM is available."""
+        if self._mode == "b":
+            return True  # Ollama assumed running
+        if self._mode == "c":
+            return bool(
+                os.environ.get("OPENROUTER_API_KEY")
+                or getattr(self._config.llm, 'api_key', None)
+            )
+        return False
+    def _call_llm(self, prompt: str, max_tokens: int = 200) -> str:
+        """Route to Ollama (B) or OpenRouter (C)."""
+        if self._mode == "b":
+            return self._call_ollama(prompt, max_tokens)
+        return self._call_openrouter(prompt, max_tokens)
+    def _call_ollama(self, prompt: str, max_tokens: int = 200) -> str:
+        """Call local Ollama for summary generation."""
+        import httpx
+        model = getattr(self._config.llm, 'model', None) or "llama3.1:8b"
+        with httpx.Client(timeout=httpx.Timeout(20.0)) as client:
+            resp = client.post("http://localhost:11434/api/generate", json={
+                "model": model,
+                "prompt": prompt,
+                "stream": False,
+                "options": {"num_predict": max_tokens, "temperature": 0.3},
+            })
+            resp.raise_for_status()
+            return resp.json().get("response", "").strip()
+    def _call_openrouter(self, prompt: str, max_tokens: int = 200) -> str:
+        """Call OpenRouter API for summary generation."""
+        import httpx
+        api_key = (
+            os.environ.get("OPENROUTER_API_KEY")
+            or getattr(self._config.llm, 'api_key', None)
+        )
+        if not api_key:
+            raise RuntimeError("No OpenRouter API key")
+        model = (
+            getattr(self._config.llm, 'model', None)
+            or "meta-llama/llama-3.1-8b-instruct:free"
+        )
+        with httpx.Client(timeout=httpx.Timeout(20.0)) as client:
+            resp = client.post(
+                "https://openrouter.ai/api/v1/chat/completions",
+                headers={
+                    "Authorization": f"Bearer {api_key}",
+                    "Content-Type": "application/json",
+                },
+                json={
+                    "model": model,
+                    "messages": [{"role": "user", "content": prompt}],
+                    "max_tokens": max_tokens,
+                    "temperature": 0.3,
+                },
+            )
+            resp.raise_for_status()
+            choices = resp.json().get("choices", [])
+            if choices:
+                return choices[0].get("message", {}).get("content", "").strip()
+            return ""
+    # ------------------------------------------------------------------
+    # Prompt templates
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _cluster_prompt(texts: list[str]) -> str:
+        numbered = "\n".join(f"{i+1}. {t[:200]}" for i, t in enumerate(texts))
+        return (
+            "Summarize the following related memories in 2-3 concise sentences. "
+            "Focus on the common theme and key facts.\n\n"
+            f"Memories:\n{numbered}\n\n"
+            "Summary:"
+        )
+    @staticmethod
+    def _synthesis_prompt(query: str, texts: list[str]) -> str:
+        numbered = "\n".join(f"- {t[:200]}" for t in texts)
+        return (
+            f"Based on these stored memories, answer the question concisely.\n\n"
+            f"Question: {query}\n\n"
+            f"Relevant memories:\n{numbered}\n\n"
+            "Answer (2-3 sentences):"
+        )

package/src/superlocalmemory/core/worker_pool.py ADDED Viewed

@@ -0,0 +1,217 @@
+# Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
+# Licensed under the MIT License - see LICENSE file
+# Part of SuperLocalMemory V3 | https://qualixar.com | https://varunpratap.com
+"""Recall worker pool — manages subprocess lifecycle for all callers.
+Single shared worker process handles requests from dashboard, MCP, CLI.
+Serializes concurrent requests via a threading lock (one at a time to
+avoid interleaved stdout). Worker auto-kills after idle timeout.
+Usage:
+    pool = WorkerPool.shared()
+    result = pool.recall("what is X?", limit=10)
+    result = pool.store("some content", metadata={})
+Part of Qualixar | Author: Varun Pratap Bhardwaj
+"""
+from __future__ import annotations
+import json
+import logging
+import os
+import subprocess
+import sys
+import threading
+import time
+logger = logging.getLogger(__name__)
+_IDLE_TIMEOUT = 120  # 2 min — kill worker after idle
+_REQUEST_TIMEOUT = 60  # 60 sec max per request
+class WorkerPool:
+    """Manages a single recall_worker subprocess with idle auto-kill.
+    Thread-safe: concurrent callers are serialized via lock.
+    The worker subprocess holds all heavy memory (PyTorch, engine).
+    The calling process stays at ~60 MB.
+    """
+    _instance: WorkerPool | None = None
+    _instance_lock = threading.Lock()
+    def __init__(self) -> None:
+        self._lock = threading.Lock()
+        self._proc: subprocess.Popen | None = None
+        self._idle_timer: threading.Timer | None = None
+        self._last_used: float = 0.0
+    @classmethod
+    def shared(cls) -> WorkerPool:
+        """Get or create the singleton worker pool."""
+        if cls._instance is None:
+            with cls._instance_lock:
+                if cls._instance is None:
+                    cls._instance = cls()
+        return cls._instance
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    def recall(self, query: str, limit: int = 10) -> dict:
+        """Run recall in worker subprocess. Returns result dict."""
+        return self._send({"cmd": "recall", "query": query, "limit": limit})
+    def store(self, content: str, metadata: dict | None = None) -> dict:
+        """Run store in worker subprocess. Returns result dict."""
+        return self._send({
+            "cmd": "store", "content": content,
+            "metadata": metadata or {},
+        })
+    def delete_memory(self, fact_id: str, agent_id: str = "system") -> dict:
+        """Delete a specific memory by fact_id. Logged for audit."""
+        return self._send({"cmd": "delete_memory", "fact_id": fact_id, "agent_id": agent_id})
+    def update_memory(self, fact_id: str, content: str, agent_id: str = "system") -> dict:
+        """Update content of a specific memory. Logged for audit."""
+        return self._send({"cmd": "update_memory", "fact_id": fact_id, "content": content, "agent_id": agent_id})
+    def get_memory_facts(self, memory_id: str) -> dict:
+        """Get original memory text + child atomic facts."""
+        return self._send({"cmd": "get_memory_facts", "memory_id": memory_id})
+    def summarize(self, texts: list[str]) -> dict:
+        """Generate summary from texts (heuristic in A, LLM in B/C)."""
+        return self._send({"cmd": "summarize", "texts": texts})
+    def synthesize(self, query: str, facts: list[dict]) -> dict:
+        """Generate synthesized answer from query + facts."""
+        return self._send({"cmd": "synthesize", "query": query, "facts": facts})
+    def status(self) -> dict:
+        """Get engine status from worker."""
+        return self._send({"cmd": "status"})
+    def shutdown(self) -> None:
+        """Gracefully kill the worker."""
+        with self._lock:
+            self._kill()
+    @property
+    def worker_pid(self) -> int | None:
+        """PID of the worker process, or None if not running."""
+        if self._proc and self._proc.poll() is None:
+            return self._proc.pid
+        return None
+    # ------------------------------------------------------------------
+    # Internals
+    # ------------------------------------------------------------------
+    def _send(self, request: dict) -> dict:
+        """Send request to worker and get response. Thread-safe."""
+        with self._lock:
+            self._ensure_worker()
+            if self._proc is None:
+                return {"ok": False, "error": "Worker failed to start"}
+            req_line = json.dumps(request) + "\n"
+            try:
+                self._proc.stdin.write(req_line)
+                self._proc.stdin.flush()
+                # Read response with timeout
+                import selectors
+                sel = selectors.DefaultSelector()
+                sel.register(self._proc.stdout, selectors.EVENT_READ)
+                ready = sel.select(timeout=_REQUEST_TIMEOUT)
+                sel.close()
+                if not ready:
+                    logger.error("Worker timed out after %ds", _REQUEST_TIMEOUT)
+                    self._kill()
+                    return {"ok": False, "error": "Worker timed out"}
+                resp_line = self._proc.stdout.readline()
+                if not resp_line:
+                    logger.warning("Worker returned empty, restarting")
+                    self._kill()
+                    return {"ok": False, "error": "Worker died"}
+                self._reset_idle_timer()
+                return json.loads(resp_line)
+            except (BrokenPipeError, OSError, json.JSONDecodeError) as exc:
+                logger.warning("Worker communication failed: %s", exc)
+                self._kill()
+                return {"ok": False, "error": str(exc)}
+    def _ensure_worker(self) -> None:
+        """Spawn worker if not running."""
+        if self._proc is not None and self._proc.poll() is None:
+            return
+        self._proc = None
+        try:
+            env = {
+                **os.environ,
+                "CUDA_VISIBLE_DEVICES": "",
+                "PYTORCH_MPS_HIGH_WATERMARK_RATIO": "0.0",
+                "PYTORCH_MPS_MEM_LIMIT": "0",
+                "PYTORCH_ENABLE_MPS_FALLBACK": "1",
+                "TOKENIZERS_PARALLELISM": "false",
+                "TORCH_DEVICE": "cpu",
+            }
+            self._proc = subprocess.Popen(
+                [sys.executable, "-m", "superlocalmemory.core.recall_worker"],
+                stdin=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.DEVNULL,
+                text=True,
+                bufsize=1,
+                env=env,
+            )
+            logger.info("Recall worker spawned (PID %d)", self._proc.pid)
+        except Exception as exc:
+            logger.error("Failed to spawn recall worker: %s", exc)
+            self._proc = None
+    def _kill(self) -> None:
+        """Terminate worker. ALL memory freed to OS."""
+        if self._idle_timer is not None:
+            self._idle_timer.cancel()
+            self._idle_timer = None
+        if self._proc is not None:
+            pid = self._proc.pid
+            try:
+                self._proc.stdin.write('{"cmd":"quit"}\n')
+                self._proc.stdin.flush()
+                self._proc.wait(timeout=3)
+            except Exception:
+                try:
+                    self._proc.kill()
+                    self._proc.wait(timeout=2)
+                except Exception:
+                    pass
+            self._proc = None
+            logger.info("Recall worker killed (PID %s)", pid)
+    def _reset_idle_timer(self) -> None:
+        """Kill worker after 2 min of no requests."""
+        if self._idle_timer is not None:
+            self._idle_timer.cancel()
+        self._idle_timer = threading.Timer(_IDLE_TIMEOUT, self._idle_kill)
+        self._idle_timer.daemon = True
+        self._idle_timer.start()
+        self._last_used = time.time()
+    def _idle_kill(self) -> None:
+        """Called by idle timer — kill worker to free memory."""
+        with self._lock:
+            if self._proc is not None:
+                logger.info("Idle timeout — killing recall worker")
+                self._kill()

package/src/superlocalmemory/mcp/server.py CHANGED Viewed

@@ -11,6 +11,15 @@ Part of Qualixar | Author: Varun Pratap Bhardwaj
 from __future__ import annotations
+# CRITICAL: Set BEFORE any torch/transformers import to prevent Metal/MPS
+# GPU memory reservation on Apple Silicon.
+import os as _os
+_os.environ.setdefault('PYTORCH_MPS_HIGH_WATERMARK_RATIO', '0.0')
+_os.environ.setdefault('PYTORCH_MPS_MEM_LIMIT', '0')
+_os.environ.setdefault('PYTORCH_ENABLE_MPS_FALLBACK', '1')
+_os.environ.setdefault('TOKENIZERS_PARALLELISM', 'false')
+_os.environ.setdefault('TORCH_DEVICE', 'cpu')
 import logging
 import sys