superlocalmemory 3.0.17 → 3.0.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,267 @@
1
+ # Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
2
+ # Licensed under the MIT License - see LICENSE file
3
+ # Part of SuperLocalMemory V3 | https://qualixar.com | https://varunpratap.com
4
+
5
+ """Subprocess recall worker — runs the full recall pipeline in isolation.
6
+
7
+ The dashboard/MCP main process NEVER imports torch, numpy, or the engine.
8
+ All heavy work (engine init, embedding, retrieval, reranking) happens here.
9
+
10
+ Protocol (JSON over stdin/stdout):
11
+ Request: {"cmd": "recall", "query": "...", "limit": 10}
12
+ Response: {"ok": true, "results": [...], "query_type": "...", ...}
13
+
14
+ Part of Qualixar | Author: Varun Pratap Bhardwaj
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import json
20
+ import os
21
+ import sys
22
+
23
+ # Force CPU BEFORE any torch import
24
+ os.environ["CUDA_VISIBLE_DEVICES"] = ""
25
+ os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
26
+ os.environ["PYTORCH_MPS_MEM_LIMIT"] = "0"
27
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
28
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
29
+ os.environ["TORCH_DEVICE"] = "cpu"
30
+
31
+ _engine = None
32
+
33
+
34
+ def _get_engine():
35
+ global _engine
36
+ if _engine is None:
37
+ from superlocalmemory.core.config import SLMConfig
38
+ from superlocalmemory.core.engine import MemoryEngine
39
+ config = SLMConfig.load()
40
+ _engine = MemoryEngine(config)
41
+ _engine.initialize()
42
+ return _engine
43
+
44
+
45
+ def _handle_recall(query: str, limit: int) -> dict:
46
+ engine = _get_engine()
47
+ response = engine.recall(query, limit=limit)
48
+
49
+ # Batch-fetch original memory text for all results
50
+ memory_ids = list({r.fact.memory_id for r in response.results[:limit] if r.fact.memory_id})
51
+ memory_map = engine._db.get_memory_content_batch(memory_ids) if memory_ids else {}
52
+
53
+ results = []
54
+ for r in response.results[:limit]:
55
+ results.append({
56
+ "fact_id": r.fact.fact_id,
57
+ "memory_id": r.fact.memory_id,
58
+ "content": r.fact.content[:300],
59
+ "source_content": memory_map.get(r.fact.memory_id, ""),
60
+ "score": round(r.score, 4),
61
+ "confidence": round(r.confidence, 4),
62
+ "trust_score": round(r.trust_score, 4),
63
+ "channel_scores": {
64
+ k: round(v, 4) for k, v in (r.channel_scores or {}).items()
65
+ },
66
+ })
67
+ return {
68
+ "ok": True,
69
+ "query": query,
70
+ "query_type": response.query_type,
71
+ "result_count": len(results),
72
+ "retrieval_time_ms": round(response.retrieval_time_ms, 1),
73
+ "results": results,
74
+ }
75
+
76
+
77
+ def _handle_store(content: str, metadata: dict) -> dict:
78
+ engine = _get_engine()
79
+ session_id = metadata.pop("session_id", "")
80
+ fact_ids = engine.store(content, session_id=session_id, metadata=metadata)
81
+
82
+ # Generate and persist summary immediately after store (Mode A heuristic, B/C LLM)
83
+ if fact_ids:
84
+ try:
85
+ from superlocalmemory.core.summarizer import Summarizer
86
+ summarizer = Summarizer(engine._config)
87
+ summary = summarizer.summarize_cluster([{"content": content}])
88
+ if summary:
89
+ # Get the memory_id from the first stored fact
90
+ rows = engine._db.execute(
91
+ "SELECT memory_id FROM atomic_facts WHERE fact_id = ? LIMIT 1",
92
+ (fact_ids[0],),
93
+ )
94
+ if rows:
95
+ memory_id = dict(rows[0])["memory_id"]
96
+ engine._db.update_memory_summary(memory_id, summary)
97
+ except Exception:
98
+ pass # Summary is non-critical
99
+
100
+ return {"ok": True, "fact_ids": fact_ids, "count": len(fact_ids)}
101
+
102
+
103
+ def _handle_get_memory_facts(memory_id: str) -> dict:
104
+ engine = _get_engine()
105
+ pid = engine.profile_id
106
+ # Get original memory content
107
+ mem_map = engine._db.get_memory_content_batch([memory_id])
108
+ original = mem_map.get(memory_id, "")
109
+ # Get child facts
110
+ facts = engine._db.get_facts_by_memory_id(memory_id, pid)
111
+ fact_list = []
112
+ for f in facts:
113
+ fact_list.append({
114
+ "fact_id": f.fact_id,
115
+ "content": f.content,
116
+ "fact_type": f.fact_type.value if hasattr(f.fact_type, 'value') else str(f.fact_type),
117
+ "confidence": round(f.confidence, 3),
118
+ "created_at": f.created_at,
119
+ })
120
+ return {
121
+ "ok": True,
122
+ "memory_id": memory_id,
123
+ "original_content": original,
124
+ "facts": fact_list,
125
+ "fact_count": len(fact_list),
126
+ }
127
+
128
+
129
+ def _handle_delete_memory(fact_id: str, agent_id: str = "system") -> dict:
130
+ """Delete a specific atomic fact by ID with audit logging."""
131
+ engine = _get_engine()
132
+ pid = engine.profile_id
133
+ rows = engine._db.execute(
134
+ "SELECT content FROM atomic_facts WHERE fact_id = ? AND profile_id = ? LIMIT 1",
135
+ (fact_id, pid),
136
+ )
137
+ if not rows:
138
+ return {"ok": False, "error": f"Memory {fact_id} not found"}
139
+ content_preview = dict(rows[0]).get("content", "")[:80]
140
+ engine._db.delete_fact(fact_id)
141
+ # Audit log
142
+ import logging as _logging
143
+ _logging.getLogger("superlocalmemory.audit").info(
144
+ "DELETE fact_id=%s by agent=%s content=%s", fact_id[:16], agent_id, content_preview,
145
+ )
146
+ return {"ok": True, "deleted": fact_id, "content_preview": content_preview}
147
+
148
+
149
+ def _handle_update_memory(fact_id: str, content: str, agent_id: str = "system") -> dict:
150
+ """Update content of a specific atomic fact with audit logging."""
151
+ engine = _get_engine()
152
+ pid = engine.profile_id
153
+ rows = engine._db.execute(
154
+ "SELECT content FROM atomic_facts WHERE fact_id = ? AND profile_id = ? LIMIT 1",
155
+ (fact_id, pid),
156
+ )
157
+ if not rows:
158
+ return {"ok": False, "error": f"Memory {fact_id} not found"}
159
+ old_content = dict(rows[0]).get("content", "")[:80]
160
+ engine._db.execute(
161
+ "UPDATE atomic_facts SET content = ? WHERE fact_id = ?",
162
+ (content, fact_id),
163
+ )
164
+ import logging as _logging
165
+ _logging.getLogger("superlocalmemory.audit").info(
166
+ "UPDATE fact_id=%s by agent=%s old=%s new=%s",
167
+ fact_id[:16], agent_id, old_content, content[:80],
168
+ )
169
+ return {"ok": True, "fact_id": fact_id, "content": content}
170
+
171
+
172
+ def _handle_summarize(texts: list[str], mode: str) -> dict:
173
+ """Generate summary using heuristic (A) or LLM (B/C)."""
174
+ from superlocalmemory.core.summarizer import Summarizer
175
+ engine = _get_engine()
176
+ summarizer = Summarizer(engine._config)
177
+ summary = summarizer.summarize_cluster(
178
+ [{"content": t} for t in texts],
179
+ )
180
+ return {"ok": True, "summary": summary}
181
+
182
+
183
+ def _handle_synthesize(query: str, facts: list[dict]) -> dict:
184
+ """Generate synthesized answer from query + facts."""
185
+ from superlocalmemory.core.summarizer import Summarizer
186
+ engine = _get_engine()
187
+ summarizer = Summarizer(engine._config)
188
+ synthesis = summarizer.synthesize_answer(query, facts)
189
+ return {"ok": True, "synthesis": synthesis}
190
+
191
+
192
+ def _handle_status() -> dict:
193
+ engine = _get_engine()
194
+ pid = engine.profile_id
195
+ fact_count = engine._db.get_fact_count(pid)
196
+ return {
197
+ "ok": True,
198
+ "mode": engine._config.mode.value,
199
+ "profile": pid,
200
+ "fact_count": fact_count,
201
+ }
202
+
203
+
204
+ def _worker_main() -> None:
205
+ """Main loop: read JSON requests from stdin, write responses to stdout."""
206
+ for line in sys.stdin:
207
+ line = line.strip()
208
+ if not line:
209
+ continue
210
+ try:
211
+ req = json.loads(line)
212
+ except json.JSONDecodeError:
213
+ _respond({"ok": False, "error": "Invalid JSON"})
214
+ continue
215
+
216
+ cmd = req.get("cmd", "")
217
+
218
+ if cmd == "quit":
219
+ break
220
+
221
+ if cmd == "ping":
222
+ _respond({"ok": True})
223
+ continue
224
+
225
+ try:
226
+ if cmd == "recall":
227
+ result = _handle_recall(req.get("query", ""), req.get("limit", 10))
228
+ _respond(result)
229
+ elif cmd == "store":
230
+ result = _handle_store(req.get("content", ""), req.get("metadata", {}))
231
+ _respond(result)
232
+ elif cmd == "delete_memory":
233
+ result = _handle_delete_memory(
234
+ req.get("fact_id", ""), req.get("agent_id", "system"),
235
+ )
236
+ _respond(result)
237
+ elif cmd == "update_memory":
238
+ result = _handle_update_memory(
239
+ req.get("fact_id", ""),
240
+ req.get("content", ""),
241
+ req.get("agent_id", "system"),
242
+ )
243
+ _respond(result)
244
+ elif cmd == "get_memory_facts":
245
+ result = _handle_get_memory_facts(req.get("memory_id", ""))
246
+ _respond(result)
247
+ elif cmd == "summarize":
248
+ result = _handle_summarize(req.get("texts", []), req.get("mode", "a"))
249
+ _respond(result)
250
+ elif cmd == "synthesize":
251
+ result = _handle_synthesize(req.get("query", ""), req.get("facts", []))
252
+ _respond(result)
253
+ elif cmd == "status":
254
+ _respond(_handle_status())
255
+ else:
256
+ _respond({"ok": False, "error": f"Unknown command: {cmd}"})
257
+ except Exception as exc:
258
+ _respond({"ok": False, "error": str(exc)})
259
+
260
+
261
+ def _respond(data: dict) -> None:
262
+ sys.stdout.write(json.dumps(data) + "\n")
263
+ sys.stdout.flush()
264
+
265
+
266
+ if __name__ == "__main__":
267
+ _worker_main()
@@ -0,0 +1,182 @@
1
+ # Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
2
+ # Licensed under the MIT License - see LICENSE file
3
+ # Part of SuperLocalMemory V3 | https://qualixar.com | https://varunpratap.com
4
+
5
+ """Summarizer — Mode A heuristic + Mode B Ollama + Mode C OpenRouter.
6
+
7
+ Generates cluster summaries and search synthesis. All LLM failures
8
+ fall back to heuristic silently — never crashes the caller.
9
+
10
+ Part of Qualixar | Author: Varun Pratap Bhardwaj
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import logging
16
+ import os
17
+ import re
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class Summarizer:
23
+ """Generate summaries using heuristic or LLM based on mode."""
24
+
25
+ def __init__(self, config) -> None:
26
+ self._config = config
27
+ self._mode = config.mode.value if hasattr(config.mode, 'value') else str(config.mode)
28
+
29
+ # ------------------------------------------------------------------
30
+ # Public API
31
+ # ------------------------------------------------------------------
32
+
33
+ def summarize_cluster(self, members: list[dict]) -> str:
34
+ """Generate a human-readable cluster summary.
35
+
36
+ Args:
37
+ members: List of dicts with 'content' key.
38
+
39
+ Returns:
40
+ Summary string (2-3 sentences).
41
+ """
42
+ texts = [m.get("content", "") for m in members if m.get("content")]
43
+ if not texts:
44
+ return "Empty cluster."
45
+ if self._mode in ("b", "c") and self._has_llm():
46
+ try:
47
+ prompt = self._cluster_prompt(texts[:10])
48
+ return self._call_llm(prompt, max_tokens=150)
49
+ except Exception as exc:
50
+ logger.warning("LLM cluster summary failed, using heuristic: %s", exc)
51
+ return self._heuristic_summary(texts[:5])
52
+
53
+ def synthesize_answer(self, query: str, facts: list[dict]) -> str:
54
+ """Generate a synthesized answer from query + retrieved facts.
55
+
56
+ Returns empty string in Mode A (no LLM available).
57
+ """
58
+ if self._mode == "a" or not self._has_llm():
59
+ return ""
60
+ texts = [f.get("content", "") for f in facts if f.get("content")]
61
+ if not texts:
62
+ return ""
63
+ try:
64
+ prompt = self._synthesis_prompt(query, texts[:8])
65
+ return self._call_llm(prompt, max_tokens=250)
66
+ except Exception as exc:
67
+ logger.warning("LLM synthesis failed: %s", exc)
68
+ return ""
69
+
70
+ # ------------------------------------------------------------------
71
+ # Heuristic (Mode A — always available)
72
+ # ------------------------------------------------------------------
73
+
74
+ def _heuristic_summary(self, texts: list[str]) -> str:
75
+ """First sentence from top-3 texts, joined."""
76
+ sentences = []
77
+ for text in texts[:3]:
78
+ first = self._first_sentence(text)
79
+ if first and first not in sentences:
80
+ sentences.append(first)
81
+ return " ".join(sentences)[:300] if sentences else "No summary available."
82
+
83
+ @staticmethod
84
+ def _first_sentence(text: str) -> str:
85
+ """Extract first sentence (up to period, question mark, or 100 chars)."""
86
+ text = text.strip()
87
+ match = re.match(r'^(.+?[.!?])\s', text)
88
+ if match:
89
+ return match.group(1).strip()
90
+ return text[:100].strip()
91
+
92
+ # ------------------------------------------------------------------
93
+ # LLM calls (Mode B/C)
94
+ # ------------------------------------------------------------------
95
+
96
+ def _has_llm(self) -> bool:
97
+ """Check if LLM is available."""
98
+ if self._mode == "b":
99
+ return True # Ollama assumed running
100
+ if self._mode == "c":
101
+ return bool(
102
+ os.environ.get("OPENROUTER_API_KEY")
103
+ or getattr(self._config.llm, 'api_key', None)
104
+ )
105
+ return False
106
+
107
+ def _call_llm(self, prompt: str, max_tokens: int = 200) -> str:
108
+ """Route to Ollama (B) or OpenRouter (C)."""
109
+ if self._mode == "b":
110
+ return self._call_ollama(prompt, max_tokens)
111
+ return self._call_openrouter(prompt, max_tokens)
112
+
113
+ def _call_ollama(self, prompt: str, max_tokens: int = 200) -> str:
114
+ """Call local Ollama for summary generation."""
115
+ import httpx
116
+ model = getattr(self._config.llm, 'model', None) or "llama3.1:8b"
117
+ with httpx.Client(timeout=httpx.Timeout(20.0)) as client:
118
+ resp = client.post("http://localhost:11434/api/generate", json={
119
+ "model": model,
120
+ "prompt": prompt,
121
+ "stream": False,
122
+ "options": {"num_predict": max_tokens, "temperature": 0.3},
123
+ })
124
+ resp.raise_for_status()
125
+ return resp.json().get("response", "").strip()
126
+
127
+ def _call_openrouter(self, prompt: str, max_tokens: int = 200) -> str:
128
+ """Call OpenRouter API for summary generation."""
129
+ import httpx
130
+ api_key = (
131
+ os.environ.get("OPENROUTER_API_KEY")
132
+ or getattr(self._config.llm, 'api_key', None)
133
+ )
134
+ if not api_key:
135
+ raise RuntimeError("No OpenRouter API key")
136
+ model = (
137
+ getattr(self._config.llm, 'model', None)
138
+ or "meta-llama/llama-3.1-8b-instruct:free"
139
+ )
140
+ with httpx.Client(timeout=httpx.Timeout(20.0)) as client:
141
+ resp = client.post(
142
+ "https://openrouter.ai/api/v1/chat/completions",
143
+ headers={
144
+ "Authorization": f"Bearer {api_key}",
145
+ "Content-Type": "application/json",
146
+ },
147
+ json={
148
+ "model": model,
149
+ "messages": [{"role": "user", "content": prompt}],
150
+ "max_tokens": max_tokens,
151
+ "temperature": 0.3,
152
+ },
153
+ )
154
+ resp.raise_for_status()
155
+ choices = resp.json().get("choices", [])
156
+ if choices:
157
+ return choices[0].get("message", {}).get("content", "").strip()
158
+ return ""
159
+
160
+ # ------------------------------------------------------------------
161
+ # Prompt templates
162
+ # ------------------------------------------------------------------
163
+
164
+ @staticmethod
165
+ def _cluster_prompt(texts: list[str]) -> str:
166
+ numbered = "\n".join(f"{i+1}. {t[:200]}" for i, t in enumerate(texts))
167
+ return (
168
+ "Summarize the following related memories in 2-3 concise sentences. "
169
+ "Focus on the common theme and key facts.\n\n"
170
+ f"Memories:\n{numbered}\n\n"
171
+ "Summary:"
172
+ )
173
+
174
+ @staticmethod
175
+ def _synthesis_prompt(query: str, texts: list[str]) -> str:
176
+ numbered = "\n".join(f"- {t[:200]}" for t in texts)
177
+ return (
178
+ f"Based on these stored memories, answer the question concisely.\n\n"
179
+ f"Question: {query}\n\n"
180
+ f"Relevant memories:\n{numbered}\n\n"
181
+ "Answer (2-3 sentences):"
182
+ )
@@ -0,0 +1,217 @@
1
+ # Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
2
+ # Licensed under the MIT License - see LICENSE file
3
+ # Part of SuperLocalMemory V3 | https://qualixar.com | https://varunpratap.com
4
+
5
+ """Recall worker pool — manages subprocess lifecycle for all callers.
6
+
7
+ Single shared worker process handles requests from dashboard, MCP, CLI.
8
+ Serializes concurrent requests via a threading lock (one at a time to
9
+ avoid interleaved stdout). Worker auto-kills after idle timeout.
10
+
11
+ Usage:
12
+ pool = WorkerPool.shared()
13
+ result = pool.recall("what is X?", limit=10)
14
+ result = pool.store("some content", metadata={})
15
+
16
+ Part of Qualixar | Author: Varun Pratap Bhardwaj
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import json
22
+ import logging
23
+ import os
24
+ import subprocess
25
+ import sys
26
+ import threading
27
+ import time
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+ _IDLE_TIMEOUT = 120 # 2 min — kill worker after idle
32
+ _REQUEST_TIMEOUT = 60 # 60 sec max per request
33
+
34
+
35
+ class WorkerPool:
36
+ """Manages a single recall_worker subprocess with idle auto-kill.
37
+
38
+ Thread-safe: concurrent callers are serialized via lock.
39
+ The worker subprocess holds all heavy memory (PyTorch, engine).
40
+ The calling process stays at ~60 MB.
41
+ """
42
+
43
+ _instance: WorkerPool | None = None
44
+ _instance_lock = threading.Lock()
45
+
46
+ def __init__(self) -> None:
47
+ self._lock = threading.Lock()
48
+ self._proc: subprocess.Popen | None = None
49
+ self._idle_timer: threading.Timer | None = None
50
+ self._last_used: float = 0.0
51
+
52
+ @classmethod
53
+ def shared(cls) -> WorkerPool:
54
+ """Get or create the singleton worker pool."""
55
+ if cls._instance is None:
56
+ with cls._instance_lock:
57
+ if cls._instance is None:
58
+ cls._instance = cls()
59
+ return cls._instance
60
+
61
+ # ------------------------------------------------------------------
62
+ # Public API
63
+ # ------------------------------------------------------------------
64
+
65
+ def recall(self, query: str, limit: int = 10) -> dict:
66
+ """Run recall in worker subprocess. Returns result dict."""
67
+ return self._send({"cmd": "recall", "query": query, "limit": limit})
68
+
69
+ def store(self, content: str, metadata: dict | None = None) -> dict:
70
+ """Run store in worker subprocess. Returns result dict."""
71
+ return self._send({
72
+ "cmd": "store", "content": content,
73
+ "metadata": metadata or {},
74
+ })
75
+
76
+ def delete_memory(self, fact_id: str, agent_id: str = "system") -> dict:
77
+ """Delete a specific memory by fact_id. Logged for audit."""
78
+ return self._send({"cmd": "delete_memory", "fact_id": fact_id, "agent_id": agent_id})
79
+
80
+ def update_memory(self, fact_id: str, content: str, agent_id: str = "system") -> dict:
81
+ """Update content of a specific memory. Logged for audit."""
82
+ return self._send({"cmd": "update_memory", "fact_id": fact_id, "content": content, "agent_id": agent_id})
83
+
84
+ def get_memory_facts(self, memory_id: str) -> dict:
85
+ """Get original memory text + child atomic facts."""
86
+ return self._send({"cmd": "get_memory_facts", "memory_id": memory_id})
87
+
88
+ def summarize(self, texts: list[str]) -> dict:
89
+ """Generate summary from texts (heuristic in A, LLM in B/C)."""
90
+ return self._send({"cmd": "summarize", "texts": texts})
91
+
92
+ def synthesize(self, query: str, facts: list[dict]) -> dict:
93
+ """Generate synthesized answer from query + facts."""
94
+ return self._send({"cmd": "synthesize", "query": query, "facts": facts})
95
+
96
+ def status(self) -> dict:
97
+ """Get engine status from worker."""
98
+ return self._send({"cmd": "status"})
99
+
100
+ def shutdown(self) -> None:
101
+ """Gracefully kill the worker."""
102
+ with self._lock:
103
+ self._kill()
104
+
105
+ @property
106
+ def worker_pid(self) -> int | None:
107
+ """PID of the worker process, or None if not running."""
108
+ if self._proc and self._proc.poll() is None:
109
+ return self._proc.pid
110
+ return None
111
+
112
+ # ------------------------------------------------------------------
113
+ # Internals
114
+ # ------------------------------------------------------------------
115
+
116
+ def _send(self, request: dict) -> dict:
117
+ """Send request to worker and get response. Thread-safe."""
118
+ with self._lock:
119
+ self._ensure_worker()
120
+ if self._proc is None:
121
+ return {"ok": False, "error": "Worker failed to start"}
122
+
123
+ req_line = json.dumps(request) + "\n"
124
+ try:
125
+ self._proc.stdin.write(req_line)
126
+ self._proc.stdin.flush()
127
+
128
+ # Read response with timeout
129
+ import selectors
130
+ sel = selectors.DefaultSelector()
131
+ sel.register(self._proc.stdout, selectors.EVENT_READ)
132
+ ready = sel.select(timeout=_REQUEST_TIMEOUT)
133
+ sel.close()
134
+
135
+ if not ready:
136
+ logger.error("Worker timed out after %ds", _REQUEST_TIMEOUT)
137
+ self._kill()
138
+ return {"ok": False, "error": "Worker timed out"}
139
+
140
+ resp_line = self._proc.stdout.readline()
141
+ if not resp_line:
142
+ logger.warning("Worker returned empty, restarting")
143
+ self._kill()
144
+ return {"ok": False, "error": "Worker died"}
145
+
146
+ self._reset_idle_timer()
147
+ return json.loads(resp_line)
148
+
149
+ except (BrokenPipeError, OSError, json.JSONDecodeError) as exc:
150
+ logger.warning("Worker communication failed: %s", exc)
151
+ self._kill()
152
+ return {"ok": False, "error": str(exc)}
153
+
154
+ def _ensure_worker(self) -> None:
155
+ """Spawn worker if not running."""
156
+ if self._proc is not None and self._proc.poll() is None:
157
+ return
158
+ self._proc = None
159
+ try:
160
+ env = {
161
+ **os.environ,
162
+ "CUDA_VISIBLE_DEVICES": "",
163
+ "PYTORCH_MPS_HIGH_WATERMARK_RATIO": "0.0",
164
+ "PYTORCH_MPS_MEM_LIMIT": "0",
165
+ "PYTORCH_ENABLE_MPS_FALLBACK": "1",
166
+ "TOKENIZERS_PARALLELISM": "false",
167
+ "TORCH_DEVICE": "cpu",
168
+ }
169
+ self._proc = subprocess.Popen(
170
+ [sys.executable, "-m", "superlocalmemory.core.recall_worker"],
171
+ stdin=subprocess.PIPE,
172
+ stdout=subprocess.PIPE,
173
+ stderr=subprocess.DEVNULL,
174
+ text=True,
175
+ bufsize=1,
176
+ env=env,
177
+ )
178
+ logger.info("Recall worker spawned (PID %d)", self._proc.pid)
179
+ except Exception as exc:
180
+ logger.error("Failed to spawn recall worker: %s", exc)
181
+ self._proc = None
182
+
183
+ def _kill(self) -> None:
184
+ """Terminate worker. ALL memory freed to OS."""
185
+ if self._idle_timer is not None:
186
+ self._idle_timer.cancel()
187
+ self._idle_timer = None
188
+ if self._proc is not None:
189
+ pid = self._proc.pid
190
+ try:
191
+ self._proc.stdin.write('{"cmd":"quit"}\n')
192
+ self._proc.stdin.flush()
193
+ self._proc.wait(timeout=3)
194
+ except Exception:
195
+ try:
196
+ self._proc.kill()
197
+ self._proc.wait(timeout=2)
198
+ except Exception:
199
+ pass
200
+ self._proc = None
201
+ logger.info("Recall worker killed (PID %s)", pid)
202
+
203
+ def _reset_idle_timer(self) -> None:
204
+ """Kill worker after 2 min of no requests."""
205
+ if self._idle_timer is not None:
206
+ self._idle_timer.cancel()
207
+ self._idle_timer = threading.Timer(_IDLE_TIMEOUT, self._idle_kill)
208
+ self._idle_timer.daemon = True
209
+ self._idle_timer.start()
210
+ self._last_used = time.time()
211
+
212
+ def _idle_kill(self) -> None:
213
+ """Called by idle timer — kill worker to free memory."""
214
+ with self._lock:
215
+ if self._proc is not None:
216
+ logger.info("Idle timeout — killing recall worker")
217
+ self._kill()
@@ -11,6 +11,15 @@ Part of Qualixar | Author: Varun Pratap Bhardwaj
11
11
 
12
12
  from __future__ import annotations
13
13
 
14
+ # CRITICAL: Set BEFORE any torch/transformers import to prevent Metal/MPS
15
+ # GPU memory reservation on Apple Silicon.
16
+ import os as _os
17
+ _os.environ.setdefault('PYTORCH_MPS_HIGH_WATERMARK_RATIO', '0.0')
18
+ _os.environ.setdefault('PYTORCH_MPS_MEM_LIMIT', '0')
19
+ _os.environ.setdefault('PYTORCH_ENABLE_MPS_FALLBACK', '1')
20
+ _os.environ.setdefault('TOKENIZERS_PARALLELISM', 'false')
21
+ _os.environ.setdefault('TORCH_DEVICE', 'cpu')
22
+
14
23
  import logging
15
24
  import sys
16
25