superlocalmemory 3.3.1 → 3.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ide/configs/antigravity-mcp.json +2 -1
- package/ide/configs/claude-desktop-mcp.json +2 -1
- package/ide/configs/cursor-mcp.json +2 -1
- package/ide/configs/gemini-cli-mcp.json +2 -1
- package/ide/configs/jetbrains-mcp.json +2 -1
- package/ide/configs/perplexity-mcp.json +2 -1
- package/ide/configs/windsurf-mcp.json +2 -1
- package/package.json +1 -1
- package/pyproject.toml +6 -3
- package/scripts/postinstall.js +16 -9
- package/src/superlocalmemory/cli/commands.py +44 -15
- package/src/superlocalmemory/core/config.py +18 -6
- package/src/superlocalmemory/core/embedding_worker.py +14 -1
- package/src/superlocalmemory/core/embeddings.py +12 -1
- package/src/superlocalmemory/core/engine_wiring.py +4 -1
- package/src/superlocalmemory/core/modes.py +2 -1
- package/src/superlocalmemory/core/recall_worker.py +11 -5
- package/src/superlocalmemory/core/worker_pool.py +13 -2
- package/src/superlocalmemory/hooks/ide_connector.py +1 -0
- package/src/superlocalmemory/retrieval/reranker.py +125 -24
|
@@ -5,7 +5,8 @@
|
|
|
5
5
|
"args": [
|
|
6
6
|
"mcp"
|
|
7
7
|
],
|
|
8
|
-
"description": "SuperLocalMemory V3 - Local memory system with mathematical foundations and knowledge graphs"
|
|
8
|
+
"description": "SuperLocalMemory V3 - Local memory system with mathematical foundations and knowledge graphs",
|
|
9
|
+
"type": "stdio"
|
|
9
10
|
}
|
|
10
11
|
}
|
|
11
12
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "superlocalmemory",
|
|
3
|
-
"version": "3.3.
|
|
3
|
+
"version": "3.3.2",
|
|
4
4
|
"description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"ai-memory",
|
package/pyproject.toml
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "superlocalmemory"
|
|
3
|
-
version = "3.3.
|
|
3
|
+
version = "3.3.2"
|
|
4
4
|
description = "Information-geometric agent memory with mathematical guarantees"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = {text = "MIT"}
|
|
7
|
-
requires-python = ">=3.11"
|
|
7
|
+
requires-python = ">=3.11,<3.15"
|
|
8
8
|
authors = [
|
|
9
9
|
{name = "Varun Pratap Bhardwaj", email = "admin@superlocalmemory.com"},
|
|
10
10
|
]
|
|
@@ -48,11 +48,13 @@ dependencies = [
|
|
|
48
48
|
|
|
49
49
|
[project.optional-dependencies]
|
|
50
50
|
search = [
|
|
51
|
-
"sentence-transformers>=
|
|
51
|
+
"sentence-transformers>=4.0.0",
|
|
52
|
+
"sentence-transformers[onnx]>=4.0.0",
|
|
52
53
|
"einops>=0.8.2",
|
|
53
54
|
"torch>=2.2.0",
|
|
54
55
|
"scikit-learn>=1.3.0,<2.0.0",
|
|
55
56
|
"geoopt>=0.5.0",
|
|
57
|
+
"onnxruntime>=1.17.0",
|
|
56
58
|
]
|
|
57
59
|
ui = [
|
|
58
60
|
"fastapi[all]>=0.135.1",
|
|
@@ -72,6 +74,7 @@ full = [
|
|
|
72
74
|
dev = [
|
|
73
75
|
"pytest>=8.0",
|
|
74
76
|
"pytest-cov>=4.1",
|
|
77
|
+
"sqlite-vec>=0.1.6",
|
|
75
78
|
]
|
|
76
79
|
|
|
77
80
|
[project.urls]
|
package/scripts/postinstall.js
CHANGED
|
@@ -112,20 +112,27 @@ if (pipInstall(coreDeps, 'core')) {
|
|
|
112
112
|
console.log(' Run manually: pip install ' + coreDeps.join(' '));
|
|
113
113
|
}
|
|
114
114
|
|
|
115
|
-
// Search
|
|
116
|
-
const searchDeps = [
|
|
115
|
+
// Search + ONNX reranking (V3.3.2 — enables 6-channel retrieval + cross-encoder)
|
|
116
|
+
const searchDeps = [
|
|
117
|
+
'sentence-transformers[onnx]>=4.0.0',
|
|
118
|
+
'einops>=0.7.0', 'geoopt>=0.5.0',
|
|
119
|
+
'onnxruntime>=1.17.0',
|
|
120
|
+
];
|
|
117
121
|
|
|
118
|
-
console.log('\nInstalling semantic search
|
|
122
|
+
console.log('\nInstalling semantic search + ONNX reranking engine...');
|
|
123
|
+
console.log(' (sentence-transformers 4+, ONNX Runtime, Fisher-Rao geometry)');
|
|
119
124
|
if (pipInstall(searchDeps, 'search')) {
|
|
120
|
-
console.log('✓
|
|
125
|
+
console.log('✓ Search engine installed (sentence-transformers + ONNX + Fisher-Rao)');
|
|
126
|
+
console.log(' Cross-encoder reranking enabled for ALL modes (+30pp quality)');
|
|
121
127
|
console.log('');
|
|
122
|
-
console.log('
|
|
123
|
-
console.log('
|
|
128
|
+
console.log(' Models auto-download on first use:');
|
|
129
|
+
console.log(' - Embedding: nomic-ai/nomic-embed-text-v1.5 (~500MB)');
|
|
130
|
+
console.log(' - Reranker: cross-encoder/ms-marco-MiniLM-L-6-v2 (~90MB)');
|
|
124
131
|
console.log(' To pre-download now, run: slm warmup');
|
|
125
132
|
} else {
|
|
126
|
-
console.log('⚠
|
|
127
|
-
console.log(' For full
|
|
128
|
-
console.log(' pip install sentence-transformers einops geoopt');
|
|
133
|
+
console.log('⚠ Search engine installation failed (BM25 keyword search still works).');
|
|
134
|
+
console.log(' For full 6-channel retrieval + reranking, run:');
|
|
135
|
+
console.log(' pip install "sentence-transformers[onnx]>=4.0.0" einops geoopt onnxruntime');
|
|
129
136
|
}
|
|
130
137
|
|
|
131
138
|
// Dashboard dependencies (IMPORTANT — enables web dashboard + MCP server)
|
|
@@ -993,35 +993,64 @@ def cmd_dashboard(args: Namespace) -> None:
|
|
|
993
993
|
print("Or install manually: pip install 'fastapi[all]' uvicorn")
|
|
994
994
|
sys.exit(1)
|
|
995
995
|
|
|
996
|
+
import os
|
|
997
|
+
import signal
|
|
996
998
|
import socket
|
|
997
999
|
|
|
998
1000
|
port = getattr(args, "port", 8765)
|
|
999
1001
|
|
|
1000
|
-
def
|
|
1001
|
-
|
|
1002
|
-
try:
|
|
1003
|
-
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
1004
|
-
s.bind(("127.0.0.1", p))
|
|
1005
|
-
return p
|
|
1006
|
-
except OSError:
|
|
1007
|
-
continue
|
|
1008
|
-
return preferred
|
|
1002
|
+
def _kill_existing_on_port(target_port: int) -> None:
|
|
1003
|
+
"""Kill any existing SLM dashboard on the target port.
|
|
1009
1004
|
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1005
|
+
V3.3.2: ONE port, no auto-increment. If port is busy with
|
|
1006
|
+
another SLM instance, kill it. If busy with a non-SLM process,
|
|
1007
|
+
warn and exit — never silently shift to a different port.
|
|
1008
|
+
"""
|
|
1009
|
+
if sys.platform == "win32":
|
|
1010
|
+
return # Windows: user must close manually
|
|
1011
|
+
try:
|
|
1012
|
+
import subprocess
|
|
1013
|
+
result = subprocess.run(
|
|
1014
|
+
["lsof", "-ti", f":{target_port}"],
|
|
1015
|
+
capture_output=True, text=True, timeout=5,
|
|
1016
|
+
)
|
|
1017
|
+
if result.returncode == 0 and result.stdout.strip():
|
|
1018
|
+
pids = result.stdout.strip().split("\n")
|
|
1019
|
+
for pid_str in pids:
|
|
1020
|
+
pid = int(pid_str.strip())
|
|
1021
|
+
if pid == os.getpid():
|
|
1022
|
+
continue
|
|
1023
|
+
# Check if it's an SLM/Python process
|
|
1024
|
+
ps_result = subprocess.run(
|
|
1025
|
+
["ps", "-p", str(pid), "-o", "command="],
|
|
1026
|
+
capture_output=True, text=True, timeout=5,
|
|
1027
|
+
)
|
|
1028
|
+
cmd = ps_result.stdout.strip().lower()
|
|
1029
|
+
if "superlocalmemory" in cmd or "slm" in cmd or "uvicorn" in cmd:
|
|
1030
|
+
os.kill(pid, signal.SIGTERM)
|
|
1031
|
+
print(f" Stopped previous dashboard (PID {pid})")
|
|
1032
|
+
import time
|
|
1033
|
+
time.sleep(1)
|
|
1034
|
+
except Exception:
|
|
1035
|
+
pass # Best-effort
|
|
1036
|
+
|
|
1037
|
+
_kill_existing_on_port(port)
|
|
1038
|
+
|
|
1039
|
+
# Brief wait for port to fully release after killing old process
|
|
1040
|
+
import time
|
|
1041
|
+
time.sleep(1)
|
|
1013
1042
|
|
|
1014
1043
|
print("=" * 60)
|
|
1015
1044
|
print(" SuperLocalMemory V3 — Web Dashboard")
|
|
1016
1045
|
print("=" * 60)
|
|
1017
|
-
print(f" Dashboard: http://localhost:{
|
|
1018
|
-
print(f" API Docs: http://localhost:{
|
|
1046
|
+
print(f" Dashboard: http://localhost:{port}")
|
|
1047
|
+
print(f" API Docs: http://localhost:{port}/api/docs")
|
|
1019
1048
|
print(" Press Ctrl+C to stop\n")
|
|
1020
1049
|
|
|
1021
1050
|
from superlocalmemory.server.ui import create_app
|
|
1022
1051
|
|
|
1023
1052
|
app = create_app()
|
|
1024
|
-
uvicorn.run(app, host="127.0.0.1", port=
|
|
1053
|
+
uvicorn.run(app, host="127.0.0.1", port=port, log_level="info")
|
|
1025
1054
|
|
|
1026
1055
|
|
|
1027
1056
|
# -- Profiles (supports --json) -------------------------------------------
|
|
@@ -152,9 +152,10 @@ class RetrievalConfig:
|
|
|
152
152
|
entity_graph_max_hops: int = 3
|
|
153
153
|
temporal_proximity_days: int = 30
|
|
154
154
|
|
|
155
|
-
# Reranking
|
|
155
|
+
# Reranking (V3.3.2: ONNX backend enabled for all modes)
|
|
156
156
|
use_cross_encoder: bool = True
|
|
157
|
-
cross_encoder_model: str = "
|
|
157
|
+
cross_encoder_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
|
158
|
+
cross_encoder_backend: str = "onnx" # "onnx" (~200MB) or "" (PyTorch, ~1.5GB)
|
|
158
159
|
|
|
159
160
|
# Agentic (Mode C only)
|
|
160
161
|
agentic_max_rounds: int = 3
|
|
@@ -611,6 +612,15 @@ class SLMConfig:
|
|
|
611
612
|
|
|
612
613
|
rt = data.get("retrieval", {})
|
|
613
614
|
if rt:
|
|
615
|
+
# V3.3.2 migration: auto-enable ONNX cross-encoder.
|
|
616
|
+
# Pre-3.3.2 configs had use_cross_encoder=False because the
|
|
617
|
+
# PyTorch cross-encoder used ~1.5GB RAM. With ONNX backend
|
|
618
|
+
# (~200MB), it's now safe for all modes. Detect old configs
|
|
619
|
+
# by the absence of cross_encoder_backend field.
|
|
620
|
+
if "cross_encoder_backend" not in rt:
|
|
621
|
+
rt["use_cross_encoder"] = True
|
|
622
|
+
rt["cross_encoder_model"] = "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
|
623
|
+
rt["cross_encoder_backend"] = "onnx"
|
|
614
624
|
config.retrieval = RetrievalConfig(**{
|
|
615
625
|
k: v for k, v in rt.items()
|
|
616
626
|
if k in RetrievalConfig.__dataclass_fields__
|
|
@@ -650,6 +660,8 @@ class SLMConfig:
|
|
|
650
660
|
},
|
|
651
661
|
"retrieval": {
|
|
652
662
|
"use_cross_encoder": self.retrieval.use_cross_encoder,
|
|
663
|
+
"cross_encoder_model": self.retrieval.cross_encoder_model,
|
|
664
|
+
"cross_encoder_backend": self.retrieval.cross_encoder_backend,
|
|
653
665
|
},
|
|
654
666
|
}
|
|
655
667
|
|
|
@@ -725,8 +737,8 @@ class SLMConfig:
|
|
|
725
737
|
),
|
|
726
738
|
llm=LLMConfig(), # No LLM
|
|
727
739
|
retrieval=RetrievalConfig(
|
|
728
|
-
#
|
|
729
|
-
use_cross_encoder=
|
|
740
|
+
# V3.3.2: ONNX cross-encoder enabled for all modes (~200MB)
|
|
741
|
+
use_cross_encoder=True,
|
|
730
742
|
),
|
|
731
743
|
math=MathConfig(
|
|
732
744
|
sheaf_contradiction_threshold=0.45, # 768d threshold
|
|
@@ -750,8 +762,8 @@ class SLMConfig:
|
|
|
750
762
|
api_key=llm_api_key or "",
|
|
751
763
|
),
|
|
752
764
|
retrieval=RetrievalConfig(
|
|
753
|
-
#
|
|
754
|
-
use_cross_encoder=
|
|
765
|
+
# V3.3.2: ONNX cross-encoder enabled for all modes (~200MB)
|
|
766
|
+
use_cross_encoder=True,
|
|
755
767
|
),
|
|
756
768
|
)
|
|
757
769
|
|
|
@@ -23,6 +23,7 @@ Part of Qualixar | Author: Varun Pratap Bhardwaj
|
|
|
23
23
|
from __future__ import annotations
|
|
24
24
|
|
|
25
25
|
import json
|
|
26
|
+
import signal
|
|
26
27
|
import sys
|
|
27
28
|
import os
|
|
28
29
|
|
|
@@ -34,6 +35,11 @@ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
|
|
34
35
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
35
36
|
os.environ["TORCH_DEVICE"] = "cpu"
|
|
36
37
|
|
|
38
|
+
# SIGTERM bridge: Docker/systemd send SIGTERM to stop processes.
|
|
39
|
+
# Without this, the worker ignores SIGTERM and becomes a zombie.
|
|
40
|
+
if sys.platform != "win32":
|
|
41
|
+
signal.signal(signal.SIGTERM, lambda *_: sys.exit(0))
|
|
42
|
+
|
|
37
43
|
|
|
38
44
|
def _worker_main() -> None:
|
|
39
45
|
"""Main loop: read JSON requests from stdin, write responses to stdout."""
|
|
@@ -97,7 +103,14 @@ def _worker_main() -> None:
|
|
|
97
103
|
_respond({"ok": False, "error": f"Model load failed: {exc}"})
|
|
98
104
|
continue
|
|
99
105
|
try:
|
|
100
|
-
|
|
106
|
+
# torch.inference_mode prevents autograd graph accumulation
|
|
107
|
+
# which causes silent memory leaks over long-running sessions.
|
|
108
|
+
try:
|
|
109
|
+
import torch
|
|
110
|
+
with torch.inference_mode():
|
|
111
|
+
vecs = model.encode(texts, normalize_embeddings=True)
|
|
112
|
+
except ImportError:
|
|
113
|
+
vecs = model.encode(texts, normalize_embeddings=True)
|
|
101
114
|
if isinstance(vecs, np.ndarray) and vecs.ndim == 2:
|
|
102
115
|
result = [vecs[i].tolist() for i in range(vecs.shape[0])]
|
|
103
116
|
else:
|
|
@@ -45,7 +45,8 @@ class DimensionMismatchError(RuntimeError):
|
|
|
45
45
|
|
|
46
46
|
|
|
47
47
|
_IDLE_TIMEOUT_SECONDS = 120 # 2 minutes — kill worker after idle
|
|
48
|
-
_SUBPROCESS_RESPONSE_TIMEOUT =
|
|
48
|
+
_SUBPROCESS_RESPONSE_TIMEOUT = 120 # V3.3.2: 120s for ONNX cold start
|
|
49
|
+
_WORKER_RECYCLE_AFTER = 1000 # Recycle worker after N requests (C++ fragmentation prevention)
|
|
49
50
|
|
|
50
51
|
|
|
51
52
|
class EmbeddingService:
|
|
@@ -66,6 +67,7 @@ class EmbeddingService:
|
|
|
66
67
|
self._last_used: float = 0.0
|
|
67
68
|
self._idle_timer: threading.Timer | None = None
|
|
68
69
|
self._worker_ready = False
|
|
70
|
+
self._request_count: int = 0
|
|
69
71
|
|
|
70
72
|
@property
|
|
71
73
|
def is_available(self) -> bool:
|
|
@@ -144,6 +146,13 @@ class EmbeddingService:
|
|
|
144
146
|
never hangs indefinitely on cold model loads or network issues.
|
|
145
147
|
"""
|
|
146
148
|
with self._lock:
|
|
149
|
+
# Worker recycling: restart after N requests to prevent
|
|
150
|
+
# C++ allocator fragmentation over long-running sessions.
|
|
151
|
+
if self._request_count >= _WORKER_RECYCLE_AFTER and self._worker_proc is not None:
|
|
152
|
+
logger.info("Recycling embedding worker after %d requests", self._request_count)
|
|
153
|
+
self._kill_worker()
|
|
154
|
+
self._request_count = 0
|
|
155
|
+
|
|
147
156
|
self._ensure_worker()
|
|
148
157
|
if self._worker_proc is None:
|
|
149
158
|
return None
|
|
@@ -176,6 +185,7 @@ class EmbeddingService:
|
|
|
176
185
|
logger.warning("Worker error: %s", resp.get("error"))
|
|
177
186
|
return None
|
|
178
187
|
self._reset_idle_timer()
|
|
188
|
+
self._request_count += 1
|
|
179
189
|
return resp["vectors"]
|
|
180
190
|
except (BrokenPipeError, OSError, json.JSONDecodeError) as exc:
|
|
181
191
|
logger.warning(
|
|
@@ -235,6 +245,7 @@ class EmbeddingService:
|
|
|
235
245
|
text=True,
|
|
236
246
|
bufsize=1,
|
|
237
247
|
env=env,
|
|
248
|
+
start_new_session=True, # Prevent terminal signals bleeding to worker
|
|
238
249
|
)
|
|
239
250
|
logger.info("Embedding worker spawned (PID %d)", self._worker_proc.pid)
|
|
240
251
|
self._worker_ready = True
|
|
@@ -437,7 +437,10 @@ def init_retrieval(
|
|
|
437
437
|
|
|
438
438
|
reranker = None
|
|
439
439
|
if config.retrieval.use_cross_encoder:
|
|
440
|
-
reranker = CrossEncoderReranker(
|
|
440
|
+
reranker = CrossEncoderReranker(
|
|
441
|
+
config.retrieval.cross_encoder_model,
|
|
442
|
+
backend=config.retrieval.cross_encoder_backend,
|
|
443
|
+
)
|
|
441
444
|
|
|
442
445
|
profile_ch = ProfileChannel(db)
|
|
443
446
|
bridge = BridgeDiscovery(db)
|
|
@@ -68,7 +68,7 @@ MODE_A = ModeCapabilities(
|
|
|
68
68
|
description=(
|
|
69
69
|
"Local Guardian — Zero LLM, zero cloud. "
|
|
70
70
|
"Uses nomic-embed-text-v1.5 encoder (768d, 8K context) for embeddings. "
|
|
71
|
-
"spaCy + rules for extraction.
|
|
71
|
+
"spaCy + rules for extraction. ONNX cross-encoder reranking (~200MB). "
|
|
72
72
|
"Full EU AI Act compliance. Target: 65%+"
|
|
73
73
|
),
|
|
74
74
|
)
|
|
@@ -89,6 +89,7 @@ MODE_B = ModeCapabilities(
|
|
|
89
89
|
description=(
|
|
90
90
|
"Smart Local — Local Ollama LLM (Phi-3, Llama 3.2). "
|
|
91
91
|
"LLM-quality extraction and classification, fully local. "
|
|
92
|
+
"ONNX cross-encoder reranking (~200MB). "
|
|
92
93
|
"No cloud, no data export. EU AI Act compliant. Target: 75-80%"
|
|
93
94
|
),
|
|
94
95
|
)
|
|
@@ -18,6 +18,7 @@ from __future__ import annotations
|
|
|
18
18
|
|
|
19
19
|
import json
|
|
20
20
|
import os
|
|
21
|
+
import signal
|
|
21
22
|
import sys
|
|
22
23
|
|
|
23
24
|
# Force CPU BEFORE any torch import
|
|
@@ -28,6 +29,11 @@ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
|
|
28
29
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
29
30
|
os.environ["TORCH_DEVICE"] = "cpu"
|
|
30
31
|
|
|
32
|
+
# SIGTERM bridge: Docker/systemd send SIGTERM to stop processes.
|
|
33
|
+
# Without this, the worker ignores SIGTERM and becomes a zombie.
|
|
34
|
+
if sys.platform != "win32":
|
|
35
|
+
signal.signal(signal.SIGTERM, lambda *_: sys.exit(0))
|
|
36
|
+
|
|
31
37
|
_engine = None
|
|
32
38
|
|
|
33
39
|
|
|
@@ -223,14 +229,14 @@ def _worker_main() -> None:
|
|
|
223
229
|
continue
|
|
224
230
|
|
|
225
231
|
if cmd == "warmup":
|
|
226
|
-
# Pre-load engine +
|
|
227
|
-
#
|
|
228
|
-
#
|
|
232
|
+
# Pre-load engine + database + embeddings only.
|
|
233
|
+
# V3.3.2: Do NOT run a dummy recall — it triggers the ONNX
|
|
234
|
+
# cross-encoder export (~30s) which combined with engine init
|
|
235
|
+
# exceeds the worker timeout. The cross-encoder loads lazily
|
|
236
|
+
# in a background thread on the first real recall instead.
|
|
229
237
|
try:
|
|
230
238
|
engine = _get_engine()
|
|
231
239
|
fact_count = engine._db.get_fact_count(engine._profile_id) if engine._db else 0
|
|
232
|
-
if fact_count > 0:
|
|
233
|
-
engine.recall("warmup", limit=1)
|
|
234
240
|
_respond({"ok": True, "message": "Engine warm", "facts": fact_count})
|
|
235
241
|
except Exception as exc:
|
|
236
242
|
_respond({"ok": False, "error": f"Warmup failed: {exc}"})
|
|
@@ -29,8 +29,9 @@ import time
|
|
|
29
29
|
logger = logging.getLogger(__name__)
|
|
30
30
|
|
|
31
31
|
_IDLE_TIMEOUT = 120 # 2 min — kill worker after idle
|
|
32
|
-
_REQUEST_TIMEOUT =
|
|
33
|
-
_WARMUP_TIMEOUT =
|
|
32
|
+
_REQUEST_TIMEOUT = 120 # 120 sec per request (V3.3.2: ONNX cold start can take 30-60s)
|
|
33
|
+
_WARMUP_TIMEOUT = 180 # 3 min — first cold start: engine + ONNX export + models
|
|
34
|
+
_WORKER_RECYCLE_AFTER = 1000 # Recycle worker after N requests (C++ fragmentation prevention)
|
|
34
35
|
|
|
35
36
|
|
|
36
37
|
class WorkerPool:
|
|
@@ -49,6 +50,7 @@ class WorkerPool:
|
|
|
49
50
|
self._proc: subprocess.Popen | None = None
|
|
50
51
|
self._idle_timer: threading.Timer | None = None
|
|
51
52
|
self._last_used: float = 0.0
|
|
53
|
+
self._request_count: int = 0
|
|
52
54
|
|
|
53
55
|
@classmethod
|
|
54
56
|
def shared(cls) -> WorkerPool:
|
|
@@ -146,6 +148,13 @@ class WorkerPool:
|
|
|
146
148
|
def _send_with_timeout(self, request: dict, timeout: float) -> dict:
|
|
147
149
|
"""Send request with configurable timeout. Thread-safe."""
|
|
148
150
|
with self._lock:
|
|
151
|
+
# Worker recycling: restart after N requests to prevent
|
|
152
|
+
# C++ allocator fragmentation over long-running sessions.
|
|
153
|
+
if self._request_count >= _WORKER_RECYCLE_AFTER and self._proc is not None:
|
|
154
|
+
logger.info("Recycling recall worker after %d requests", self._request_count)
|
|
155
|
+
self._kill()
|
|
156
|
+
self._request_count = 0
|
|
157
|
+
|
|
149
158
|
self._ensure_worker()
|
|
150
159
|
if self._proc is None:
|
|
151
160
|
return {"ok": False, "error": "Worker failed to start"}
|
|
@@ -168,6 +177,7 @@ class WorkerPool:
|
|
|
168
177
|
return {"ok": False, "error": "Worker died"}
|
|
169
178
|
|
|
170
179
|
self._reset_idle_timer()
|
|
180
|
+
self._request_count += 1
|
|
171
181
|
return json.loads(resp_line)
|
|
172
182
|
|
|
173
183
|
except (BrokenPipeError, OSError, json.JSONDecodeError) as exc:
|
|
@@ -227,6 +237,7 @@ class WorkerPool:
|
|
|
227
237
|
text=True,
|
|
228
238
|
bufsize=1,
|
|
229
239
|
env=env,
|
|
240
|
+
start_new_session=True, # Prevent terminal signals bleeding to worker
|
|
230
241
|
)
|
|
231
242
|
logger.info("Recall worker spawned (PID %d)", self._proc.pid)
|
|
232
243
|
except Exception as exc:
|
|
@@ -14,6 +14,9 @@ License: MIT
|
|
|
14
14
|
from __future__ import annotations
|
|
15
15
|
|
|
16
16
|
import logging
|
|
17
|
+
import platform
|
|
18
|
+
import struct
|
|
19
|
+
import sys
|
|
17
20
|
import threading
|
|
18
21
|
from typing import Any
|
|
19
22
|
|
|
@@ -22,56 +25,151 @@ from superlocalmemory.storage.models import AtomicFact
|
|
|
22
25
|
logger = logging.getLogger(__name__)
|
|
23
26
|
|
|
24
27
|
|
|
28
|
+
def _detect_onnx_variant() -> str:
|
|
29
|
+
"""Auto-detect the best ONNX model variant for the current platform.
|
|
30
|
+
|
|
31
|
+
Returns the file_name parameter for CrossEncoder model_kwargs.
|
|
32
|
+
Platform detection:
|
|
33
|
+
- macOS ARM64 (Apple Silicon): qint8_arm64
|
|
34
|
+
- x86_64 with AVX2: quint8_avx2
|
|
35
|
+
- Everything else: default model.onnx (float32, works everywhere)
|
|
36
|
+
"""
|
|
37
|
+
arch = platform.machine().lower()
|
|
38
|
+
is_64bit = struct.calcsize("P") * 8 == 64
|
|
39
|
+
|
|
40
|
+
if sys.platform == "darwin" and arch in ("arm64", "aarch64"):
|
|
41
|
+
return "onnx/model_qint8_arm64.onnx"
|
|
42
|
+
|
|
43
|
+
if arch in ("x86_64", "amd64") and is_64bit:
|
|
44
|
+
return "onnx/model_quint8_avx2.onnx"
|
|
45
|
+
|
|
46
|
+
return "onnx/model.onnx"
|
|
47
|
+
|
|
48
|
+
|
|
25
49
|
class CrossEncoderReranker:
|
|
26
50
|
"""Rerank candidate facts using a local cross-encoder model.
|
|
27
51
|
|
|
52
|
+
V3.3.2: Uses ONNX backend by default (~200MB) instead of full PyTorch
|
|
53
|
+
(~1.5GB). Three-tier fallback: ONNX → PyTorch → no reranking.
|
|
54
|
+
Auto-detects the optimal quantized ONNX variant per platform.
|
|
55
|
+
|
|
28
56
|
When the model is unavailable (missing package, download failure,
|
|
29
57
|
offline environment), falls back to returning candidates in their
|
|
30
58
|
original score order — never crashes.
|
|
31
59
|
|
|
32
60
|
Args:
|
|
33
61
|
model_name: HuggingFace cross-encoder model identifier.
|
|
62
|
+
backend: Inference backend. "onnx" for ONNX Runtime (light),
|
|
63
|
+
"" for PyTorch (heavy). Default: "onnx".
|
|
34
64
|
"""
|
|
35
65
|
|
|
36
66
|
def __init__(
|
|
37
67
|
self,
|
|
38
|
-
model_name: str = "
|
|
68
|
+
model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2",
|
|
69
|
+
backend: str = "onnx",
|
|
39
70
|
) -> None:
|
|
40
71
|
self._model_name = model_name
|
|
72
|
+
self._backend = backend
|
|
41
73
|
self._model: Any = None
|
|
42
74
|
self._loaded = False
|
|
75
|
+
self._loading = False # True while background load is in progress
|
|
76
|
+
self._active_backend: str = ""
|
|
43
77
|
self._lock = threading.Lock()
|
|
44
78
|
|
|
45
79
|
# ------------------------------------------------------------------
|
|
46
|
-
# Lazy loading
|
|
80
|
+
# Lazy loading (non-blocking)
|
|
47
81
|
# ------------------------------------------------------------------
|
|
48
82
|
|
|
49
83
|
def _ensure_model(self) -> None:
|
|
50
|
-
"""
|
|
84
|
+
"""Trigger model load in background (non-blocking).
|
|
85
|
+
|
|
86
|
+
On first call, starts loading in a background thread and returns
|
|
87
|
+
immediately. The model becomes available for subsequent calls
|
|
88
|
+
once loading completes. This prevents the 30s ONNX cold start
|
|
89
|
+
from blocking the first recall request.
|
|
90
|
+
|
|
91
|
+
Three-tier fallback:
|
|
92
|
+
1. ONNX backend with platform-optimal quantization — ~100-200MB RAM
|
|
93
|
+
2. PyTorch backend (requires torch) — ~1.5GB RAM
|
|
94
|
+
3. No model (graceful degradation) — 0 RAM
|
|
95
|
+
"""
|
|
51
96
|
if self._loaded:
|
|
52
97
|
return
|
|
53
98
|
|
|
54
99
|
with self._lock:
|
|
55
|
-
if self._loaded:
|
|
56
|
-
return
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
100
|
+
if self._loaded or self._loading:
|
|
101
|
+
return
|
|
102
|
+
self._loading = True
|
|
103
|
+
|
|
104
|
+
# Load in background thread so first recall isn't blocked
|
|
105
|
+
loader = threading.Thread(
|
|
106
|
+
target=self._load_model, daemon=True, name="ce-loader",
|
|
107
|
+
)
|
|
108
|
+
loader.start()
|
|
109
|
+
|
|
110
|
+
def _load_model(self) -> None:
|
|
111
|
+
"""Actually load the model (runs in background thread)."""
|
|
112
|
+
try:
|
|
113
|
+
from sentence_transformers import CrossEncoder
|
|
114
|
+
|
|
115
|
+
if self._backend == "onnx":
|
|
116
|
+
try:
|
|
117
|
+
onnx_file = _detect_onnx_variant()
|
|
118
|
+
model = CrossEncoder(
|
|
119
|
+
self._model_name,
|
|
120
|
+
backend="onnx",
|
|
121
|
+
model_kwargs={"file_name": onnx_file},
|
|
122
|
+
)
|
|
123
|
+
self._model = model
|
|
124
|
+
self._active_backend = "onnx"
|
|
125
|
+
logger.info(
|
|
126
|
+
"Cross-encoder loaded (ONNX %s): %s",
|
|
127
|
+
onnx_file, self._model_name,
|
|
128
|
+
)
|
|
129
|
+
except Exception as onnx_exc:
|
|
130
|
+
logger.info(
|
|
131
|
+
"ONNX backend unavailable (%s), falling back to PyTorch",
|
|
132
|
+
onnx_exc,
|
|
133
|
+
)
|
|
134
|
+
model = CrossEncoder(self._model_name)
|
|
135
|
+
self._model = model
|
|
136
|
+
self._active_backend = "pytorch"
|
|
137
|
+
logger.info(
|
|
138
|
+
"Cross-encoder loaded (PyTorch fallback): %s",
|
|
139
|
+
self._model_name,
|
|
140
|
+
)
|
|
141
|
+
else:
|
|
142
|
+
model = CrossEncoder(self._model_name)
|
|
143
|
+
self._model = model
|
|
144
|
+
self._active_backend = "pytorch"
|
|
61
145
|
logger.info("Cross-encoder loaded: %s", self._model_name)
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
146
|
+
except ImportError:
|
|
147
|
+
logger.warning(
|
|
148
|
+
"sentence-transformers not installed; "
|
|
149
|
+
"cross-encoder reranking disabled"
|
|
150
|
+
)
|
|
151
|
+
except OSError as exc:
|
|
152
|
+
logger.warning(
|
|
153
|
+
"Failed to load cross-encoder %s: %s",
|
|
154
|
+
self._model_name,
|
|
155
|
+
exc,
|
|
156
|
+
)
|
|
157
|
+
finally:
|
|
158
|
+
self._loaded = True
|
|
159
|
+
self._loading = False
|
|
160
|
+
|
|
161
|
+
def _ensure_model_blocking(self) -> None:
|
|
162
|
+
"""Load model synchronously (blocks until ready).
|
|
163
|
+
|
|
164
|
+
Used by warmup and is_available where we need the model NOW.
|
|
165
|
+
"""
|
|
166
|
+
if self._loaded:
|
|
167
|
+
return
|
|
168
|
+
with self._lock:
|
|
169
|
+
if self._loaded:
|
|
170
|
+
return
|
|
171
|
+
self._loading = True
|
|
172
|
+
self._load_model()
|
|
75
173
|
|
|
76
174
|
# ------------------------------------------------------------------
|
|
77
175
|
# Public API
|
|
@@ -104,10 +202,13 @@ class CrossEncoderReranker:
|
|
|
104
202
|
if not candidates:
|
|
105
203
|
return []
|
|
106
204
|
|
|
205
|
+
# Non-blocking: trigger background load if not yet started
|
|
107
206
|
self._ensure_model()
|
|
108
207
|
|
|
109
208
|
if self._model is None:
|
|
110
|
-
#
|
|
209
|
+
# Model not loaded yet (still loading in background or failed).
|
|
210
|
+
# Graceful fallback: return candidates sorted by existing score.
|
|
211
|
+
# Next recall will use the model once it's ready.
|
|
111
212
|
sorted_cands = sorted(
|
|
112
213
|
candidates, key=lambda x: x[1], reverse=True
|
|
113
214
|
)
|
|
@@ -150,5 +251,5 @@ class CrossEncoderReranker:
|
|
|
150
251
|
@property
|
|
151
252
|
def is_available(self) -> bool:
|
|
152
253
|
"""Whether the cross-encoder model is loaded and ready."""
|
|
153
|
-
self.
|
|
254
|
+
self._ensure_model_blocking()
|
|
154
255
|
return self._model is not None
|