superlocalmemory 3.3.6 → 3.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/package.json +1 -1
- package/pyproject.toml +1 -1
- package/src/superlocalmemory/core/config.py +2 -1
- package/src/superlocalmemory/core/embedding_worker.py +27 -1
- package/src/superlocalmemory/core/embeddings.py +39 -0
- package/src/superlocalmemory/core/recall_worker.py +26 -0
- package/src/superlocalmemory/math/polar_quant.py +57 -23
- package/src/superlocalmemory/math/turbo_quant.py +308 -0
- package/src/superlocalmemory/retrieval/reranker.py +39 -0
package/README.md
CHANGED
|
@@ -3,7 +3,8 @@
|
|
|
3
3
|
</p>
|
|
4
4
|
|
|
5
5
|
<h1 align="center">SuperLocalMemory V3.3</h1>
|
|
6
|
-
<p align="center"><strong>
|
|
6
|
+
<p align="center"><strong>Every other AI forgets. Yours won't.</strong><br/><em>Infinite memory for Claude Code, Cursor, Windsurf & 17+ AI tools.</em></p>
|
|
7
|
+
<p align="center"><code>v3.3.6</code> — Install once. Every session remembers the last. Automatically.</p>
|
|
7
8
|
|
|
8
9
|
<p align="center">
|
|
9
10
|
<code>+16pp vs Mem0 (zero cloud)</code> · <code>85% Open-Domain (best of any system)</code> · <code>EU AI Act Ready</code>
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "superlocalmemory",
|
|
3
|
-
"version": "3.3.
|
|
3
|
+
"version": "3.3.8",
|
|
4
4
|
"description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"ai-memory",
|
package/pyproject.toml
CHANGED
|
@@ -310,6 +310,7 @@ class PolarQuantConfig:
|
|
|
310
310
|
dimension: int = 768
|
|
311
311
|
rotation_matrix_path: str = "" # empty = ~/.superlocalmemory/polar_rotation.npy
|
|
312
312
|
seed: int = 42 # reproducible rotation matrix
|
|
313
|
+
codebook_method: str = "turbo" # "turbo" (default) or "polar_legacy"
|
|
313
314
|
|
|
314
315
|
|
|
315
316
|
@dataclass(frozen=True)
|
|
@@ -338,7 +339,7 @@ class QuantizationConfig:
|
|
|
338
339
|
eap_enabled: bool = True
|
|
339
340
|
keep_float32_backup: bool = True
|
|
340
341
|
auto_compact_interval_hours: int = 6
|
|
341
|
-
polar_search_penalty: float = 0.95
|
|
342
|
+
polar_search_penalty: float = 0.97 # V3.3.8: 0.95→0.97, TurboQuant has lower MSE
|
|
342
343
|
|
|
343
344
|
|
|
344
345
|
@dataclass(frozen=True)
|
|
@@ -23,9 +23,10 @@ Part of Qualixar | Author: Varun Pratap Bhardwaj
|
|
|
23
23
|
from __future__ import annotations
|
|
24
24
|
|
|
25
25
|
import json
|
|
26
|
+
import os
|
|
26
27
|
import signal
|
|
27
28
|
import sys
|
|
28
|
-
import
|
|
29
|
+
import threading
|
|
29
30
|
|
|
30
31
|
# Force CPU BEFORE any torch import
|
|
31
32
|
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
|
@@ -41,8 +42,33 @@ if sys.platform != "win32":
|
|
|
41
42
|
signal.signal(signal.SIGTERM, lambda *_: sys.exit(0))
|
|
42
43
|
|
|
43
44
|
|
|
45
|
+
def _start_parent_watchdog() -> None:
|
|
46
|
+
"""Monitor parent process — self-terminate if parent dies.
|
|
47
|
+
|
|
48
|
+
Prevents orphaned workers that consume 500-800 MB each when the parent
|
|
49
|
+
process crashes, is killed, or exits without cleanup.
|
|
50
|
+
|
|
51
|
+
V3.3.7: Added after incident where orphaned workers consumed 33 GB.
|
|
52
|
+
"""
|
|
53
|
+
parent_pid = os.getppid()
|
|
54
|
+
|
|
55
|
+
def _watch() -> None:
|
|
56
|
+
import time
|
|
57
|
+
while True:
|
|
58
|
+
time.sleep(5)
|
|
59
|
+
try:
|
|
60
|
+
os.kill(parent_pid, 0)
|
|
61
|
+
except OSError:
|
|
62
|
+
os._exit(0)
|
|
63
|
+
|
|
64
|
+
t = threading.Thread(target=_watch, daemon=True, name="parent-watchdog")
|
|
65
|
+
t.start()
|
|
66
|
+
|
|
67
|
+
|
|
44
68
|
def _worker_main() -> None:
|
|
45
69
|
"""Main loop: read JSON requests from stdin, write responses to stdout."""
|
|
70
|
+
_start_parent_watchdog() # V3.3.7: self-terminate if parent dies
|
|
71
|
+
|
|
46
72
|
import numpy as np
|
|
47
73
|
|
|
48
74
|
model = None
|
|
@@ -15,6 +15,7 @@ Part of Qualixar | Author: Varun Pratap Bhardwaj
|
|
|
15
15
|
|
|
16
16
|
from __future__ import annotations
|
|
17
17
|
|
|
18
|
+
import atexit
|
|
18
19
|
import json
|
|
19
20
|
import logging
|
|
20
21
|
import os
|
|
@@ -22,11 +23,15 @@ import subprocess
|
|
|
22
23
|
import sys
|
|
23
24
|
import threading
|
|
24
25
|
import time
|
|
26
|
+
import weakref
|
|
25
27
|
from pathlib import Path
|
|
26
28
|
from typing import TYPE_CHECKING
|
|
27
29
|
|
|
28
30
|
import numpy as np
|
|
29
31
|
|
|
32
|
+
# Track all live embedding services for atexit cleanup
|
|
33
|
+
_live_embedding_services: set[weakref.ref] = set()
|
|
34
|
+
|
|
30
35
|
if TYPE_CHECKING:
|
|
31
36
|
from numpy.typing import NDArray
|
|
32
37
|
|
|
@@ -69,6 +74,17 @@ class EmbeddingService:
|
|
|
69
74
|
self._worker_ready = False
|
|
70
75
|
self._request_count: int = 0
|
|
71
76
|
|
|
77
|
+
# Register for atexit cleanup (prevent orphaned workers)
|
|
78
|
+
ref = weakref.ref(self, _live_embedding_services.discard)
|
|
79
|
+
_live_embedding_services.add(ref)
|
|
80
|
+
|
|
81
|
+
def __del__(self) -> None:
|
|
82
|
+
"""Kill worker subprocess when service is garbage-collected."""
|
|
83
|
+
try:
|
|
84
|
+
self._kill_worker()
|
|
85
|
+
except Exception:
|
|
86
|
+
pass
|
|
87
|
+
|
|
72
88
|
@property
|
|
73
89
|
def is_available(self) -> bool:
|
|
74
90
|
"""Check if embedding service can produce embeddings."""
|
|
@@ -338,3 +354,26 @@ class EmbeddingService:
|
|
|
338
354
|
raise DimensionMismatchError(
|
|
339
355
|
f"Embedding dimension {actual} != expected {self._config.dimension}"
|
|
340
356
|
)
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
# ---------------------------------------------------------------------------
|
|
360
|
+
# Module-level atexit: kill ALL embedding workers on process exit
|
|
361
|
+
# ---------------------------------------------------------------------------
|
|
362
|
+
|
|
363
|
+
def _cleanup_all_embedding_services() -> None:
|
|
364
|
+
"""Kill all embedding worker subprocesses on interpreter exit.
|
|
365
|
+
|
|
366
|
+
Prevents orphaned 500-800 MB sentence-transformer workers surviving
|
|
367
|
+
after parent exits (especially during test runs with parallel agents).
|
|
368
|
+
"""
|
|
369
|
+
for ref in list(_live_embedding_services):
|
|
370
|
+
svc = ref()
|
|
371
|
+
if svc is not None:
|
|
372
|
+
try:
|
|
373
|
+
svc._kill_worker()
|
|
374
|
+
except Exception:
|
|
375
|
+
pass
|
|
376
|
+
_live_embedding_services.clear()
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
atexit.register(_cleanup_all_embedding_services)
|
|
@@ -20,6 +20,7 @@ import json
|
|
|
20
20
|
import os
|
|
21
21
|
import signal
|
|
22
22
|
import sys
|
|
23
|
+
import threading
|
|
23
24
|
|
|
24
25
|
# Force CPU BEFORE any torch import
|
|
25
26
|
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
|
@@ -34,6 +35,29 @@ os.environ["TORCH_DEVICE"] = "cpu"
|
|
|
34
35
|
if sys.platform != "win32":
|
|
35
36
|
signal.signal(signal.SIGTERM, lambda *_: sys.exit(0))
|
|
36
37
|
|
|
38
|
+
|
|
39
|
+
def _start_parent_watchdog() -> None:
|
|
40
|
+
"""Monitor parent process — self-terminate if parent dies.
|
|
41
|
+
|
|
42
|
+
Prevents orphaned workers that consume 500+ MB each when the parent
|
|
43
|
+
process crashes, is killed, or exits without cleanup.
|
|
44
|
+
|
|
45
|
+
V3.3.7: Added after incident where orphaned workers consumed 33 GB.
|
|
46
|
+
"""
|
|
47
|
+
parent_pid = os.getppid()
|
|
48
|
+
|
|
49
|
+
def _watch() -> None:
|
|
50
|
+
import time
|
|
51
|
+
while True:
|
|
52
|
+
time.sleep(5)
|
|
53
|
+
try:
|
|
54
|
+
os.kill(parent_pid, 0)
|
|
55
|
+
except OSError:
|
|
56
|
+
os._exit(0)
|
|
57
|
+
|
|
58
|
+
t = threading.Thread(target=_watch, daemon=True, name="parent-watchdog")
|
|
59
|
+
t.start()
|
|
60
|
+
|
|
37
61
|
_engine = None
|
|
38
62
|
|
|
39
63
|
|
|
@@ -209,6 +233,8 @@ def _handle_status() -> dict:
|
|
|
209
233
|
|
|
210
234
|
def _worker_main() -> None:
|
|
211
235
|
"""Main loop: read JSON requests from stdin, write responses to stdout."""
|
|
236
|
+
_start_parent_watchdog() # V3.3.7: self-terminate if parent dies
|
|
237
|
+
|
|
212
238
|
for line in sys.stdin:
|
|
213
239
|
line = line.strip()
|
|
214
240
|
if not line:
|
|
@@ -83,13 +83,23 @@ class PolarQuantEncoder:
|
|
|
83
83
|
HR-09: Angle indices as uint8, packed into bytes.
|
|
84
84
|
"""
|
|
85
85
|
|
|
86
|
-
__slots__ = ("_config", "_d", "_S", "_codebooks")
|
|
86
|
+
__slots__ = ("_config", "_d", "_S", "_codebooks", "_turbo", "_use_turbo")
|
|
87
87
|
|
|
88
88
|
def __init__(self, config: PolarQuantConfig) -> None:
|
|
89
89
|
self._config = config
|
|
90
90
|
self._d = config.dimension
|
|
91
|
-
|
|
92
|
-
|
|
91
|
+
codebook_method = getattr(config, "codebook_method", "turbo")
|
|
92
|
+
if codebook_method == "turbo":
|
|
93
|
+
from superlocalmemory.math.turbo_quant import TurboQuantEncoder
|
|
94
|
+
self._turbo = TurboQuantEncoder(config)
|
|
95
|
+
self._S = self._turbo._S
|
|
96
|
+
self._codebooks = self._generate_uniform_codebooks() # for legacy decode
|
|
97
|
+
self._use_turbo = True
|
|
98
|
+
else:
|
|
99
|
+
self._turbo = None
|
|
100
|
+
self._S = self._load_or_create_rotation_matrix()
|
|
101
|
+
self._codebooks = self._generate_uniform_codebooks()
|
|
102
|
+
self._use_turbo = False
|
|
93
103
|
|
|
94
104
|
# -- Rotation matrix (HR-01, HR-02) ------------------------------------
|
|
95
105
|
|
|
@@ -156,14 +166,14 @@ class PolarQuantEncoder:
|
|
|
156
166
|
# -- Encode ------------------------------------------------------------
|
|
157
167
|
|
|
158
168
|
def encode(self, embedding: NDArray, bit_width: int = 4) -> QuantizedEmbedding:
|
|
159
|
-
"""Encode a float32 embedding into quantized
|
|
169
|
+
"""Encode a float32 embedding into quantized representation.
|
|
160
170
|
|
|
161
171
|
Args:
|
|
162
172
|
embedding: 1-D float vector of dimension self._d.
|
|
163
173
|
bit_width: 2, 4, or 8.
|
|
164
174
|
|
|
165
175
|
Returns:
|
|
166
|
-
QuantizedEmbedding with packed
|
|
176
|
+
QuantizedEmbedding with packed indices.
|
|
167
177
|
|
|
168
178
|
Raises:
|
|
169
179
|
ValueError: Invalid bit_width or dimension mismatch.
|
|
@@ -177,13 +187,25 @@ class PolarQuantEncoder:
|
|
|
177
187
|
f"shape mismatch: expected ({self._d},), got {embedding.shape}"
|
|
178
188
|
)
|
|
179
189
|
|
|
180
|
-
#
|
|
181
|
-
|
|
190
|
+
# V3.3.8: TurboQuant path (default)
|
|
191
|
+
if self._use_turbo:
|
|
192
|
+
result = self._turbo.encode(embedding, bit_width)
|
|
193
|
+
return QuantizedEmbedding(
|
|
194
|
+
fact_id="",
|
|
195
|
+
radius=result.radius,
|
|
196
|
+
angle_indices=result.indices,
|
|
197
|
+
bit_width=result.bit_width,
|
|
198
|
+
qjl_bits=None,
|
|
199
|
+
)
|
|
182
200
|
|
|
183
|
-
#
|
|
201
|
+
# Legacy PolarQuant path
|
|
202
|
+
return self._encode_polar(embedding, bit_width)
|
|
203
|
+
|
|
204
|
+
def _encode_polar(self, embedding: NDArray, bit_width: int) -> QuantizedEmbedding:
|
|
205
|
+
"""Legacy PolarQuant encode (polar coordinate transform)."""
|
|
206
|
+
v_rot = self._S @ embedding
|
|
184
207
|
r = float(np.linalg.norm(v_rot))
|
|
185
208
|
|
|
186
|
-
# Degenerate zero vector
|
|
187
209
|
if r < 1e-12:
|
|
188
210
|
zero_angles = np.zeros(self._d - 1, dtype=np.uint8)
|
|
189
211
|
if bit_width == 8:
|
|
@@ -200,17 +222,11 @@ class PolarQuantEncoder:
|
|
|
200
222
|
qjl_bits=None,
|
|
201
223
|
)
|
|
202
224
|
|
|
203
|
-
# Step 3: Normalize
|
|
204
225
|
v_unit = v_rot / r
|
|
205
|
-
|
|
206
|
-
# Step 4: Cartesian to polar angles
|
|
207
226
|
angles = _cartesian_to_polar_angles(v_unit)
|
|
208
|
-
|
|
209
|
-
# Step 5: Quantize angles using codebook
|
|
210
227
|
cb = self._codebooks[bit_width]
|
|
211
228
|
indices = np.digitize(angles, cb["boundaries"][1:-1]).astype(np.uint8)
|
|
212
229
|
|
|
213
|
-
# Step 6: Pack into bytes
|
|
214
230
|
if bit_width == 8:
|
|
215
231
|
packed = indices.tobytes()
|
|
216
232
|
elif bit_width == 4:
|
|
@@ -228,18 +244,43 @@ class PolarQuantEncoder:
|
|
|
228
244
|
|
|
229
245
|
# -- Decode ------------------------------------------------------------
|
|
230
246
|
|
|
247
|
+
# TQ magic prefix for format detection (HR-MIG-02)
|
|
248
|
+
_TQ_MAGIC = b"\x54\x51"
|
|
249
|
+
|
|
231
250
|
def decode(self, qe: QuantizedEmbedding) -> NDArray:
|
|
232
251
|
"""Decode a QuantizedEmbedding back to float64 vector.
|
|
233
252
|
|
|
253
|
+
V3.3.8: Detects "TQ" prefix (0x54, 0x51) to route between
|
|
254
|
+
TurboQuant and legacy PolarQuant decode paths.
|
|
255
|
+
|
|
234
256
|
Args:
|
|
235
257
|
qe: Quantized embedding produced by encode().
|
|
236
258
|
|
|
237
259
|
Returns:
|
|
238
260
|
Reconstructed vector of dimension self._d.
|
|
239
261
|
"""
|
|
262
|
+
# Format detection: TQ prefix = TurboQuant, else legacy polar
|
|
263
|
+
if qe.angle_indices[:2] == self._TQ_MAGIC:
|
|
264
|
+
return self._decode_turbo(qe)
|
|
265
|
+
return self._decode_polar(qe)
|
|
266
|
+
|
|
267
|
+
def _decode_turbo(self, qe: QuantizedEmbedding) -> NDArray:
|
|
268
|
+
"""Decode TurboQuant-encoded BLOB (has TQ prefix)."""
|
|
269
|
+
if self._turbo is None:
|
|
270
|
+
from superlocalmemory.math.turbo_quant import TurboQuantEncoder
|
|
271
|
+
self._turbo = TurboQuantEncoder(self._config)
|
|
272
|
+
from superlocalmemory.math.turbo_quant import TurboQuantResult
|
|
273
|
+
result = TurboQuantResult(
|
|
274
|
+
radius=qe.radius,
|
|
275
|
+
indices=qe.angle_indices,
|
|
276
|
+
bit_width=qe.bit_width,
|
|
277
|
+
)
|
|
278
|
+
return self._turbo.decode(result)
|
|
279
|
+
|
|
280
|
+
def _decode_polar(self, qe: QuantizedEmbedding) -> NDArray:
|
|
281
|
+
"""Decode legacy PolarQuant BLOB (no TQ prefix)."""
|
|
240
282
|
n_angles = self._d - 1
|
|
241
283
|
|
|
242
|
-
# Step 1: Unpack angle indices
|
|
243
284
|
if qe.bit_width == 8:
|
|
244
285
|
indices = np.frombuffer(qe.angle_indices, dtype=np.uint8).copy()
|
|
245
286
|
elif qe.bit_width == 4:
|
|
@@ -247,19 +288,12 @@ class PolarQuantEncoder:
|
|
|
247
288
|
else:
|
|
248
289
|
indices = self.unpack_2bit(qe.angle_indices, n_angles)
|
|
249
290
|
|
|
250
|
-
# Step 2: Dequantize -- map indices to centroid angles
|
|
251
291
|
centroids = self._codebooks[qe.bit_width]["centroids"]
|
|
252
|
-
# Clip indices to valid range
|
|
253
292
|
indices = np.clip(indices, 0, len(centroids) - 1)
|
|
254
293
|
angles = centroids[indices]
|
|
255
294
|
|
|
256
|
-
# Step 3: Polar to Cartesian
|
|
257
295
|
v_unit = _polar_to_cartesian(angles, self._d)
|
|
258
|
-
|
|
259
|
-
# Step 4: Scale by radius
|
|
260
296
|
v_rot = v_unit * qe.radius
|
|
261
|
-
|
|
262
|
-
# Step 5: Inverse rotation (S is orthogonal, so S^T = S^{-1})
|
|
263
297
|
v_orig = self._S.T @ v_rot
|
|
264
298
|
|
|
265
299
|
return v_orig
|
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
# Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
|
|
2
|
+
# Licensed under the MIT License - see LICENSE file
|
|
3
|
+
# Part of SuperLocalMemory V3
|
|
4
|
+
|
|
5
|
+
"""TurboQuant embedding quantization (ICLR 2026).
|
|
6
|
+
|
|
7
|
+
Per-coordinate Lloyd-Max scalar quantization after random orthogonal rotation.
|
|
8
|
+
D_mse <= sqrt(3*pi/2) / 4^b. No scipy (HR-SCIPY-01). 2-byte "TQ" prefix on
|
|
9
|
+
all BLOBs (HR-MIG-02). Bit-widths: 2, 4, 8 only (HR-3BIT-01).
|
|
10
|
+
|
|
11
|
+
References: TurboQuant (arXiv 2504.19874), PolarQuant (arXiv 2502.02617).
|
|
12
|
+
Part of Qualixar | Author: Varun Pratap Bhardwaj | License: MIT
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import logging
|
|
18
|
+
import math
|
|
19
|
+
import shutil
|
|
20
|
+
from dataclasses import dataclass
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
import numpy as np
|
|
24
|
+
from numpy.typing import NDArray
|
|
25
|
+
|
|
26
|
+
from superlocalmemory.core.config import PolarQuantConfig
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
TQ_MAGIC = b"\x54\x51" # 2-byte prefix for TurboQuant BLOBs (HR-MIG-02)
|
|
31
|
+
SUPPORTED_BIT_WIDTHS: frozenset[int] = frozenset({2, 4, 8})
|
|
32
|
+
|
|
33
|
+
# ---------------------------------------------------------------------------
|
|
34
|
+
# Data types
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass(frozen=True)
|
|
39
|
+
class TurboQuantResult:
|
|
40
|
+
"""Immutable TurboQuant-encoded embedding. radius=float16, indices=TQ-prefixed."""
|
|
41
|
+
|
|
42
|
+
radius: float
|
|
43
|
+
indices: bytes # TQ_MAGIC + packed codebook indices
|
|
44
|
+
bit_width: int
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# ---------------------------------------------------------------------------
|
|
48
|
+
# Lloyd-Max codebook (HR-SCIPY-01: math.erf + math.exp only)
|
|
49
|
+
# ---------------------------------------------------------------------------
|
|
50
|
+
|
|
51
|
+
_SQRT_2PI = math.sqrt(2.0 * math.pi)
|
|
52
|
+
_SQRT_2 = math.sqrt(2.0)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _std_normal_pdf(x: float) -> float:
|
|
56
|
+
return math.exp(-0.5 * x * x) / _SQRT_2PI
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _std_normal_cdf(x: float) -> float:
|
|
60
|
+
return 0.5 * (1.0 + math.erf(x / _SQRT_2))
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _compute_lloyd_max_gaussian(
|
|
64
|
+
sigma: float, n_levels: int, max_iter: int = 100, tol: float = 1e-10,
|
|
65
|
+
) -> NDArray:
|
|
66
|
+
"""Lloyd-Max optimal codebook for N(0, sigma^2). Deterministic (HR-CB-01)."""
|
|
67
|
+
lo, hi = -5.0 * sigma, 5.0 * sigma
|
|
68
|
+
boundaries = np.linspace(lo, hi, n_levels + 1)
|
|
69
|
+
centroids = np.zeros(n_levels)
|
|
70
|
+
for k in range(n_levels):
|
|
71
|
+
centroids[k] = 0.5 * (boundaries[k] + boundaries[k + 1])
|
|
72
|
+
|
|
73
|
+
for _ in range(max_iter):
|
|
74
|
+
old = centroids.copy()
|
|
75
|
+
for k in range(n_levels):
|
|
76
|
+
a_k = float(boundaries[k]) / sigma
|
|
77
|
+
b_k = float(boundaries[k + 1]) / sigma
|
|
78
|
+
denom = _std_normal_cdf(b_k) - _std_normal_cdf(a_k)
|
|
79
|
+
if denom > 1e-15:
|
|
80
|
+
centroids[k] = sigma * (_std_normal_pdf(a_k) - _std_normal_pdf(b_k)) / denom
|
|
81
|
+
for k in range(1, n_levels):
|
|
82
|
+
boundaries[k] = 0.5 * (centroids[k - 1] + centroids[k])
|
|
83
|
+
if float(np.max(np.abs(centroids - old))) < tol:
|
|
84
|
+
break
|
|
85
|
+
|
|
86
|
+
return np.sort(centroids)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# ---------------------------------------------------------------------------
|
|
90
|
+
# Bit packing
|
|
91
|
+
# ---------------------------------------------------------------------------
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _pack_8bit(indices: NDArray) -> bytes:
|
|
95
|
+
return indices.astype(np.uint8).tobytes()
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _unpack_8bit(data: bytes, length: int) -> NDArray:
|
|
99
|
+
return np.frombuffer(data, dtype=np.uint8)[:length].copy()
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _pack_4bit(indices: NDArray) -> bytes:
|
|
103
|
+
n = len(indices)
|
|
104
|
+
padded = np.zeros(n + (n % 2), dtype=np.uint8)
|
|
105
|
+
padded[:n] = np.clip(indices, 0, 15)
|
|
106
|
+
return ((padded[0::2] << 4) | padded[1::2]).tobytes()
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _unpack_4bit(data: bytes, length: int) -> NDArray:
|
|
110
|
+
packed = np.frombuffer(data, dtype=np.uint8)
|
|
111
|
+
result = np.empty(len(packed) * 2, dtype=np.uint8)
|
|
112
|
+
result[0::2] = packed >> 4
|
|
113
|
+
result[1::2] = packed & 0x0F
|
|
114
|
+
return result[:length]
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _pack_2bit(indices: NDArray) -> bytes:
|
|
118
|
+
n = len(indices)
|
|
119
|
+
padded = np.zeros(n + (4 - n % 4) % 4, dtype=np.uint8)
|
|
120
|
+
padded[:n] = np.clip(indices, 0, 3)
|
|
121
|
+
return (
|
|
122
|
+
(padded[0::4] << 6) | (padded[1::4] << 4)
|
|
123
|
+
| (padded[2::4] << 2) | padded[3::4]
|
|
124
|
+
).tobytes()
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _unpack_2bit(data: bytes, length: int) -> NDArray:
|
|
128
|
+
packed = np.frombuffer(data, dtype=np.uint8)
|
|
129
|
+
result = np.empty(len(packed) * 4, dtype=np.uint8)
|
|
130
|
+
result[0::4] = (packed >> 6) & 0x03
|
|
131
|
+
result[1::4] = (packed >> 4) & 0x03
|
|
132
|
+
result[2::4] = (packed >> 2) & 0x03
|
|
133
|
+
result[3::4] = packed & 0x03
|
|
134
|
+
return result[:length]
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
_PACKERS: dict[int, tuple] = {
|
|
138
|
+
8: (_pack_8bit, _unpack_8bit),
|
|
139
|
+
4: (_pack_4bit, _unpack_4bit),
|
|
140
|
+
2: (_pack_2bit, _unpack_2bit),
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
# ---------------------------------------------------------------------------
|
|
144
|
+
# TurboQuantEncoder
|
|
145
|
+
# ---------------------------------------------------------------------------
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class TurboQuantEncoder:
|
|
149
|
+
"""Per-coordinate Lloyd-Max quantizer with random rotation.
|
|
150
|
+
|
|
151
|
+
HR-ROT-01: Same rotation matrix for encode/decode.
|
|
152
|
+
HR-CB-02: Codebooks computed ONCE at __init__.
|
|
153
|
+
HR-SCIPY-01: No scipy dependency.
|
|
154
|
+
"""
|
|
155
|
+
|
|
156
|
+
__slots__ = ("_config", "_d", "_S", "_codebooks")
|
|
157
|
+
|
|
158
|
+
def __init__(self, config: PolarQuantConfig) -> None:
|
|
159
|
+
self._config = config
|
|
160
|
+
self._d = config.dimension
|
|
161
|
+
self._S = self._load_or_create_rotation_matrix()
|
|
162
|
+
self._codebooks = self._compute_codebooks()
|
|
163
|
+
|
|
164
|
+
def _load_or_create_rotation_matrix(self) -> NDArray:
|
|
165
|
+
"""Load/create rotation matrix with copy-on-detect (AUDIT C4-MED-01)."""
|
|
166
|
+
d = self._d
|
|
167
|
+
slm_dir = Path.home() / ".superlocalmemory"
|
|
168
|
+
|
|
169
|
+
turbo_path_str = self._config.rotation_matrix_path
|
|
170
|
+
if not turbo_path_str:
|
|
171
|
+
turbo_path_str = str(slm_dir / f"turbo_rotation_{d}.npy")
|
|
172
|
+
turbo_path = Path(turbo_path_str)
|
|
173
|
+
|
|
174
|
+
if turbo_path.exists():
|
|
175
|
+
try:
|
|
176
|
+
S = np.load(str(turbo_path))
|
|
177
|
+
if S.shape == (d, d):
|
|
178
|
+
return S
|
|
179
|
+
logger.warning("Turbo rotation shape %s != (%d,%d)", S.shape, d, d)
|
|
180
|
+
except Exception as exc:
|
|
181
|
+
logger.warning("Corrupt turbo rotation: %s", exc)
|
|
182
|
+
|
|
183
|
+
# Copy-on-detect: reuse existing polar rotation matrix
|
|
184
|
+
polar_path = slm_dir / f"polar_rotation_{d}.npy"
|
|
185
|
+
if polar_path.exists() and not turbo_path.exists():
|
|
186
|
+
try:
|
|
187
|
+
S = np.load(str(polar_path))
|
|
188
|
+
if S.shape == (d, d):
|
|
189
|
+
turbo_path.parent.mkdir(parents=True, exist_ok=True)
|
|
190
|
+
shutil.copy2(str(polar_path), str(turbo_path))
|
|
191
|
+
logger.info("Copied polar rotation matrix for TurboQuant compatibility")
|
|
192
|
+
return S
|
|
193
|
+
except Exception as exc:
|
|
194
|
+
logger.warning("Could not copy polar rotation: %s", exc)
|
|
195
|
+
|
|
196
|
+
# Generate new via Mezzadri-corrected QR
|
|
197
|
+
rng = np.random.default_rng(self._config.seed)
|
|
198
|
+
H = rng.standard_normal((d, d))
|
|
199
|
+
Q, R = np.linalg.qr(H)
|
|
200
|
+
S = Q @ np.diag(np.sign(np.diag(R)))
|
|
201
|
+
|
|
202
|
+
turbo_path.parent.mkdir(parents=True, exist_ok=True)
|
|
203
|
+
np.save(str(turbo_path), S)
|
|
204
|
+
logger.info("Generated TurboQuant rotation (%d x %d) at %s", d, d, turbo_path)
|
|
205
|
+
return S
|
|
206
|
+
|
|
207
|
+
def _compute_codebooks(self) -> dict[int, NDArray]:
|
|
208
|
+
"""Pre-compute Lloyd-Max codebooks for 2/4/8-bit."""
|
|
209
|
+
sigma = 1.0 / math.sqrt(self._d)
|
|
210
|
+
codebooks: dict[int, NDArray] = {}
|
|
211
|
+
for bw in sorted(SUPPORTED_BIT_WIDTHS):
|
|
212
|
+
centroids = _compute_lloyd_max_gaussian(sigma, 2 ** bw)
|
|
213
|
+
assert len(centroids) == 2 ** bw
|
|
214
|
+
assert np.all(centroids[1:] >= centroids[:-1])
|
|
215
|
+
codebooks[bw] = centroids
|
|
216
|
+
return codebooks
|
|
217
|
+
|
|
218
|
+
def encode(self, embedding: NDArray, bit_width: int = 4) -> TurboQuantResult:
|
|
219
|
+
"""Encode embedding. HR-ENC-01: pure. HR-ENC-02: radius=float16."""
|
|
220
|
+
if bit_width not in SUPPORTED_BIT_WIDTHS:
|
|
221
|
+
raise ValueError(f"bit_width must be 2, 4, or 8, got {bit_width}")
|
|
222
|
+
if embedding.shape != (self._d,):
|
|
223
|
+
raise ValueError(f"shape mismatch: expected ({self._d},), got {embedding.shape}")
|
|
224
|
+
|
|
225
|
+
y = self._S @ embedding
|
|
226
|
+
r = float(np.linalg.norm(y))
|
|
227
|
+
|
|
228
|
+
if r < 1e-12:
|
|
229
|
+
pack_fn, _ = _PACKERS[bit_width]
|
|
230
|
+
packed = TQ_MAGIC + pack_fn(np.zeros(self._d, dtype=np.uint8))
|
|
231
|
+
return TurboQuantResult(radius=0.0, indices=packed, bit_width=bit_width)
|
|
232
|
+
|
|
233
|
+
y_unit = y / r
|
|
234
|
+
centroids = self._codebooks[bit_width]
|
|
235
|
+
idx = np.searchsorted(centroids, y_unit)
|
|
236
|
+
idx = np.clip(idx, 0, len(centroids) - 1)
|
|
237
|
+
left = np.clip(idx - 1, 0, len(centroids) - 1)
|
|
238
|
+
use_left = np.abs(y_unit - centroids[left]) < np.abs(y_unit - centroids[idx])
|
|
239
|
+
idx = np.where(use_left, left, idx).astype(np.uint8)
|
|
240
|
+
|
|
241
|
+
pack_fn, _ = _PACKERS[bit_width]
|
|
242
|
+
packed = TQ_MAGIC + pack_fn(idx)
|
|
243
|
+
|
|
244
|
+
return TurboQuantResult(
|
|
245
|
+
radius=float(np.float16(r)), indices=packed, bit_width=bit_width,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
def decode(self, result: TurboQuantResult) -> NDArray:
|
|
249
|
+
"""Decode with format detection: TQ prefix -> turbo, else -> legacy polar."""
|
|
250
|
+
blob = result.indices
|
|
251
|
+
|
|
252
|
+
if blob[:2] == TQ_MAGIC:
|
|
253
|
+
data = blob[2:]
|
|
254
|
+
else:
|
|
255
|
+
return self._decode_legacy_polar(result)
|
|
256
|
+
|
|
257
|
+
_, unpack_fn = _PACKERS[result.bit_width]
|
|
258
|
+
indices = unpack_fn(data, self._d)
|
|
259
|
+
centroids = self._codebooks[result.bit_width]
|
|
260
|
+
y_unit_approx = centroids[np.clip(indices, 0, len(centroids) - 1)]
|
|
261
|
+
return self._S.T @ (y_unit_approx * result.radius)
|
|
262
|
+
|
|
263
|
+
def _decode_legacy_polar(self, result: TurboQuantResult) -> NDArray:
|
|
264
|
+
"""Decode legacy PolarQuant BLOB (no TQ prefix) for SLM <= 3.3.6."""
|
|
265
|
+
from superlocalmemory.math.polar_quant import PolarQuantEncoder, _polar_to_cartesian
|
|
266
|
+
|
|
267
|
+
n_angles = self._d - 1
|
|
268
|
+
if result.bit_width == 8:
|
|
269
|
+
indices = np.frombuffer(result.indices, dtype=np.uint8).copy()
|
|
270
|
+
elif result.bit_width == 4:
|
|
271
|
+
indices = PolarQuantEncoder.unpack_4bit(result.indices, n_angles)
|
|
272
|
+
else:
|
|
273
|
+
indices = PolarQuantEncoder.unpack_2bit(result.indices, n_angles)
|
|
274
|
+
|
|
275
|
+
levels = 2 ** result.bit_width
|
|
276
|
+
boundaries = np.linspace(0.0, math.pi, levels + 1)
|
|
277
|
+
centroids = (boundaries[:-1] + boundaries[1:]) / 2.0
|
|
278
|
+
angles = centroids[np.clip(indices, 0, len(centroids) - 1)]
|
|
279
|
+
|
|
280
|
+
v_unit = _polar_to_cartesian(angles, self._d)
|
|
281
|
+
return self._S.T @ (v_unit * result.radius)
|
|
282
|
+
|
|
283
|
+
def approximate_similarity(self, query: NDArray, result: TurboQuantResult) -> float:
|
|
284
|
+
"""Cosine similarity via decode. Returns 0.0 on degenerate inputs."""
|
|
285
|
+
decoded = self.decode(result)
|
|
286
|
+
denom = np.linalg.norm(query) * np.linalg.norm(decoded)
|
|
287
|
+
if denom < 1e-12:
|
|
288
|
+
return 0.0
|
|
289
|
+
sim = float(np.dot(query, decoded) / denom)
|
|
290
|
+
return 0.0 if (math.isnan(sim) or math.isinf(sim)) else sim
|
|
291
|
+
|
|
292
|
+
# Static pack/unpack (backward compat with PolarQuantEncoder API)
|
|
293
|
+
|
|
294
|
+
@staticmethod
|
|
295
|
+
def pack_4bit(indices: NDArray) -> bytes:
|
|
296
|
+
return _pack_4bit(indices)
|
|
297
|
+
|
|
298
|
+
@staticmethod
|
|
299
|
+
def unpack_4bit(data: bytes, length: int) -> NDArray:
|
|
300
|
+
return _unpack_4bit(data, length)
|
|
301
|
+
|
|
302
|
+
@staticmethod
|
|
303
|
+
def pack_2bit(indices: NDArray) -> bytes:
|
|
304
|
+
return _pack_2bit(indices)
|
|
305
|
+
|
|
306
|
+
@staticmethod
|
|
307
|
+
def unpack_2bit(data: bytes, length: int) -> NDArray:
|
|
308
|
+
return _unpack_2bit(data, length)
|
|
@@ -16,6 +16,7 @@ License: MIT
|
|
|
16
16
|
|
|
17
17
|
from __future__ import annotations
|
|
18
18
|
|
|
19
|
+
import atexit
|
|
19
20
|
import json
|
|
20
21
|
import logging
|
|
21
22
|
import os
|
|
@@ -23,10 +24,14 @@ import subprocess
|
|
|
23
24
|
import sys
|
|
24
25
|
import threading
|
|
25
26
|
import time
|
|
27
|
+
import weakref
|
|
26
28
|
from typing import Any
|
|
27
29
|
|
|
28
30
|
from superlocalmemory.storage.models import AtomicFact
|
|
29
31
|
|
|
32
|
+
# Track all live reranker instances for atexit cleanup
|
|
33
|
+
_live_rerankers: set[weakref.ref] = set()
|
|
34
|
+
|
|
30
35
|
logger = logging.getLogger(__name__)
|
|
31
36
|
|
|
32
37
|
_IDLE_TIMEOUT_SECONDS = 120 # 2 min → kill worker
|
|
@@ -64,11 +69,22 @@ class CrossEncoderReranker:
|
|
|
64
69
|
self._idle_timer: threading.Timer | None = None
|
|
65
70
|
self._request_count: int = 0
|
|
66
71
|
|
|
72
|
+
# Register for atexit cleanup (prevent orphaned workers)
|
|
73
|
+
ref = weakref.ref(self, _live_rerankers.discard)
|
|
74
|
+
_live_rerankers.add(ref)
|
|
75
|
+
|
|
67
76
|
# Start background warmup immediately — worker loads model
|
|
68
77
|
# while the rest of init continues. First recall gets instant
|
|
69
78
|
# fallback; second recall uses the warm model.
|
|
70
79
|
self._start_background_warmup()
|
|
71
80
|
|
|
81
|
+
def __del__(self) -> None:
|
|
82
|
+
"""Kill worker subprocess when reranker is garbage-collected."""
|
|
83
|
+
try:
|
|
84
|
+
self._kill_worker()
|
|
85
|
+
except Exception:
|
|
86
|
+
pass
|
|
87
|
+
|
|
72
88
|
# ------------------------------------------------------------------
|
|
73
89
|
# Background warmup (non-blocking model load)
|
|
74
90
|
# ------------------------------------------------------------------
|
|
@@ -330,3 +346,26 @@ class CrossEncoderReranker:
|
|
|
330
346
|
if resp is None:
|
|
331
347
|
return False
|
|
332
348
|
return resp.get("ok", False)
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
# ---------------------------------------------------------------------------
|
|
352
|
+
# Module-level atexit: kill ALL reranker workers on process exit
|
|
353
|
+
# ---------------------------------------------------------------------------
|
|
354
|
+
|
|
355
|
+
def _cleanup_all_rerankers() -> None:
|
|
356
|
+
"""Kill all reranker worker subprocesses on interpreter exit.
|
|
357
|
+
|
|
358
|
+
Prevents orphaned 1.3 GB ONNX/PyTorch workers surviving after
|
|
359
|
+
parent exits (especially during test runs with parallel agents).
|
|
360
|
+
"""
|
|
361
|
+
for ref in list(_live_rerankers):
|
|
362
|
+
reranker = ref()
|
|
363
|
+
if reranker is not None:
|
|
364
|
+
try:
|
|
365
|
+
reranker._kill_worker()
|
|
366
|
+
except Exception:
|
|
367
|
+
pass
|
|
368
|
+
_live_rerankers.clear()
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
atexit.register(_cleanup_all_rerankers)
|