superlocalmemory 3.3.6 → 3.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -3,7 +3,8 @@
3
3
  </p>
4
4
 
5
5
  <h1 align="center">SuperLocalMemory V3.3</h1>
6
- <p align="center"><strong>The first local-only AI memory to break 74% retrieval on LoCoMo.<br/>No cloud. No APIs. No data leaves your machine.</strong></p>
6
+ <p align="center"><strong>Every other AI forgets. Yours won't.</strong><br/><em>Infinite memory for Claude Code, Cursor, Windsurf & 17+ AI tools.</em></p>
7
+ <p align="center"><code>v3.3.6</code> — Install once. Every session remembers the last. Automatically.</p>
7
8
 
8
9
  <p align="center">
9
10
  <code>+16pp vs Mem0 (zero cloud)</code> &nbsp;·&nbsp; <code>85% Open-Domain (best of any system)</code> &nbsp;·&nbsp; <code>EU AI Act Ready</code>
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "superlocalmemory",
3
- "version": "3.3.6",
3
+ "version": "3.3.8",
4
4
  "description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
5
5
  "keywords": [
6
6
  "ai-memory",
package/pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "superlocalmemory"
3
- version = "3.3.6"
3
+ version = "3.3.8"
4
4
  description = "Information-geometric agent memory with mathematical guarantees"
5
5
  readme = "README.md"
6
6
  license = {text = "MIT"}
@@ -310,6 +310,7 @@ class PolarQuantConfig:
310
310
  dimension: int = 768
311
311
  rotation_matrix_path: str = "" # empty = ~/.superlocalmemory/polar_rotation.npy
312
312
  seed: int = 42 # reproducible rotation matrix
313
+ codebook_method: str = "turbo" # "turbo" (default) or "polar_legacy"
313
314
 
314
315
 
315
316
  @dataclass(frozen=True)
@@ -338,7 +339,7 @@ class QuantizationConfig:
338
339
  eap_enabled: bool = True
339
340
  keep_float32_backup: bool = True
340
341
  auto_compact_interval_hours: int = 6
341
- polar_search_penalty: float = 0.95
342
+ polar_search_penalty: float = 0.97 # V3.3.8: 0.95→0.97, TurboQuant has lower MSE
342
343
 
343
344
 
344
345
  @dataclass(frozen=True)
@@ -23,9 +23,10 @@ Part of Qualixar | Author: Varun Pratap Bhardwaj
23
23
  from __future__ import annotations
24
24
 
25
25
  import json
26
+ import os
26
27
  import signal
27
28
  import sys
28
- import os
29
+ import threading
29
30
 
30
31
  # Force CPU BEFORE any torch import
31
32
  os.environ["CUDA_VISIBLE_DEVICES"] = ""
@@ -41,8 +42,33 @@ if sys.platform != "win32":
41
42
  signal.signal(signal.SIGTERM, lambda *_: sys.exit(0))
42
43
 
43
44
 
45
+ def _start_parent_watchdog() -> None:
46
+ """Monitor parent process — self-terminate if parent dies.
47
+
48
+ Prevents orphaned workers that consume 500-800 MB each when the parent
49
+ process crashes, is killed, or exits without cleanup.
50
+
51
+ V3.3.7: Added after incident where orphaned workers consumed 33 GB.
52
+ """
53
+ parent_pid = os.getppid()
54
+
55
+ def _watch() -> None:
56
+ import time
57
+ while True:
58
+ time.sleep(5)
59
+ try:
60
+ os.kill(parent_pid, 0)
61
+ except OSError:
62
+ os._exit(0)
63
+
64
+ t = threading.Thread(target=_watch, daemon=True, name="parent-watchdog")
65
+ t.start()
66
+
67
+
44
68
  def _worker_main() -> None:
45
69
  """Main loop: read JSON requests from stdin, write responses to stdout."""
70
+ _start_parent_watchdog() # V3.3.7: self-terminate if parent dies
71
+
46
72
  import numpy as np
47
73
 
48
74
  model = None
@@ -15,6 +15,7 @@ Part of Qualixar | Author: Varun Pratap Bhardwaj
15
15
 
16
16
  from __future__ import annotations
17
17
 
18
+ import atexit
18
19
  import json
19
20
  import logging
20
21
  import os
@@ -22,11 +23,15 @@ import subprocess
22
23
  import sys
23
24
  import threading
24
25
  import time
26
+ import weakref
25
27
  from pathlib import Path
26
28
  from typing import TYPE_CHECKING
27
29
 
28
30
  import numpy as np
29
31
 
32
+ # Track all live embedding services for atexit cleanup
33
+ _live_embedding_services: set[weakref.ref] = set()
34
+
30
35
  if TYPE_CHECKING:
31
36
  from numpy.typing import NDArray
32
37
 
@@ -69,6 +74,17 @@ class EmbeddingService:
69
74
  self._worker_ready = False
70
75
  self._request_count: int = 0
71
76
 
77
+ # Register for atexit cleanup (prevent orphaned workers)
78
+ ref = weakref.ref(self, _live_embedding_services.discard)
79
+ _live_embedding_services.add(ref)
80
+
81
+ def __del__(self) -> None:
82
+ """Kill worker subprocess when service is garbage-collected."""
83
+ try:
84
+ self._kill_worker()
85
+ except Exception:
86
+ pass
87
+
72
88
  @property
73
89
  def is_available(self) -> bool:
74
90
  """Check if embedding service can produce embeddings."""
@@ -338,3 +354,26 @@ class EmbeddingService:
338
354
  raise DimensionMismatchError(
339
355
  f"Embedding dimension {actual} != expected {self._config.dimension}"
340
356
  )
357
+
358
+
359
+ # ---------------------------------------------------------------------------
360
+ # Module-level atexit: kill ALL embedding workers on process exit
361
+ # ---------------------------------------------------------------------------
362
+
363
+ def _cleanup_all_embedding_services() -> None:
364
+ """Kill all embedding worker subprocesses on interpreter exit.
365
+
366
+ Prevents orphaned 500-800 MB sentence-transformer workers surviving
367
+ after parent exits (especially during test runs with parallel agents).
368
+ """
369
+ for ref in list(_live_embedding_services):
370
+ svc = ref()
371
+ if svc is not None:
372
+ try:
373
+ svc._kill_worker()
374
+ except Exception:
375
+ pass
376
+ _live_embedding_services.clear()
377
+
378
+
379
+ atexit.register(_cleanup_all_embedding_services)
@@ -20,6 +20,7 @@ import json
20
20
  import os
21
21
  import signal
22
22
  import sys
23
+ import threading
23
24
 
24
25
  # Force CPU BEFORE any torch import
25
26
  os.environ["CUDA_VISIBLE_DEVICES"] = ""
@@ -34,6 +35,29 @@ os.environ["TORCH_DEVICE"] = "cpu"
34
35
  if sys.platform != "win32":
35
36
  signal.signal(signal.SIGTERM, lambda *_: sys.exit(0))
36
37
 
38
+
39
+ def _start_parent_watchdog() -> None:
40
+ """Monitor parent process — self-terminate if parent dies.
41
+
42
+ Prevents orphaned workers that consume 500+ MB each when the parent
43
+ process crashes, is killed, or exits without cleanup.
44
+
45
+ V3.3.7: Added after incident where orphaned workers consumed 33 GB.
46
+ """
47
+ parent_pid = os.getppid()
48
+
49
+ def _watch() -> None:
50
+ import time
51
+ while True:
52
+ time.sleep(5)
53
+ try:
54
+ os.kill(parent_pid, 0)
55
+ except OSError:
56
+ os._exit(0)
57
+
58
+ t = threading.Thread(target=_watch, daemon=True, name="parent-watchdog")
59
+ t.start()
60
+
37
61
  _engine = None
38
62
 
39
63
 
@@ -209,6 +233,8 @@ def _handle_status() -> dict:
209
233
 
210
234
  def _worker_main() -> None:
211
235
  """Main loop: read JSON requests from stdin, write responses to stdout."""
236
+ _start_parent_watchdog() # V3.3.7: self-terminate if parent dies
237
+
212
238
  for line in sys.stdin:
213
239
  line = line.strip()
214
240
  if not line:
@@ -83,13 +83,23 @@ class PolarQuantEncoder:
83
83
  HR-09: Angle indices as uint8, packed into bytes.
84
84
  """
85
85
 
86
- __slots__ = ("_config", "_d", "_S", "_codebooks")
86
+ __slots__ = ("_config", "_d", "_S", "_codebooks", "_turbo", "_use_turbo")
87
87
 
88
88
  def __init__(self, config: PolarQuantConfig) -> None:
89
89
  self._config = config
90
90
  self._d = config.dimension
91
- self._S = self._load_or_create_rotation_matrix()
92
- self._codebooks = self._generate_uniform_codebooks()
91
+ codebook_method = getattr(config, "codebook_method", "turbo")
92
+ if codebook_method == "turbo":
93
+ from superlocalmemory.math.turbo_quant import TurboQuantEncoder
94
+ self._turbo = TurboQuantEncoder(config)
95
+ self._S = self._turbo._S
96
+ self._codebooks = self._generate_uniform_codebooks() # for legacy decode
97
+ self._use_turbo = True
98
+ else:
99
+ self._turbo = None
100
+ self._S = self._load_or_create_rotation_matrix()
101
+ self._codebooks = self._generate_uniform_codebooks()
102
+ self._use_turbo = False
93
103
 
94
104
  # -- Rotation matrix (HR-01, HR-02) ------------------------------------
95
105
 
@@ -156,14 +166,14 @@ class PolarQuantEncoder:
156
166
  # -- Encode ------------------------------------------------------------
157
167
 
158
168
  def encode(self, embedding: NDArray, bit_width: int = 4) -> QuantizedEmbedding:
159
- """Encode a float32 embedding into quantized polar representation.
169
+ """Encode a float32 embedding into quantized representation.
160
170
 
161
171
  Args:
162
172
  embedding: 1-D float vector of dimension self._d.
163
173
  bit_width: 2, 4, or 8.
164
174
 
165
175
  Returns:
166
- QuantizedEmbedding with packed angle indices.
176
+ QuantizedEmbedding with packed indices.
167
177
 
168
178
  Raises:
169
179
  ValueError: Invalid bit_width or dimension mismatch.
@@ -177,13 +187,25 @@ class PolarQuantEncoder:
177
187
  f"shape mismatch: expected ({self._d},), got {embedding.shape}"
178
188
  )
179
189
 
180
- # Step 1: Random rotation
181
- v_rot = self._S @ embedding
190
+ # V3.3.8: TurboQuant path (default)
191
+ if self._use_turbo:
192
+ result = self._turbo.encode(embedding, bit_width)
193
+ return QuantizedEmbedding(
194
+ fact_id="",
195
+ radius=result.radius,
196
+ angle_indices=result.indices,
197
+ bit_width=result.bit_width,
198
+ qjl_bits=None,
199
+ )
182
200
 
183
- # Step 2: Compute radius
201
+ # Legacy PolarQuant path
202
+ return self._encode_polar(embedding, bit_width)
203
+
204
+ def _encode_polar(self, embedding: NDArray, bit_width: int) -> QuantizedEmbedding:
205
+ """Legacy PolarQuant encode (polar coordinate transform)."""
206
+ v_rot = self._S @ embedding
184
207
  r = float(np.linalg.norm(v_rot))
185
208
 
186
- # Degenerate zero vector
187
209
  if r < 1e-12:
188
210
  zero_angles = np.zeros(self._d - 1, dtype=np.uint8)
189
211
  if bit_width == 8:
@@ -200,17 +222,11 @@ class PolarQuantEncoder:
200
222
  qjl_bits=None,
201
223
  )
202
224
 
203
- # Step 3: Normalize
204
225
  v_unit = v_rot / r
205
-
206
- # Step 4: Cartesian to polar angles
207
226
  angles = _cartesian_to_polar_angles(v_unit)
208
-
209
- # Step 5: Quantize angles using codebook
210
227
  cb = self._codebooks[bit_width]
211
228
  indices = np.digitize(angles, cb["boundaries"][1:-1]).astype(np.uint8)
212
229
 
213
- # Step 6: Pack into bytes
214
230
  if bit_width == 8:
215
231
  packed = indices.tobytes()
216
232
  elif bit_width == 4:
@@ -228,18 +244,43 @@ class PolarQuantEncoder:
228
244
 
229
245
  # -- Decode ------------------------------------------------------------
230
246
 
247
+ # TQ magic prefix for format detection (HR-MIG-02)
248
+ _TQ_MAGIC = b"\x54\x51"
249
+
231
250
  def decode(self, qe: QuantizedEmbedding) -> NDArray:
232
251
  """Decode a QuantizedEmbedding back to float64 vector.
233
252
 
253
+ V3.3.8: Detects "TQ" prefix (0x54, 0x51) to route between
254
+ TurboQuant and legacy PolarQuant decode paths.
255
+
234
256
  Args:
235
257
  qe: Quantized embedding produced by encode().
236
258
 
237
259
  Returns:
238
260
  Reconstructed vector of dimension self._d.
239
261
  """
262
+ # Format detection: TQ prefix = TurboQuant, else legacy polar
263
+ if qe.angle_indices[:2] == self._TQ_MAGIC:
264
+ return self._decode_turbo(qe)
265
+ return self._decode_polar(qe)
266
+
267
+ def _decode_turbo(self, qe: QuantizedEmbedding) -> NDArray:
268
+ """Decode TurboQuant-encoded BLOB (has TQ prefix)."""
269
+ if self._turbo is None:
270
+ from superlocalmemory.math.turbo_quant import TurboQuantEncoder
271
+ self._turbo = TurboQuantEncoder(self._config)
272
+ from superlocalmemory.math.turbo_quant import TurboQuantResult
273
+ result = TurboQuantResult(
274
+ radius=qe.radius,
275
+ indices=qe.angle_indices,
276
+ bit_width=qe.bit_width,
277
+ )
278
+ return self._turbo.decode(result)
279
+
280
+ def _decode_polar(self, qe: QuantizedEmbedding) -> NDArray:
281
+ """Decode legacy PolarQuant BLOB (no TQ prefix)."""
240
282
  n_angles = self._d - 1
241
283
 
242
- # Step 1: Unpack angle indices
243
284
  if qe.bit_width == 8:
244
285
  indices = np.frombuffer(qe.angle_indices, dtype=np.uint8).copy()
245
286
  elif qe.bit_width == 4:
@@ -247,19 +288,12 @@ class PolarQuantEncoder:
247
288
  else:
248
289
  indices = self.unpack_2bit(qe.angle_indices, n_angles)
249
290
 
250
- # Step 2: Dequantize -- map indices to centroid angles
251
291
  centroids = self._codebooks[qe.bit_width]["centroids"]
252
- # Clip indices to valid range
253
292
  indices = np.clip(indices, 0, len(centroids) - 1)
254
293
  angles = centroids[indices]
255
294
 
256
- # Step 3: Polar to Cartesian
257
295
  v_unit = _polar_to_cartesian(angles, self._d)
258
-
259
- # Step 4: Scale by radius
260
296
  v_rot = v_unit * qe.radius
261
-
262
- # Step 5: Inverse rotation (S is orthogonal, so S^T = S^{-1})
263
297
  v_orig = self._S.T @ v_rot
264
298
 
265
299
  return v_orig
@@ -0,0 +1,308 @@
1
+ # Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
2
+ # Licensed under the MIT License - see LICENSE file
3
+ # Part of SuperLocalMemory V3
4
+
5
+ """TurboQuant embedding quantization (ICLR 2026).
6
+
7
+ Per-coordinate Lloyd-Max scalar quantization after random orthogonal rotation.
8
+ D_mse <= sqrt(3*pi/2) / 4^b. No scipy (HR-SCIPY-01). 2-byte "TQ" prefix on
9
+ all BLOBs (HR-MIG-02). Bit-widths: 2, 4, 8 only (HR-3BIT-01).
10
+
11
+ References: TurboQuant (arXiv 2504.19874), PolarQuant (arXiv 2502.02617).
12
+ Part of Qualixar | Author: Varun Pratap Bhardwaj | License: MIT
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import logging
18
+ import math
19
+ import shutil
20
+ from dataclasses import dataclass
21
+ from pathlib import Path
22
+
23
+ import numpy as np
24
+ from numpy.typing import NDArray
25
+
26
+ from superlocalmemory.core.config import PolarQuantConfig
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+ TQ_MAGIC = b"\x54\x51" # 2-byte prefix for TurboQuant BLOBs (HR-MIG-02)
31
+ SUPPORTED_BIT_WIDTHS: frozenset[int] = frozenset({2, 4, 8})
32
+
33
+ # ---------------------------------------------------------------------------
34
+ # Data types
35
+ # ---------------------------------------------------------------------------
36
+
37
+
38
+ @dataclass(frozen=True)
39
+ class TurboQuantResult:
40
+ """Immutable TurboQuant-encoded embedding. radius=float16, indices=TQ-prefixed."""
41
+
42
+ radius: float
43
+ indices: bytes # TQ_MAGIC + packed codebook indices
44
+ bit_width: int
45
+
46
+
47
+ # ---------------------------------------------------------------------------
48
+ # Lloyd-Max codebook (HR-SCIPY-01: math.erf + math.exp only)
49
+ # ---------------------------------------------------------------------------
50
+
51
+ _SQRT_2PI = math.sqrt(2.0 * math.pi)
52
+ _SQRT_2 = math.sqrt(2.0)
53
+
54
+
55
+ def _std_normal_pdf(x: float) -> float:
56
+ return math.exp(-0.5 * x * x) / _SQRT_2PI
57
+
58
+
59
+ def _std_normal_cdf(x: float) -> float:
60
+ return 0.5 * (1.0 + math.erf(x / _SQRT_2))
61
+
62
+
63
+ def _compute_lloyd_max_gaussian(
64
+ sigma: float, n_levels: int, max_iter: int = 100, tol: float = 1e-10,
65
+ ) -> NDArray:
66
+ """Lloyd-Max optimal codebook for N(0, sigma^2). Deterministic (HR-CB-01)."""
67
+ lo, hi = -5.0 * sigma, 5.0 * sigma
68
+ boundaries = np.linspace(lo, hi, n_levels + 1)
69
+ centroids = np.zeros(n_levels)
70
+ for k in range(n_levels):
71
+ centroids[k] = 0.5 * (boundaries[k] + boundaries[k + 1])
72
+
73
+ for _ in range(max_iter):
74
+ old = centroids.copy()
75
+ for k in range(n_levels):
76
+ a_k = float(boundaries[k]) / sigma
77
+ b_k = float(boundaries[k + 1]) / sigma
78
+ denom = _std_normal_cdf(b_k) - _std_normal_cdf(a_k)
79
+ if denom > 1e-15:
80
+ centroids[k] = sigma * (_std_normal_pdf(a_k) - _std_normal_pdf(b_k)) / denom
81
+ for k in range(1, n_levels):
82
+ boundaries[k] = 0.5 * (centroids[k - 1] + centroids[k])
83
+ if float(np.max(np.abs(centroids - old))) < tol:
84
+ break
85
+
86
+ return np.sort(centroids)
87
+
88
+
89
+ # ---------------------------------------------------------------------------
90
+ # Bit packing
91
+ # ---------------------------------------------------------------------------
92
+
93
+
94
+ def _pack_8bit(indices: NDArray) -> bytes:
95
+ return indices.astype(np.uint8).tobytes()
96
+
97
+
98
+ def _unpack_8bit(data: bytes, length: int) -> NDArray:
99
+ return np.frombuffer(data, dtype=np.uint8)[:length].copy()
100
+
101
+
102
+ def _pack_4bit(indices: NDArray) -> bytes:
103
+ n = len(indices)
104
+ padded = np.zeros(n + (n % 2), dtype=np.uint8)
105
+ padded[:n] = np.clip(indices, 0, 15)
106
+ return ((padded[0::2] << 4) | padded[1::2]).tobytes()
107
+
108
+
109
+ def _unpack_4bit(data: bytes, length: int) -> NDArray:
110
+ packed = np.frombuffer(data, dtype=np.uint8)
111
+ result = np.empty(len(packed) * 2, dtype=np.uint8)
112
+ result[0::2] = packed >> 4
113
+ result[1::2] = packed & 0x0F
114
+ return result[:length]
115
+
116
+
117
+ def _pack_2bit(indices: NDArray) -> bytes:
118
+ n = len(indices)
119
+ padded = np.zeros(n + (4 - n % 4) % 4, dtype=np.uint8)
120
+ padded[:n] = np.clip(indices, 0, 3)
121
+ return (
122
+ (padded[0::4] << 6) | (padded[1::4] << 4)
123
+ | (padded[2::4] << 2) | padded[3::4]
124
+ ).tobytes()
125
+
126
+
127
+ def _unpack_2bit(data: bytes, length: int) -> NDArray:
128
+ packed = np.frombuffer(data, dtype=np.uint8)
129
+ result = np.empty(len(packed) * 4, dtype=np.uint8)
130
+ result[0::4] = (packed >> 6) & 0x03
131
+ result[1::4] = (packed >> 4) & 0x03
132
+ result[2::4] = (packed >> 2) & 0x03
133
+ result[3::4] = packed & 0x03
134
+ return result[:length]
135
+
136
+
137
+ _PACKERS: dict[int, tuple] = {
138
+ 8: (_pack_8bit, _unpack_8bit),
139
+ 4: (_pack_4bit, _unpack_4bit),
140
+ 2: (_pack_2bit, _unpack_2bit),
141
+ }
142
+
143
+ # ---------------------------------------------------------------------------
144
+ # TurboQuantEncoder
145
+ # ---------------------------------------------------------------------------
146
+
147
+
148
+ class TurboQuantEncoder:
149
+ """Per-coordinate Lloyd-Max quantizer with random rotation.
150
+
151
+ HR-ROT-01: Same rotation matrix for encode/decode.
152
+ HR-CB-02: Codebooks computed ONCE at __init__.
153
+ HR-SCIPY-01: No scipy dependency.
154
+ """
155
+
156
+ __slots__ = ("_config", "_d", "_S", "_codebooks")
157
+
158
+ def __init__(self, config: PolarQuantConfig) -> None:
159
+ self._config = config
160
+ self._d = config.dimension
161
+ self._S = self._load_or_create_rotation_matrix()
162
+ self._codebooks = self._compute_codebooks()
163
+
164
+ def _load_or_create_rotation_matrix(self) -> NDArray:
165
+ """Load/create rotation matrix with copy-on-detect (AUDIT C4-MED-01)."""
166
+ d = self._d
167
+ slm_dir = Path.home() / ".superlocalmemory"
168
+
169
+ turbo_path_str = self._config.rotation_matrix_path
170
+ if not turbo_path_str:
171
+ turbo_path_str = str(slm_dir / f"turbo_rotation_{d}.npy")
172
+ turbo_path = Path(turbo_path_str)
173
+
174
+ if turbo_path.exists():
175
+ try:
176
+ S = np.load(str(turbo_path))
177
+ if S.shape == (d, d):
178
+ return S
179
+ logger.warning("Turbo rotation shape %s != (%d,%d)", S.shape, d, d)
180
+ except Exception as exc:
181
+ logger.warning("Corrupt turbo rotation: %s", exc)
182
+
183
+ # Copy-on-detect: reuse existing polar rotation matrix
184
+ polar_path = slm_dir / f"polar_rotation_{d}.npy"
185
+ if polar_path.exists() and not turbo_path.exists():
186
+ try:
187
+ S = np.load(str(polar_path))
188
+ if S.shape == (d, d):
189
+ turbo_path.parent.mkdir(parents=True, exist_ok=True)
190
+ shutil.copy2(str(polar_path), str(turbo_path))
191
+ logger.info("Copied polar rotation matrix for TurboQuant compatibility")
192
+ return S
193
+ except Exception as exc:
194
+ logger.warning("Could not copy polar rotation: %s", exc)
195
+
196
+ # Generate new via Mezzadri-corrected QR
197
+ rng = np.random.default_rng(self._config.seed)
198
+ H = rng.standard_normal((d, d))
199
+ Q, R = np.linalg.qr(H)
200
+ S = Q @ np.diag(np.sign(np.diag(R)))
201
+
202
+ turbo_path.parent.mkdir(parents=True, exist_ok=True)
203
+ np.save(str(turbo_path), S)
204
+ logger.info("Generated TurboQuant rotation (%d x %d) at %s", d, d, turbo_path)
205
+ return S
206
+
207
+ def _compute_codebooks(self) -> dict[int, NDArray]:
208
+ """Pre-compute Lloyd-Max codebooks for 2/4/8-bit."""
209
+ sigma = 1.0 / math.sqrt(self._d)
210
+ codebooks: dict[int, NDArray] = {}
211
+ for bw in sorted(SUPPORTED_BIT_WIDTHS):
212
+ centroids = _compute_lloyd_max_gaussian(sigma, 2 ** bw)
213
+ assert len(centroids) == 2 ** bw
214
+ assert np.all(centroids[1:] >= centroids[:-1])
215
+ codebooks[bw] = centroids
216
+ return codebooks
217
+
218
+ def encode(self, embedding: NDArray, bit_width: int = 4) -> TurboQuantResult:
219
+ """Encode embedding. HR-ENC-01: pure. HR-ENC-02: radius=float16."""
220
+ if bit_width not in SUPPORTED_BIT_WIDTHS:
221
+ raise ValueError(f"bit_width must be 2, 4, or 8, got {bit_width}")
222
+ if embedding.shape != (self._d,):
223
+ raise ValueError(f"shape mismatch: expected ({self._d},), got {embedding.shape}")
224
+
225
+ y = self._S @ embedding
226
+ r = float(np.linalg.norm(y))
227
+
228
+ if r < 1e-12:
229
+ pack_fn, _ = _PACKERS[bit_width]
230
+ packed = TQ_MAGIC + pack_fn(np.zeros(self._d, dtype=np.uint8))
231
+ return TurboQuantResult(radius=0.0, indices=packed, bit_width=bit_width)
232
+
233
+ y_unit = y / r
234
+ centroids = self._codebooks[bit_width]
235
+ idx = np.searchsorted(centroids, y_unit)
236
+ idx = np.clip(idx, 0, len(centroids) - 1)
237
+ left = np.clip(idx - 1, 0, len(centroids) - 1)
238
+ use_left = np.abs(y_unit - centroids[left]) < np.abs(y_unit - centroids[idx])
239
+ idx = np.where(use_left, left, idx).astype(np.uint8)
240
+
241
+ pack_fn, _ = _PACKERS[bit_width]
242
+ packed = TQ_MAGIC + pack_fn(idx)
243
+
244
+ return TurboQuantResult(
245
+ radius=float(np.float16(r)), indices=packed, bit_width=bit_width,
246
+ )
247
+
248
+ def decode(self, result: TurboQuantResult) -> NDArray:
249
+ """Decode with format detection: TQ prefix -> turbo, else -> legacy polar."""
250
+ blob = result.indices
251
+
252
+ if blob[:2] == TQ_MAGIC:
253
+ data = blob[2:]
254
+ else:
255
+ return self._decode_legacy_polar(result)
256
+
257
+ _, unpack_fn = _PACKERS[result.bit_width]
258
+ indices = unpack_fn(data, self._d)
259
+ centroids = self._codebooks[result.bit_width]
260
+ y_unit_approx = centroids[np.clip(indices, 0, len(centroids) - 1)]
261
+ return self._S.T @ (y_unit_approx * result.radius)
262
+
263
+ def _decode_legacy_polar(self, result: TurboQuantResult) -> NDArray:
264
+ """Decode legacy PolarQuant BLOB (no TQ prefix) for SLM <= 3.3.6."""
265
+ from superlocalmemory.math.polar_quant import PolarQuantEncoder, _polar_to_cartesian
266
+
267
+ n_angles = self._d - 1
268
+ if result.bit_width == 8:
269
+ indices = np.frombuffer(result.indices, dtype=np.uint8).copy()
270
+ elif result.bit_width == 4:
271
+ indices = PolarQuantEncoder.unpack_4bit(result.indices, n_angles)
272
+ else:
273
+ indices = PolarQuantEncoder.unpack_2bit(result.indices, n_angles)
274
+
275
+ levels = 2 ** result.bit_width
276
+ boundaries = np.linspace(0.0, math.pi, levels + 1)
277
+ centroids = (boundaries[:-1] + boundaries[1:]) / 2.0
278
+ angles = centroids[np.clip(indices, 0, len(centroids) - 1)]
279
+
280
+ v_unit = _polar_to_cartesian(angles, self._d)
281
+ return self._S.T @ (v_unit * result.radius)
282
+
283
+ def approximate_similarity(self, query: NDArray, result: TurboQuantResult) -> float:
284
+ """Cosine similarity via decode. Returns 0.0 on degenerate inputs."""
285
+ decoded = self.decode(result)
286
+ denom = np.linalg.norm(query) * np.linalg.norm(decoded)
287
+ if denom < 1e-12:
288
+ return 0.0
289
+ sim = float(np.dot(query, decoded) / denom)
290
+ return 0.0 if (math.isnan(sim) or math.isinf(sim)) else sim
291
+
292
+ # Static pack/unpack (backward compat with PolarQuantEncoder API)
293
+
294
+ @staticmethod
295
+ def pack_4bit(indices: NDArray) -> bytes:
296
+ return _pack_4bit(indices)
297
+
298
+ @staticmethod
299
+ def unpack_4bit(data: bytes, length: int) -> NDArray:
300
+ return _unpack_4bit(data, length)
301
+
302
+ @staticmethod
303
+ def pack_2bit(indices: NDArray) -> bytes:
304
+ return _pack_2bit(indices)
305
+
306
+ @staticmethod
307
+ def unpack_2bit(data: bytes, length: int) -> NDArray:
308
+ return _unpack_2bit(data, length)
@@ -16,6 +16,7 @@ License: MIT
16
16
 
17
17
  from __future__ import annotations
18
18
 
19
+ import atexit
19
20
  import json
20
21
  import logging
21
22
  import os
@@ -23,10 +24,14 @@ import subprocess
23
24
  import sys
24
25
  import threading
25
26
  import time
27
+ import weakref
26
28
  from typing import Any
27
29
 
28
30
  from superlocalmemory.storage.models import AtomicFact
29
31
 
32
+ # Track all live reranker instances for atexit cleanup
33
+ _live_rerankers: set[weakref.ref] = set()
34
+
30
35
  logger = logging.getLogger(__name__)
31
36
 
32
37
  _IDLE_TIMEOUT_SECONDS = 120 # 2 min → kill worker
@@ -64,11 +69,22 @@ class CrossEncoderReranker:
64
69
  self._idle_timer: threading.Timer | None = None
65
70
  self._request_count: int = 0
66
71
 
72
+ # Register for atexit cleanup (prevent orphaned workers)
73
+ ref = weakref.ref(self, _live_rerankers.discard)
74
+ _live_rerankers.add(ref)
75
+
67
76
  # Start background warmup immediately — worker loads model
68
77
  # while the rest of init continues. First recall gets instant
69
78
  # fallback; second recall uses the warm model.
70
79
  self._start_background_warmup()
71
80
 
81
+ def __del__(self) -> None:
82
+ """Kill worker subprocess when reranker is garbage-collected."""
83
+ try:
84
+ self._kill_worker()
85
+ except Exception:
86
+ pass
87
+
72
88
  # ------------------------------------------------------------------
73
89
  # Background warmup (non-blocking model load)
74
90
  # ------------------------------------------------------------------
@@ -330,3 +346,26 @@ class CrossEncoderReranker:
330
346
  if resp is None:
331
347
  return False
332
348
  return resp.get("ok", False)
349
+
350
+
351
+ # ---------------------------------------------------------------------------
352
+ # Module-level atexit: kill ALL reranker workers on process exit
353
+ # ---------------------------------------------------------------------------
354
+
355
+ def _cleanup_all_rerankers() -> None:
356
+ """Kill all reranker worker subprocesses on interpreter exit.
357
+
358
+ Prevents orphaned 1.3 GB ONNX/PyTorch workers surviving after
359
+ parent exits (especially during test runs with parallel agents).
360
+ """
361
+ for ref in list(_live_rerankers):
362
+ reranker = ref()
363
+ if reranker is not None:
364
+ try:
365
+ reranker._kill_worker()
366
+ except Exception:
367
+ pass
368
+ _live_rerankers.clear()
369
+
370
+
371
+ atexit.register(_cleanup_all_rerankers)