npm - superlocalmemory - Versions diffs - 3.3.6 → 3.3.8 - Mend

superlocalmemory 3.3.6 → 3.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/README.md +2 -1
package/package.json +1 -1
package/pyproject.toml +1 -1
package/src/superlocalmemory/core/config.py +2 -1
package/src/superlocalmemory/core/embedding_worker.py +27 -1
package/src/superlocalmemory/core/embeddings.py +39 -0
package/src/superlocalmemory/core/recall_worker.py +26 -0
package/src/superlocalmemory/math/polar_quant.py +57 -23
package/src/superlocalmemory/math/turbo_quant.py +308 -0
package/src/superlocalmemory/retrieval/reranker.py +39 -0

package/README.md CHANGED Viewed

@@ -3,7 +3,8 @@
 </p>
 <h1 align="center">SuperLocalMemory V3.3</h1>
-<p align="center"><strong>The first local-only AI memory to break 74% retrieval on LoCoMo.<br/>No cloud. No APIs. No data leaves your machine.</strong></p>
+<p align="center"><strong>Every other AI forgets. Yours won't.</strong><br/><em>Infinite memory for Claude Code, Cursor, Windsurf & 17+ AI tools.</em></p>
+<p align="center"><code>v3.3.6</code> — Install once. Every session remembers the last. Automatically.</p>
 <p align="center">
   <code>+16pp vs Mem0 (zero cloud)</code> &nbsp;·&nbsp; <code>85% Open-Domain (best of any system)</code> &nbsp;·&nbsp; <code>EU AI Act Ready</code>

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "superlocalmemory",
-  "version": "3.3.6",
+  "version": "3.3.8",
   "description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
   "keywords": [
     "ai-memory",

package/pyproject.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "superlocalmemory"
-version = "3.3.6"
+version = "3.3.8"
 description = "Information-geometric agent memory with mathematical guarantees"
 readme = "README.md"
 license = {text = "MIT"}

package/src/superlocalmemory/core/config.py CHANGED Viewed

@@ -310,6 +310,7 @@ class PolarQuantConfig:
     dimension: int = 768
     rotation_matrix_path: str = ""  # empty = ~/.superlocalmemory/polar_rotation.npy
     seed: int = 42                  # reproducible rotation matrix
+    codebook_method: str = "turbo"  # "turbo" (default) or "polar_legacy"
 @dataclass(frozen=True)
@@ -338,7 +339,7 @@ class QuantizationConfig:
     eap_enabled: bool = True
     keep_float32_backup: bool = True
     auto_compact_interval_hours: int = 6
-    polar_search_penalty: float = 0.95
+    polar_search_penalty: float = 0.97  # V3.3.8: 0.95→0.97, TurboQuant has lower MSE
 @dataclass(frozen=True)

package/src/superlocalmemory/core/embedding_worker.py CHANGED Viewed

@@ -23,9 +23,10 @@ Part of Qualixar | Author: Varun Pratap Bhardwaj
 from __future__ import annotations
 import json
+import os
 import signal
 import sys
-import os
+import threading
 # Force CPU BEFORE any torch import
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
@@ -41,8 +42,33 @@ if sys.platform != "win32":
     signal.signal(signal.SIGTERM, lambda *_: sys.exit(0))
+def _start_parent_watchdog() -> None:
+    """Monitor parent process — self-terminate if parent dies.
+    Prevents orphaned workers that consume 500-800 MB each when the parent
+    process crashes, is killed, or exits without cleanup.
+    V3.3.7: Added after incident where orphaned workers consumed 33 GB.
+    """
+    parent_pid = os.getppid()
+    def _watch() -> None:
+        import time
+        while True:
+            time.sleep(5)
+            try:
+                os.kill(parent_pid, 0)
+            except OSError:
+                os._exit(0)
+    t = threading.Thread(target=_watch, daemon=True, name="parent-watchdog")
+    t.start()
 def _worker_main() -> None:
     """Main loop: read JSON requests from stdin, write responses to stdout."""
+    _start_parent_watchdog()  # V3.3.7: self-terminate if parent dies
     import numpy as np
     model = None

package/src/superlocalmemory/core/embeddings.py CHANGED Viewed

@@ -15,6 +15,7 @@ Part of Qualixar | Author: Varun Pratap Bhardwaj
 from __future__ import annotations
+import atexit
 import json
 import logging
 import os
@@ -22,11 +23,15 @@ import subprocess
 import sys
 import threading
 import time
+import weakref
 from pathlib import Path
 from typing import TYPE_CHECKING
 import numpy as np
+# Track all live embedding services for atexit cleanup
+_live_embedding_services: set[weakref.ref] = set()
 if TYPE_CHECKING:
     from numpy.typing import NDArray
@@ -69,6 +74,17 @@ class EmbeddingService:
         self._worker_ready = False
         self._request_count: int = 0
+        # Register for atexit cleanup (prevent orphaned workers)
+        ref = weakref.ref(self, _live_embedding_services.discard)
+        _live_embedding_services.add(ref)
+    def __del__(self) -> None:
+        """Kill worker subprocess when service is garbage-collected."""
+        try:
+            self._kill_worker()
+        except Exception:
+            pass
     @property
     def is_available(self) -> bool:
         """Check if embedding service can produce embeddings."""
@@ -338,3 +354,26 @@ class EmbeddingService:
             raise DimensionMismatchError(
                 f"Embedding dimension {actual} != expected {self._config.dimension}"
             )
+# ---------------------------------------------------------------------------
+# Module-level atexit: kill ALL embedding workers on process exit
+# ---------------------------------------------------------------------------
+def _cleanup_all_embedding_services() -> None:
+    """Kill all embedding worker subprocesses on interpreter exit.
+    Prevents orphaned 500-800 MB sentence-transformer workers surviving
+    after parent exits (especially during test runs with parallel agents).
+    """
+    for ref in list(_live_embedding_services):
+        svc = ref()
+        if svc is not None:
+            try:
+                svc._kill_worker()
+            except Exception:
+                pass
+    _live_embedding_services.clear()
+atexit.register(_cleanup_all_embedding_services)

package/src/superlocalmemory/core/recall_worker.py CHANGED Viewed

@@ -20,6 +20,7 @@ import json
 import os
 import signal
 import sys
+import threading
 # Force CPU BEFORE any torch import
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
@@ -34,6 +35,29 @@ os.environ["TORCH_DEVICE"] = "cpu"
 if sys.platform != "win32":
     signal.signal(signal.SIGTERM, lambda *_: sys.exit(0))
+def _start_parent_watchdog() -> None:
+    """Monitor parent process — self-terminate if parent dies.
+    Prevents orphaned workers that consume 500+ MB each when the parent
+    process crashes, is killed, or exits without cleanup.
+    V3.3.7: Added after incident where orphaned workers consumed 33 GB.
+    """
+    parent_pid = os.getppid()
+    def _watch() -> None:
+        import time
+        while True:
+            time.sleep(5)
+            try:
+                os.kill(parent_pid, 0)
+            except OSError:
+                os._exit(0)
+    t = threading.Thread(target=_watch, daemon=True, name="parent-watchdog")
+    t.start()
 _engine = None
@@ -209,6 +233,8 @@ def _handle_status() -> dict:
 def _worker_main() -> None:
     """Main loop: read JSON requests from stdin, write responses to stdout."""
+    _start_parent_watchdog()  # V3.3.7: self-terminate if parent dies
     for line in sys.stdin:
         line = line.strip()
         if not line:

package/src/superlocalmemory/math/polar_quant.py CHANGED Viewed

@@ -83,13 +83,23 @@ class PolarQuantEncoder:
     HR-09: Angle indices as uint8, packed into bytes.
     """
-    __slots__ = ("_config", "_d", "_S", "_codebooks")
+    __slots__ = ("_config", "_d", "_S", "_codebooks", "_turbo", "_use_turbo")
     def __init__(self, config: PolarQuantConfig) -> None:
         self._config = config
         self._d = config.dimension
-        self._S = self._load_or_create_rotation_matrix()
-        self._codebooks = self._generate_uniform_codebooks()
+        codebook_method = getattr(config, "codebook_method", "turbo")
+        if codebook_method == "turbo":
+            from superlocalmemory.math.turbo_quant import TurboQuantEncoder
+            self._turbo = TurboQuantEncoder(config)
+            self._S = self._turbo._S
+            self._codebooks = self._generate_uniform_codebooks()  # for legacy decode
+            self._use_turbo = True
+        else:
+            self._turbo = None
+            self._S = self._load_or_create_rotation_matrix()
+            self._codebooks = self._generate_uniform_codebooks()
+            self._use_turbo = False
     # -- Rotation matrix (HR-01, HR-02) ------------------------------------
@@ -156,14 +166,14 @@ class PolarQuantEncoder:
     # -- Encode ------------------------------------------------------------
     def encode(self, embedding: NDArray, bit_width: int = 4) -> QuantizedEmbedding:
-        """Encode a float32 embedding into quantized polar representation.
+        """Encode a float32 embedding into quantized representation.
         Args:
             embedding: 1-D float vector of dimension self._d.
             bit_width:  2, 4, or 8.
         Returns:
-            QuantizedEmbedding with packed angle indices.
+            QuantizedEmbedding with packed indices.
         Raises:
             ValueError: Invalid bit_width or dimension mismatch.
@@ -177,13 +187,25 @@ class PolarQuantEncoder:
                 f"shape mismatch: expected ({self._d},), got {embedding.shape}"
             )
-        # Step 1: Random rotation
-        v_rot = self._S @ embedding
+        # V3.3.8: TurboQuant path (default)
+        if self._use_turbo:
+            result = self._turbo.encode(embedding, bit_width)
+            return QuantizedEmbedding(
+                fact_id="",
+                radius=result.radius,
+                angle_indices=result.indices,
+                bit_width=result.bit_width,
+                qjl_bits=None,
+            )
-        # Step 2: Compute radius
+        # Legacy PolarQuant path
+        return self._encode_polar(embedding, bit_width)
+    def _encode_polar(self, embedding: NDArray, bit_width: int) -> QuantizedEmbedding:
+        """Legacy PolarQuant encode (polar coordinate transform)."""
+        v_rot = self._S @ embedding
         r = float(np.linalg.norm(v_rot))
-        # Degenerate zero vector
         if r < 1e-12:
             zero_angles = np.zeros(self._d - 1, dtype=np.uint8)
             if bit_width == 8:
@@ -200,17 +222,11 @@ class PolarQuantEncoder:
                 qjl_bits=None,
             )
-        # Step 3: Normalize
         v_unit = v_rot / r
-        # Step 4: Cartesian to polar angles
         angles = _cartesian_to_polar_angles(v_unit)
-        # Step 5: Quantize angles using codebook
         cb = self._codebooks[bit_width]
         indices = np.digitize(angles, cb["boundaries"][1:-1]).astype(np.uint8)
-        # Step 6: Pack into bytes
         if bit_width == 8:
             packed = indices.tobytes()
         elif bit_width == 4:
@@ -228,18 +244,43 @@ class PolarQuantEncoder:
     # -- Decode ------------------------------------------------------------
+    # TQ magic prefix for format detection (HR-MIG-02)
+    _TQ_MAGIC = b"\x54\x51"
     def decode(self, qe: QuantizedEmbedding) -> NDArray:
         """Decode a QuantizedEmbedding back to float64 vector.
+        V3.3.8: Detects "TQ" prefix (0x54, 0x51) to route between
+        TurboQuant and legacy PolarQuant decode paths.
         Args:
             qe: Quantized embedding produced by encode().
         Returns:
             Reconstructed vector of dimension self._d.
         """
+        # Format detection: TQ prefix = TurboQuant, else legacy polar
+        if qe.angle_indices[:2] == self._TQ_MAGIC:
+            return self._decode_turbo(qe)
+        return self._decode_polar(qe)
+    def _decode_turbo(self, qe: QuantizedEmbedding) -> NDArray:
+        """Decode TurboQuant-encoded BLOB (has TQ prefix)."""
+        if self._turbo is None:
+            from superlocalmemory.math.turbo_quant import TurboQuantEncoder
+            self._turbo = TurboQuantEncoder(self._config)
+        from superlocalmemory.math.turbo_quant import TurboQuantResult
+        result = TurboQuantResult(
+            radius=qe.radius,
+            indices=qe.angle_indices,
+            bit_width=qe.bit_width,
+        )
+        return self._turbo.decode(result)
+    def _decode_polar(self, qe: QuantizedEmbedding) -> NDArray:
+        """Decode legacy PolarQuant BLOB (no TQ prefix)."""
         n_angles = self._d - 1
-        # Step 1: Unpack angle indices
         if qe.bit_width == 8:
             indices = np.frombuffer(qe.angle_indices, dtype=np.uint8).copy()
         elif qe.bit_width == 4:
@@ -247,19 +288,12 @@ class PolarQuantEncoder:
         else:
             indices = self.unpack_2bit(qe.angle_indices, n_angles)
-        # Step 2: Dequantize -- map indices to centroid angles
         centroids = self._codebooks[qe.bit_width]["centroids"]
-        # Clip indices to valid range
         indices = np.clip(indices, 0, len(centroids) - 1)
         angles = centroids[indices]
-        # Step 3: Polar to Cartesian
         v_unit = _polar_to_cartesian(angles, self._d)
-        # Step 4: Scale by radius
         v_rot = v_unit * qe.radius
-        # Step 5: Inverse rotation (S is orthogonal, so S^T = S^{-1})
         v_orig = self._S.T @ v_rot
         return v_orig

package/src/superlocalmemory/math/turbo_quant.py ADDED Viewed

@@ -0,0 +1,308 @@
+# Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
+# Licensed under the MIT License - see LICENSE file
+# Part of SuperLocalMemory V3
+"""TurboQuant embedding quantization (ICLR 2026).
+Per-coordinate Lloyd-Max scalar quantization after random orthogonal rotation.
+D_mse <= sqrt(3*pi/2) / 4^b. No scipy (HR-SCIPY-01). 2-byte "TQ" prefix on
+all BLOBs (HR-MIG-02). Bit-widths: 2, 4, 8 only (HR-3BIT-01).
+References: TurboQuant (arXiv 2504.19874), PolarQuant (arXiv 2502.02617).
+Part of Qualixar | Author: Varun Pratap Bhardwaj | License: MIT
+"""
+from __future__ import annotations
+import logging
+import math
+import shutil
+from dataclasses import dataclass
+from pathlib import Path
+import numpy as np
+from numpy.typing import NDArray
+from superlocalmemory.core.config import PolarQuantConfig
+logger = logging.getLogger(__name__)
+TQ_MAGIC = b"\x54\x51"  # 2-byte prefix for TurboQuant BLOBs (HR-MIG-02)
+SUPPORTED_BIT_WIDTHS: frozenset[int] = frozenset({2, 4, 8})
+# ---------------------------------------------------------------------------
+# Data types
+# ---------------------------------------------------------------------------
+@dataclass(frozen=True)
+class TurboQuantResult:
+    """Immutable TurboQuant-encoded embedding. radius=float16, indices=TQ-prefixed."""
+    radius: float
+    indices: bytes  # TQ_MAGIC + packed codebook indices
+    bit_width: int
+# ---------------------------------------------------------------------------
+# Lloyd-Max codebook (HR-SCIPY-01: math.erf + math.exp only)
+# ---------------------------------------------------------------------------
+_SQRT_2PI = math.sqrt(2.0 * math.pi)
+_SQRT_2 = math.sqrt(2.0)
+def _std_normal_pdf(x: float) -> float:
+    return math.exp(-0.5 * x * x) / _SQRT_2PI
+def _std_normal_cdf(x: float) -> float:
+    return 0.5 * (1.0 + math.erf(x / _SQRT_2))
+def _compute_lloyd_max_gaussian(
+    sigma: float, n_levels: int, max_iter: int = 100, tol: float = 1e-10,
+) -> NDArray:
+    """Lloyd-Max optimal codebook for N(0, sigma^2). Deterministic (HR-CB-01)."""
+    lo, hi = -5.0 * sigma, 5.0 * sigma
+    boundaries = np.linspace(lo, hi, n_levels + 1)
+    centroids = np.zeros(n_levels)
+    for k in range(n_levels):
+        centroids[k] = 0.5 * (boundaries[k] + boundaries[k + 1])
+    for _ in range(max_iter):
+        old = centroids.copy()
+        for k in range(n_levels):
+            a_k = float(boundaries[k]) / sigma
+            b_k = float(boundaries[k + 1]) / sigma
+            denom = _std_normal_cdf(b_k) - _std_normal_cdf(a_k)
+            if denom > 1e-15:
+                centroids[k] = sigma * (_std_normal_pdf(a_k) - _std_normal_pdf(b_k)) / denom
+        for k in range(1, n_levels):
+            boundaries[k] = 0.5 * (centroids[k - 1] + centroids[k])
+        if float(np.max(np.abs(centroids - old))) < tol:
+            break
+    return np.sort(centroids)
+# ---------------------------------------------------------------------------
+# Bit packing
+# ---------------------------------------------------------------------------
+def _pack_8bit(indices: NDArray) -> bytes:
+    return indices.astype(np.uint8).tobytes()
+def _unpack_8bit(data: bytes, length: int) -> NDArray:
+    return np.frombuffer(data, dtype=np.uint8)[:length].copy()
+def _pack_4bit(indices: NDArray) -> bytes:
+    n = len(indices)
+    padded = np.zeros(n + (n % 2), dtype=np.uint8)
+    padded[:n] = np.clip(indices, 0, 15)
+    return ((padded[0::2] << 4) | padded[1::2]).tobytes()
+def _unpack_4bit(data: bytes, length: int) -> NDArray:
+    packed = np.frombuffer(data, dtype=np.uint8)
+    result = np.empty(len(packed) * 2, dtype=np.uint8)
+    result[0::2] = packed >> 4
+    result[1::2] = packed & 0x0F
+    return result[:length]
+def _pack_2bit(indices: NDArray) -> bytes:
+    n = len(indices)
+    padded = np.zeros(n + (4 - n % 4) % 4, dtype=np.uint8)
+    padded[:n] = np.clip(indices, 0, 3)
+    return (
+        (padded[0::4] << 6) | (padded[1::4] << 4)
+        | (padded[2::4] << 2) | padded[3::4]
+    ).tobytes()
+def _unpack_2bit(data: bytes, length: int) -> NDArray:
+    packed = np.frombuffer(data, dtype=np.uint8)
+    result = np.empty(len(packed) * 4, dtype=np.uint8)
+    result[0::4] = (packed >> 6) & 0x03
+    result[1::4] = (packed >> 4) & 0x03
+    result[2::4] = (packed >> 2) & 0x03
+    result[3::4] = packed & 0x03
+    return result[:length]
+_PACKERS: dict[int, tuple] = {
+    8: (_pack_8bit, _unpack_8bit),
+    4: (_pack_4bit, _unpack_4bit),
+    2: (_pack_2bit, _unpack_2bit),
+}
+# ---------------------------------------------------------------------------
+# TurboQuantEncoder
+# ---------------------------------------------------------------------------
+class TurboQuantEncoder:
+    """Per-coordinate Lloyd-Max quantizer with random rotation.
+    HR-ROT-01: Same rotation matrix for encode/decode.
+    HR-CB-02: Codebooks computed ONCE at __init__.
+    HR-SCIPY-01: No scipy dependency.
+    """
+    __slots__ = ("_config", "_d", "_S", "_codebooks")
+    def __init__(self, config: PolarQuantConfig) -> None:
+        self._config = config
+        self._d = config.dimension
+        self._S = self._load_or_create_rotation_matrix()
+        self._codebooks = self._compute_codebooks()
+    def _load_or_create_rotation_matrix(self) -> NDArray:
+        """Load/create rotation matrix with copy-on-detect (AUDIT C4-MED-01)."""
+        d = self._d
+        slm_dir = Path.home() / ".superlocalmemory"
+        turbo_path_str = self._config.rotation_matrix_path
+        if not turbo_path_str:
+            turbo_path_str = str(slm_dir / f"turbo_rotation_{d}.npy")
+        turbo_path = Path(turbo_path_str)
+        if turbo_path.exists():
+            try:
+                S = np.load(str(turbo_path))
+                if S.shape == (d, d):
+                    return S
+                logger.warning("Turbo rotation shape %s != (%d,%d)", S.shape, d, d)
+            except Exception as exc:
+                logger.warning("Corrupt turbo rotation: %s", exc)
+        # Copy-on-detect: reuse existing polar rotation matrix
+        polar_path = slm_dir / f"polar_rotation_{d}.npy"
+        if polar_path.exists() and not turbo_path.exists():
+            try:
+                S = np.load(str(polar_path))
+                if S.shape == (d, d):
+                    turbo_path.parent.mkdir(parents=True, exist_ok=True)
+                    shutil.copy2(str(polar_path), str(turbo_path))
+                    logger.info("Copied polar rotation matrix for TurboQuant compatibility")
+                    return S
+            except Exception as exc:
+                logger.warning("Could not copy polar rotation: %s", exc)
+        # Generate new via Mezzadri-corrected QR
+        rng = np.random.default_rng(self._config.seed)
+        H = rng.standard_normal((d, d))
+        Q, R = np.linalg.qr(H)
+        S = Q @ np.diag(np.sign(np.diag(R)))
+        turbo_path.parent.mkdir(parents=True, exist_ok=True)
+        np.save(str(turbo_path), S)
+        logger.info("Generated TurboQuant rotation (%d x %d) at %s", d, d, turbo_path)
+        return S
+    def _compute_codebooks(self) -> dict[int, NDArray]:
+        """Pre-compute Lloyd-Max codebooks for 2/4/8-bit."""
+        sigma = 1.0 / math.sqrt(self._d)
+        codebooks: dict[int, NDArray] = {}
+        for bw in sorted(SUPPORTED_BIT_WIDTHS):
+            centroids = _compute_lloyd_max_gaussian(sigma, 2 ** bw)
+            assert len(centroids) == 2 ** bw
+            assert np.all(centroids[1:] >= centroids[:-1])
+            codebooks[bw] = centroids
+        return codebooks
+    def encode(self, embedding: NDArray, bit_width: int = 4) -> TurboQuantResult:
+        """Encode embedding. HR-ENC-01: pure. HR-ENC-02: radius=float16."""
+        if bit_width not in SUPPORTED_BIT_WIDTHS:
+            raise ValueError(f"bit_width must be 2, 4, or 8, got {bit_width}")
+        if embedding.shape != (self._d,):
+            raise ValueError(f"shape mismatch: expected ({self._d},), got {embedding.shape}")
+        y = self._S @ embedding
+        r = float(np.linalg.norm(y))
+        if r < 1e-12:
+            pack_fn, _ = _PACKERS[bit_width]
+            packed = TQ_MAGIC + pack_fn(np.zeros(self._d, dtype=np.uint8))
+            return TurboQuantResult(radius=0.0, indices=packed, bit_width=bit_width)
+        y_unit = y / r
+        centroids = self._codebooks[bit_width]
+        idx = np.searchsorted(centroids, y_unit)
+        idx = np.clip(idx, 0, len(centroids) - 1)
+        left = np.clip(idx - 1, 0, len(centroids) - 1)
+        use_left = np.abs(y_unit - centroids[left]) < np.abs(y_unit - centroids[idx])
+        idx = np.where(use_left, left, idx).astype(np.uint8)
+        pack_fn, _ = _PACKERS[bit_width]
+        packed = TQ_MAGIC + pack_fn(idx)
+        return TurboQuantResult(
+            radius=float(np.float16(r)), indices=packed, bit_width=bit_width,
+        )
+    def decode(self, result: TurboQuantResult) -> NDArray:
+        """Decode with format detection: TQ prefix -> turbo, else -> legacy polar."""
+        blob = result.indices
+        if blob[:2] == TQ_MAGIC:
+            data = blob[2:]
+        else:
+            return self._decode_legacy_polar(result)
+        _, unpack_fn = _PACKERS[result.bit_width]
+        indices = unpack_fn(data, self._d)
+        centroids = self._codebooks[result.bit_width]
+        y_unit_approx = centroids[np.clip(indices, 0, len(centroids) - 1)]
+        return self._S.T @ (y_unit_approx * result.radius)
+    def _decode_legacy_polar(self, result: TurboQuantResult) -> NDArray:
+        """Decode legacy PolarQuant BLOB (no TQ prefix) for SLM <= 3.3.6."""
+        from superlocalmemory.math.polar_quant import PolarQuantEncoder, _polar_to_cartesian
+        n_angles = self._d - 1
+        if result.bit_width == 8:
+            indices = np.frombuffer(result.indices, dtype=np.uint8).copy()
+        elif result.bit_width == 4:
+            indices = PolarQuantEncoder.unpack_4bit(result.indices, n_angles)
+        else:
+            indices = PolarQuantEncoder.unpack_2bit(result.indices, n_angles)
+        levels = 2 ** result.bit_width
+        boundaries = np.linspace(0.0, math.pi, levels + 1)
+        centroids = (boundaries[:-1] + boundaries[1:]) / 2.0
+        angles = centroids[np.clip(indices, 0, len(centroids) - 1)]
+        v_unit = _polar_to_cartesian(angles, self._d)
+        return self._S.T @ (v_unit * result.radius)
+    def approximate_similarity(self, query: NDArray, result: TurboQuantResult) -> float:
+        """Cosine similarity via decode. Returns 0.0 on degenerate inputs."""
+        decoded = self.decode(result)
+        denom = np.linalg.norm(query) * np.linalg.norm(decoded)
+        if denom < 1e-12:
+            return 0.0
+        sim = float(np.dot(query, decoded) / denom)
+        return 0.0 if (math.isnan(sim) or math.isinf(sim)) else sim
+    # Static pack/unpack (backward compat with PolarQuantEncoder API)
+    @staticmethod
+    def pack_4bit(indices: NDArray) -> bytes:
+        return _pack_4bit(indices)
+    @staticmethod
+    def unpack_4bit(data: bytes, length: int) -> NDArray:
+        return _unpack_4bit(data, length)
+    @staticmethod
+    def pack_2bit(indices: NDArray) -> bytes:
+        return _pack_2bit(indices)
+    @staticmethod
+    def unpack_2bit(data: bytes, length: int) -> NDArray:
+        return _unpack_2bit(data, length)

package/src/superlocalmemory/retrieval/reranker.py CHANGED Viewed

@@ -16,6 +16,7 @@ License: MIT
 from __future__ import annotations
+import atexit
 import json
 import logging
 import os
@@ -23,10 +24,14 @@ import subprocess
 import sys
 import threading
 import time
+import weakref
 from typing import Any
 from superlocalmemory.storage.models import AtomicFact
+# Track all live reranker instances for atexit cleanup
+_live_rerankers: set[weakref.ref] = set()
 logger = logging.getLogger(__name__)
 _IDLE_TIMEOUT_SECONDS = 120  # 2 min → kill worker
@@ -64,11 +69,22 @@ class CrossEncoderReranker:
         self._idle_timer: threading.Timer | None = None
         self._request_count: int = 0
+        # Register for atexit cleanup (prevent orphaned workers)
+        ref = weakref.ref(self, _live_rerankers.discard)
+        _live_rerankers.add(ref)
         # Start background warmup immediately — worker loads model
         # while the rest of init continues. First recall gets instant
         # fallback; second recall uses the warm model.
         self._start_background_warmup()
+    def __del__(self) -> None:
+        """Kill worker subprocess when reranker is garbage-collected."""
+        try:
+            self._kill_worker()
+        except Exception:
+            pass
     # ------------------------------------------------------------------
     # Background warmup (non-blocking model load)
     # ------------------------------------------------------------------
@@ -330,3 +346,26 @@ class CrossEncoderReranker:
         if resp is None:
             return False
         return resp.get("ok", False)
+# ---------------------------------------------------------------------------
+# Module-level atexit: kill ALL reranker workers on process exit
+# ---------------------------------------------------------------------------
+def _cleanup_all_rerankers() -> None:
+    """Kill all reranker worker subprocesses on interpreter exit.
+    Prevents orphaned 1.3 GB ONNX/PyTorch workers surviving after
+    parent exits (especially during test runs with parallel agents).
+    """
+    for ref in list(_live_rerankers):
+        reranker = ref()
+        if reranker is not None:
+            try:
+                reranker._kill_worker()
+            except Exception:
+                pass
+    _live_rerankers.clear()
+atexit.register(_cleanup_all_rerankers)