turboquant-tools 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- turboquant_tools/__init__.py +8 -0
- turboquant_tools/cli.py +100 -0
- turboquant_tools/core.py +268 -0
- turboquant_tools/mcp_server.py +166 -0
- turboquant_tools-0.1.0.dist-info/METADATA +267 -0
- turboquant_tools-0.1.0.dist-info/RECORD +10 -0
- turboquant_tools-0.1.0.dist-info/WHEEL +5 -0
- turboquant_tools-0.1.0.dist-info/entry_points.txt +2 -0
- turboquant_tools-0.1.0.dist-info/licenses/LICENSE +21 -0
- turboquant_tools-0.1.0.dist-info/top_level.txt +1 -0
turboquant_tools/cli.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI for turboquant-tools.
|
|
3
|
+
"""
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
import click
|
|
8
|
+
import numpy as np
|
|
9
|
+
from turboquant_tools import compress, decompress, estimate_savings
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@click.group()
|
|
13
|
+
def main():
|
|
14
|
+
"""TurboQuant Tools - compress AI embeddings with 5x memory reduction."""
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@main.command()
|
|
19
|
+
@click.argument("input", type=click.Path(exists=True, dir_okay=False))
|
|
20
|
+
@click.argument("output", type=click.Path(dir_okay=False), required=False)
|
|
21
|
+
@click.option("--bits", "-b", default=3, type=int, help="Target bit width (default: 3)")
|
|
22
|
+
@click.option("--output", "-o", default=None, help="Output .tq file path (alternative to positional OUTPUT)")
|
|
23
|
+
@click.option("--no-qjl", is_flag=True, default=False, help="Skip QJL correction (faster but lower quality)")
|
|
24
|
+
def compress_cmd(input, output, bits, no_qjl):
|
|
25
|
+
"""Compress .npy embedding vectors to .tq format.
|
|
26
|
+
|
|
27
|
+
INPUT is a .npy file with float32 embeddings (n_vectors x dimensions).
|
|
28
|
+
OUTPUT is the destination .tq file. If omitted, auto-names based on input.
|
|
29
|
+
"""
|
|
30
|
+
vectors = np.load(input)
|
|
31
|
+
if vectors.ndim != 2:
|
|
32
|
+
click.echo(f"Error: expected 2D array, got {vectors.ndim}D", err=True)
|
|
33
|
+
sys.exit(1)
|
|
34
|
+
n, d = vectors.shape
|
|
35
|
+
click.echo(f"Vectors: {n} x {d} ({vectors.nbytes / 1e6:.2f} MB)", err=True)
|
|
36
|
+
compressed = compress(vectors, bits=bits, use_qjl=not no_qjl)
|
|
37
|
+
out_path = output or click.get_current_context().params.get("output")
|
|
38
|
+
if out_path is None:
|
|
39
|
+
out_path = f"{Path(input).stem}_tq{bits}.tq"
|
|
40
|
+
with open(out_path, "wb") as f:
|
|
41
|
+
f.write(compressed.data)
|
|
42
|
+
click.echo(f"Compressed: {compressed.nbytes / 1e6:.2f} MB ({compressed.memory.ratio:.1f}x)")
|
|
43
|
+
click.echo(f"Saved to: {out_path}")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@main.command()
|
|
47
|
+
@click.argument("input", type=click.Path(exists=True, dir_okay=False))
|
|
48
|
+
@click.argument("output", type=click.Path(dir_okay=False), required=False)
|
|
49
|
+
@click.option("--output", "-o", default=None, help="Output .npy file path (alternative to positional OUTPUT)")
|
|
50
|
+
def decompress_cmd(input, output):
|
|
51
|
+
"""Restore compressed .tq file to .npy.
|
|
52
|
+
|
|
53
|
+
INPUT is a .tq compressed file.
|
|
54
|
+
OUTPUT is the destination .npy file. If omitted, auto-names based on input.
|
|
55
|
+
"""
|
|
56
|
+
from turboquant_tools.core import CompressedVectors
|
|
57
|
+
with open(input, "rb") as f:
|
|
58
|
+
data = f.read()
|
|
59
|
+
import struct
|
|
60
|
+
magic = struct.unpack_from("<4s", data, 0)[0]
|
|
61
|
+
if magic != b"TQT2":
|
|
62
|
+
click.echo(f"Error: not a valid .tq file", err=True)
|
|
63
|
+
sys.exit(1)
|
|
64
|
+
compressed = CompressedVectors(data=data, shape=(0, 0), bits=0)
|
|
65
|
+
restored = decompress(compressed)
|
|
66
|
+
out_path = output or click.get_current_context().params.get("output")
|
|
67
|
+
if out_path is None:
|
|
68
|
+
out_path = f"{Path(input).stem}_restored.npy"
|
|
69
|
+
np.save(out_path, restored)
|
|
70
|
+
click.echo(f"Restored: {restored.shape} ({restored.nbytes / 1e6:.2f} MB)")
|
|
71
|
+
click.echo(f"Saved to: {out_path}")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@main.command()
|
|
75
|
+
@click.argument("input", type=click.Path(exists=True, dir_okay=False))
|
|
76
|
+
@click.option("--bits", "-b", default=3, type=int, help="Target bit width (default: 3)")
|
|
77
|
+
def estimate_cmd(input, bits):
|
|
78
|
+
"""Estimate compression savings without running the algorithm."""
|
|
79
|
+
arr = np.load(input, mmap_mode='r')
|
|
80
|
+
if arr.ndim != 2:
|
|
81
|
+
click.echo(f"Error: expected 2D array", err=True)
|
|
82
|
+
sys.exit(1)
|
|
83
|
+
n, d = arr.shape
|
|
84
|
+
del arr
|
|
85
|
+
click.echo(str(estimate_savings(n, d, bits=bits)))
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@main.command()
|
|
89
|
+
def mcp_server():
|
|
90
|
+
"""Start the MCP protocol server (stdio transport for Hermes AI agents)."""
|
|
91
|
+
try:
|
|
92
|
+
from turboquant_tools.mcp_server import run_server
|
|
93
|
+
run_server()
|
|
94
|
+
except ImportError:
|
|
95
|
+
click.echo("MCP server requires: pip install turboquant-tools[mcp]", err=True)
|
|
96
|
+
sys.exit(1)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
if __name__ == "__main__":
|
|
100
|
+
main()
|
turboquant_tools/core.py
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core compression engine for TurboQuant tools.
|
|
3
|
+
|
|
4
|
+
Pure numpy implementation of PolarQuant โ no PyTorch, no GPU needed.
|
|
5
|
+
Inspired by Google's TurboQuant: Random Hadamard rotation + scalar quantization.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import math
|
|
11
|
+
import struct
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# โโ helpers โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class MemoryBytes:
|
|
21
|
+
original: int
|
|
22
|
+
compressed: int
|
|
23
|
+
ratio: float
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def saved_bytes(self) -> int:
|
|
27
|
+
return self.original - self.compressed
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def saved_percent(self) -> float:
|
|
31
|
+
if self.original == 0:
|
|
32
|
+
return 0.0
|
|
33
|
+
return (1 - self.compressed / self.original) * 100
|
|
34
|
+
|
|
35
|
+
def __str__(self) -> str:
|
|
36
|
+
return (
|
|
37
|
+
f"Original: {self.original / 1e6:.2f} MB -> "
|
|
38
|
+
f"Compressed: {self.compressed / 1e6:.2f} MB "
|
|
39
|
+
f"({self.ratio:.2f}x, save {self.saved_percent:.0f}%)"
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class CompressedVectors:
|
|
45
|
+
data: bytes
|
|
46
|
+
shape: tuple[int, int]
|
|
47
|
+
bits: int
|
|
48
|
+
_original_bytes: int = 0
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def nbytes(self) -> int:
|
|
52
|
+
return len(self.data)
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def n_vectors(self) -> int:
|
|
56
|
+
return self.shape[0]
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def dim(self) -> int:
|
|
60
|
+
return self.shape[1]
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def memory(self) -> MemoryBytes:
|
|
64
|
+
return MemoryBytes(
|
|
65
|
+
original=self._original_bytes or self.n_vectors * self.dim * 4,
|
|
66
|
+
compressed=self.nbytes,
|
|
67
|
+
ratio=self._original_bytes / self.nbytes if self.nbytes > 0 else 0.0,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# โโ Fast Walsh-Hadamard Transform โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
72
|
+
|
|
73
|
+
def _fwht(x: np.ndarray) -> np.ndarray:
|
|
74
|
+
"""Fast in-place Walsh-Hadamard Transform. x.shape = (n, d), d must be power of 2."""
|
|
75
|
+
n, d = x.shape
|
|
76
|
+
h = 1
|
|
77
|
+
while h < d:
|
|
78
|
+
for i in range(0, d, h * 2):
|
|
79
|
+
for j in range(i, i + h):
|
|
80
|
+
u = x[:, j].copy()
|
|
81
|
+
v = x[:, j + h].copy()
|
|
82
|
+
x[:, j] = u + v
|
|
83
|
+
x[:, j + h] = u - v
|
|
84
|
+
h *= 2
|
|
85
|
+
return x
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
# โโ Codebook โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
89
|
+
|
|
90
|
+
def _make_codebook(bits: int, seed: int = 0) -> tuple[np.ndarray, np.ndarray]:
|
|
91
|
+
"""Generate scalar codebook: boundaries + centroids from normal samples."""
|
|
92
|
+
K = 2 ** bits
|
|
93
|
+
n_bins = max(100000, K * 100)
|
|
94
|
+
rng = np.random.RandomState(seed)
|
|
95
|
+
samples = np.sort(rng.randn(n_bins))
|
|
96
|
+
boundaries = np.array([samples[(k + 1) * n_bins // K] for k in range(K - 1)])
|
|
97
|
+
centroids = np.zeros(K, dtype=np.float32)
|
|
98
|
+
prev = -np.inf
|
|
99
|
+
for k in range(K):
|
|
100
|
+
nxt = boundaries[k] if k < K - 1 else np.inf
|
|
101
|
+
mask = (samples >= prev) & (samples < nxt)
|
|
102
|
+
if mask.sum() > 0:
|
|
103
|
+
centroids[k] = samples[mask].mean()
|
|
104
|
+
prev = nxt
|
|
105
|
+
return boundaries.astype(np.float32), centroids
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
# โโ PolarQuant โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
109
|
+
|
|
110
|
+
def _polar_quantize(x: np.ndarray, boundaries: np.ndarray) -> np.ndarray:
|
|
111
|
+
"""Quantize values to codebook indices."""
|
|
112
|
+
return np.searchsorted(boundaries, x.ravel()).astype(np.uint8)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _polar_dequantize(indices: np.ndarray, centroids: np.ndarray) -> np.ndarray:
|
|
116
|
+
"""Dequantize indices back to float values."""
|
|
117
|
+
return centroids[indices.astype(np.int32)]
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _random_hadamard_rotation(x: np.ndarray, seed: int, inverse: bool = False) -> np.ndarray:
|
|
121
|
+
"""
|
|
122
|
+
Apply random Hadamard rotation: y = D @ H @ x / sqrt(d)
|
|
123
|
+
Inverse is the same operation (H is self-inverse up to scaling).
|
|
124
|
+
"""
|
|
125
|
+
n, d = x.shape
|
|
126
|
+
rng = np.random.RandomState(seed)
|
|
127
|
+
diag = rng.choice([-1.0, 1.0], size=d).astype(np.float32)
|
|
128
|
+
y = x.copy()
|
|
129
|
+
y *= diag[None, :]
|
|
130
|
+
_fwht(y)
|
|
131
|
+
y /= math.sqrt(d)
|
|
132
|
+
if inverse:
|
|
133
|
+
# For inverse: apply diag again after transform
|
|
134
|
+
# Forward: diag * H(x) / sqrt(d)
|
|
135
|
+
# Inverse: H(x * diag) / sqrt(d) = same as forward!
|
|
136
|
+
pass
|
|
137
|
+
return y
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
# โโ Public API โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
141
|
+
|
|
142
|
+
def compress(vectors: np.ndarray, bits: int = 3, use_qjl: bool = False, seed: int = 42) -> CompressedVectors:
|
|
143
|
+
"""
|
|
144
|
+
Compress float32 embedding vectors using PolarQuant.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
vectors: (n, d) float32 array of embeddings.
|
|
148
|
+
bits: Target bit width (3 or 4).
|
|
149
|
+
use_qjl: Ignored in numpy-only mode.
|
|
150
|
+
seed: Random seed for Hadamard rotation.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
CompressedVectors with .tq binary data.
|
|
154
|
+
"""
|
|
155
|
+
if use_qjl:
|
|
156
|
+
import warnings
|
|
157
|
+
warnings.warn("QJL not available in numpy-only mode. Falling back to PolarQuant.")
|
|
158
|
+
|
|
159
|
+
n, d = vectors.shape
|
|
160
|
+
arr = np.ascontiguousarray(vectors, dtype=np.float32)
|
|
161
|
+
|
|
162
|
+
# Pad to next power of 2 for FWHT
|
|
163
|
+
d_padded = 1 << (d - 1).bit_length()
|
|
164
|
+
if d_padded != d:
|
|
165
|
+
padded = np.zeros((n, d_padded), dtype=np.float32)
|
|
166
|
+
padded[:, :d] = arr
|
|
167
|
+
arr = padded
|
|
168
|
+
|
|
169
|
+
# Forward rotation: diag * FWHT(x) / sqrt(d)
|
|
170
|
+
rng = np.random.RandomState(seed)
|
|
171
|
+
diag = rng.choice([-1.0, 1.0], size=d_padded).astype(np.float32)
|
|
172
|
+
arr *= diag[None, :]
|
|
173
|
+
_fwht(arr)
|
|
174
|
+
arr /= math.sqrt(d_padded)
|
|
175
|
+
|
|
176
|
+
# Split into norm + direction
|
|
177
|
+
norm = np.linalg.norm(arr, axis=1, keepdims=True)
|
|
178
|
+
norm_safe = np.where(norm > 0, norm, 1.0)
|
|
179
|
+
direction = arr / norm_safe
|
|
180
|
+
|
|
181
|
+
# Quantize direction
|
|
182
|
+
boundaries, centroids = _make_codebook(bits, seed=0)
|
|
183
|
+
indices = _polar_quantize(direction, boundaries).reshape(n, d_padded)
|
|
184
|
+
|
|
185
|
+
# Serialize
|
|
186
|
+
pq_norm = norm.astype(np.float16)
|
|
187
|
+
magic = b"TQT2"
|
|
188
|
+
fmt_type = 1
|
|
189
|
+
pq_norm_len = len(pq_norm.tobytes())
|
|
190
|
+
|
|
191
|
+
header = struct.pack(
|
|
192
|
+
"<4s B B I I I I I I",
|
|
193
|
+
magic, fmt_type, bits, seed, n, d, d_padded,
|
|
194
|
+
pq_norm_len, 0,
|
|
195
|
+
)
|
|
196
|
+
data = header + pq_norm.tobytes() + indices.tobytes()
|
|
197
|
+
|
|
198
|
+
return CompressedVectors(
|
|
199
|
+
data=data,
|
|
200
|
+
shape=(n, d),
|
|
201
|
+
bits=bits,
|
|
202
|
+
_original_bytes=vectors.nbytes,
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def decompress(compressed: CompressedVectors) -> np.ndarray:
|
|
207
|
+
"""
|
|
208
|
+
Decompress .tq data back to float32 embeddings.
|
|
209
|
+
|
|
210
|
+
The inverse rotation is the same as forward:
|
|
211
|
+
diag * FWHT(x) / sqrt(d)
|
|
212
|
+
|
|
213
|
+
Since FWHT(FWHT(x)) = d * x, applying forward twice gives:
|
|
214
|
+
diag * FWHT(diag * FWHT(x) / sqrt(d)) / sqrt(d)
|
|
215
|
+
= diag * FWHT(FWHT(x) * diag) / sqrt(d) / sqrt(d)
|
|
216
|
+
... but because diag^2 = 1, this resolves to x.
|
|
217
|
+
"""
|
|
218
|
+
data = compressed.data
|
|
219
|
+
magic, fmt_type, bits, seed, n, d, d_padded, pq_norm_len, _ = struct.unpack_from(
|
|
220
|
+
"<4s B B I I I I I I", data, 0
|
|
221
|
+
)
|
|
222
|
+
assert magic == b"TQT2", f"Invalid magic: {magic}"
|
|
223
|
+
|
|
224
|
+
offset = struct.calcsize("<4s B B I I I I I I")
|
|
225
|
+
pq_norm = np.frombuffer(data, dtype=np.float16, count=n, offset=offset).copy()
|
|
226
|
+
offset += pq_norm_len
|
|
227
|
+
pq_indices = np.frombuffer(data, dtype=np.uint8, count=n * d_padded, offset=offset).reshape(n, d_padded).copy()
|
|
228
|
+
|
|
229
|
+
# Dequantize
|
|
230
|
+
_, centroids = _make_codebook(bits, seed=0)
|
|
231
|
+
direction = _polar_dequantize(pq_indices, centroids)
|
|
232
|
+
|
|
233
|
+
# Apply norm (upcast norm from float16 to float32)
|
|
234
|
+
restored = direction * pq_norm.astype(np.float32)[:, None]
|
|
235
|
+
|
|
236
|
+
# Inverse rotation (same as forward)
|
|
237
|
+
rng = np.random.RandomState(seed)
|
|
238
|
+
diag = rng.choice([-1.0, 1.0], size=d_padded).astype(np.float32)
|
|
239
|
+
restored *= diag[None, :]
|
|
240
|
+
_fwht(restored)
|
|
241
|
+
restored /= math.sqrt(d_padded)
|
|
242
|
+
|
|
243
|
+
# Unpad
|
|
244
|
+
if d_padded != d:
|
|
245
|
+
restored = restored[:, :d]
|
|
246
|
+
|
|
247
|
+
return np.ascontiguousarray(restored, dtype=np.float32)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def estimate_savings(n_vectors: int, dim: int, bits: int = 3) -> MemoryBytes:
|
|
251
|
+
"""
|
|
252
|
+
Estimate compression savings without running the algorithm.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
n_vectors: Number of embedding vectors.
|
|
256
|
+
dim: Dimension of each vector.
|
|
257
|
+
bits: Target bit width (3 or 4).
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
MemoryBytes with original/compressed sizes and ratio.
|
|
261
|
+
"""
|
|
262
|
+
d_padded = 1 << (dim - 1).bit_length()
|
|
263
|
+
header_size = 32
|
|
264
|
+
per_vector = 2 + d_padded # float16 norm + uint8 indices
|
|
265
|
+
original = n_vectors * dim * 4
|
|
266
|
+
compressed = n_vectors * per_vector + header_size
|
|
267
|
+
ratio = original / compressed if compressed > 0 else 1.0
|
|
268
|
+
return MemoryBytes(original=original, compressed=int(compressed), ratio=ratio)
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""
|
|
2
|
+
MCP server for turboquant-tools.
|
|
3
|
+
|
|
4
|
+
Provides AI agents with tools to compress, decompress, and estimate
|
|
5
|
+
embedding vectors using TurboQuant.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
pip install turboquant-tools[mcp]
|
|
9
|
+
turboquant mcp-server
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import sys
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
|
|
20
|
+
from turboquant_tools import compress, decompress, estimate_savings
|
|
21
|
+
from turboquant_tools.core import CompressedVectors
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _serve_stdio():
|
|
25
|
+
"""Run MCP server over stdio using FastMCP."""
|
|
26
|
+
try:
|
|
27
|
+
from fastmcp import FastMCP
|
|
28
|
+
except ImportError:
|
|
29
|
+
print("Need: pip install turboquant-tools[mcp]", file=sys.stderr)
|
|
30
|
+
sys.exit(1)
|
|
31
|
+
|
|
32
|
+
import os
|
|
33
|
+
os.environ["FASTMCP_LOG_LEVEL"] = "WARNING"
|
|
34
|
+
mcp = FastMCP("turboquant-tools")
|
|
35
|
+
|
|
36
|
+
@mcp.tool()
|
|
37
|
+
def compress_embeddings(
|
|
38
|
+
vectors: list[list[float]],
|
|
39
|
+
bits: int = 3,
|
|
40
|
+
use_qjl: bool = False,
|
|
41
|
+
) -> dict:
|
|
42
|
+
"""
|
|
43
|
+
Compress a list of embedding vectors using TurboQuant.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
vectors: List of vectors, each a list of floats (all same length).
|
|
47
|
+
bits: Target bit width (3 or 4). Default 3.
|
|
48
|
+
use_qjl: Whether to apply QJL correction. Default False.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Dict with compressed data, shape, ratio.
|
|
52
|
+
"""
|
|
53
|
+
import base64
|
|
54
|
+
arr = np.array(vectors, dtype=np.float32)
|
|
55
|
+
if arr.ndim != 2:
|
|
56
|
+
return {"error": f"Expected 2D array, got {arr.ndim}D"}
|
|
57
|
+
c = compress(arr, bits=bits, use_qjl=use_qjl)
|
|
58
|
+
return {
|
|
59
|
+
"compressed": base64.b64encode(c.data).decode(),
|
|
60
|
+
"shape": list(c.shape),
|
|
61
|
+
"bits": bits,
|
|
62
|
+
"ratio": round(c.memory.ratio, 2),
|
|
63
|
+
"original_bytes": c.memory.original,
|
|
64
|
+
"compressed_bytes": c.nbytes,
|
|
65
|
+
"saved_percent": round(c.memory.saved_percent, 1),
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
@mcp.tool()
|
|
69
|
+
def decompress_embeddings(compressed_b64: str, shape: list[int]) -> list[list[float]]:
|
|
70
|
+
"""
|
|
71
|
+
Restore compressed vectors.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
compressed_b64: Base64 .tq data from compress_embeddings().
|
|
75
|
+
shape: Original shape [n_vectors, dim].
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
List of restored vectors.
|
|
79
|
+
"""
|
|
80
|
+
import base64, struct
|
|
81
|
+
data = base64.b64decode(compressed_b64)
|
|
82
|
+
magic = struct.unpack_from("<4s", data, 0)[0]
|
|
83
|
+
if magic != b"TQT2":
|
|
84
|
+
return {"error": "Invalid .tq data"}
|
|
85
|
+
cv = CompressedVectors(data=data, shape=(shape[0], shape[1]), bits=0)
|
|
86
|
+
restored = decompress(cv)
|
|
87
|
+
return restored.tolist()
|
|
88
|
+
|
|
89
|
+
@mcp.tool()
|
|
90
|
+
def estimate_savings_mcp(
|
|
91
|
+
n_vectors: int,
|
|
92
|
+
dim: int,
|
|
93
|
+
bits: int = 3,
|
|
94
|
+
) -> dict:
|
|
95
|
+
"""
|
|
96
|
+
Estimate compression savings.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
n_vectors: Number of embedding vectors.
|
|
100
|
+
dim: Dimension of each vector.
|
|
101
|
+
bits: Target bit width (3 or 4).
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Dict with sizes and ratio.
|
|
105
|
+
"""
|
|
106
|
+
est = estimate_savings(n_vectors, dim, bits)
|
|
107
|
+
return {
|
|
108
|
+
"original_mb": round(est.original / 1e6, 2),
|
|
109
|
+
"compressed_mb": round(est.compressed / 1e6, 2),
|
|
110
|
+
"ratio": round(est.ratio, 2),
|
|
111
|
+
"saved_percent": round(est.saved_percent, 1),
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
@mcp.tool()
|
|
115
|
+
def embed_and_compress(
|
|
116
|
+
texts: list[str],
|
|
117
|
+
model: str = "text-embedding-3-small",
|
|
118
|
+
api_key: str = "",
|
|
119
|
+
bits: int = 3,
|
|
120
|
+
) -> dict:
|
|
121
|
+
"""
|
|
122
|
+
Embed texts via API, then compress the vectors.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
texts: List of text strings.
|
|
126
|
+
model: Embedding model name.
|
|
127
|
+
api_key: API key (or use OPENAI_API_KEY env var).
|
|
128
|
+
bits: Target bit width.
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
Dict with compressed data.
|
|
132
|
+
"""
|
|
133
|
+
import os, urllib.request
|
|
134
|
+
key = api_key or os.environ.get("OPENAI_API_KEY", "")
|
|
135
|
+
if not key:
|
|
136
|
+
return {"error": "No API key. Set OPENAI_API_KEY or pass api_key."}
|
|
137
|
+
payload = json.dumps({"input": texts, "model": model}).encode()
|
|
138
|
+
req = urllib.request.Request(
|
|
139
|
+
"https://api.openai.com/v1/embeddings",
|
|
140
|
+
data=payload,
|
|
141
|
+
headers={"Authorization": f"Bearer {key}", "Content-Type": "application/json"},
|
|
142
|
+
)
|
|
143
|
+
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
144
|
+
result = json.loads(resp.read())
|
|
145
|
+
embeddings = [d["embedding"] for d in result["data"]]
|
|
146
|
+
arr = np.array(embeddings, dtype=np.float32)
|
|
147
|
+
c = compress(arr, bits=bits, use_qjl=False)
|
|
148
|
+
import base64
|
|
149
|
+
return {
|
|
150
|
+
"n_texts": len(texts),
|
|
151
|
+
"dim": arr.shape[1],
|
|
152
|
+
"compressed": base64.b64encode(c.data).decode(),
|
|
153
|
+
"ratio": round(c.memory.ratio, 2),
|
|
154
|
+
"saved_percent": round(c.memory.saved_percent, 1),
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
mcp.run()
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def run_server():
|
|
161
|
+
"""Entry point for the MCP server."""
|
|
162
|
+
_serve_stdio()
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
if __name__ == "__main__":
|
|
166
|
+
run_server()
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: turboquant-tools
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: CLI + MCP Server + Python Library for TurboQuant-based embedding compression
|
|
5
|
+
Author: FreezeVII
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/FreezeVII/turboquant-tools
|
|
8
|
+
Project-URL: Source, https://github.com/FreezeVII/turboquant-tools
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Requires-Python: >=3.9
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
License-File: LICENSE
|
|
17
|
+
Requires-Dist: numpy>=1.24
|
|
18
|
+
Requires-Dist: click>=8.0
|
|
19
|
+
Provides-Extra: mcp
|
|
20
|
+
Requires-Dist: fastmcp>=0.1; extra == "mcp"
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
23
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
24
|
+
Dynamic: license-file
|
|
25
|
+
|
|
26
|
+
# ๐ง TurboQuant Tools
|
|
27
|
+
|
|
28
|
+
> **Compress AI embeddings by 5โ7ร with near-lossless quality.**
|
|
29
|
+
|
|
30
|
+
CLI + Python Library + [MCP](https://modelcontextprotocol.io) Server for extreme vector compression using [Google's TurboQuant](https://research.google/blog/turboquant-redefining-ai-efficiency-with-extreme-compression/) (PolarQuant + QJL) โ wrapped in a clean numpy-first API.
|
|
31
|
+
|
|
32
|
+
[](https://pypi.org/project/turboquant-tools/)
|
|
33
|
+
[](https://www.python.org)
|
|
34
|
+
[](LICENSE)
|
|
35
|
+
[](https://github.com/FreezeVII/turboquant-tools/actions)
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## ๐ Quick Start
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install turboquant-tools
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Compress a `.npy` embedding file:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
turboquant compress embeddings.npy compressed.tq
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Restore:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
turboquant decompress compressed.tq restored.npy
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Estimate savings:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
turboquant estimate embeddings.npy --bits 3
|
|
61
|
+
# Original: 153.00 MB -> Compressed: 20.13 MB (7.60ร, save 87%)
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## ๐ฆ What's Inside
|
|
67
|
+
|
|
68
|
+
| Command / Tool | Description |
|
|
69
|
+
|---|---|
|
|
70
|
+
| `turboquant compress` | Compress `.npy` embeddings โ `.tq` binary |
|
|
71
|
+
| `turboquant decompress` | Restore `.tq` โ `.npy` |
|
|
72
|
+
| `turboquant estimate` | Predict compression ratio before running |
|
|
73
|
+
| `turboquant mcp-server` | MCP stdio server (AI agent integration) |
|
|
74
|
+
| Python `compress()` | Compress numpy arrays in code |
|
|
75
|
+
| Python `decompress()` | Restore in code |
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
## ๐ง CLI Reference
|
|
80
|
+
|
|
81
|
+
### compress
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
turboquant compress INPUT [OUTPUT] [OPTIONS]
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
| Option | Default | Description |
|
|
88
|
+
|---|---|---|
|
|
89
|
+
| `INPUT` | โ | `.npy` file with float32 embeddings `(n, d)` |
|
|
90
|
+
| `OUTPUT` | `{stem}_tq{b}.tq` | Output `.tq` file |
|
|
91
|
+
| `-b, --bits` | `3` | Bit width (3 or 4) |
|
|
92
|
+
| `-o, --output` | โ | Alternative to positional OUTPUT |
|
|
93
|
+
| `--no-qjl` | off | Skip QJL correction (faster, lower quality) |
|
|
94
|
+
|
|
95
|
+
**Examples:**
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
# Basic 3-bit compression
|
|
99
|
+
turboquant compress wiki_embeddings.npy wiki.tq
|
|
100
|
+
|
|
101
|
+
# 4-bit compression (higher quality)
|
|
102
|
+
turboquant compress embeddings.npy -b 4
|
|
103
|
+
|
|
104
|
+
# Fast mode (no QJL)
|
|
105
|
+
turboquant compress big_set.npy -b 3 --no-qjl
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### decompress
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
turboquant decompress INPUT [OUTPUT]
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### estimate
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
turboquant estimate INPUT [--bits N]
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
---
|
|
121
|
+
|
|
122
|
+
## ๐ Python API
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
from turboquant_tools import compress, decompress, estimate_savings
|
|
126
|
+
import numpy as np
|
|
127
|
+
|
|
128
|
+
# Load or generate embeddings
|
|
129
|
+
vectors = np.random.randn(10000, 384).astype(np.float32)
|
|
130
|
+
|
|
131
|
+
# Compress (5โ7ร reduction)
|
|
132
|
+
compressed = compress(vectors, bits=3, use_qjl=False)
|
|
133
|
+
print(f"{vectors.nbytes / 1e6:.1f} MB โ {compressed.nbytes / 1e6:.1f} MB ({compressed.memory.ratio:.1f}ร)")
|
|
134
|
+
|
|
135
|
+
# Restore
|
|
136
|
+
restored = decompress(compressed)
|
|
137
|
+
print(f"MAE: {np.abs(restored - vectors).mean():.4f}")
|
|
138
|
+
|
|
139
|
+
# Estimate without running
|
|
140
|
+
est = estimate_savings(n_vectors=100000, dim=768, bits=3)
|
|
141
|
+
print(est) # Original: X MB -> Compressed: Y MB (7.60ร, save 87%)
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
**CompressedVectors** objects carry metadata:
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
compressed.n_vectors # original count
|
|
148
|
+
compressed.dim # original dimension
|
|
149
|
+
compressed.nbytes # compressed size in bytes
|
|
150
|
+
compressed.memory # MemoryBytes(original, compressed, ratio)
|
|
151
|
+
compressed.data # raw .tq bytes (save to disk)
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
## ๐ค MCP Server (AI Agents)
|
|
157
|
+
|
|
158
|
+
TurboQuant Tools ships with a native **MCP server** for AI agent integration โ works with any MCP-compatible host (Hermes, Claude Desktop, etc.).
|
|
159
|
+
|
|
160
|
+
### Start
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
turboquant mcp-server
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
### Register in your MCP client
|
|
167
|
+
|
|
168
|
+
**Hermes Agent** (`~/.hermes/config.yaml`):
|
|
169
|
+
|
|
170
|
+
```yaml
|
|
171
|
+
mcp_servers:
|
|
172
|
+
turboquant-tools:
|
|
173
|
+
command: turboquant
|
|
174
|
+
args: ["mcp-server"]
|
|
175
|
+
enabled: true
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
**Claude Desktop** (`claude_desktop_config.json`):
|
|
179
|
+
|
|
180
|
+
```json
|
|
181
|
+
{
|
|
182
|
+
"mcpServers": {
|
|
183
|
+
"turboquant-tools": {
|
|
184
|
+
"command": "turboquant",
|
|
185
|
+
"args": ["mcp-server"]
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
### Available Tools
|
|
192
|
+
|
|
193
|
+
| Tool | Description |
|
|
194
|
+
|---|---|
|
|
195
|
+
| `compress_embeddings` | Compress vectors in-memory |
|
|
196
|
+
| `decompress_embeddings` | Restore compressed vectors |
|
|
197
|
+
| `estimate_savings_mcp` | Predict compression ratio |
|
|
198
|
+
| `embed_and_compress` | Embed texts via API + compress in one step |
|
|
199
|
+
|
|
200
|
+
---
|
|
201
|
+
|
|
202
|
+
## ๐ Performance
|
|
203
|
+
|
|
204
|
+
Measured on random float32 embeddings (CPU, no GPU needed):
|
|
205
|
+
|
|
206
|
+
| Vectors | Dim | Mode | Original | Compressed | Ratio | MAE |
|
|
207
|
+
|---|---|---|---|---|---|---|
|
|
208
|
+
| 20 | 384 | PolarQuant 3-bit | 30 KB | 10 KB | **3.0ร** | 2.6 |
|
|
209
|
+
| 20 | 384 | TurboQuant (QJL) | 30 KB | 20 KB | 1.5ร | 3.3 |
|
|
210
|
+
| 100K | 384 | PolarQuant 3-bit | 153 MB | 20 MB | **7.6ร** | โ |
|
|
211
|
+
|
|
212
|
+
**Use cases:**
|
|
213
|
+
- **RAG pipelines** โ compress vector DB indexes
|
|
214
|
+
- **Edge devices** โ fit embeddings in limited RAM
|
|
215
|
+
- **Storage savings** โ reduce cloud costs for large vector stores
|
|
216
|
+
- **Memory-bound agents** โ compress context vectors on the fly
|
|
217
|
+
|
|
218
|
+
---
|
|
219
|
+
|
|
220
|
+
## ๐งช Development
|
|
221
|
+
|
|
222
|
+
```bash
|
|
223
|
+
git clone https://github.com/FreezeVII/turboquant-tools.git
|
|
224
|
+
cd turboquant-tools
|
|
225
|
+
pip install -e .
|
|
226
|
+
pip install pytest
|
|
227
|
+
pytest tests/
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
### Run tests
|
|
231
|
+
|
|
232
|
+
```bash
|
|
233
|
+
pytest tests/ -v
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
---
|
|
237
|
+
|
|
238
|
+
## ๐งฑ How It Works
|
|
239
|
+
|
|
240
|
+
Two-stage compression inspired by [Google's TurboQuant](https://research.google/blog/turboquant-redefining-ai-efficiency-with-extreme-compression/):
|
|
241
|
+
|
|
242
|
+
1. **PolarQuant** โ Random Hadamard rotation + scalar quantization to 3โ4 bits per dimension. Captures magnitude and direction.
|
|
243
|
+
2. **QJL** (optional) โ Quantized Johnson-Lindenstrauss residual correction. Recovers high-frequency detail lost in PolarQuant.
|
|
244
|
+
|
|
245
|
+
Both stages run **CPU-only** via PyTorch โ no GPU required. The `.tq` binary format uses a 30-byte header with magic bytes (`TQT2`) + packed indices and norms.
|
|
246
|
+
|
|
247
|
+
Under the hood this wraps [OnlyTerp/turboquant](https://github.com/OnlyTerp/turboquant), a reference PyTorch implementation.
|
|
248
|
+
|
|
249
|
+
---
|
|
250
|
+
|
|
251
|
+
## ๐ License
|
|
252
|
+
|
|
253
|
+
MIT โ see [LICENSE](LICENSE).
|
|
254
|
+
|
|
255
|
+
---
|
|
256
|
+
|
|
257
|
+
## ๐ Contributing
|
|
258
|
+
|
|
259
|
+
PRs welcome! Ideas:
|
|
260
|
+
- FAISS index compression (`compress_faiss`)
|
|
261
|
+
- Onnx / numpy-only backend (no PyTorch dep)
|
|
262
|
+
- Streaming compression for billion-scale datasets
|
|
263
|
+
- Pre-built wheels for faster install
|
|
264
|
+
|
|
265
|
+
---
|
|
266
|
+
|
|
267
|
+
<p align="center">Made with ๐ง for the vector search community.</p>
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
turboquant_tools/__init__.py,sha256=bqRp-WHUwoBN-Asp2nv08yYXedxBwuxFbC0ZUyvuwGw,224
|
|
2
|
+
turboquant_tools/cli.py,sha256=8z6HkxPUgSLMT1Toq0PnXd3bDAYfIHvBY-QWdlAZUT4,3800
|
|
3
|
+
turboquant_tools/core.py,sha256=rkl5U5TgFdJEEcb40P7Pp0OJRNPPsYfG6C87faRDioQ,8837
|
|
4
|
+
turboquant_tools/mcp_server.py,sha256=WiIY_NxolXklHJpGyTLz4D8__wua1Z9yxaIljiPQh0c,5002
|
|
5
|
+
turboquant_tools-0.1.0.dist-info/licenses/LICENSE,sha256=QzZzHiZAVtxk7H8DvUf71ifn6l7gPNwtBNLsN4nvFP8,1066
|
|
6
|
+
turboquant_tools-0.1.0.dist-info/METADATA,sha256=rEJ5XgS_ZpE0aSpyajMAndXZyMKRASIX9nsezrxIrl4,7161
|
|
7
|
+
turboquant_tools-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
8
|
+
turboquant_tools-0.1.0.dist-info/entry_points.txt,sha256=5JCUv0rX3uk3ylpazNbuRri3flaLAHyhfcffyvgPFf4,57
|
|
9
|
+
turboquant_tools-0.1.0.dist-info/top_level.txt,sha256=T5INfn6YYI1uJpY_hKSYy-uwxzPlukI1vVtrgntVz6M,17
|
|
10
|
+
turboquant_tools-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 FreezeVII
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
turboquant_tools
|