bitbudget 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bitbudget/__init__.py +28 -0
- bitbudget/_bittrie.c +92 -0
- bitbudget/_bittrie_build.py +94 -0
- bitbudget/bittrie.py +108 -0
- bitbudget/cli.py +231 -0
- bitbudget/datasets.py +49 -0
- bitbudget/embedders.py +46 -0
- bitbudget/eval.py +44 -0
- bitbudget/indexes.py +136 -0
- bitbudget/methods.py +114 -0
- bitbudget/metrics.py +37 -0
- bitbudget-0.1.0.dist-info/METADATA +148 -0
- bitbudget-0.1.0.dist-info/RECORD +17 -0
- bitbudget-0.1.0.dist-info/WHEEL +5 -0
- bitbudget-0.1.0.dist-info/entry_points.txt +2 -0
- bitbudget-0.1.0.dist-info/licenses/LICENSE +21 -0
- bitbudget-0.1.0.dist-info/top_level.txt +1 -0
bitbudget/__init__.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""BitBudget: how much retrieval quality do you keep per byte?
|
|
2
|
+
|
|
3
|
+
A reproducible benchmark for embedding compression. Given an embedder and a corpus, it
|
|
4
|
+
measures the retrieval quality (nDCG@10, recall@10) retained by each compression method
|
|
5
|
+
against the bytes it stores per vector -- the recall-per-byte frontier.
|
|
6
|
+
|
|
7
|
+
Add your own method in five lines::
|
|
8
|
+
|
|
9
|
+
from bitbudget import method
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
@method("my-2bit", bits=2)
|
|
13
|
+
def my_2bit(demb, qemb):
|
|
14
|
+
# return (query x doc similarity scores, bytes per stored vector)
|
|
15
|
+
codes = my_quantise(demb)
|
|
16
|
+
return qemb @ my_dequantise(codes).T, demb.shape[1] * 2 / 8
|
|
17
|
+
|
|
18
|
+
Then ``bitbudget run --embedder mxbai --corpus scifact`` scores it alongside the built-ins.
|
|
19
|
+
"""
|
|
20
|
+
from .methods import method, METHODS, list_methods
|
|
21
|
+
from .embedders import embedder, EMBEDDERS, list_embedders
|
|
22
|
+
from .eval import evaluate
|
|
23
|
+
from .indexes import index, INDEXES, list_indexes, bench_indexes
|
|
24
|
+
|
|
25
|
+
__version__ = "0.1.0"
|
|
26
|
+
__all__ = ["method", "METHODS", "list_methods", "embedder", "EMBEDDERS",
|
|
27
|
+
"list_embedders", "evaluate", "index", "INDEXES", "list_indexes",
|
|
28
|
+
"bench_indexes", "__version__"]
|
bitbudget/_bittrie.c
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
/* Fast batched bit-trie query: coarse-to-fine beam descent over sorted codes, then a
|
|
2
|
+
* full-precision re-ranking pass, looped over all queries in C. Plain C (no Python.h);
|
|
3
|
+
* loaded via ctypes from _bittrie_build.py and compiled on demand. Falls back to the
|
|
4
|
+
* numpy/Python path in bittrie.py if no compiler is available. */
|
|
5
|
+
#include <stdint.h>
|
|
6
|
+
#include <stdlib.h>
|
|
7
|
+
#include <string.h>
|
|
8
|
+
#include <math.h>
|
|
9
|
+
|
|
10
|
+
typedef struct { double cost; long lo, hi; unsigned long long pre; } Node;
|
|
11
|
+
|
|
12
|
+
static int cmp_node(const void *a, const void *b) {
|
|
13
|
+
double d = ((const Node *)a)->cost - ((const Node *)b)->cost;
|
|
14
|
+
return (d > 0) - (d < 0);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
static long lower_bound_u64(const uint64_t *a, long lo, long hi, uint64_t key) {
|
|
18
|
+
while (lo < hi) { long mid = lo + (hi - lo) / 2; if (a[mid] < key) lo = mid + 1; else hi = mid; }
|
|
19
|
+
return lo;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/* out: nq*k int64 of original doc ids (top-k by inner product), -1 padding if fewer found. */
|
|
23
|
+
/* Each query is independent and writes a disjoint slice of `out`, so the loop parallelises
|
|
24
|
+
* cleanly. Scratch buffers are allocated per thread inside the parallel region; when compiled
|
|
25
|
+
* without OpenMP the `omp` pragmas are ignored and this runs as one block, single-threaded. */
|
|
26
|
+
void bt_query_batch(const uint64_t *codes, const int64_t *docids,
|
|
27
|
+
const float *Xf, const float *qproj, const float *qf,
|
|
28
|
+
int n, int d, int b, int nq, int depth, int beam, int k,
|
|
29
|
+
int64_t *out) {
|
|
30
|
+
if (depth > b) depth = b;
|
|
31
|
+
if (beam < 1) beam = 1;
|
|
32
|
+
int cap = 2 * beam + 4;
|
|
33
|
+
|
|
34
|
+
#pragma omp parallel
|
|
35
|
+
{
|
|
36
|
+
Node *cur = (Node *)malloc(sizeof(Node) * cap);
|
|
37
|
+
Node *nxt = (Node *)malloc(sizeof(Node) * 2 * cap);
|
|
38
|
+
double *bscore = (double *)malloc(sizeof(double) * k);
|
|
39
|
+
long *bid = (long *)malloc(sizeof(long) * k);
|
|
40
|
+
|
|
41
|
+
#pragma omp for schedule(static)
|
|
42
|
+
for (int qi = 0; qi < nq; qi++) {
|
|
43
|
+
const float *qp = qproj + (size_t)qi * b;
|
|
44
|
+
const float *qv = qf + (size_t)qi * d;
|
|
45
|
+
int ncur = 1;
|
|
46
|
+
cur[0].cost = 0.0; cur[0].lo = 0; cur[0].hi = n; cur[0].pre = 0ULL;
|
|
47
|
+
|
|
48
|
+
for (int t = 0; t < depth; t++) {
|
|
49
|
+
int shift = b - t - 1;
|
|
50
|
+
int qb = qp[t] > 0.0f ? 1 : 0;
|
|
51
|
+
double c = fabs((double)qp[t]);
|
|
52
|
+
int nn = 0;
|
|
53
|
+
for (int i = 0; i < ncur; i++) {
|
|
54
|
+
long lo = cur[i].lo, hi = cur[i].hi;
|
|
55
|
+
unsigned long long pre = cur[i].pre;
|
|
56
|
+
unsigned long long hi_part = (b - t) >= 64 ? 0ULL : (pre << (b - t));
|
|
57
|
+
unsigned long long thresh = hi_part | (1ULL << shift);
|
|
58
|
+
long m = lower_bound_u64(codes, lo, hi, (uint64_t)thresh);
|
|
59
|
+
if (m > lo) { nxt[nn].cost = cur[i].cost + (qb == 0 ? 0.0 : c);
|
|
60
|
+
nxt[nn].lo = lo; nxt[nn].hi = m; nxt[nn].pre = (pre << 1); nn++; }
|
|
61
|
+
if (hi > m) { nxt[nn].cost = cur[i].cost + (qb == 1 ? 0.0 : c);
|
|
62
|
+
nxt[nn].lo = m; nxt[nn].hi = hi; nxt[nn].pre = (pre << 1) | 1ULL; nn++; }
|
|
63
|
+
}
|
|
64
|
+
qsort(nxt, nn, sizeof(Node), cmp_node);
|
|
65
|
+
int keep = nn < beam ? nn : beam;
|
|
66
|
+
memcpy(cur, nxt, sizeof(Node) * keep);
|
|
67
|
+
ncur = keep;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
for (int j = 0; j < k; j++) { bscore[j] = -1e300; bid[j] = -1; }
|
|
71
|
+
for (int i = 0; i < ncur; i++) {
|
|
72
|
+
for (long j = cur[i].lo; j < cur[i].hi; j++) {
|
|
73
|
+
long id = (long)docids[j];
|
|
74
|
+
const float *xr = Xf + (size_t)id * d;
|
|
75
|
+
double s = 0.0;
|
|
76
|
+
for (int dd = 0; dd < d; dd++) s += (double)qv[dd] * (double)xr[dd];
|
|
77
|
+
int mn = 0;
|
|
78
|
+
for (int t2 = 1; t2 < k; t2++) if (bscore[t2] < bscore[mn]) mn = t2;
|
|
79
|
+
if (s > bscore[mn]) { bscore[mn] = s; bid[mn] = id; }
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
for (int a = 0; a < k; a++) { /* selection-sort the k winners, score descending */
|
|
83
|
+
int mx = a;
|
|
84
|
+
for (int b2 = a + 1; b2 < k; b2++) if (bscore[b2] > bscore[mx]) mx = b2;
|
|
85
|
+
double ts = bscore[a]; bscore[a] = bscore[mx]; bscore[mx] = ts;
|
|
86
|
+
long ti = bid[a]; bid[a] = bid[mx]; bid[mx] = ti;
|
|
87
|
+
}
|
|
88
|
+
for (int j = 0; j < k; j++) out[(size_t)qi * k + j] = bid[j];
|
|
89
|
+
}
|
|
90
|
+
free(cur); free(nxt); free(bscore); free(bid);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Compile the bit-trie C kernel on demand and load it via ctypes.
|
|
2
|
+
|
|
3
|
+
The package ships pure-Python on PyPI (the .c is data, not a built extension), so install never
|
|
4
|
+
needs a compiler. The first time the fast path is requested we compile _bittrie.c into a cached
|
|
5
|
+
shared library with the system C compiler; if no compiler is available we return None and the
|
|
6
|
+
caller falls back to the numpy/Python path. The compiled library is cached by source hash, so the
|
|
7
|
+
build happens once per machine.
|
|
8
|
+
"""
|
|
9
|
+
import ctypes
|
|
10
|
+
import hashlib
|
|
11
|
+
import os
|
|
12
|
+
import subprocess
|
|
13
|
+
|
|
14
|
+
_LIB = None
|
|
15
|
+
_TRIED = False
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _cache_dir():
|
|
19
|
+
d = os.environ.get("BITBUDGET_CACHE") or os.path.join(os.path.expanduser("~"), ".cache", "bitbudget")
|
|
20
|
+
os.makedirs(d, exist_ok=True)
|
|
21
|
+
return d
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _brew_libomp():
|
|
25
|
+
"""Return the Homebrew libomp prefix on macOS, or None."""
|
|
26
|
+
try:
|
|
27
|
+
p = subprocess.run(["brew", "--prefix", "libomp"], capture_output=True, text=True)
|
|
28
|
+
prefix = p.stdout.strip()
|
|
29
|
+
if p.returncode == 0 and prefix and os.path.isdir(prefix):
|
|
30
|
+
return prefix
|
|
31
|
+
except Exception:
|
|
32
|
+
pass
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _variants():
|
|
37
|
+
"""(cflags, lflags) compile attempts, OpenMP first, single-thread C last."""
|
|
38
|
+
v = []
|
|
39
|
+
omp = _brew_libomp()
|
|
40
|
+
if omp: # macOS clang + Homebrew libomp (try first)
|
|
41
|
+
v.append((["-Xpreprocessor", "-fopenmp", f"-I{omp}/include"],
|
|
42
|
+
[f"-L{omp}/lib", "-lomp", f"-Wl,-rpath,{omp}/lib"]))
|
|
43
|
+
v.append((["-fopenmp"], ["-fopenmp"])) # gcc / OpenMP-capable clang (Linux)
|
|
44
|
+
v.append(([], [])) # no OpenMP: single-threaded C
|
|
45
|
+
return v
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _set_sig(lib):
|
|
49
|
+
f = lib.bt_query_batch
|
|
50
|
+
f.restype = None
|
|
51
|
+
P = ctypes.POINTER
|
|
52
|
+
f.argtypes = [P(ctypes.c_uint64), P(ctypes.c_int64), P(ctypes.c_float),
|
|
53
|
+
P(ctypes.c_float), P(ctypes.c_float),
|
|
54
|
+
ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int,
|
|
55
|
+
ctypes.c_int, ctypes.c_int, ctypes.c_int, P(ctypes.c_int64)]
|
|
56
|
+
return lib
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def get_lib():
|
|
60
|
+
"""Return the loaded ctypes library exposing bt_query_batch, or None if unavailable.
|
|
61
|
+
|
|
62
|
+
Tries to build a multithreaded (OpenMP) kernel first, falling back to single-threaded C,
|
|
63
|
+
then to None (the caller uses the numpy/Python path). The compiled library is cached by
|
|
64
|
+
source hash; set BITBUDGET_NO_OMP=1 to force the single-threaded build.
|
|
65
|
+
"""
|
|
66
|
+
global _LIB, _TRIED
|
|
67
|
+
if _TRIED:
|
|
68
|
+
return _LIB
|
|
69
|
+
_TRIED = True
|
|
70
|
+
src = os.path.join(os.path.dirname(__file__), "_bittrie.c")
|
|
71
|
+
if not os.path.exists(src):
|
|
72
|
+
return None
|
|
73
|
+
h = hashlib.sha1(open(src, "rb").read()).hexdigest()[:12]
|
|
74
|
+
so = os.path.join(_cache_dir(), f"_bittrie_{h}.so")
|
|
75
|
+
if os.path.exists(so): # cached: load directly
|
|
76
|
+
try:
|
|
77
|
+
_LIB = _set_sig(ctypes.CDLL(so)); return _LIB
|
|
78
|
+
except Exception:
|
|
79
|
+
try: os.remove(so)
|
|
80
|
+
except Exception: pass
|
|
81
|
+
cc = os.environ.get("CC") or "cc"
|
|
82
|
+
variants = [([], [])] if os.environ.get("BITBUDGET_NO_OMP") else _variants()
|
|
83
|
+
for cflags, lflags in variants:
|
|
84
|
+
cmd = [cc, "-O3", "-shared", "-fPIC", *cflags, "-o", so, src, *lflags, "-lm"]
|
|
85
|
+
try:
|
|
86
|
+
subprocess.run(cmd, check=True, capture_output=True)
|
|
87
|
+
except Exception:
|
|
88
|
+
continue
|
|
89
|
+
try: # must actually load (libomp present at runtime)
|
|
90
|
+
_LIB = _set_sig(ctypes.CDLL(so)); return _LIB
|
|
91
|
+
except Exception:
|
|
92
|
+
try: os.remove(so)
|
|
93
|
+
except Exception: pass
|
|
94
|
+
return None
|
bitbudget/bittrie.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Bit-trie index: a van Emde Boas / PATRICIA-style radix trie over compact codes.
|
|
2
|
+
|
|
3
|
+
The sorted array of packed sign-codes *is* the trie: a node is a contiguous range of codes
|
|
4
|
+
sharing a bit prefix. Search is coarse-to-fine beam descent (the most discriminative bit is the
|
|
5
|
+
shallowest), then a re-ranking pass. Unsupervised (random rotation + sign), numpy-only, one knob
|
|
6
|
+
(``beam``). It stores compact codes rather than full vectors, so it sits on the organisation axis
|
|
7
|
+
at a fraction of a graph's footprint. Research entry to the BitBudget index benchmark; see the
|
|
8
|
+
projection-quantisation-organisation survey for the motivation.
|
|
9
|
+
|
|
10
|
+
This reference implementation packs into a single uint64, so ``n_bits <= 64``.
|
|
11
|
+
"""
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BitTrieIndex:
|
|
16
|
+
def __init__(self, n_bits=64, seed=0):
|
|
17
|
+
assert n_bits <= 64, "this reference packs into one uint64; use a word-array for >64 bits"
|
|
18
|
+
self.b = n_bits
|
|
19
|
+
self.seed = seed
|
|
20
|
+
|
|
21
|
+
def fit(self, X):
|
|
22
|
+
X = np.ascontiguousarray(X, dtype=np.float32)
|
|
23
|
+
n, d = X.shape
|
|
24
|
+
rng = np.random.RandomState(self.seed)
|
|
25
|
+
R = rng.randn(d, self.b).astype(np.float32) # unsupervised random rotation (RaBitQ)
|
|
26
|
+
R, _ = np.linalg.qr(R) if self.b <= d else (R, None)
|
|
27
|
+
P = X @ R
|
|
28
|
+
order = np.argsort(-P.var(axis=0)) # coarse-to-fine: MSB = highest variance
|
|
29
|
+
self.R = R[:, order]
|
|
30
|
+
P = P[:, order]
|
|
31
|
+
bits = (P > 0).astype(np.uint64)
|
|
32
|
+
weights = (np.uint64(1) << np.arange(self.b - 1, -1, -1, dtype=np.uint64))
|
|
33
|
+
codes = bits @ weights
|
|
34
|
+
order_idx = np.argsort(codes, kind="stable")
|
|
35
|
+
self.codes = codes[order_idx] # sorted: the implicit trie
|
|
36
|
+
self.docids = order_idx.astype(np.int64)
|
|
37
|
+
self.X = X # kept only for optional float re-rank (cold)
|
|
38
|
+
self.n, self.d = n, d
|
|
39
|
+
return self
|
|
40
|
+
|
|
41
|
+
def _descend(self, qbits, qconf, beam, depth):
|
|
42
|
+
"""Confidence-ordered beam over the sorted codes. Stop at ``depth`` bits so each surviving
|
|
43
|
+
prefix is still a fat bucket (the coarse-to-fine / anytime property)."""
|
|
44
|
+
b = self.b
|
|
45
|
+
state = [(0.0, 0, self.n, 0, 0)] # (cost, lo, hi, prefix_int, depth)
|
|
46
|
+
for t in range(depth):
|
|
47
|
+
shift = b - t - 1
|
|
48
|
+
nxt = []
|
|
49
|
+
for cost, lo, hi, pre, _ in state:
|
|
50
|
+
thresh = (pre << (b - t)) | (1 << shift)
|
|
51
|
+
m = lo + int(np.searchsorted(self.codes[lo:hi], np.uint64(thresh), "left"))
|
|
52
|
+
qb = int(qbits[t]); c = float(qconf[t])
|
|
53
|
+
left = (lo, m, (pre << 1))
|
|
54
|
+
right = (m, hi, (pre << 1) | 1)
|
|
55
|
+
for child_bit, (clo, chi, cpre) in ((0, left), (1, right)):
|
|
56
|
+
if chi <= clo:
|
|
57
|
+
continue
|
|
58
|
+
add = 0.0 if child_bit == qb else c # sibling costs its margin
|
|
59
|
+
nxt.append((cost + add, clo, chi, cpre, t + 1))
|
|
60
|
+
nxt.sort(key=lambda e: e[0])
|
|
61
|
+
state = nxt[:beam]
|
|
62
|
+
return state
|
|
63
|
+
|
|
64
|
+
def query(self, q, topk=10, beam=64, depth=28, rerank="float"):
|
|
65
|
+
q = np.asarray(q, dtype=np.float32)
|
|
66
|
+
p = q @ self.R
|
|
67
|
+
qbits = (p > 0).astype(int)
|
|
68
|
+
qconf = np.abs(p)
|
|
69
|
+
ranges = self._descend(qbits, qconf, beam, depth)
|
|
70
|
+
cand = np.concatenate([self.docids[lo:hi] for _, lo, hi, _, _ in ranges]) if ranges else np.empty(0, np.int64)
|
|
71
|
+
if cand.size == 0:
|
|
72
|
+
return cand
|
|
73
|
+
if rerank == "float": # DiskANN-style: cold float for re-rank only
|
|
74
|
+
s = self.X[cand] @ q
|
|
75
|
+
else: # asymmetric: float query vs sign code, no float
|
|
76
|
+
shifts = np.arange(self.b - 1, -1, -1, dtype=np.uint64)
|
|
77
|
+
sign = np.where(((self.codes_full(cand)[:, None] >> shifts) & np.uint64(1)) > 0, 1.0, -1.0)
|
|
78
|
+
s = sign @ (q @ self.R)
|
|
79
|
+
return cand[np.argsort(-s)[:topk]]
|
|
80
|
+
|
|
81
|
+
def query_batch(self, Q, topk=10, beam=64, depth=28):
|
|
82
|
+
"""Top-k for every query in Q. Uses the compiled C kernel when available (one to two
|
|
83
|
+
orders of magnitude faster), else falls back to the pure-Python per-query path."""
|
|
84
|
+
Q = np.ascontiguousarray(Q, dtype=np.float32)
|
|
85
|
+
depth = min(depth, self.b)
|
|
86
|
+
from ._bittrie_build import get_lib
|
|
87
|
+
lib = get_lib()
|
|
88
|
+
if lib is None:
|
|
89
|
+
return np.array([self.query(q, topk, beam, depth, rerank="float") for q in Q])
|
|
90
|
+
import ctypes
|
|
91
|
+
qproj = np.ascontiguousarray(Q @ self.R, dtype=np.float32)
|
|
92
|
+
Xf = np.ascontiguousarray(self.X, dtype=np.float32)
|
|
93
|
+
out = np.full((len(Q), topk), -1, dtype=np.int64)
|
|
94
|
+
cast = lambda a, t: a.ctypes.data_as(ctypes.POINTER(t))
|
|
95
|
+
lib.bt_query_batch(
|
|
96
|
+
cast(self.codes, ctypes.c_uint64), cast(self.docids, ctypes.c_int64),
|
|
97
|
+
cast(Xf, ctypes.c_float), cast(qproj, ctypes.c_float), cast(Q, ctypes.c_float),
|
|
98
|
+
self.n, self.d, self.b, len(Q), int(depth), int(beam), int(topk),
|
|
99
|
+
cast(out, ctypes.c_int64))
|
|
100
|
+
return out
|
|
101
|
+
|
|
102
|
+
def codes_full(self, cand):
|
|
103
|
+
inv = np.empty(self.n, np.int64); inv[self.docids] = np.arange(self.n)
|
|
104
|
+
return self.codes[inv[cand]]
|
|
105
|
+
|
|
106
|
+
def index_bytes(self):
|
|
107
|
+
"""Bytes that must be in RAM to route (codes + leaf ids); the trie is implicit in the sort."""
|
|
108
|
+
return self.codes.nbytes + self.docids.nbytes
|
bitbudget/cli.py
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
"""BitBudget command line.
|
|
2
|
+
|
|
3
|
+
bitbudget methods # list registered compression methods
|
|
4
|
+
bitbudget embedders # list registered embedders
|
|
5
|
+
bitbudget embed --embedder mxbai --corpus scifact # torch step -> cached embeddings
|
|
6
|
+
bitbudget eval --embedder mxbai --corpus scifact # numpy step -> results card
|
|
7
|
+
bitbudget run --embedder mxbai --corpus scifact nfcorpus arguana fiqa # embed + eval
|
|
8
|
+
bitbudget leaderboard results/*.json # aggregate cards into a markdown table
|
|
9
|
+
bitbudget indexes # list registered indexes (organisation axis)
|
|
10
|
+
bitbudget bench-index --synthetic 100000 128 # recall vs QPS vs bytes (flat/hnsw/ivfpq/bittrie)
|
|
11
|
+
|
|
12
|
+
`run` embeds (torch) and evaluates (numpy) in one process, which is safe because the core
|
|
13
|
+
methods use no faiss. The faiss-backed indexes in `bench-index` import faiss only when run, so
|
|
14
|
+
run it in its own process (no torch) to avoid the OpenMP clash.
|
|
15
|
+
"""
|
|
16
|
+
import argparse
|
|
17
|
+
import glob
|
|
18
|
+
import json
|
|
19
|
+
import os
|
|
20
|
+
|
|
21
|
+
import numpy as np
|
|
22
|
+
|
|
23
|
+
from . import datasets
|
|
24
|
+
from .eval import evaluate, aggregate
|
|
25
|
+
from .methods import list_methods, METHODS
|
|
26
|
+
from .indexes import bench_indexes, list_indexes, INDEXES
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _emb_path(cache, embedder, corpus):
|
|
30
|
+
return os.path.join(os.path.expanduser(cache), f"emb_{embedder}_{corpus}.npz")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def cmd_embed(a):
|
|
34
|
+
from .embedders import EMBEDDERS # lazy: imports torch
|
|
35
|
+
emb = EMBEDDERS[a.embedder]
|
|
36
|
+
for corpus in a.corpus:
|
|
37
|
+
corp, queries, qrels = datasets.load(corpus, a.cache, a.split)
|
|
38
|
+
cids, qids = list(corp), list(queries)
|
|
39
|
+
print(f"[embed] {a.embedder} x {corpus}: {len(cids)} docs, {len(qids)} queries")
|
|
40
|
+
demb = emb([corp[c] for c in cids], is_query=False, device=a.device, batch_size=a.batch_size)
|
|
41
|
+
qemb = emb([queries[q] for q in qids], is_query=True, device=a.device, batch_size=a.batch_size)
|
|
42
|
+
os.makedirs(os.path.expanduser(a.cache), exist_ok=True)
|
|
43
|
+
cidset = set(cids)
|
|
44
|
+
np.savez(_emb_path(a.cache, a.embedder, corpus), demb=demb, qemb=qemb,
|
|
45
|
+
cids=np.array(cids), qids=np.array(qids),
|
|
46
|
+
qrels=json.dumps({q: {d: s for d, s in qrels[q].items() if d in cidset} for q in qids}))
|
|
47
|
+
print(f"[embed] cached -> {_emb_path(a.cache, a.embedder, corpus)} {demb.shape}")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _load_emb(a, corpus):
|
|
51
|
+
z = np.load(_emb_path(a.cache, a.embedder, corpus), allow_pickle=True)
|
|
52
|
+
return z["demb"], z["qemb"], z["cids"], z["qids"], json.loads(str(z["qrels"]))
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def cmd_eval(a, _embed_first=False):
|
|
56
|
+
methods = a.methods or list_methods()
|
|
57
|
+
per_corpus = {}
|
|
58
|
+
for corpus in a.corpus:
|
|
59
|
+
if _embed_first:
|
|
60
|
+
cmd_embed(argparse.Namespace(**{**vars(a), "corpus": [corpus]}))
|
|
61
|
+
demb, qemb, cids, qids, qrels = _load_emb(a, corpus)
|
|
62
|
+
rows = evaluate(demb, qemb, cids, qids, qrels, methods)
|
|
63
|
+
per_corpus[corpus] = rows
|
|
64
|
+
print(f"\n=== {a.embedder} x {corpus} ({demb.shape[1]}-d) ===")
|
|
65
|
+
print(f" {'method':16s}{'bytes':>8s}{'nDCG@10':>9s}{'recall@10':>11s}{'% float':>9s}")
|
|
66
|
+
for r in rows:
|
|
67
|
+
print(f" {r['method']:16s}{r['bytes']:8.0f}{r['ndcg']:9.3f}{r['recall']:11.3f}{r['pct_float']:8.0f}%")
|
|
68
|
+
os.makedirs(a.out, exist_ok=True)
|
|
69
|
+
card = dict(embedder=a.embedder, corpora=a.corpus,
|
|
70
|
+
dim=int(_load_emb(a, a.corpus[0])[0].shape[1]),
|
|
71
|
+
per_corpus=per_corpus, aggregate=aggregate(per_corpus))
|
|
72
|
+
path = os.path.join(a.out, f"card_{a.embedder}.json")
|
|
73
|
+
json.dump(card, open(path, "w"), indent=2)
|
|
74
|
+
print(f"\n=== aggregate over {len(a.corpus)} corpora (mean +/- std) ===")
|
|
75
|
+
for r in card["aggregate"]:
|
|
76
|
+
print(f" {r['method']:16s}{r['bytes']:8.0f}B nDCG {r['ndcg']:.3f} "
|
|
77
|
+
f"{r['pct_float']:.0f}+/-{r['pct_float_std']:.0f}% of float")
|
|
78
|
+
print(f"\nwrote results card -> {path}")
|
|
79
|
+
return card
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def cmd_methods(a):
|
|
83
|
+
print("registered methods (name | axis | bits):")
|
|
84
|
+
for n in list_methods():
|
|
85
|
+
fn = METHODS[n]
|
|
86
|
+
print(f" {n:16s} {getattr(fn,'axis','?'):12s} {fn.bits}")
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def cmd_indexes(a):
|
|
90
|
+
print("registered indexes (name | axis):")
|
|
91
|
+
for n in list_indexes():
|
|
92
|
+
print(f" {n:14s} {getattr(INDEXES[n], 'axis', 'organisation')}")
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def cmd_bench_index(a):
|
|
96
|
+
"""Build each index over document vectors and report recall@k, QPS and bytes/vec.
|
|
97
|
+
|
|
98
|
+
faiss (its OpenMP) and the bit-trie (libomp for the C kernel) cannot share one process on
|
|
99
|
+
macOS (two OpenMP runtimes). When both groups are requested we run each in its own subprocess
|
|
100
|
+
and merge; the data is deterministic from the arguments, so the split is transparent.
|
|
101
|
+
"""
|
|
102
|
+
sel = a.indexes or list_indexes()
|
|
103
|
+
faiss_grp = [n for n in sel if getattr(INDEXES[n], "axis", "") == "organisation"]
|
|
104
|
+
other_grp = [n for n in sel if n not in faiss_grp]
|
|
105
|
+
if faiss_grp and other_grp and not getattr(a, "no_split", False):
|
|
106
|
+
import subprocess
|
|
107
|
+
import sys
|
|
108
|
+
import tempfile
|
|
109
|
+
import shutil
|
|
110
|
+
merged, meta = [], {}
|
|
111
|
+
for grp in (faiss_grp, other_grp):
|
|
112
|
+
td = tempfile.mkdtemp()
|
|
113
|
+
cmd = [sys.executable, "-m", "bitbudget.cli", "bench-index",
|
|
114
|
+
*_bench_data_args(a), "--k", str(a.k), "--out", td, "--no-split",
|
|
115
|
+
"--indexes", *grp]
|
|
116
|
+
subprocess.run(cmd, check=True, env=os.environ.copy())
|
|
117
|
+
c = json.load(open(os.path.join(td, "index_card.json")))
|
|
118
|
+
merged += c["rows"]; meta = c
|
|
119
|
+
shutil.rmtree(td, ignore_errors=True)
|
|
120
|
+
merged.sort(key=lambda r: r["bytes"])
|
|
121
|
+
os.makedirs(a.out, exist_ok=True)
|
|
122
|
+
card = dict(source=meta["source"], k=a.k, dim=meta["dim"], n=meta["n"], rows=merged)
|
|
123
|
+
json.dump(card, open(os.path.join(a.out, "index_card.json"), "w"), indent=2)
|
|
124
|
+
print("\n=== board (faiss + bit-trie run in separate processes) ===")
|
|
125
|
+
for r in merged:
|
|
126
|
+
print(f" {r['method']:14s}{r['bytes']:8.0f}B recall@{a.k}={r['recall']:.3f} {r['qps']:9.0f} qps")
|
|
127
|
+
print(f"wrote index card -> {os.path.join(a.out, 'index_card.json')}")
|
|
128
|
+
return card
|
|
129
|
+
|
|
130
|
+
xb, xq, label = _bench_load(a)
|
|
131
|
+
print(f"[bench-index] {label}: base={tuple(xb.shape)}, queries={len(xq)}, k={a.k}")
|
|
132
|
+
rows = bench_indexes(xb, xq, k=a.k, which=sel)
|
|
133
|
+
os.makedirs(a.out, exist_ok=True)
|
|
134
|
+
card = dict(source=label, k=a.k, dim=int(xb.shape[1]), n=int(len(xb)), rows=rows)
|
|
135
|
+
path = os.path.join(a.out, "index_card.json")
|
|
136
|
+
json.dump(card, open(path, "w"), indent=2)
|
|
137
|
+
print(f"\nwrote index card -> {path}")
|
|
138
|
+
return card
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _bench_data_args(a):
|
|
142
|
+
if a.synthetic:
|
|
143
|
+
return ["--synthetic", str(a.synthetic[0]), str(a.synthetic[1])]
|
|
144
|
+
if a.npz:
|
|
145
|
+
return ["--npz", a.npz]
|
|
146
|
+
return ["--embedder", a.embedder, "--corpus", a.corpus, "--cache", a.cache]
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _bench_load(a):
|
|
150
|
+
if a.synthetic:
|
|
151
|
+
N, D = a.synthetic
|
|
152
|
+
rng = np.random.RandomState(0)
|
|
153
|
+
centers = rng.randn(max(2, N // 500), D).astype(np.float32) * 4.0
|
|
154
|
+
xb = centers[rng.randint(0, len(centers), N)] + rng.randn(N, D).astype(np.float32)
|
|
155
|
+
nq = min(10000, max(1, N // 10)) # enough queries to time throughput stably
|
|
156
|
+
xq = xb[rng.choice(N, nq, replace=False)] + 0.1 * rng.randn(nq, D).astype(np.float32)
|
|
157
|
+
return xb, xq, f"synthetic N={N} D={D}"
|
|
158
|
+
if a.npz:
|
|
159
|
+
z = np.load(os.path.expanduser(a.npz), allow_pickle=True)
|
|
160
|
+
xb = z["base"] if "base" in z.files else z["demb"]
|
|
161
|
+
xq = z["query"] if "query" in z.files else z["qemb"]
|
|
162
|
+
return xb, xq, a.npz
|
|
163
|
+
if a.embedder and a.corpus:
|
|
164
|
+
z = np.load(_emb_path(a.cache, a.embedder, a.corpus), allow_pickle=True)
|
|
165
|
+
return z["demb"], z["qemb"], f"{a.embedder} x {a.corpus}"
|
|
166
|
+
raise SystemExit("bench-index needs one of: --synthetic N D | --npz PATH | --embedder X --corpus Y")
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def cmd_embedders(a):
|
|
170
|
+
from .embedders import list_embedders, EMBEDDERS
|
|
171
|
+
print("registered embedders (name | dim | matryoshka):")
|
|
172
|
+
for n in list_embedders():
|
|
173
|
+
fn = EMBEDDERS[n]
|
|
174
|
+
print(f" {n:10s} dim={fn.dim} matryoshka={fn.matryoshka} ({fn.model})")
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def cmd_leaderboard(a):
|
|
178
|
+
cards = [json.load(open(p)) for pat in a.cards for p in glob.glob(pat)]
|
|
179
|
+
print("# BitBudget leaderboard\n")
|
|
180
|
+
for c in sorted(cards, key=lambda c: -c["dim"]):
|
|
181
|
+
print(f"## {c['embedder']} ({c['dim']}-d), mean over {len(c['corpora'])} corpora\n")
|
|
182
|
+
print("| Method | Axis | Bytes/vec | nDCG@10 | % of float |")
|
|
183
|
+
print("|---|---|---|---|---|")
|
|
184
|
+
for r in c["aggregate"]:
|
|
185
|
+
print(f"| {r['method']} | {r['axis']} | {r['bytes']:.0f} | "
|
|
186
|
+
f"{r['ndcg']:.3f} | {r['pct_float']:.0f} ± {r['pct_float_std']:.0f} |")
|
|
187
|
+
print()
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def main(argv=None):
|
|
191
|
+
p = argparse.ArgumentParser(prog="bitbudget", description="recall-per-byte benchmark for embedding compression")
|
|
192
|
+
sub = p.add_subparsers(dest="cmd", required=True)
|
|
193
|
+
for name in ("embed", "eval", "run"):
|
|
194
|
+
s = sub.add_parser(name)
|
|
195
|
+
s.add_argument("--embedder", required=True)
|
|
196
|
+
s.add_argument("--corpus", nargs="+", default=datasets.CORPORA)
|
|
197
|
+
s.add_argument("--methods", nargs="+", default=None)
|
|
198
|
+
s.add_argument("--cache", default="~/.cache/bitbudget")
|
|
199
|
+
s.add_argument("--out", default="results")
|
|
200
|
+
s.add_argument("--device", default="cpu", help="cpu | mps (embedding step)")
|
|
201
|
+
s.add_argument("--batch-size", type=int, default=64)
|
|
202
|
+
s.add_argument("--split", default="test")
|
|
203
|
+
sub.add_parser("methods")
|
|
204
|
+
sub.add_parser("embedders")
|
|
205
|
+
sub.add_parser("indexes")
|
|
206
|
+
lb = sub.add_parser("leaderboard"); lb.add_argument("cards", nargs="+")
|
|
207
|
+
bi = sub.add_parser("bench-index")
|
|
208
|
+
bi.add_argument("--synthetic", nargs=2, type=int, metavar=("N", "D"),
|
|
209
|
+
help="benchmark on N clustered Gaussian vectors of dimension D")
|
|
210
|
+
bi.add_argument("--npz", help="load base/query (keys base,query or demb,qemb) from an .npz")
|
|
211
|
+
bi.add_argument("--embedder", help="use a cached embedding (with --corpus): demb as base")
|
|
212
|
+
bi.add_argument("--corpus")
|
|
213
|
+
bi.add_argument("--cache", default="~/.cache/bitbudget")
|
|
214
|
+
bi.add_argument("--k", type=int, default=10)
|
|
215
|
+
bi.add_argument("--indexes", nargs="+", default=None, help="subset of indexes to run")
|
|
216
|
+
bi.add_argument("--out", default="results")
|
|
217
|
+
bi.add_argument("--no-split", dest="no_split", action="store_true",
|
|
218
|
+
help="run all indexes in one process (do not isolate faiss from the bit-trie)")
|
|
219
|
+
a = p.parse_args(argv)
|
|
220
|
+
if a.cmd == "embed": cmd_embed(a)
|
|
221
|
+
elif a.cmd == "eval": cmd_eval(a)
|
|
222
|
+
elif a.cmd == "run": cmd_eval(a, _embed_first=True)
|
|
223
|
+
elif a.cmd == "methods": cmd_methods(a)
|
|
224
|
+
elif a.cmd == "embedders": cmd_embedders(a)
|
|
225
|
+
elif a.cmd == "indexes": cmd_indexes(a)
|
|
226
|
+
elif a.cmd == "leaderboard": cmd_leaderboard(a)
|
|
227
|
+
elif a.cmd == "bench-index": cmd_bench_index(a)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
if __name__ == "__main__":
|
|
231
|
+
main()
|
bitbudget/datasets.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""BEIR corpus loading. Datasets auto-download to the cache dir on first use."""
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
import urllib.request
|
|
6
|
+
import zipfile
|
|
7
|
+
|
|
8
|
+
BEIR_URL = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip"
|
|
9
|
+
# the standard small/medium BEIR corpora the leaderboard is defined over
|
|
10
|
+
CORPORA = ["scifact", "nfcorpus", "arguana", "fiqa"]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _download(name, cache):
|
|
14
|
+
d = os.path.join(cache, name)
|
|
15
|
+
if not os.path.isdir(d):
|
|
16
|
+
os.makedirs(cache, exist_ok=True)
|
|
17
|
+
z = os.path.join(cache, name + ".zip")
|
|
18
|
+
if not os.path.exists(z):
|
|
19
|
+
req = urllib.request.Request(BEIR_URL.format(name), headers={"User-Agent": "Mozilla/5.0"})
|
|
20
|
+
with urllib.request.urlopen(req) as r, open(z + ".part", "wb") as f:
|
|
21
|
+
shutil.copyfileobj(r, f)
|
|
22
|
+
os.replace(z + ".part", z)
|
|
23
|
+
with zipfile.ZipFile(z) as zf:
|
|
24
|
+
zf.extractall(cache)
|
|
25
|
+
return d
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def load(name, cache="~/.cache/bitbudget", split="test"):
|
|
29
|
+
cache = os.path.expanduser(cache)
|
|
30
|
+
d = _download(name, cache)
|
|
31
|
+
corpus = {}
|
|
32
|
+
with open(os.path.join(d, "corpus.jsonl")) as f:
|
|
33
|
+
for line in f:
|
|
34
|
+
o = json.loads(line)
|
|
35
|
+
corpus[o["_id"]] = (o.get("title", "") + " " + o.get("text", "")).strip()
|
|
36
|
+
queries = {}
|
|
37
|
+
with open(os.path.join(d, "queries.jsonl")) as f:
|
|
38
|
+
for line in f:
|
|
39
|
+
o = json.loads(line)
|
|
40
|
+
queries[o["_id"]] = o["text"]
|
|
41
|
+
qrels = {}
|
|
42
|
+
with open(os.path.join(d, "qrels", split + ".tsv")) as f:
|
|
43
|
+
next(f)
|
|
44
|
+
for line in f:
|
|
45
|
+
qid, did, score = line.strip().split("\t")
|
|
46
|
+
if int(score) > 0:
|
|
47
|
+
qrels.setdefault(qid, {})[did] = int(score)
|
|
48
|
+
queries = {q: queries[q] for q in qrels if q in queries}
|
|
49
|
+
return corpus, queries, qrels
|
bitbudget/embedders.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Embedders. A named embedder turns a list of texts into a float32 matrix. These import
|
|
2
|
+
sentence-transformers (torch) and so must run in a process that does NOT also import faiss
|
|
3
|
+
(OpenMP clash on macOS); the CLI keeps embedding and faiss-backed evaluation in separate
|
|
4
|
+
processes. Register your own with ``@embedder``.
|
|
5
|
+
"""
|
|
6
|
+
EMBEDDERS = {}
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def embedder(name, model=None, dim=None, query_prompt="", matryoshka=False):
|
|
10
|
+
"""Register an embedder. `query_prompt` is prepended to queries only (asymmetric models)."""
|
|
11
|
+
def deco(fn):
|
|
12
|
+
fn.bitbudget_name = name
|
|
13
|
+
fn.model, fn.dim, fn.query_prompt, fn.matryoshka = model, dim, query_prompt, matryoshka
|
|
14
|
+
EMBEDDERS[name] = fn
|
|
15
|
+
return fn
|
|
16
|
+
return deco
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def list_embedders():
|
|
20
|
+
return sorted(EMBEDDERS)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _st(model_name, device, max_seq_length):
|
|
24
|
+
from sentence_transformers import SentenceTransformer
|
|
25
|
+
m = SentenceTransformer(model_name, device=device, trust_remote_code=True)
|
|
26
|
+
if max_seq_length:
|
|
27
|
+
m.max_seq_length = max_seq_length
|
|
28
|
+
return m
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _make_st_embedder(name, model_name, dim, query_prompt, matryoshka, max_seq_length=256):
|
|
32
|
+
@embedder(name, model=model_name, dim=dim, query_prompt=query_prompt, matryoshka=matryoshka)
|
|
33
|
+
def fn(texts, is_query=False, device="cpu", batch_size=64):
|
|
34
|
+
import numpy as np
|
|
35
|
+
m = _st(model_name, device, max_seq_length)
|
|
36
|
+
if is_query and query_prompt:
|
|
37
|
+
texts = [query_prompt + t for t in texts]
|
|
38
|
+
return m.encode(texts, batch_size=batch_size, normalize_embeddings=True,
|
|
39
|
+
show_progress_bar=True).astype(np.float32)
|
|
40
|
+
return fn
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# built-in embedders used in the paper
|
|
44
|
+
_make_st_embedder("minilm", "sentence-transformers/all-MiniLM-L6-v2", 384, "", False)
|
|
45
|
+
_make_st_embedder("mxbai", "mixedbread-ai/mxbai-embed-large-v1", 1024,
|
|
46
|
+
"Represent this sentence for searching relevant passages: ", True)
|
bitbudget/eval.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""The BitBudget protocol: given document/query embeddings and judgements, score every
|
|
2
|
+
registered compression method on retrieval quality against the bytes it stores per vector."""
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
from .methods import METHODS
|
|
6
|
+
from .metrics import ndcg_at_k, recall_at_k
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def evaluate(demb, qemb, cids, qids, qrels, methods=None, recall_k=10, ndcg_k=10):
|
|
10
|
+
"""Run `methods` (default: all registered) and return a list of result dicts:
|
|
11
|
+
{method, axis, bytes, ndcg, recall, pct_float}. nDCG uses the graded qrels; recall uses
|
|
12
|
+
the exact float top-k as ground truth; pct_float is nDCG relative to the float32 baseline.
|
|
13
|
+
"""
|
|
14
|
+
methods = methods or list(METHODS)
|
|
15
|
+
exact = qemb @ demb.T # exact float scores = recall ground truth
|
|
16
|
+
base = ndcg_at_k(exact, qids, cids, qrels, ndcg_k)
|
|
17
|
+
rows = []
|
|
18
|
+
for name in methods:
|
|
19
|
+
fn = METHODS[name]
|
|
20
|
+
scores, bpv = fn(demb, qemb)
|
|
21
|
+
n = ndcg_at_k(scores, qids, cids, qrels, ndcg_k)
|
|
22
|
+
r = recall_at_k(scores, exact, recall_k)
|
|
23
|
+
rows.append(dict(method=name, axis=getattr(fn, "axis", "?"), bytes=float(bpv),
|
|
24
|
+
ndcg=round(n, 4), recall=round(r, 4),
|
|
25
|
+
pct_float=round(100 * n / base, 1) if base else float("nan")))
|
|
26
|
+
rows.sort(key=lambda x: x["bytes"])
|
|
27
|
+
return rows
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def aggregate(per_corpus):
|
|
31
|
+
"""Mean +/- std across corpora for each method, given a {corpus: rows} mapping."""
|
|
32
|
+
by = {}
|
|
33
|
+
for corpus, rows in per_corpus.items():
|
|
34
|
+
for r in rows:
|
|
35
|
+
by.setdefault(r["method"], []).append(r)
|
|
36
|
+
out = []
|
|
37
|
+
for name, rs in by.items():
|
|
38
|
+
nd = np.array([r["ndcg"] for r in rs]); pc = np.array([r["pct_float"] for r in rs])
|
|
39
|
+
out.append(dict(method=name, axis=rs[0]["axis"], bytes=rs[0]["bytes"],
|
|
40
|
+
ndcg=round(float(nd.mean()), 3), ndcg_std=round(float(nd.std()), 3),
|
|
41
|
+
pct_float=round(float(pc.mean()), 1), pct_float_std=round(float(pc.std()), 1),
|
|
42
|
+
n_corpora=len(rs)))
|
|
43
|
+
out.sort(key=lambda x: x["bytes"])
|
|
44
|
+
return out
|
bitbudget/indexes.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""The organisation axis: build an index over document vectors and measure recall@k, query
|
|
2
|
+
throughput (QPS) and the bytes it stores per vector.
|
|
3
|
+
|
|
4
|
+
This is the axis the compression leaderboard does not cover. A graph index such as HNSW does not
|
|
5
|
+
shrink the footprint, it *adds* bytes (the vectors plus a graph) and buys throughput; a compact
|
|
6
|
+
code such as the bit-trie shrinks it. Reporting recall, QPS and bytes together makes the
|
|
7
|
+
trade-off explicit, in the spirit of ann-benchmarks.
|
|
8
|
+
|
|
9
|
+
Register an index with ``@index``. The faiss-backed indexes (flat/hnsw/ivfpq) import faiss lazily
|
|
10
|
+
and are skipped with a message if faiss is not installed; the bit-trie is numpy-only. Vectors are
|
|
11
|
+
L2-normalised on ingest so inner-product and L2 rankings agree. Run in its own process (no torch)
|
|
12
|
+
to avoid the faiss/OpenMP clash (see README).
|
|
13
|
+
"""
|
|
14
|
+
import time
|
|
15
|
+
import numpy as np
|
|
16
|
+
|
|
17
|
+
INDEXES = {}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def index(name, axis="organisation"):
|
|
21
|
+
"""Register an index. An index is ``fn(xb, xq, k) -> (topk_indices, bytes_per_vec, qps)``."""
|
|
22
|
+
def deco(fn):
|
|
23
|
+
fn.bitbudget_name = name
|
|
24
|
+
fn.axis = axis
|
|
25
|
+
INDEXES[name] = fn
|
|
26
|
+
return fn
|
|
27
|
+
return deco
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def list_indexes():
|
|
31
|
+
return sorted(INDEXES)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _time(search, repeats=3):
|
|
35
|
+
best, I = float("inf"), None
|
|
36
|
+
for _ in range(repeats):
|
|
37
|
+
t = time.time(); I = search(); best = min(best, time.time() - t)
|
|
38
|
+
return I, best
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# ----------------------------------------------------------------- faiss-backed (organisation)
|
|
42
|
+
@index("flat")
|
|
43
|
+
def flat(xb, xq, k):
|
|
44
|
+
import faiss
|
|
45
|
+
D = xb.shape[1]
|
|
46
|
+
ix = faiss.IndexFlatIP(D); ix.add(xb)
|
|
47
|
+
I, t = _time(lambda: ix.search(xq, k)[1])
|
|
48
|
+
return I, 4 * D, len(xq) / t # full float vectors, exact
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@index("hnsw")
|
|
52
|
+
def hnsw(xb, xq, k, M=24, efc=200, efs=128):
|
|
53
|
+
import faiss
|
|
54
|
+
D = xb.shape[1]
|
|
55
|
+
ix = faiss.IndexHNSWFlat(D, M, faiss.METRIC_INNER_PRODUCT)
|
|
56
|
+
ix.hnsw.efConstruction = efc; ix.add(xb); ix.hnsw.efSearch = efs
|
|
57
|
+
I, t = _time(lambda: ix.search(xq, k)[1])
|
|
58
|
+
return I, 4 * D + M * 2 * 4, len(xq) / t # vectors + graph edges (both levels approx)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@index("ivfpq")
|
|
62
|
+
def ivfpq(xb, xq, k, nprobe=64):
|
|
63
|
+
import faiss
|
|
64
|
+
n, D = xb.shape
|
|
65
|
+
m = D // 2
|
|
66
|
+
while D % m:
|
|
67
|
+
m -= 1
|
|
68
|
+
nlist = max(1, min(8192, n // 100))
|
|
69
|
+
ix = faiss.IndexIVFPQ(faiss.IndexFlatIP(D), D, nlist, m, 8, faiss.METRIC_INNER_PRODUCT)
|
|
70
|
+
samp = xb[np.random.RandomState(0).choice(n, min(200000, n), replace=False)]
|
|
71
|
+
ix.train(samp); ix.add(xb); ix.nprobe = min(nprobe, nlist)
|
|
72
|
+
I, t = _time(lambda: ix.search(xq, k)[1])
|
|
73
|
+
return I, float(m), len(xq) / t # PQ code bytes
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# ----------------------------------------------------------------- numpy bit-trie (research entry)
|
|
77
|
+
@index("bittrie", axis="quantisation")
|
|
78
|
+
def bittrie(xb, xq, k, nbits=64, beam=128, depth=24):
|
|
79
|
+
from .bittrie import BitTrieIndex
|
|
80
|
+
ix = BitTrieIndex(n_bits=nbits, seed=0).fit(xb)
|
|
81
|
+
ix.query_batch(xq[:2], topk=k, beam=beam, depth=depth) # warm up (compile C kernel once)
|
|
82
|
+
I, t = _time(lambda: ix.query_batch(xq, topk=k, beam=beam, depth=depth), repeats=2)
|
|
83
|
+
return I, nbits / 8.0, len(xq) / t # compact routing code (float kept cold for re-rank)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# ----------------------------------------------------------------- harness
|
|
87
|
+
def _exact(xb, xq, k, chunk=200000):
|
|
88
|
+
"""Exact top-k by inner product, chunked over documents to bound memory. The chunk shrinks
|
|
89
|
+
as the query count grows so the (nq x chunk) score block stays small regardless of nq."""
|
|
90
|
+
nq = len(xq)
|
|
91
|
+
chunk = max(1024, min(chunk, 20_000_000 // max(1, nq))) # keep nq*chunk ~ 20M elements
|
|
92
|
+
best_s = np.full((nq, k), -1e30, np.float32)
|
|
93
|
+
best_i = np.zeros((nq, k), np.int64)
|
|
94
|
+
for s in range(0, len(xb), chunk):
|
|
95
|
+
e = min(s + chunk, len(xb))
|
|
96
|
+
sc = xq @ xb[s:e].T
|
|
97
|
+
cs = np.concatenate([best_s, sc], 1)
|
|
98
|
+
ci = np.concatenate([best_i, np.broadcast_to(np.arange(s, e), (nq, e - s))], 1)
|
|
99
|
+
part = np.argpartition(-cs, k - 1, axis=1)[:, :k]
|
|
100
|
+
best_s = np.take_along_axis(cs, part, 1)
|
|
101
|
+
best_i = np.take_along_axis(ci, part, 1)
|
|
102
|
+
order = np.argsort(-best_s, 1)
|
|
103
|
+
return np.take_along_axis(best_i, order, 1)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def bench_indexes(xb, xq, k=10, gt=None, which=None, verbose=True):
|
|
107
|
+
"""Build each index and report recall@k, QPS and bytes/vec. Returns rows sorted by footprint.
|
|
108
|
+
Vectors are L2-normalised so the inner-product ground truth is metric-consistent across indexes.
|
|
109
|
+
"""
|
|
110
|
+
xb = np.ascontiguousarray(xb, dtype=np.float32)
|
|
111
|
+
xq = np.ascontiguousarray(xq, dtype=np.float32)
|
|
112
|
+
xb /= (np.linalg.norm(xb, axis=1, keepdims=True) + 1e-9)
|
|
113
|
+
xq /= (np.linalg.norm(xq, axis=1, keepdims=True) + 1e-9)
|
|
114
|
+
if gt is None:
|
|
115
|
+
gt = _exact(xb, xq, k)
|
|
116
|
+
rows = []
|
|
117
|
+
# run faiss-backed indexes before the numpy bit-trie: the bit-trie loads its own OpenMP
|
|
118
|
+
# runtime (libomp) for the C kernel, which clashes with faiss's OpenMP if faiss runs after it
|
|
119
|
+
# in the same process (the macOS two-runtimes problem). Ordering it last avoids the clash.
|
|
120
|
+
names = sorted(which or list_indexes(), key=lambda n: (n == "bittrie", n))
|
|
121
|
+
for name in names:
|
|
122
|
+
fn = INDEXES[name]
|
|
123
|
+
try:
|
|
124
|
+
I, bpv, qps = fn(xb, xq, k)
|
|
125
|
+
except ImportError:
|
|
126
|
+
if verbose:
|
|
127
|
+
print(f" [skip {name}: install bitbudget[faiss]]")
|
|
128
|
+
continue
|
|
129
|
+
rec = float(np.mean([len(set(I[i][:k].tolist()) & set(gt[i][:k].tolist())) / k
|
|
130
|
+
for i in range(len(xq))]))
|
|
131
|
+
rows.append(dict(method=name, axis=getattr(fn, "axis", "organisation"),
|
|
132
|
+
bytes=float(bpv), recall=round(rec, 4), qps=round(float(qps), 1)))
|
|
133
|
+
if verbose:
|
|
134
|
+
print(f" {name:14s}{bpv:8.0f}B recall@{k}={rec:.3f} {qps:9.0f} qps")
|
|
135
|
+
rows.sort(key=lambda r: r["bytes"])
|
|
136
|
+
return rows
|
bitbudget/methods.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""Compression methods. A method maps float document/query embeddings to a
|
|
2
|
+
query-by-document similarity matrix under some compression, and reports the bytes it stores
|
|
3
|
+
per document vector. Register your own with the ``@method`` decorator.
|
|
4
|
+
|
|
5
|
+
@method("my-method", bits=2)
|
|
6
|
+
def my_method(demb, qemb):
|
|
7
|
+
...
|
|
8
|
+
return scores, bytes_per_vec # scores: (n_queries, n_docs); bytes: float
|
|
9
|
+
|
|
10
|
+
Everything here is numpy-only so that evaluation runs in the same process as torch embedding
|
|
11
|
+
without the faiss/OpenMP clash (see README). faiss-backed methods live in methods_faiss.py and
|
|
12
|
+
run in a separate evaluation process.
|
|
13
|
+
"""
|
|
14
|
+
import numpy as np
|
|
15
|
+
|
|
16
|
+
METHODS = {}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def method(name, bits=None, axis="quantisation"):
|
|
20
|
+
"""Register a compression method. `bits` and `axis` are metadata for the leaderboard."""
|
|
21
|
+
def deco(fn):
|
|
22
|
+
fn.bitbudget_name = name
|
|
23
|
+
fn.bits = bits
|
|
24
|
+
fn.axis = axis
|
|
25
|
+
METHODS[name] = fn
|
|
26
|
+
return fn
|
|
27
|
+
return deco
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def list_methods():
|
|
31
|
+
return sorted(METHODS)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# ---------------------------------------------------------------- baselines / quantisation
|
|
35
|
+
@method("float32", bits=32, axis="none")
|
|
36
|
+
def float32(demb, qemb):
|
|
37
|
+
return qemb @ demb.T, 4 * demb.shape[1]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@method("int8", bits=8)
|
|
41
|
+
def int8(demb, qemb):
|
|
42
|
+
scale = np.abs(demb).max(0, keepdims=True) / 127.0 + 1e-9
|
|
43
|
+
dq = np.round(demb / scale) * scale
|
|
44
|
+
return qemb @ dq.T, demb.shape[1]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@method("binary", bits=1)
|
|
48
|
+
def binary(demb, qemb):
|
|
49
|
+
return np.sign(qemb) @ np.sign(demb).T, demb.shape[1] / 8.0
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@method("binary+rerank", bits=1)
|
|
53
|
+
def binary_rerank(demb, qemb, depth=100):
|
|
54
|
+
bq, bd = np.sign(qemb), np.sign(demb)
|
|
55
|
+
cand = np.argsort(-(bq @ bd.T), axis=1)[:, :depth]
|
|
56
|
+
scores = np.full((qemb.shape[0], demb.shape[0]), -1e9, dtype=np.float32)
|
|
57
|
+
for i in range(qemb.shape[0]):
|
|
58
|
+
scores[i, cand[i]] = qemb[i] @ demb[cand[i]].T
|
|
59
|
+
return scores, demb.shape[1] / 8.0 # compact code stored; full vectors fetched on re-rank
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@method("rabitq", bits=1)
|
|
63
|
+
def rabitq(demb, qemb, seed=0):
|
|
64
|
+
D = demb.shape[1]
|
|
65
|
+
R = np.linalg.qr(np.random.RandomState(seed).randn(D, D))[0].astype(np.float32)
|
|
66
|
+
return (qemb @ R) @ np.sign(demb @ R).T, D / 8.0
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# ---------------------------------------------------------------- product quantisation (numpy)
|
|
70
|
+
def _kmeans(X, k, iters=10, seed=0):
|
|
71
|
+
rng = np.random.RandomState(seed)
|
|
72
|
+
C = X[rng.choice(len(X), size=min(k, len(X)), replace=False)].copy()
|
|
73
|
+
for _ in range(iters):
|
|
74
|
+
d = ((X ** 2).sum(1, keepdims=True) - 2 * X @ C.T + (C ** 2).sum(1))
|
|
75
|
+
a = d.argmin(1)
|
|
76
|
+
for j in range(len(C)):
|
|
77
|
+
m = a == j
|
|
78
|
+
if m.any():
|
|
79
|
+
C[j] = X[m].mean(0)
|
|
80
|
+
return C, a
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@method("pq", bits=8, axis="quantisation")
|
|
84
|
+
def pq(demb, qemb, m=None, ksub=256):
|
|
85
|
+
"""Product quantisation: m sub-quantisers, 256 centroids each (m bytes/vector). Asymmetric
|
|
86
|
+
scoring against reconstructed documents."""
|
|
87
|
+
D = demb.shape[1]
|
|
88
|
+
m = m or max(1, D // 8)
|
|
89
|
+
assert D % m == 0, f"PQ needs D ({D}) divisible by m ({m})"
|
|
90
|
+
sub = D // m
|
|
91
|
+
recon = np.empty_like(demb)
|
|
92
|
+
for s in range(m):
|
|
93
|
+
sl = slice(s * sub, (s + 1) * sub)
|
|
94
|
+
C, a = _kmeans(demb[:, sl], ksub, seed=s)
|
|
95
|
+
recon[:, sl] = C[a]
|
|
96
|
+
return qemb @ recon.T, float(m)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
# ---------------------------------------------------------------- projection axis (dimensions)
|
|
100
|
+
@method("matryoshka", bits=32, axis="projection")
|
|
101
|
+
def matryoshka(demb, qemb, dim=None):
|
|
102
|
+
"""Prefix-truncation. Fair only for Matryoshka-trained embedders; on others it is naive."""
|
|
103
|
+
dim = dim or demb.shape[1] // 4
|
|
104
|
+
return qemb[:, :dim] @ demb[:, :dim].T, 4 * dim
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
@method("pca", bits=32, axis="projection")
|
|
108
|
+
def pca(demb, qemb, dim=None):
|
|
109
|
+
dim = dim or demb.shape[1] // 4
|
|
110
|
+
mean = demb.mean(0)
|
|
111
|
+
dc = demb - mean
|
|
112
|
+
_, V = np.linalg.eigh(dc.T @ dc)
|
|
113
|
+
W = V[:, ::-1][:, :dim]
|
|
114
|
+
return ((qemb - mean) @ W) @ (dc @ W).T, 4 * dim
|
bitbudget/metrics.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Retrieval metrics. nDCG@k uses graded BEIR judgements; recall@k uses the exact
|
|
2
|
+
floating-point neighbours as ground truth (the ANN-Benchmarks convention)."""
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _topk(scores_row, k):
|
|
7
|
+
k = min(k, scores_row.shape[0])
|
|
8
|
+
idx = np.argpartition(-scores_row, k - 1)[:k]
|
|
9
|
+
return idx[np.argsort(-scores_row[idx])]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def ndcg_at_k(scores, qids, cids, qrels, k=10):
|
|
13
|
+
"""Mean nDCG@k over queries with graded relevance judgements `qrels`."""
|
|
14
|
+
disc = 1.0 / np.log2(np.arange(2, k + 2))
|
|
15
|
+
out = []
|
|
16
|
+
cids = [str(c) for c in cids]
|
|
17
|
+
for i, q in enumerate(qids):
|
|
18
|
+
rel = qrels.get(str(q), {})
|
|
19
|
+
if not rel:
|
|
20
|
+
continue
|
|
21
|
+
top = _topk(scores[i], k)
|
|
22
|
+
gains = np.array([rel.get(cids[j], 0) for j in top], dtype=float)
|
|
23
|
+
dcg = np.sum((2 ** gains - 1) * disc[:len(gains)])
|
|
24
|
+
ideal = np.sort(np.array(list(rel.values()), dtype=float))[::-1][:k]
|
|
25
|
+
idcg = np.sum((2 ** ideal - 1) * disc[:len(ideal)])
|
|
26
|
+
out.append(dcg / idcg if idcg > 0 else 0.0)
|
|
27
|
+
return float(np.mean(out)) if out else float("nan")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def recall_at_k(scores, exact_scores, k=10):
|
|
31
|
+
"""k-recall@k against the exact floating-point top-k (ground truth from `exact_scores`)."""
|
|
32
|
+
out = []
|
|
33
|
+
for i in range(scores.shape[0]):
|
|
34
|
+
gt = set(_topk(exact_scores[i], k).tolist())
|
|
35
|
+
got = set(_topk(scores[i], k).tolist())
|
|
36
|
+
out.append(len(gt & got) / float(k))
|
|
37
|
+
return float(np.mean(out))
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: bitbudget
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: How much retrieval quality do you keep per byte? A reproducible benchmark for embedding compression.
|
|
5
|
+
Author: Sean Moran
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Paper, https://arxiv.org/abs/2510.04127
|
|
8
|
+
Project-URL: Leaderboard, https://github.com/sjmoran/bitbudget/blob/main/LEADERBOARD.md
|
|
9
|
+
Keywords: retrieval,embeddings,quantisation,hashing,compression,ANN,RAG
|
|
10
|
+
Requires-Python: >=3.9
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: numpy>=1.21
|
|
14
|
+
Provides-Extra: embed
|
|
15
|
+
Requires-Dist: sentence-transformers>=2.2; extra == "embed"
|
|
16
|
+
Provides-Extra: faiss
|
|
17
|
+
Requires-Dist: faiss-cpu>=1.7.4; extra == "faiss"
|
|
18
|
+
Provides-Extra: all
|
|
19
|
+
Requires-Dist: sentence-transformers>=2.2; extra == "all"
|
|
20
|
+
Requires-Dist: faiss-cpu>=1.7.4; extra == "all"
|
|
21
|
+
Dynamic: license-file
|
|
22
|
+
|
|
23
|
+
# BitBudget
|
|
24
|
+
|
|
25
|
+
**How much retrieval quality do you keep per byte?**
|
|
26
|
+
|
|
27
|
+
BitBudget is a small, reproducible benchmark for **embedding compression**. Give it an
|
|
28
|
+
embedder and a corpus and it reports the retrieval quality (nDCG@10, recall@10) that each
|
|
29
|
+
compression method retains against the **bytes it stores per vector** — the recall‑per‑byte
|
|
30
|
+
frontier that every RAG and vector‑database deployment actually lives on.
|
|
31
|
+
|
|
32
|
+
It is the companion benchmark to the survey *“Projection and Quantisation: A Unifying View of
|
|
33
|
+
Learning to Hash, from Random Projections to the RAG Era”* and exists to answer one question
|
|
34
|
+
that today is mostly answered by vendor blog posts: **when you binarise / int8 / RaBitQ /
|
|
35
|
+
product‑quantise / Matryoshka‑truncate your embeddings, what do you actually lose?**
|
|
36
|
+
|
|
37
|
+
## The headline finding
|
|
38
|
+
|
|
39
|
+
> **Bits beat dimensions.** Spending a fixed byte budget on *more coarsely quantised*
|
|
40
|
+
> coordinates beats spending it on *fewer full‑precision* coordinates, at every budget and
|
|
41
|
+
> for every embedder we have tried. One‑bit codes with a cheap re‑ranking pass are **32×
|
|
42
|
+
> smaller than float at no measurable loss**.
|
|
43
|
+
|
|
44
|
+
```
|
|
45
|
+
mxbai‑embed‑large (1024‑d), mean over 4 BEIR corpora
|
|
46
|
+
binary+rerank 128 B nDCG 0.509 100% of float ← 32× smaller, lossless
|
|
47
|
+
pq 128 B nDCG 0.488 96%
|
|
48
|
+
rabitq 128 B nDCG 0.487 96%
|
|
49
|
+
matryoshka 1024 B nDCG 0.439 86% ← 4× smaller, projection axis
|
|
50
|
+
float32 4096 B nDCG 0.508 100%
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
See **[LEADERBOARD.md](LEADERBOARD.md)** for the full table.
|
|
54
|
+
|
|
55
|
+
## Install
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
pip install bitbudget # evaluation only (numpy)
|
|
59
|
+
pip install "bitbudget[all]" # + sentence-transformers (embedding) + faiss
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Quickstart
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
bitbudget methods # list compression methods
|
|
66
|
+
bitbudget run --embedder mxbai --corpus scifact # embed + evaluate, print a results card
|
|
67
|
+
bitbudget leaderboard results/card_*.json # render a markdown leaderboard
|
|
68
|
+
|
|
69
|
+
bitbudget indexes # list indexes (organisation axis)
|
|
70
|
+
bitbudget bench-index --synthetic 100000 128 # recall vs QPS vs bytes: flat/hnsw/ivfpq/bittrie
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
`run` embeds (torch) and evaluates (numpy) in one process. The corpora auto‑download.
|
|
74
|
+
|
|
75
|
+
### The organisation axis (`bench-index`)
|
|
76
|
+
|
|
77
|
+
The compression leaderboard answers *quality per byte*; `bench-index` answers the orthogonal
|
|
78
|
+
*recall per query-second*. It builds an index over the document vectors and reports recall@k,
|
|
79
|
+
throughput (QPS) and bytes per vector, so HNSW and IVF‑PQ (which buy throughput and *add* bytes)
|
|
80
|
+
can be compared against compact‑code indexes on one frontier. Run it on synthetic data, on a
|
|
81
|
+
cached embedding (`--embedder mxbai --corpus scifact`), or on your own vectors (`--npz`). The
|
|
82
|
+
faiss‑backed indexes need `pip install bitbudget[faiss]`; the numpy `bittrie` runs without it.
|
|
83
|
+
|
|
84
|
+
The `bittrie` index ships a small C kernel (`_bittrie.c`) for the query hot‑path, compiled on
|
|
85
|
+
first use and cached (no compiler needed to *install* — the wheel stays pure‑Python, and it falls
|
|
86
|
+
back to numpy if no compiler is present). It builds **multithreaded** when OpenMP is available
|
|
87
|
+
(GCC/clang on Linux, Homebrew `libomp` on macOS) and single‑threaded otherwise; results are
|
|
88
|
+
bit‑identical to the numpy path, and recall/footprint are algorithmic and unchanged either way.
|
|
89
|
+
|
|
90
|
+
Because faiss carries its own OpenMP runtime, it cannot share a process with the bit‑trie's
|
|
91
|
+
`libomp` on macOS. `bench-index` therefore runs the faiss indexes and the bit‑trie in **separate
|
|
92
|
+
subprocesses** and merges the results, so a single `bitbudget bench-index ...` works everywhere
|
|
93
|
+
(pass `--no-split` to force one process, e.g. on Linux where both share one OpenMP runtime).
|
|
94
|
+
|
|
95
|
+
> **macOS note.** torch and faiss each bundle their own OpenMP runtime and crash if imported
|
|
96
|
+
> in the same process. The core methods are numpy‑only, so `run` is safe; if you add a
|
|
97
|
+
> faiss‑backed method, run `bitbudget embed` (torch) and `bitbudget eval` (numpy/faiss)
|
|
98
|
+
> as separate processes.
|
|
99
|
+
|
|
100
|
+
## The protocol (frozen, so results are comparable)
|
|
101
|
+
|
|
102
|
+
- **Corpora:** the BEIR subsets `scifact`, `nfcorpus`, `arguana`, `fiqa` (small enough to run
|
|
103
|
+
on a laptop, diverse enough to be honest). Numbers are the mean over corpora; `±` is the
|
|
104
|
+
standard deviation across them.
|
|
105
|
+
- **Metrics:** `nDCG@10` against the graded BEIR judgements, and `recall@10` against the exact
|
|
106
|
+
floating‑point neighbours. `% of float` is nDCG relative to the uncompressed embedding.
|
|
107
|
+
- **Memory:** bytes stored per document vector (`4D` float, `D` int8, `D/8` binary, `M` for an
|
|
108
|
+
`M`‑byte product code, `4·dim` for a truncated/PCA‑reduced vector).
|
|
109
|
+
- **Embedders:** `minilm` (384‑d) and `mxbai` (1024‑d, Matryoshka) ship built in.
|
|
110
|
+
|
|
111
|
+
## Add your method in five lines
|
|
112
|
+
|
|
113
|
+
This is the point of the benchmark: drop in your compressor and it is scored against every
|
|
114
|
+
built‑in on the same protocol.
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
from bitbudget import method
|
|
118
|
+
import numpy as np
|
|
119
|
+
|
|
120
|
+
@method("my-2bit", bits=2)
|
|
121
|
+
def my_2bit(demb, qemb):
|
|
122
|
+
codes = my_quantise(demb) # your compression
|
|
123
|
+
scores = qemb @ my_reconstruct(codes).T # (queries x docs) similarity
|
|
124
|
+
return scores, demb.shape[1] * 2 / 8 # scores, bytes per stored vector
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
bitbudget run --embedder mxbai --corpus scifact --methods my-2bit binary+rerank float32
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
Then open a pull request adding your row to [LEADERBOARD.md](LEADERBOARD.md). See
|
|
132
|
+
[CONTRIBUTING.md](CONTRIBUTING.md).
|
|
133
|
+
|
|
134
|
+
## Cite
|
|
135
|
+
|
|
136
|
+
If BitBudget helps your work, please cite the survey:
|
|
137
|
+
|
|
138
|
+
```bibtex
|
|
139
|
+
@article{moran2025projection,
|
|
140
|
+
title = {Projection and Quantisation: A Unifying View of Learning to Hash,
|
|
141
|
+
from Random Projections to the RAG Era},
|
|
142
|
+
author = {Moran, Sean},
|
|
143
|
+
journal = {arXiv preprint arXiv:2510.04127},
|
|
144
|
+
year = {2025}
|
|
145
|
+
}
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
MIT licensed.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
bitbudget/__init__.py,sha256=83ZS2fMiaZojifkwbfMYYTXL8SwIHG9GPcL8TykB-sE,1157
|
|
2
|
+
bitbudget/_bittrie.c,sha256=jbFyD_NwvkupJtC9DfgvYz0XuHTZQe4B-hdXI4gj0Hc,4496
|
|
3
|
+
bitbudget/_bittrie_build.py,sha256=3HCcmnfWbVzQD4BVPM9z8aDT8r6N-reEShfRtxgbf6c,3672
|
|
4
|
+
bitbudget/bittrie.py,sha256=3EQa1Qu6VAf8nxm8_sGvD60Iwbzz4ncKM8Ar3xPfIAs,5516
|
|
5
|
+
bitbudget/cli.py,sha256=6dubvJF7yas9ytXm-vyUEhlvzIPFMVuNbGdUjM-slsE,10992
|
|
6
|
+
bitbudget/datasets.py,sha256=0lQJbD3ALfIQF7wzCT-gIkMJ8tD1pm3z3EM4IpB_mOQ,1799
|
|
7
|
+
bitbudget/embedders.py,sha256=_F6pHVRhQaK724317Fc1uIjDmeTnPKaV5nuAzdoZ9Bk,1909
|
|
8
|
+
bitbudget/eval.py,sha256=O3nUaD6plbeOgrgD57EOrkNA54zgfPOrI2w-aapjzR0,2090
|
|
9
|
+
bitbudget/indexes.py,sha256=yQ2O3NzcPoaONdpLagZrNz_AzT8NQcNQN70R4FXnBlM,5830
|
|
10
|
+
bitbudget/methods.py,sha256=D4B66oKXUmq5xezc0w9WHQ9nThOY9dhagbuvWWzxiWk,3909
|
|
11
|
+
bitbudget/metrics.py,sha256=7K2D3lgtpUDsACP6smPjytw5lbckWjE6HRpdzKdzCs8,1450
|
|
12
|
+
bitbudget-0.1.0.dist-info/licenses/LICENSE,sha256=nUSDmf1Ud4HyOItkD93o4_Vs-hHzoheLsIW_BI9zfEo,1067
|
|
13
|
+
bitbudget-0.1.0.dist-info/METADATA,sha256=wIstOwmYomqfT8Chz4mzZefQiwOGNvgLmct3OX3qXiU,6717
|
|
14
|
+
bitbudget-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
15
|
+
bitbudget-0.1.0.dist-info/entry_points.txt,sha256=f-1IR4Yvl1Zygj95DC9w0LgaSPOIjsFQg6Oz4v8tPeo,49
|
|
16
|
+
bitbudget-0.1.0.dist-info/top_level.txt,sha256=jBmwsZBMtA-eonXvguqSaVB05M8M6W1-UAl3OY6VFls,10
|
|
17
|
+
bitbudget-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Sean Moran
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
bitbudget
|