bitbudget 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bitbudget/__init__.py ADDED
@@ -0,0 +1,28 @@
1
+ """BitBudget: how much retrieval quality do you keep per byte?
2
+
3
+ A reproducible benchmark for embedding compression. Given an embedder and a corpus, it
4
+ measures the retrieval quality (nDCG@10, recall@10) retained by each compression method
5
+ against the bytes it stores per vector -- the recall-per-byte frontier.
6
+
7
+ Add your own method in five lines::
8
+
9
+ from bitbudget import method
10
+ import numpy as np
11
+
12
+ @method("my-2bit", bits=2)
13
+ def my_2bit(demb, qemb):
14
+ # return (query x doc similarity scores, bytes per stored vector)
15
+ codes = my_quantise(demb)
16
+ return qemb @ my_dequantise(codes).T, demb.shape[1] * 2 / 8
17
+
18
+ Then ``bitbudget run --embedder mxbai --corpus scifact`` scores it alongside the built-ins.
19
+ """
20
+ from .methods import method, METHODS, list_methods
21
+ from .embedders import embedder, EMBEDDERS, list_embedders
22
+ from .eval import evaluate
23
+ from .indexes import index, INDEXES, list_indexes, bench_indexes
24
+
25
+ __version__ = "0.1.0"
26
+ __all__ = ["method", "METHODS", "list_methods", "embedder", "EMBEDDERS",
27
+ "list_embedders", "evaluate", "index", "INDEXES", "list_indexes",
28
+ "bench_indexes", "__version__"]
bitbudget/_bittrie.c ADDED
@@ -0,0 +1,92 @@
1
+ /* Fast batched bit-trie query: coarse-to-fine beam descent over sorted codes, then a
2
+ * full-precision re-ranking pass, looped over all queries in C. Plain C (no Python.h);
3
+ * loaded via ctypes from _bittrie_build.py and compiled on demand. Falls back to the
4
+ * numpy/Python path in bittrie.py if no compiler is available. */
5
+ #include <stdint.h>
6
+ #include <stdlib.h>
7
+ #include <string.h>
8
+ #include <math.h>
9
+
10
+ typedef struct { double cost; long lo, hi; unsigned long long pre; } Node;
11
+
12
+ static int cmp_node(const void *a, const void *b) {
13
+ double d = ((const Node *)a)->cost - ((const Node *)b)->cost;
14
+ return (d > 0) - (d < 0);
15
+ }
16
+
17
+ static long lower_bound_u64(const uint64_t *a, long lo, long hi, uint64_t key) {
18
+ while (lo < hi) { long mid = lo + (hi - lo) / 2; if (a[mid] < key) lo = mid + 1; else hi = mid; }
19
+ return lo;
20
+ }
21
+
22
+ /* out: nq*k int64 of original doc ids (top-k by inner product), -1 padding if fewer found. */
23
+ /* Each query is independent and writes a disjoint slice of `out`, so the loop parallelises
24
+ * cleanly. Scratch buffers are allocated per thread inside the parallel region; when compiled
25
+ * without OpenMP the `omp` pragmas are ignored and this runs as one block, single-threaded. */
26
+ void bt_query_batch(const uint64_t *codes, const int64_t *docids,
27
+ const float *Xf, const float *qproj, const float *qf,
28
+ int n, int d, int b, int nq, int depth, int beam, int k,
29
+ int64_t *out) {
30
+ if (depth > b) depth = b;
31
+ if (beam < 1) beam = 1;
32
+ int cap = 2 * beam + 4;
33
+
34
+ #pragma omp parallel
35
+ {
36
+ Node *cur = (Node *)malloc(sizeof(Node) * cap);
37
+ Node *nxt = (Node *)malloc(sizeof(Node) * 2 * cap);
38
+ double *bscore = (double *)malloc(sizeof(double) * k);
39
+ long *bid = (long *)malloc(sizeof(long) * k);
40
+
41
+ #pragma omp for schedule(static)
42
+ for (int qi = 0; qi < nq; qi++) {
43
+ const float *qp = qproj + (size_t)qi * b;
44
+ const float *qv = qf + (size_t)qi * d;
45
+ int ncur = 1;
46
+ cur[0].cost = 0.0; cur[0].lo = 0; cur[0].hi = n; cur[0].pre = 0ULL;
47
+
48
+ for (int t = 0; t < depth; t++) {
49
+ int shift = b - t - 1;
50
+ int qb = qp[t] > 0.0f ? 1 : 0;
51
+ double c = fabs((double)qp[t]);
52
+ int nn = 0;
53
+ for (int i = 0; i < ncur; i++) {
54
+ long lo = cur[i].lo, hi = cur[i].hi;
55
+ unsigned long long pre = cur[i].pre;
56
+ unsigned long long hi_part = (b - t) >= 64 ? 0ULL : (pre << (b - t));
57
+ unsigned long long thresh = hi_part | (1ULL << shift);
58
+ long m = lower_bound_u64(codes, lo, hi, (uint64_t)thresh);
59
+ if (m > lo) { nxt[nn].cost = cur[i].cost + (qb == 0 ? 0.0 : c);
60
+ nxt[nn].lo = lo; nxt[nn].hi = m; nxt[nn].pre = (pre << 1); nn++; }
61
+ if (hi > m) { nxt[nn].cost = cur[i].cost + (qb == 1 ? 0.0 : c);
62
+ nxt[nn].lo = m; nxt[nn].hi = hi; nxt[nn].pre = (pre << 1) | 1ULL; nn++; }
63
+ }
64
+ qsort(nxt, nn, sizeof(Node), cmp_node);
65
+ int keep = nn < beam ? nn : beam;
66
+ memcpy(cur, nxt, sizeof(Node) * keep);
67
+ ncur = keep;
68
+ }
69
+
70
+ for (int j = 0; j < k; j++) { bscore[j] = -1e300; bid[j] = -1; }
71
+ for (int i = 0; i < ncur; i++) {
72
+ for (long j = cur[i].lo; j < cur[i].hi; j++) {
73
+ long id = (long)docids[j];
74
+ const float *xr = Xf + (size_t)id * d;
75
+ double s = 0.0;
76
+ for (int dd = 0; dd < d; dd++) s += (double)qv[dd] * (double)xr[dd];
77
+ int mn = 0;
78
+ for (int t2 = 1; t2 < k; t2++) if (bscore[t2] < bscore[mn]) mn = t2;
79
+ if (s > bscore[mn]) { bscore[mn] = s; bid[mn] = id; }
80
+ }
81
+ }
82
+ for (int a = 0; a < k; a++) { /* selection-sort the k winners, score descending */
83
+ int mx = a;
84
+ for (int b2 = a + 1; b2 < k; b2++) if (bscore[b2] > bscore[mx]) mx = b2;
85
+ double ts = bscore[a]; bscore[a] = bscore[mx]; bscore[mx] = ts;
86
+ long ti = bid[a]; bid[a] = bid[mx]; bid[mx] = ti;
87
+ }
88
+ for (int j = 0; j < k; j++) out[(size_t)qi * k + j] = bid[j];
89
+ }
90
+ free(cur); free(nxt); free(bscore); free(bid);
91
+ }
92
+ }
@@ -0,0 +1,94 @@
1
+ """Compile the bit-trie C kernel on demand and load it via ctypes.
2
+
3
+ The package ships pure-Python on PyPI (the .c is data, not a built extension), so install never
4
+ needs a compiler. The first time the fast path is requested we compile _bittrie.c into a cached
5
+ shared library with the system C compiler; if no compiler is available we return None and the
6
+ caller falls back to the numpy/Python path. The compiled library is cached by source hash, so the
7
+ build happens once per machine.
8
+ """
9
+ import ctypes
10
+ import hashlib
11
+ import os
12
+ import subprocess
13
+
14
+ _LIB = None
15
+ _TRIED = False
16
+
17
+
18
+ def _cache_dir():
19
+ d = os.environ.get("BITBUDGET_CACHE") or os.path.join(os.path.expanduser("~"), ".cache", "bitbudget")
20
+ os.makedirs(d, exist_ok=True)
21
+ return d
22
+
23
+
24
+ def _brew_libomp():
25
+ """Return the Homebrew libomp prefix on macOS, or None."""
26
+ try:
27
+ p = subprocess.run(["brew", "--prefix", "libomp"], capture_output=True, text=True)
28
+ prefix = p.stdout.strip()
29
+ if p.returncode == 0 and prefix and os.path.isdir(prefix):
30
+ return prefix
31
+ except Exception:
32
+ pass
33
+ return None
34
+
35
+
36
+ def _variants():
37
+ """(cflags, lflags) compile attempts, OpenMP first, single-thread C last."""
38
+ v = []
39
+ omp = _brew_libomp()
40
+ if omp: # macOS clang + Homebrew libomp (try first)
41
+ v.append((["-Xpreprocessor", "-fopenmp", f"-I{omp}/include"],
42
+ [f"-L{omp}/lib", "-lomp", f"-Wl,-rpath,{omp}/lib"]))
43
+ v.append((["-fopenmp"], ["-fopenmp"])) # gcc / OpenMP-capable clang (Linux)
44
+ v.append(([], [])) # no OpenMP: single-threaded C
45
+ return v
46
+
47
+
48
+ def _set_sig(lib):
49
+ f = lib.bt_query_batch
50
+ f.restype = None
51
+ P = ctypes.POINTER
52
+ f.argtypes = [P(ctypes.c_uint64), P(ctypes.c_int64), P(ctypes.c_float),
53
+ P(ctypes.c_float), P(ctypes.c_float),
54
+ ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int,
55
+ ctypes.c_int, ctypes.c_int, ctypes.c_int, P(ctypes.c_int64)]
56
+ return lib
57
+
58
+
59
+ def get_lib():
60
+ """Return the loaded ctypes library exposing bt_query_batch, or None if unavailable.
61
+
62
+ Tries to build a multithreaded (OpenMP) kernel first, falling back to single-threaded C,
63
+ then to None (the caller uses the numpy/Python path). The compiled library is cached by
64
+ source hash; set BITBUDGET_NO_OMP=1 to force the single-threaded build.
65
+ """
66
+ global _LIB, _TRIED
67
+ if _TRIED:
68
+ return _LIB
69
+ _TRIED = True
70
+ src = os.path.join(os.path.dirname(__file__), "_bittrie.c")
71
+ if not os.path.exists(src):
72
+ return None
73
+ h = hashlib.sha1(open(src, "rb").read()).hexdigest()[:12]
74
+ so = os.path.join(_cache_dir(), f"_bittrie_{h}.so")
75
+ if os.path.exists(so): # cached: load directly
76
+ try:
77
+ _LIB = _set_sig(ctypes.CDLL(so)); return _LIB
78
+ except Exception:
79
+ try: os.remove(so)
80
+ except Exception: pass
81
+ cc = os.environ.get("CC") or "cc"
82
+ variants = [([], [])] if os.environ.get("BITBUDGET_NO_OMP") else _variants()
83
+ for cflags, lflags in variants:
84
+ cmd = [cc, "-O3", "-shared", "-fPIC", *cflags, "-o", so, src, *lflags, "-lm"]
85
+ try:
86
+ subprocess.run(cmd, check=True, capture_output=True)
87
+ except Exception:
88
+ continue
89
+ try: # must actually load (libomp present at runtime)
90
+ _LIB = _set_sig(ctypes.CDLL(so)); return _LIB
91
+ except Exception:
92
+ try: os.remove(so)
93
+ except Exception: pass
94
+ return None
bitbudget/bittrie.py ADDED
@@ -0,0 +1,108 @@
1
+ """Bit-trie index: a van Emde Boas / PATRICIA-style radix trie over compact codes.
2
+
3
+ The sorted array of packed sign-codes *is* the trie: a node is a contiguous range of codes
4
+ sharing a bit prefix. Search is coarse-to-fine beam descent (the most discriminative bit is the
5
+ shallowest), then a re-ranking pass. Unsupervised (random rotation + sign), numpy-only, one knob
6
+ (``beam``). It stores compact codes rather than full vectors, so it sits on the organisation axis
7
+ at a fraction of a graph's footprint. Research entry to the BitBudget index benchmark; see the
8
+ projection-quantisation-organisation survey for the motivation.
9
+
10
+ This reference implementation packs into a single uint64, so ``n_bits <= 64``.
11
+ """
12
+ import numpy as np
13
+
14
+
15
+ class BitTrieIndex:
16
+ def __init__(self, n_bits=64, seed=0):
17
+ assert n_bits <= 64, "this reference packs into one uint64; use a word-array for >64 bits"
18
+ self.b = n_bits
19
+ self.seed = seed
20
+
21
+ def fit(self, X):
22
+ X = np.ascontiguousarray(X, dtype=np.float32)
23
+ n, d = X.shape
24
+ rng = np.random.RandomState(self.seed)
25
+ R = rng.randn(d, self.b).astype(np.float32) # unsupervised random rotation (RaBitQ)
26
+ R, _ = np.linalg.qr(R) if self.b <= d else (R, None)
27
+ P = X @ R
28
+ order = np.argsort(-P.var(axis=0)) # coarse-to-fine: MSB = highest variance
29
+ self.R = R[:, order]
30
+ P = P[:, order]
31
+ bits = (P > 0).astype(np.uint64)
32
+ weights = (np.uint64(1) << np.arange(self.b - 1, -1, -1, dtype=np.uint64))
33
+ codes = bits @ weights
34
+ order_idx = np.argsort(codes, kind="stable")
35
+ self.codes = codes[order_idx] # sorted: the implicit trie
36
+ self.docids = order_idx.astype(np.int64)
37
+ self.X = X # kept only for optional float re-rank (cold)
38
+ self.n, self.d = n, d
39
+ return self
40
+
41
+ def _descend(self, qbits, qconf, beam, depth):
42
+ """Confidence-ordered beam over the sorted codes. Stop at ``depth`` bits so each surviving
43
+ prefix is still a fat bucket (the coarse-to-fine / anytime property)."""
44
+ b = self.b
45
+ state = [(0.0, 0, self.n, 0, 0)] # (cost, lo, hi, prefix_int, depth)
46
+ for t in range(depth):
47
+ shift = b - t - 1
48
+ nxt = []
49
+ for cost, lo, hi, pre, _ in state:
50
+ thresh = (pre << (b - t)) | (1 << shift)
51
+ m = lo + int(np.searchsorted(self.codes[lo:hi], np.uint64(thresh), "left"))
52
+ qb = int(qbits[t]); c = float(qconf[t])
53
+ left = (lo, m, (pre << 1))
54
+ right = (m, hi, (pre << 1) | 1)
55
+ for child_bit, (clo, chi, cpre) in ((0, left), (1, right)):
56
+ if chi <= clo:
57
+ continue
58
+ add = 0.0 if child_bit == qb else c # sibling costs its margin
59
+ nxt.append((cost + add, clo, chi, cpre, t + 1))
60
+ nxt.sort(key=lambda e: e[0])
61
+ state = nxt[:beam]
62
+ return state
63
+
64
+ def query(self, q, topk=10, beam=64, depth=28, rerank="float"):
65
+ q = np.asarray(q, dtype=np.float32)
66
+ p = q @ self.R
67
+ qbits = (p > 0).astype(int)
68
+ qconf = np.abs(p)
69
+ ranges = self._descend(qbits, qconf, beam, depth)
70
+ cand = np.concatenate([self.docids[lo:hi] for _, lo, hi, _, _ in ranges]) if ranges else np.empty(0, np.int64)
71
+ if cand.size == 0:
72
+ return cand
73
+ if rerank == "float": # DiskANN-style: cold float for re-rank only
74
+ s = self.X[cand] @ q
75
+ else: # asymmetric: float query vs sign code, no float
76
+ shifts = np.arange(self.b - 1, -1, -1, dtype=np.uint64)
77
+ sign = np.where(((self.codes_full(cand)[:, None] >> shifts) & np.uint64(1)) > 0, 1.0, -1.0)
78
+ s = sign @ (q @ self.R)
79
+ return cand[np.argsort(-s)[:topk]]
80
+
81
+ def query_batch(self, Q, topk=10, beam=64, depth=28):
82
+ """Top-k for every query in Q. Uses the compiled C kernel when available (one to two
83
+ orders of magnitude faster), else falls back to the pure-Python per-query path."""
84
+ Q = np.ascontiguousarray(Q, dtype=np.float32)
85
+ depth = min(depth, self.b)
86
+ from ._bittrie_build import get_lib
87
+ lib = get_lib()
88
+ if lib is None:
89
+ return np.array([self.query(q, topk, beam, depth, rerank="float") for q in Q])
90
+ import ctypes
91
+ qproj = np.ascontiguousarray(Q @ self.R, dtype=np.float32)
92
+ Xf = np.ascontiguousarray(self.X, dtype=np.float32)
93
+ out = np.full((len(Q), topk), -1, dtype=np.int64)
94
+ cast = lambda a, t: a.ctypes.data_as(ctypes.POINTER(t))
95
+ lib.bt_query_batch(
96
+ cast(self.codes, ctypes.c_uint64), cast(self.docids, ctypes.c_int64),
97
+ cast(Xf, ctypes.c_float), cast(qproj, ctypes.c_float), cast(Q, ctypes.c_float),
98
+ self.n, self.d, self.b, len(Q), int(depth), int(beam), int(topk),
99
+ cast(out, ctypes.c_int64))
100
+ return out
101
+
102
+ def codes_full(self, cand):
103
+ inv = np.empty(self.n, np.int64); inv[self.docids] = np.arange(self.n)
104
+ return self.codes[inv[cand]]
105
+
106
+ def index_bytes(self):
107
+ """Bytes that must be in RAM to route (codes + leaf ids); the trie is implicit in the sort."""
108
+ return self.codes.nbytes + self.docids.nbytes
bitbudget/cli.py ADDED
@@ -0,0 +1,231 @@
1
+ """BitBudget command line.
2
+
3
+ bitbudget methods # list registered compression methods
4
+ bitbudget embedders # list registered embedders
5
+ bitbudget embed --embedder mxbai --corpus scifact # torch step -> cached embeddings
6
+ bitbudget eval --embedder mxbai --corpus scifact # numpy step -> results card
7
+ bitbudget run --embedder mxbai --corpus scifact nfcorpus arguana fiqa # embed + eval
8
+ bitbudget leaderboard results/*.json # aggregate cards into a markdown table
9
+ bitbudget indexes # list registered indexes (organisation axis)
10
+ bitbudget bench-index --synthetic 100000 128 # recall vs QPS vs bytes (flat/hnsw/ivfpq/bittrie)
11
+
12
+ `run` embeds (torch) and evaluates (numpy) in one process, which is safe because the core
13
+ methods use no faiss. The faiss-backed indexes in `bench-index` import faiss only when run, so
14
+ run it in its own process (no torch) to avoid the OpenMP clash.
15
+ """
16
+ import argparse
17
+ import glob
18
+ import json
19
+ import os
20
+
21
+ import numpy as np
22
+
23
+ from . import datasets
24
+ from .eval import evaluate, aggregate
25
+ from .methods import list_methods, METHODS
26
+ from .indexes import bench_indexes, list_indexes, INDEXES
27
+
28
+
29
+ def _emb_path(cache, embedder, corpus):
30
+ return os.path.join(os.path.expanduser(cache), f"emb_{embedder}_{corpus}.npz")
31
+
32
+
33
+ def cmd_embed(a):
34
+ from .embedders import EMBEDDERS # lazy: imports torch
35
+ emb = EMBEDDERS[a.embedder]
36
+ for corpus in a.corpus:
37
+ corp, queries, qrels = datasets.load(corpus, a.cache, a.split)
38
+ cids, qids = list(corp), list(queries)
39
+ print(f"[embed] {a.embedder} x {corpus}: {len(cids)} docs, {len(qids)} queries")
40
+ demb = emb([corp[c] for c in cids], is_query=False, device=a.device, batch_size=a.batch_size)
41
+ qemb = emb([queries[q] for q in qids], is_query=True, device=a.device, batch_size=a.batch_size)
42
+ os.makedirs(os.path.expanduser(a.cache), exist_ok=True)
43
+ cidset = set(cids)
44
+ np.savez(_emb_path(a.cache, a.embedder, corpus), demb=demb, qemb=qemb,
45
+ cids=np.array(cids), qids=np.array(qids),
46
+ qrels=json.dumps({q: {d: s for d, s in qrels[q].items() if d in cidset} for q in qids}))
47
+ print(f"[embed] cached -> {_emb_path(a.cache, a.embedder, corpus)} {demb.shape}")
48
+
49
+
50
+ def _load_emb(a, corpus):
51
+ z = np.load(_emb_path(a.cache, a.embedder, corpus), allow_pickle=True)
52
+ return z["demb"], z["qemb"], z["cids"], z["qids"], json.loads(str(z["qrels"]))
53
+
54
+
55
+ def cmd_eval(a, _embed_first=False):
56
+ methods = a.methods or list_methods()
57
+ per_corpus = {}
58
+ for corpus in a.corpus:
59
+ if _embed_first:
60
+ cmd_embed(argparse.Namespace(**{**vars(a), "corpus": [corpus]}))
61
+ demb, qemb, cids, qids, qrels = _load_emb(a, corpus)
62
+ rows = evaluate(demb, qemb, cids, qids, qrels, methods)
63
+ per_corpus[corpus] = rows
64
+ print(f"\n=== {a.embedder} x {corpus} ({demb.shape[1]}-d) ===")
65
+ print(f" {'method':16s}{'bytes':>8s}{'nDCG@10':>9s}{'recall@10':>11s}{'% float':>9s}")
66
+ for r in rows:
67
+ print(f" {r['method']:16s}{r['bytes']:8.0f}{r['ndcg']:9.3f}{r['recall']:11.3f}{r['pct_float']:8.0f}%")
68
+ os.makedirs(a.out, exist_ok=True)
69
+ card = dict(embedder=a.embedder, corpora=a.corpus,
70
+ dim=int(_load_emb(a, a.corpus[0])[0].shape[1]),
71
+ per_corpus=per_corpus, aggregate=aggregate(per_corpus))
72
+ path = os.path.join(a.out, f"card_{a.embedder}.json")
73
+ json.dump(card, open(path, "w"), indent=2)
74
+ print(f"\n=== aggregate over {len(a.corpus)} corpora (mean +/- std) ===")
75
+ for r in card["aggregate"]:
76
+ print(f" {r['method']:16s}{r['bytes']:8.0f}B nDCG {r['ndcg']:.3f} "
77
+ f"{r['pct_float']:.0f}+/-{r['pct_float_std']:.0f}% of float")
78
+ print(f"\nwrote results card -> {path}")
79
+ return card
80
+
81
+
82
+ def cmd_methods(a):
83
+ print("registered methods (name | axis | bits):")
84
+ for n in list_methods():
85
+ fn = METHODS[n]
86
+ print(f" {n:16s} {getattr(fn,'axis','?'):12s} {fn.bits}")
87
+
88
+
89
+ def cmd_indexes(a):
90
+ print("registered indexes (name | axis):")
91
+ for n in list_indexes():
92
+ print(f" {n:14s} {getattr(INDEXES[n], 'axis', 'organisation')}")
93
+
94
+
95
+ def cmd_bench_index(a):
96
+ """Build each index over document vectors and report recall@k, QPS and bytes/vec.
97
+
98
+ faiss (its OpenMP) and the bit-trie (libomp for the C kernel) cannot share one process on
99
+ macOS (two OpenMP runtimes). When both groups are requested we run each in its own subprocess
100
+ and merge; the data is deterministic from the arguments, so the split is transparent.
101
+ """
102
+ sel = a.indexes or list_indexes()
103
+ faiss_grp = [n for n in sel if getattr(INDEXES[n], "axis", "") == "organisation"]
104
+ other_grp = [n for n in sel if n not in faiss_grp]
105
+ if faiss_grp and other_grp and not getattr(a, "no_split", False):
106
+ import subprocess
107
+ import sys
108
+ import tempfile
109
+ import shutil
110
+ merged, meta = [], {}
111
+ for grp in (faiss_grp, other_grp):
112
+ td = tempfile.mkdtemp()
113
+ cmd = [sys.executable, "-m", "bitbudget.cli", "bench-index",
114
+ *_bench_data_args(a), "--k", str(a.k), "--out", td, "--no-split",
115
+ "--indexes", *grp]
116
+ subprocess.run(cmd, check=True, env=os.environ.copy())
117
+ c = json.load(open(os.path.join(td, "index_card.json")))
118
+ merged += c["rows"]; meta = c
119
+ shutil.rmtree(td, ignore_errors=True)
120
+ merged.sort(key=lambda r: r["bytes"])
121
+ os.makedirs(a.out, exist_ok=True)
122
+ card = dict(source=meta["source"], k=a.k, dim=meta["dim"], n=meta["n"], rows=merged)
123
+ json.dump(card, open(os.path.join(a.out, "index_card.json"), "w"), indent=2)
124
+ print("\n=== board (faiss + bit-trie run in separate processes) ===")
125
+ for r in merged:
126
+ print(f" {r['method']:14s}{r['bytes']:8.0f}B recall@{a.k}={r['recall']:.3f} {r['qps']:9.0f} qps")
127
+ print(f"wrote index card -> {os.path.join(a.out, 'index_card.json')}")
128
+ return card
129
+
130
+ xb, xq, label = _bench_load(a)
131
+ print(f"[bench-index] {label}: base={tuple(xb.shape)}, queries={len(xq)}, k={a.k}")
132
+ rows = bench_indexes(xb, xq, k=a.k, which=sel)
133
+ os.makedirs(a.out, exist_ok=True)
134
+ card = dict(source=label, k=a.k, dim=int(xb.shape[1]), n=int(len(xb)), rows=rows)
135
+ path = os.path.join(a.out, "index_card.json")
136
+ json.dump(card, open(path, "w"), indent=2)
137
+ print(f"\nwrote index card -> {path}")
138
+ return card
139
+
140
+
141
+ def _bench_data_args(a):
142
+ if a.synthetic:
143
+ return ["--synthetic", str(a.synthetic[0]), str(a.synthetic[1])]
144
+ if a.npz:
145
+ return ["--npz", a.npz]
146
+ return ["--embedder", a.embedder, "--corpus", a.corpus, "--cache", a.cache]
147
+
148
+
149
+ def _bench_load(a):
150
+ if a.synthetic:
151
+ N, D = a.synthetic
152
+ rng = np.random.RandomState(0)
153
+ centers = rng.randn(max(2, N // 500), D).astype(np.float32) * 4.0
154
+ xb = centers[rng.randint(0, len(centers), N)] + rng.randn(N, D).astype(np.float32)
155
+ nq = min(10000, max(1, N // 10)) # enough queries to time throughput stably
156
+ xq = xb[rng.choice(N, nq, replace=False)] + 0.1 * rng.randn(nq, D).astype(np.float32)
157
+ return xb, xq, f"synthetic N={N} D={D}"
158
+ if a.npz:
159
+ z = np.load(os.path.expanduser(a.npz), allow_pickle=True)
160
+ xb = z["base"] if "base" in z.files else z["demb"]
161
+ xq = z["query"] if "query" in z.files else z["qemb"]
162
+ return xb, xq, a.npz
163
+ if a.embedder and a.corpus:
164
+ z = np.load(_emb_path(a.cache, a.embedder, a.corpus), allow_pickle=True)
165
+ return z["demb"], z["qemb"], f"{a.embedder} x {a.corpus}"
166
+ raise SystemExit("bench-index needs one of: --synthetic N D | --npz PATH | --embedder X --corpus Y")
167
+
168
+
169
+ def cmd_embedders(a):
170
+ from .embedders import list_embedders, EMBEDDERS
171
+ print("registered embedders (name | dim | matryoshka):")
172
+ for n in list_embedders():
173
+ fn = EMBEDDERS[n]
174
+ print(f" {n:10s} dim={fn.dim} matryoshka={fn.matryoshka} ({fn.model})")
175
+
176
+
177
+ def cmd_leaderboard(a):
178
+ cards = [json.load(open(p)) for pat in a.cards for p in glob.glob(pat)]
179
+ print("# BitBudget leaderboard\n")
180
+ for c in sorted(cards, key=lambda c: -c["dim"]):
181
+ print(f"## {c['embedder']} ({c['dim']}-d), mean over {len(c['corpora'])} corpora\n")
182
+ print("| Method | Axis | Bytes/vec | nDCG@10 | % of float |")
183
+ print("|---|---|---|---|---|")
184
+ for r in c["aggregate"]:
185
+ print(f"| {r['method']} | {r['axis']} | {r['bytes']:.0f} | "
186
+ f"{r['ndcg']:.3f} | {r['pct_float']:.0f} ± {r['pct_float_std']:.0f} |")
187
+ print()
188
+
189
+
190
+ def main(argv=None):
191
+ p = argparse.ArgumentParser(prog="bitbudget", description="recall-per-byte benchmark for embedding compression")
192
+ sub = p.add_subparsers(dest="cmd", required=True)
193
+ for name in ("embed", "eval", "run"):
194
+ s = sub.add_parser(name)
195
+ s.add_argument("--embedder", required=True)
196
+ s.add_argument("--corpus", nargs="+", default=datasets.CORPORA)
197
+ s.add_argument("--methods", nargs="+", default=None)
198
+ s.add_argument("--cache", default="~/.cache/bitbudget")
199
+ s.add_argument("--out", default="results")
200
+ s.add_argument("--device", default="cpu", help="cpu | mps (embedding step)")
201
+ s.add_argument("--batch-size", type=int, default=64)
202
+ s.add_argument("--split", default="test")
203
+ sub.add_parser("methods")
204
+ sub.add_parser("embedders")
205
+ sub.add_parser("indexes")
206
+ lb = sub.add_parser("leaderboard"); lb.add_argument("cards", nargs="+")
207
+ bi = sub.add_parser("bench-index")
208
+ bi.add_argument("--synthetic", nargs=2, type=int, metavar=("N", "D"),
209
+ help="benchmark on N clustered Gaussian vectors of dimension D")
210
+ bi.add_argument("--npz", help="load base/query (keys base,query or demb,qemb) from an .npz")
211
+ bi.add_argument("--embedder", help="use a cached embedding (with --corpus): demb as base")
212
+ bi.add_argument("--corpus")
213
+ bi.add_argument("--cache", default="~/.cache/bitbudget")
214
+ bi.add_argument("--k", type=int, default=10)
215
+ bi.add_argument("--indexes", nargs="+", default=None, help="subset of indexes to run")
216
+ bi.add_argument("--out", default="results")
217
+ bi.add_argument("--no-split", dest="no_split", action="store_true",
218
+ help="run all indexes in one process (do not isolate faiss from the bit-trie)")
219
+ a = p.parse_args(argv)
220
+ if a.cmd == "embed": cmd_embed(a)
221
+ elif a.cmd == "eval": cmd_eval(a)
222
+ elif a.cmd == "run": cmd_eval(a, _embed_first=True)
223
+ elif a.cmd == "methods": cmd_methods(a)
224
+ elif a.cmd == "embedders": cmd_embedders(a)
225
+ elif a.cmd == "indexes": cmd_indexes(a)
226
+ elif a.cmd == "leaderboard": cmd_leaderboard(a)
227
+ elif a.cmd == "bench-index": cmd_bench_index(a)
228
+
229
+
230
+ if __name__ == "__main__":
231
+ main()
bitbudget/datasets.py ADDED
@@ -0,0 +1,49 @@
1
+ """BEIR corpus loading. Datasets auto-download to the cache dir on first use."""
2
+ import json
3
+ import os
4
+ import shutil
5
+ import urllib.request
6
+ import zipfile
7
+
8
+ BEIR_URL = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip"
9
+ # the standard small/medium BEIR corpora the leaderboard is defined over
10
+ CORPORA = ["scifact", "nfcorpus", "arguana", "fiqa"]
11
+
12
+
13
+ def _download(name, cache):
14
+ d = os.path.join(cache, name)
15
+ if not os.path.isdir(d):
16
+ os.makedirs(cache, exist_ok=True)
17
+ z = os.path.join(cache, name + ".zip")
18
+ if not os.path.exists(z):
19
+ req = urllib.request.Request(BEIR_URL.format(name), headers={"User-Agent": "Mozilla/5.0"})
20
+ with urllib.request.urlopen(req) as r, open(z + ".part", "wb") as f:
21
+ shutil.copyfileobj(r, f)
22
+ os.replace(z + ".part", z)
23
+ with zipfile.ZipFile(z) as zf:
24
+ zf.extractall(cache)
25
+ return d
26
+
27
+
28
+ def load(name, cache="~/.cache/bitbudget", split="test"):
29
+ cache = os.path.expanduser(cache)
30
+ d = _download(name, cache)
31
+ corpus = {}
32
+ with open(os.path.join(d, "corpus.jsonl")) as f:
33
+ for line in f:
34
+ o = json.loads(line)
35
+ corpus[o["_id"]] = (o.get("title", "") + " " + o.get("text", "")).strip()
36
+ queries = {}
37
+ with open(os.path.join(d, "queries.jsonl")) as f:
38
+ for line in f:
39
+ o = json.loads(line)
40
+ queries[o["_id"]] = o["text"]
41
+ qrels = {}
42
+ with open(os.path.join(d, "qrels", split + ".tsv")) as f:
43
+ next(f)
44
+ for line in f:
45
+ qid, did, score = line.strip().split("\t")
46
+ if int(score) > 0:
47
+ qrels.setdefault(qid, {})[did] = int(score)
48
+ queries = {q: queries[q] for q in qrels if q in queries}
49
+ return corpus, queries, qrels
bitbudget/embedders.py ADDED
@@ -0,0 +1,46 @@
1
+ """Embedders. A named embedder turns a list of texts into a float32 matrix. These import
2
+ sentence-transformers (torch) and so must run in a process that does NOT also import faiss
3
+ (OpenMP clash on macOS); the CLI keeps embedding and faiss-backed evaluation in separate
4
+ processes. Register your own with ``@embedder``.
5
+ """
6
+ EMBEDDERS = {}
7
+
8
+
9
+ def embedder(name, model=None, dim=None, query_prompt="", matryoshka=False):
10
+ """Register an embedder. `query_prompt` is prepended to queries only (asymmetric models)."""
11
+ def deco(fn):
12
+ fn.bitbudget_name = name
13
+ fn.model, fn.dim, fn.query_prompt, fn.matryoshka = model, dim, query_prompt, matryoshka
14
+ EMBEDDERS[name] = fn
15
+ return fn
16
+ return deco
17
+
18
+
19
+ def list_embedders():
20
+ return sorted(EMBEDDERS)
21
+
22
+
23
+ def _st(model_name, device, max_seq_length):
24
+ from sentence_transformers import SentenceTransformer
25
+ m = SentenceTransformer(model_name, device=device, trust_remote_code=True)
26
+ if max_seq_length:
27
+ m.max_seq_length = max_seq_length
28
+ return m
29
+
30
+
31
+ def _make_st_embedder(name, model_name, dim, query_prompt, matryoshka, max_seq_length=256):
32
+ @embedder(name, model=model_name, dim=dim, query_prompt=query_prompt, matryoshka=matryoshka)
33
+ def fn(texts, is_query=False, device="cpu", batch_size=64):
34
+ import numpy as np
35
+ m = _st(model_name, device, max_seq_length)
36
+ if is_query and query_prompt:
37
+ texts = [query_prompt + t for t in texts]
38
+ return m.encode(texts, batch_size=batch_size, normalize_embeddings=True,
39
+ show_progress_bar=True).astype(np.float32)
40
+ return fn
41
+
42
+
43
+ # built-in embedders used in the paper
44
+ _make_st_embedder("minilm", "sentence-transformers/all-MiniLM-L6-v2", 384, "", False)
45
+ _make_st_embedder("mxbai", "mixedbread-ai/mxbai-embed-large-v1", 1024,
46
+ "Represent this sentence for searching relevant passages: ", True)
bitbudget/eval.py ADDED
@@ -0,0 +1,44 @@
1
+ """The BitBudget protocol: given document/query embeddings and judgements, score every
2
+ registered compression method on retrieval quality against the bytes it stores per vector."""
3
+ import numpy as np
4
+
5
+ from .methods import METHODS
6
+ from .metrics import ndcg_at_k, recall_at_k
7
+
8
+
9
+ def evaluate(demb, qemb, cids, qids, qrels, methods=None, recall_k=10, ndcg_k=10):
10
+ """Run `methods` (default: all registered) and return a list of result dicts:
11
+ {method, axis, bytes, ndcg, recall, pct_float}. nDCG uses the graded qrels; recall uses
12
+ the exact float top-k as ground truth; pct_float is nDCG relative to the float32 baseline.
13
+ """
14
+ methods = methods or list(METHODS)
15
+ exact = qemb @ demb.T # exact float scores = recall ground truth
16
+ base = ndcg_at_k(exact, qids, cids, qrels, ndcg_k)
17
+ rows = []
18
+ for name in methods:
19
+ fn = METHODS[name]
20
+ scores, bpv = fn(demb, qemb)
21
+ n = ndcg_at_k(scores, qids, cids, qrels, ndcg_k)
22
+ r = recall_at_k(scores, exact, recall_k)
23
+ rows.append(dict(method=name, axis=getattr(fn, "axis", "?"), bytes=float(bpv),
24
+ ndcg=round(n, 4), recall=round(r, 4),
25
+ pct_float=round(100 * n / base, 1) if base else float("nan")))
26
+ rows.sort(key=lambda x: x["bytes"])
27
+ return rows
28
+
29
+
30
+ def aggregate(per_corpus):
31
+ """Mean +/- std across corpora for each method, given a {corpus: rows} mapping."""
32
+ by = {}
33
+ for corpus, rows in per_corpus.items():
34
+ for r in rows:
35
+ by.setdefault(r["method"], []).append(r)
36
+ out = []
37
+ for name, rs in by.items():
38
+ nd = np.array([r["ndcg"] for r in rs]); pc = np.array([r["pct_float"] for r in rs])
39
+ out.append(dict(method=name, axis=rs[0]["axis"], bytes=rs[0]["bytes"],
40
+ ndcg=round(float(nd.mean()), 3), ndcg_std=round(float(nd.std()), 3),
41
+ pct_float=round(float(pc.mean()), 1), pct_float_std=round(float(pc.std()), 1),
42
+ n_corpora=len(rs)))
43
+ out.sort(key=lambda x: x["bytes"])
44
+ return out
bitbudget/indexes.py ADDED
@@ -0,0 +1,136 @@
1
+ """The organisation axis: build an index over document vectors and measure recall@k, query
2
+ throughput (QPS) and the bytes it stores per vector.
3
+
4
+ This is the axis the compression leaderboard does not cover. A graph index such as HNSW does not
5
+ shrink the footprint, it *adds* bytes (the vectors plus a graph) and buys throughput; a compact
6
+ code such as the bit-trie shrinks it. Reporting recall, QPS and bytes together makes the
7
+ trade-off explicit, in the spirit of ann-benchmarks.
8
+
9
+ Register an index with ``@index``. The faiss-backed indexes (flat/hnsw/ivfpq) import faiss lazily
10
+ and are skipped with a message if faiss is not installed; the bit-trie is numpy-only. Vectors are
11
+ L2-normalised on ingest so inner-product and L2 rankings agree. Run in its own process (no torch)
12
+ to avoid the faiss/OpenMP clash (see README).
13
+ """
14
+ import time
15
+ import numpy as np
16
+
17
+ INDEXES = {}
18
+
19
+
20
+ def index(name, axis="organisation"):
21
+ """Register an index. An index is ``fn(xb, xq, k) -> (topk_indices, bytes_per_vec, qps)``."""
22
+ def deco(fn):
23
+ fn.bitbudget_name = name
24
+ fn.axis = axis
25
+ INDEXES[name] = fn
26
+ return fn
27
+ return deco
28
+
29
+
30
+ def list_indexes():
31
+ return sorted(INDEXES)
32
+
33
+
34
+ def _time(search, repeats=3):
35
+ best, I = float("inf"), None
36
+ for _ in range(repeats):
37
+ t = time.time(); I = search(); best = min(best, time.time() - t)
38
+ return I, best
39
+
40
+
41
+ # ----------------------------------------------------------------- faiss-backed (organisation)
42
+ @index("flat")
43
+ def flat(xb, xq, k):
44
+ import faiss
45
+ D = xb.shape[1]
46
+ ix = faiss.IndexFlatIP(D); ix.add(xb)
47
+ I, t = _time(lambda: ix.search(xq, k)[1])
48
+ return I, 4 * D, len(xq) / t # full float vectors, exact
49
+
50
+
51
+ @index("hnsw")
52
+ def hnsw(xb, xq, k, M=24, efc=200, efs=128):
53
+ import faiss
54
+ D = xb.shape[1]
55
+ ix = faiss.IndexHNSWFlat(D, M, faiss.METRIC_INNER_PRODUCT)
56
+ ix.hnsw.efConstruction = efc; ix.add(xb); ix.hnsw.efSearch = efs
57
+ I, t = _time(lambda: ix.search(xq, k)[1])
58
+ return I, 4 * D + M * 2 * 4, len(xq) / t # vectors + graph edges (both levels approx)
59
+
60
+
61
+ @index("ivfpq")
62
+ def ivfpq(xb, xq, k, nprobe=64):
63
+ import faiss
64
+ n, D = xb.shape
65
+ m = D // 2
66
+ while D % m:
67
+ m -= 1
68
+ nlist = max(1, min(8192, n // 100))
69
+ ix = faiss.IndexIVFPQ(faiss.IndexFlatIP(D), D, nlist, m, 8, faiss.METRIC_INNER_PRODUCT)
70
+ samp = xb[np.random.RandomState(0).choice(n, min(200000, n), replace=False)]
71
+ ix.train(samp); ix.add(xb); ix.nprobe = min(nprobe, nlist)
72
+ I, t = _time(lambda: ix.search(xq, k)[1])
73
+ return I, float(m), len(xq) / t # PQ code bytes
74
+
75
+
76
+ # ----------------------------------------------------------------- numpy bit-trie (research entry)
77
+ @index("bittrie", axis="quantisation")
78
+ def bittrie(xb, xq, k, nbits=64, beam=128, depth=24):
79
+ from .bittrie import BitTrieIndex
80
+ ix = BitTrieIndex(n_bits=nbits, seed=0).fit(xb)
81
+ ix.query_batch(xq[:2], topk=k, beam=beam, depth=depth) # warm up (compile C kernel once)
82
+ I, t = _time(lambda: ix.query_batch(xq, topk=k, beam=beam, depth=depth), repeats=2)
83
+ return I, nbits / 8.0, len(xq) / t # compact routing code (float kept cold for re-rank)
84
+
85
+
86
+ # ----------------------------------------------------------------- harness
87
+ def _exact(xb, xq, k, chunk=200000):
88
+ """Exact top-k by inner product, chunked over documents to bound memory. The chunk shrinks
89
+ as the query count grows so the (nq x chunk) score block stays small regardless of nq."""
90
+ nq = len(xq)
91
+ chunk = max(1024, min(chunk, 20_000_000 // max(1, nq))) # keep nq*chunk ~ 20M elements
92
+ best_s = np.full((nq, k), -1e30, np.float32)
93
+ best_i = np.zeros((nq, k), np.int64)
94
+ for s in range(0, len(xb), chunk):
95
+ e = min(s + chunk, len(xb))
96
+ sc = xq @ xb[s:e].T
97
+ cs = np.concatenate([best_s, sc], 1)
98
+ ci = np.concatenate([best_i, np.broadcast_to(np.arange(s, e), (nq, e - s))], 1)
99
+ part = np.argpartition(-cs, k - 1, axis=1)[:, :k]
100
+ best_s = np.take_along_axis(cs, part, 1)
101
+ best_i = np.take_along_axis(ci, part, 1)
102
+ order = np.argsort(-best_s, 1)
103
+ return np.take_along_axis(best_i, order, 1)
104
+
105
+
106
+ def bench_indexes(xb, xq, k=10, gt=None, which=None, verbose=True):
107
+ """Build each index and report recall@k, QPS and bytes/vec. Returns rows sorted by footprint.
108
+ Vectors are L2-normalised so the inner-product ground truth is metric-consistent across indexes.
109
+ """
110
+ xb = np.ascontiguousarray(xb, dtype=np.float32)
111
+ xq = np.ascontiguousarray(xq, dtype=np.float32)
112
+ xb /= (np.linalg.norm(xb, axis=1, keepdims=True) + 1e-9)
113
+ xq /= (np.linalg.norm(xq, axis=1, keepdims=True) + 1e-9)
114
+ if gt is None:
115
+ gt = _exact(xb, xq, k)
116
+ rows = []
117
+ # run faiss-backed indexes before the numpy bit-trie: the bit-trie loads its own OpenMP
118
+ # runtime (libomp) for the C kernel, which clashes with faiss's OpenMP if faiss runs after it
119
+ # in the same process (the macOS two-runtimes problem). Ordering it last avoids the clash.
120
+ names = sorted(which or list_indexes(), key=lambda n: (n == "bittrie", n))
121
+ for name in names:
122
+ fn = INDEXES[name]
123
+ try:
124
+ I, bpv, qps = fn(xb, xq, k)
125
+ except ImportError:
126
+ if verbose:
127
+ print(f" [skip {name}: install bitbudget[faiss]]")
128
+ continue
129
+ rec = float(np.mean([len(set(I[i][:k].tolist()) & set(gt[i][:k].tolist())) / k
130
+ for i in range(len(xq))]))
131
+ rows.append(dict(method=name, axis=getattr(fn, "axis", "organisation"),
132
+ bytes=float(bpv), recall=round(rec, 4), qps=round(float(qps), 1)))
133
+ if verbose:
134
+ print(f" {name:14s}{bpv:8.0f}B recall@{k}={rec:.3f} {qps:9.0f} qps")
135
+ rows.sort(key=lambda r: r["bytes"])
136
+ return rows
bitbudget/methods.py ADDED
@@ -0,0 +1,114 @@
1
+ """Compression methods. A method maps float document/query embeddings to a
2
+ query-by-document similarity matrix under some compression, and reports the bytes it stores
3
+ per document vector. Register your own with the ``@method`` decorator.
4
+
5
+ @method("my-method", bits=2)
6
+ def my_method(demb, qemb):
7
+ ...
8
+ return scores, bytes_per_vec # scores: (n_queries, n_docs); bytes: float
9
+
10
+ Everything here is numpy-only so that evaluation runs in the same process as torch embedding
11
+ without the faiss/OpenMP clash (see README). faiss-backed methods live in methods_faiss.py and
12
+ run in a separate evaluation process.
13
+ """
14
+ import numpy as np
15
+
16
+ METHODS = {}
17
+
18
+
19
+ def method(name, bits=None, axis="quantisation"):
20
+ """Register a compression method. `bits` and `axis` are metadata for the leaderboard."""
21
+ def deco(fn):
22
+ fn.bitbudget_name = name
23
+ fn.bits = bits
24
+ fn.axis = axis
25
+ METHODS[name] = fn
26
+ return fn
27
+ return deco
28
+
29
+
30
+ def list_methods():
31
+ return sorted(METHODS)
32
+
33
+
34
+ # ---------------------------------------------------------------- baselines / quantisation
35
+ @method("float32", bits=32, axis="none")
36
+ def float32(demb, qemb):
37
+ return qemb @ demb.T, 4 * demb.shape[1]
38
+
39
+
40
+ @method("int8", bits=8)
41
+ def int8(demb, qemb):
42
+ scale = np.abs(demb).max(0, keepdims=True) / 127.0 + 1e-9
43
+ dq = np.round(demb / scale) * scale
44
+ return qemb @ dq.T, demb.shape[1]
45
+
46
+
47
+ @method("binary", bits=1)
48
+ def binary(demb, qemb):
49
+ return np.sign(qemb) @ np.sign(demb).T, demb.shape[1] / 8.0
50
+
51
+
52
+ @method("binary+rerank", bits=1)
53
+ def binary_rerank(demb, qemb, depth=100):
54
+ bq, bd = np.sign(qemb), np.sign(demb)
55
+ cand = np.argsort(-(bq @ bd.T), axis=1)[:, :depth]
56
+ scores = np.full((qemb.shape[0], demb.shape[0]), -1e9, dtype=np.float32)
57
+ for i in range(qemb.shape[0]):
58
+ scores[i, cand[i]] = qemb[i] @ demb[cand[i]].T
59
+ return scores, demb.shape[1] / 8.0 # compact code stored; full vectors fetched on re-rank
60
+
61
+
62
+ @method("rabitq", bits=1)
63
+ def rabitq(demb, qemb, seed=0):
64
+ D = demb.shape[1]
65
+ R = np.linalg.qr(np.random.RandomState(seed).randn(D, D))[0].astype(np.float32)
66
+ return (qemb @ R) @ np.sign(demb @ R).T, D / 8.0
67
+
68
+
69
+ # ---------------------------------------------------------------- product quantisation (numpy)
70
+ def _kmeans(X, k, iters=10, seed=0):
71
+ rng = np.random.RandomState(seed)
72
+ C = X[rng.choice(len(X), size=min(k, len(X)), replace=False)].copy()
73
+ for _ in range(iters):
74
+ d = ((X ** 2).sum(1, keepdims=True) - 2 * X @ C.T + (C ** 2).sum(1))
75
+ a = d.argmin(1)
76
+ for j in range(len(C)):
77
+ m = a == j
78
+ if m.any():
79
+ C[j] = X[m].mean(0)
80
+ return C, a
81
+
82
+
83
+ @method("pq", bits=8, axis="quantisation")
84
+ def pq(demb, qemb, m=None, ksub=256):
85
+ """Product quantisation: m sub-quantisers, 256 centroids each (m bytes/vector). Asymmetric
86
+ scoring against reconstructed documents."""
87
+ D = demb.shape[1]
88
+ m = m or max(1, D // 8)
89
+ assert D % m == 0, f"PQ needs D ({D}) divisible by m ({m})"
90
+ sub = D // m
91
+ recon = np.empty_like(demb)
92
+ for s in range(m):
93
+ sl = slice(s * sub, (s + 1) * sub)
94
+ C, a = _kmeans(demb[:, sl], ksub, seed=s)
95
+ recon[:, sl] = C[a]
96
+ return qemb @ recon.T, float(m)
97
+
98
+
99
+ # ---------------------------------------------------------------- projection axis (dimensions)
100
+ @method("matryoshka", bits=32, axis="projection")
101
+ def matryoshka(demb, qemb, dim=None):
102
+ """Prefix-truncation. Fair only for Matryoshka-trained embedders; on others it is naive."""
103
+ dim = dim or demb.shape[1] // 4
104
+ return qemb[:, :dim] @ demb[:, :dim].T, 4 * dim
105
+
106
+
107
+ @method("pca", bits=32, axis="projection")
108
+ def pca(demb, qemb, dim=None):
109
+ dim = dim or demb.shape[1] // 4
110
+ mean = demb.mean(0)
111
+ dc = demb - mean
112
+ _, V = np.linalg.eigh(dc.T @ dc)
113
+ W = V[:, ::-1][:, :dim]
114
+ return ((qemb - mean) @ W) @ (dc @ W).T, 4 * dim
bitbudget/metrics.py ADDED
@@ -0,0 +1,37 @@
1
+ """Retrieval metrics. nDCG@k uses graded BEIR judgements; recall@k uses the exact
2
+ floating-point neighbours as ground truth (the ANN-Benchmarks convention)."""
3
+ import numpy as np
4
+
5
+
6
+ def _topk(scores_row, k):
7
+ k = min(k, scores_row.shape[0])
8
+ idx = np.argpartition(-scores_row, k - 1)[:k]
9
+ return idx[np.argsort(-scores_row[idx])]
10
+
11
+
12
+ def ndcg_at_k(scores, qids, cids, qrels, k=10):
13
+ """Mean nDCG@k over queries with graded relevance judgements `qrels`."""
14
+ disc = 1.0 / np.log2(np.arange(2, k + 2))
15
+ out = []
16
+ cids = [str(c) for c in cids]
17
+ for i, q in enumerate(qids):
18
+ rel = qrels.get(str(q), {})
19
+ if not rel:
20
+ continue
21
+ top = _topk(scores[i], k)
22
+ gains = np.array([rel.get(cids[j], 0) for j in top], dtype=float)
23
+ dcg = np.sum((2 ** gains - 1) * disc[:len(gains)])
24
+ ideal = np.sort(np.array(list(rel.values()), dtype=float))[::-1][:k]
25
+ idcg = np.sum((2 ** ideal - 1) * disc[:len(ideal)])
26
+ out.append(dcg / idcg if idcg > 0 else 0.0)
27
+ return float(np.mean(out)) if out else float("nan")
28
+
29
+
30
+ def recall_at_k(scores, exact_scores, k=10):
31
+ """k-recall@k against the exact floating-point top-k (ground truth from `exact_scores`)."""
32
+ out = []
33
+ for i in range(scores.shape[0]):
34
+ gt = set(_topk(exact_scores[i], k).tolist())
35
+ got = set(_topk(scores[i], k).tolist())
36
+ out.append(len(gt & got) / float(k))
37
+ return float(np.mean(out))
@@ -0,0 +1,148 @@
1
+ Metadata-Version: 2.4
2
+ Name: bitbudget
3
+ Version: 0.1.0
4
+ Summary: How much retrieval quality do you keep per byte? A reproducible benchmark for embedding compression.
5
+ Author: Sean Moran
6
+ License: MIT
7
+ Project-URL: Paper, https://arxiv.org/abs/2510.04127
8
+ Project-URL: Leaderboard, https://github.com/sjmoran/bitbudget/blob/main/LEADERBOARD.md
9
+ Keywords: retrieval,embeddings,quantisation,hashing,compression,ANN,RAG
10
+ Requires-Python: >=3.9
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: numpy>=1.21
14
+ Provides-Extra: embed
15
+ Requires-Dist: sentence-transformers>=2.2; extra == "embed"
16
+ Provides-Extra: faiss
17
+ Requires-Dist: faiss-cpu>=1.7.4; extra == "faiss"
18
+ Provides-Extra: all
19
+ Requires-Dist: sentence-transformers>=2.2; extra == "all"
20
+ Requires-Dist: faiss-cpu>=1.7.4; extra == "all"
21
+ Dynamic: license-file
22
+
23
+ # BitBudget
24
+
25
+ **How much retrieval quality do you keep per byte?**
26
+
27
+ BitBudget is a small, reproducible benchmark for **embedding compression**. Give it an
28
+ embedder and a corpus and it reports the retrieval quality (nDCG@10, recall@10) that each
29
+ compression method retains against the **bytes it stores per vector** — the recall‑per‑byte
30
+ frontier that every RAG and vector‑database deployment actually lives on.
31
+
32
+ It is the companion benchmark to the survey *“Projection and Quantisation: A Unifying View of
33
+ Learning to Hash, from Random Projections to the RAG Era”* and exists to answer one question
34
+ that today is mostly answered by vendor blog posts: **when you binarise / int8 / RaBitQ /
35
+ product‑quantise / Matryoshka‑truncate your embeddings, what do you actually lose?**
36
+
37
+ ## The headline finding
38
+
39
+ > **Bits beat dimensions.** Spending a fixed byte budget on *more coarsely quantised*
40
+ > coordinates beats spending it on *fewer full‑precision* coordinates, at every budget and
41
+ > for every embedder we have tried. One‑bit codes with a cheap re‑ranking pass are **32×
42
+ > smaller than float at no measurable loss**.
43
+
44
+ ```
45
+ mxbai‑embed‑large (1024‑d), mean over 4 BEIR corpora
46
+ binary+rerank 128 B nDCG 0.509 100% of float ← 32× smaller, lossless
47
+ pq 128 B nDCG 0.488 96%
48
+ rabitq 128 B nDCG 0.487 96%
49
+ matryoshka 1024 B nDCG 0.439 86% ← 4× smaller, projection axis
50
+ float32 4096 B nDCG 0.508 100%
51
+ ```
52
+
53
+ See **[LEADERBOARD.md](LEADERBOARD.md)** for the full table.
54
+
55
+ ## Install
56
+
57
+ ```bash
58
+ pip install bitbudget # evaluation only (numpy)
59
+ pip install "bitbudget[all]" # + sentence-transformers (embedding) + faiss
60
+ ```
61
+
62
+ ## Quickstart
63
+
64
+ ```bash
65
+ bitbudget methods # list compression methods
66
+ bitbudget run --embedder mxbai --corpus scifact # embed + evaluate, print a results card
67
+ bitbudget leaderboard results/card_*.json # render a markdown leaderboard
68
+
69
+ bitbudget indexes # list indexes (organisation axis)
70
+ bitbudget bench-index --synthetic 100000 128 # recall vs QPS vs bytes: flat/hnsw/ivfpq/bittrie
71
+ ```
72
+
73
+ `run` embeds (torch) and evaluates (numpy) in one process. The corpora auto‑download.
74
+
75
+ ### The organisation axis (`bench-index`)
76
+
77
+ The compression leaderboard answers *quality per byte*; `bench-index` answers the orthogonal
78
+ *recall per query-second*. It builds an index over the document vectors and reports recall@k,
79
+ throughput (QPS) and bytes per vector, so HNSW and IVF‑PQ (which buy throughput and *add* bytes)
80
+ can be compared against compact‑code indexes on one frontier. Run it on synthetic data, on a
81
+ cached embedding (`--embedder mxbai --corpus scifact`), or on your own vectors (`--npz`). The
82
+ faiss‑backed indexes need `pip install bitbudget[faiss]`; the numpy `bittrie` runs without it.
83
+
84
+ The `bittrie` index ships a small C kernel (`_bittrie.c`) for the query hot‑path, compiled on
85
+ first use and cached (no compiler needed to *install* — the wheel stays pure‑Python, and it falls
86
+ back to numpy if no compiler is present). It builds **multithreaded** when OpenMP is available
87
+ (GCC/clang on Linux, Homebrew `libomp` on macOS) and single‑threaded otherwise; results are
88
+ bit‑identical to the numpy path, and recall/footprint are algorithmic and unchanged either way.
89
+
90
+ Because faiss carries its own OpenMP runtime, it cannot share a process with the bit‑trie's
91
+ `libomp` on macOS. `bench-index` therefore runs the faiss indexes and the bit‑trie in **separate
92
+ subprocesses** and merges the results, so a single `bitbudget bench-index ...` works everywhere
93
+ (pass `--no-split` to force one process, e.g. on Linux where both share one OpenMP runtime).
94
+
95
+ > **macOS note.** torch and faiss each bundle their own OpenMP runtime and crash if imported
96
+ > in the same process. The core methods are numpy‑only, so `run` is safe; if you add a
97
+ > faiss‑backed method, run `bitbudget embed` (torch) and `bitbudget eval` (numpy/faiss)
98
+ > as separate processes.
99
+
100
+ ## The protocol (frozen, so results are comparable)
101
+
102
+ - **Corpora:** the BEIR subsets `scifact`, `nfcorpus`, `arguana`, `fiqa` (small enough to run
103
+ on a laptop, diverse enough to be honest). Numbers are the mean over corpora; `±` is the
104
+ standard deviation across them.
105
+ - **Metrics:** `nDCG@10` against the graded BEIR judgements, and `recall@10` against the exact
106
+ floating‑point neighbours. `% of float` is nDCG relative to the uncompressed embedding.
107
+ - **Memory:** bytes stored per document vector (`4D` float, `D` int8, `D/8` binary, `M` for an
108
+ `M`‑byte product code, `4·dim` for a truncated/PCA‑reduced vector).
109
+ - **Embedders:** `minilm` (384‑d) and `mxbai` (1024‑d, Matryoshka) ship built in.
110
+
111
+ ## Add your method in five lines
112
+
113
+ This is the point of the benchmark: drop in your compressor and it is scored against every
114
+ built‑in on the same protocol.
115
+
116
+ ```python
117
+ from bitbudget import method
118
+ import numpy as np
119
+
120
+ @method("my-2bit", bits=2)
121
+ def my_2bit(demb, qemb):
122
+ codes = my_quantise(demb) # your compression
123
+ scores = qemb @ my_reconstruct(codes).T # (queries x docs) similarity
124
+ return scores, demb.shape[1] * 2 / 8 # scores, bytes per stored vector
125
+ ```
126
+
127
+ ```bash
128
+ bitbudget run --embedder mxbai --corpus scifact --methods my-2bit binary+rerank float32
129
+ ```
130
+
131
+ Then open a pull request adding your row to [LEADERBOARD.md](LEADERBOARD.md). See
132
+ [CONTRIBUTING.md](CONTRIBUTING.md).
133
+
134
+ ## Cite
135
+
136
+ If BitBudget helps your work, please cite the survey:
137
+
138
+ ```bibtex
139
+ @article{moran2025projection,
140
+ title = {Projection and Quantisation: A Unifying View of Learning to Hash,
141
+ from Random Projections to the RAG Era},
142
+ author = {Moran, Sean},
143
+ journal = {arXiv preprint arXiv:2510.04127},
144
+ year = {2025}
145
+ }
146
+ ```
147
+
148
+ MIT licensed.
@@ -0,0 +1,17 @@
1
+ bitbudget/__init__.py,sha256=83ZS2fMiaZojifkwbfMYYTXL8SwIHG9GPcL8TykB-sE,1157
2
+ bitbudget/_bittrie.c,sha256=jbFyD_NwvkupJtC9DfgvYz0XuHTZQe4B-hdXI4gj0Hc,4496
3
+ bitbudget/_bittrie_build.py,sha256=3HCcmnfWbVzQD4BVPM9z8aDT8r6N-reEShfRtxgbf6c,3672
4
+ bitbudget/bittrie.py,sha256=3EQa1Qu6VAf8nxm8_sGvD60Iwbzz4ncKM8Ar3xPfIAs,5516
5
+ bitbudget/cli.py,sha256=6dubvJF7yas9ytXm-vyUEhlvzIPFMVuNbGdUjM-slsE,10992
6
+ bitbudget/datasets.py,sha256=0lQJbD3ALfIQF7wzCT-gIkMJ8tD1pm3z3EM4IpB_mOQ,1799
7
+ bitbudget/embedders.py,sha256=_F6pHVRhQaK724317Fc1uIjDmeTnPKaV5nuAzdoZ9Bk,1909
8
+ bitbudget/eval.py,sha256=O3nUaD6plbeOgrgD57EOrkNA54zgfPOrI2w-aapjzR0,2090
9
+ bitbudget/indexes.py,sha256=yQ2O3NzcPoaONdpLagZrNz_AzT8NQcNQN70R4FXnBlM,5830
10
+ bitbudget/methods.py,sha256=D4B66oKXUmq5xezc0w9WHQ9nThOY9dhagbuvWWzxiWk,3909
11
+ bitbudget/metrics.py,sha256=7K2D3lgtpUDsACP6smPjytw5lbckWjE6HRpdzKdzCs8,1450
12
+ bitbudget-0.1.0.dist-info/licenses/LICENSE,sha256=nUSDmf1Ud4HyOItkD93o4_Vs-hHzoheLsIW_BI9zfEo,1067
13
+ bitbudget-0.1.0.dist-info/METADATA,sha256=wIstOwmYomqfT8Chz4mzZefQiwOGNvgLmct3OX3qXiU,6717
14
+ bitbudget-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
15
+ bitbudget-0.1.0.dist-info/entry_points.txt,sha256=f-1IR4Yvl1Zygj95DC9w0LgaSPOIjsFQg6Oz4v8tPeo,49
16
+ bitbudget-0.1.0.dist-info/top_level.txt,sha256=jBmwsZBMtA-eonXvguqSaVB05M8M6W1-UAl3OY6VFls,10
17
+ bitbudget-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ bitbudget = bitbudget.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Sean Moran
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ bitbudget