bitbudget 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bitbudget-0.1.0/LICENSE +21 -0
- bitbudget-0.1.0/PKG-INFO +148 -0
- bitbudget-0.1.0/README.md +126 -0
- bitbudget-0.1.0/pyproject.toml +39 -0
- bitbudget-0.1.0/setup.cfg +4 -0
- bitbudget-0.1.0/src/bitbudget/__init__.py +28 -0
- bitbudget-0.1.0/src/bitbudget/_bittrie.c +92 -0
- bitbudget-0.1.0/src/bitbudget/_bittrie_build.py +94 -0
- bitbudget-0.1.0/src/bitbudget/bittrie.py +108 -0
- bitbudget-0.1.0/src/bitbudget/cli.py +231 -0
- bitbudget-0.1.0/src/bitbudget/datasets.py +49 -0
- bitbudget-0.1.0/src/bitbudget/embedders.py +46 -0
- bitbudget-0.1.0/src/bitbudget/eval.py +44 -0
- bitbudget-0.1.0/src/bitbudget/indexes.py +136 -0
- bitbudget-0.1.0/src/bitbudget/methods.py +114 -0
- bitbudget-0.1.0/src/bitbudget/metrics.py +37 -0
- bitbudget-0.1.0/src/bitbudget.egg-info/PKG-INFO +148 -0
- bitbudget-0.1.0/src/bitbudget.egg-info/SOURCES.txt +22 -0
- bitbudget-0.1.0/src/bitbudget.egg-info/dependency_links.txt +1 -0
- bitbudget-0.1.0/src/bitbudget.egg-info/entry_points.txt +2 -0
- bitbudget-0.1.0/src/bitbudget.egg-info/requires.txt +11 -0
- bitbudget-0.1.0/src/bitbudget.egg-info/top_level.txt +1 -0
- bitbudget-0.1.0/tests/test_indexes.py +64 -0
- bitbudget-0.1.0/tests/test_protocol.py +44 -0
bitbudget-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Sean Moran
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
bitbudget-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: bitbudget
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: How much retrieval quality do you keep per byte? A reproducible benchmark for embedding compression.
|
|
5
|
+
Author: Sean Moran
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Paper, https://arxiv.org/abs/2510.04127
|
|
8
|
+
Project-URL: Leaderboard, https://github.com/sjmoran/bitbudget/blob/main/LEADERBOARD.md
|
|
9
|
+
Keywords: retrieval,embeddings,quantisation,hashing,compression,ANN,RAG
|
|
10
|
+
Requires-Python: >=3.9
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: numpy>=1.21
|
|
14
|
+
Provides-Extra: embed
|
|
15
|
+
Requires-Dist: sentence-transformers>=2.2; extra == "embed"
|
|
16
|
+
Provides-Extra: faiss
|
|
17
|
+
Requires-Dist: faiss-cpu>=1.7.4; extra == "faiss"
|
|
18
|
+
Provides-Extra: all
|
|
19
|
+
Requires-Dist: sentence-transformers>=2.2; extra == "all"
|
|
20
|
+
Requires-Dist: faiss-cpu>=1.7.4; extra == "all"
|
|
21
|
+
Dynamic: license-file
|
|
22
|
+
|
|
23
|
+
# BitBudget
|
|
24
|
+
|
|
25
|
+
**How much retrieval quality do you keep per byte?**
|
|
26
|
+
|
|
27
|
+
BitBudget is a small, reproducible benchmark for **embedding compression**. Give it an
|
|
28
|
+
embedder and a corpus and it reports the retrieval quality (nDCG@10, recall@10) that each
|
|
29
|
+
compression method retains against the **bytes it stores per vector** — the recall‑per‑byte
|
|
30
|
+
frontier that every RAG and vector‑database deployment actually lives on.
|
|
31
|
+
|
|
32
|
+
It is the companion benchmark to the survey *“Projection and Quantisation: A Unifying View of
|
|
33
|
+
Learning to Hash, from Random Projections to the RAG Era”* and exists to answer one question
|
|
34
|
+
that today is mostly answered by vendor blog posts: **when you binarise / int8 / RaBitQ /
|
|
35
|
+
product‑quantise / Matryoshka‑truncate your embeddings, what do you actually lose?**
|
|
36
|
+
|
|
37
|
+
## The headline finding
|
|
38
|
+
|
|
39
|
+
> **Bits beat dimensions.** Spending a fixed byte budget on *more coarsely quantised*
|
|
40
|
+
> coordinates beats spending it on *fewer full‑precision* coordinates, at every budget and
|
|
41
|
+
> for every embedder we have tried. One‑bit codes with a cheap re‑ranking pass are **32×
|
|
42
|
+
> smaller than float at no measurable loss**.
|
|
43
|
+
|
|
44
|
+
```
|
|
45
|
+
mxbai‑embed‑large (1024‑d), mean over 4 BEIR corpora
|
|
46
|
+
binary+rerank 128 B nDCG 0.509 100% of float ← 32× smaller, lossless
|
|
47
|
+
pq 128 B nDCG 0.488 96%
|
|
48
|
+
rabitq 128 B nDCG 0.487 96%
|
|
49
|
+
matryoshka 1024 B nDCG 0.439 86% ← 4× smaller, projection axis
|
|
50
|
+
float32 4096 B nDCG 0.508 100%
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
See **[LEADERBOARD.md](LEADERBOARD.md)** for the full table.
|
|
54
|
+
|
|
55
|
+
## Install
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
pip install bitbudget # evaluation only (numpy)
|
|
59
|
+
pip install "bitbudget[all]" # + sentence-transformers (embedding) + faiss
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Quickstart
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
bitbudget methods # list compression methods
|
|
66
|
+
bitbudget run --embedder mxbai --corpus scifact # embed + evaluate, print a results card
|
|
67
|
+
bitbudget leaderboard results/card_*.json # render a markdown leaderboard
|
|
68
|
+
|
|
69
|
+
bitbudget indexes # list indexes (organisation axis)
|
|
70
|
+
bitbudget bench-index --synthetic 100000 128 # recall vs QPS vs bytes: flat/hnsw/ivfpq/bittrie
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
`run` embeds (torch) and evaluates (numpy) in one process. The corpora auto‑download.
|
|
74
|
+
|
|
75
|
+
### The organisation axis (`bench-index`)
|
|
76
|
+
|
|
77
|
+
The compression leaderboard answers *quality per byte*; `bench-index` answers the orthogonal
|
|
78
|
+
*recall per query-second*. It builds an index over the document vectors and reports recall@k,
|
|
79
|
+
throughput (QPS) and bytes per vector, so HNSW and IVF‑PQ (which buy throughput and *add* bytes)
|
|
80
|
+
can be compared against compact‑code indexes on one frontier. Run it on synthetic data, on a
|
|
81
|
+
cached embedding (`--embedder mxbai --corpus scifact`), or on your own vectors (`--npz`). The
|
|
82
|
+
faiss‑backed indexes need `pip install bitbudget[faiss]`; the numpy `bittrie` runs without it.
|
|
83
|
+
|
|
84
|
+
The `bittrie` index ships a small C kernel (`_bittrie.c`) for the query hot‑path, compiled on
|
|
85
|
+
first use and cached (no compiler needed to *install* — the wheel stays pure‑Python, and it falls
|
|
86
|
+
back to numpy if no compiler is present). It builds **multithreaded** when OpenMP is available
|
|
87
|
+
(GCC/clang on Linux, Homebrew `libomp` on macOS) and single‑threaded otherwise; results are
|
|
88
|
+
bit‑identical to the numpy path, and recall/footprint are algorithmic and unchanged either way.
|
|
89
|
+
|
|
90
|
+
Because faiss carries its own OpenMP runtime, it cannot share a process with the bit‑trie's
|
|
91
|
+
`libomp` on macOS. `bench-index` therefore runs the faiss indexes and the bit‑trie in **separate
|
|
92
|
+
subprocesses** and merges the results, so a single `bitbudget bench-index ...` works everywhere
|
|
93
|
+
(pass `--no-split` to force one process, e.g. on Linux where both share one OpenMP runtime).
|
|
94
|
+
|
|
95
|
+
> **macOS note.** torch and faiss each bundle their own OpenMP runtime and crash if imported
|
|
96
|
+
> in the same process. The core methods are numpy‑only, so `run` is safe; if you add a
|
|
97
|
+
> faiss‑backed method, run `bitbudget embed` (torch) and `bitbudget eval` (numpy/faiss)
|
|
98
|
+
> as separate processes.
|
|
99
|
+
|
|
100
|
+
## The protocol (frozen, so results are comparable)
|
|
101
|
+
|
|
102
|
+
- **Corpora:** the BEIR subsets `scifact`, `nfcorpus`, `arguana`, `fiqa` (small enough to run
|
|
103
|
+
on a laptop, diverse enough to be honest). Numbers are the mean over corpora; `±` is the
|
|
104
|
+
standard deviation across them.
|
|
105
|
+
- **Metrics:** `nDCG@10` against the graded BEIR judgements, and `recall@10` against the exact
|
|
106
|
+
floating‑point neighbours. `% of float` is nDCG relative to the uncompressed embedding.
|
|
107
|
+
- **Memory:** bytes stored per document vector (`4D` float, `D` int8, `D/8` binary, `M` for an
|
|
108
|
+
`M`‑byte product code, `4·dim` for a truncated/PCA‑reduced vector).
|
|
109
|
+
- **Embedders:** `minilm` (384‑d) and `mxbai` (1024‑d, Matryoshka) ship built in.
|
|
110
|
+
|
|
111
|
+
## Add your method in five lines
|
|
112
|
+
|
|
113
|
+
This is the point of the benchmark: drop in your compressor and it is scored against every
|
|
114
|
+
built‑in on the same protocol.
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
from bitbudget import method
|
|
118
|
+
import numpy as np
|
|
119
|
+
|
|
120
|
+
@method("my-2bit", bits=2)
|
|
121
|
+
def my_2bit(demb, qemb):
|
|
122
|
+
codes = my_quantise(demb) # your compression
|
|
123
|
+
scores = qemb @ my_reconstruct(codes).T # (queries x docs) similarity
|
|
124
|
+
return scores, demb.shape[1] * 2 / 8 # scores, bytes per stored vector
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
bitbudget run --embedder mxbai --corpus scifact --methods my-2bit binary+rerank float32
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
Then open a pull request adding your row to [LEADERBOARD.md](LEADERBOARD.md). See
|
|
132
|
+
[CONTRIBUTING.md](CONTRIBUTING.md).
|
|
133
|
+
|
|
134
|
+
## Cite
|
|
135
|
+
|
|
136
|
+
If BitBudget helps your work, please cite the survey:
|
|
137
|
+
|
|
138
|
+
```bibtex
|
|
139
|
+
@article{moran2025projection,
|
|
140
|
+
title = {Projection and Quantisation: A Unifying View of Learning to Hash,
|
|
141
|
+
from Random Projections to the RAG Era},
|
|
142
|
+
author = {Moran, Sean},
|
|
143
|
+
journal = {arXiv preprint arXiv:2510.04127},
|
|
144
|
+
year = {2025}
|
|
145
|
+
}
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
MIT licensed.
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
# BitBudget
|
|
2
|
+
|
|
3
|
+
**How much retrieval quality do you keep per byte?**
|
|
4
|
+
|
|
5
|
+
BitBudget is a small, reproducible benchmark for **embedding compression**. Give it an
|
|
6
|
+
embedder and a corpus and it reports the retrieval quality (nDCG@10, recall@10) that each
|
|
7
|
+
compression method retains against the **bytes it stores per vector** — the recall‑per‑byte
|
|
8
|
+
frontier that every RAG and vector‑database deployment actually lives on.
|
|
9
|
+
|
|
10
|
+
It is the companion benchmark to the survey *“Projection and Quantisation: A Unifying View of
|
|
11
|
+
Learning to Hash, from Random Projections to the RAG Era”* and exists to answer one question
|
|
12
|
+
that today is mostly answered by vendor blog posts: **when you binarise / int8 / RaBitQ /
|
|
13
|
+
product‑quantise / Matryoshka‑truncate your embeddings, what do you actually lose?**
|
|
14
|
+
|
|
15
|
+
## The headline finding
|
|
16
|
+
|
|
17
|
+
> **Bits beat dimensions.** Spending a fixed byte budget on *more coarsely quantised*
|
|
18
|
+
> coordinates beats spending it on *fewer full‑precision* coordinates, at every budget and
|
|
19
|
+
> for every embedder we have tried. One‑bit codes with a cheap re‑ranking pass are **32×
|
|
20
|
+
> smaller than float at no measurable loss**.
|
|
21
|
+
|
|
22
|
+
```
|
|
23
|
+
mxbai‑embed‑large (1024‑d), mean over 4 BEIR corpora
|
|
24
|
+
binary+rerank 128 B nDCG 0.509 100% of float ← 32× smaller, lossless
|
|
25
|
+
pq 128 B nDCG 0.488 96%
|
|
26
|
+
rabitq 128 B nDCG 0.487 96%
|
|
27
|
+
matryoshka 1024 B nDCG 0.439 86% ← 4× smaller, projection axis
|
|
28
|
+
float32 4096 B nDCG 0.508 100%
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
See **[LEADERBOARD.md](LEADERBOARD.md)** for the full table.
|
|
32
|
+
|
|
33
|
+
## Install
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install bitbudget # evaluation only (numpy)
|
|
37
|
+
pip install "bitbudget[all]" # + sentence-transformers (embedding) + faiss
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Quickstart
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
bitbudget methods # list compression methods
|
|
44
|
+
bitbudget run --embedder mxbai --corpus scifact # embed + evaluate, print a results card
|
|
45
|
+
bitbudget leaderboard results/card_*.json # render a markdown leaderboard
|
|
46
|
+
|
|
47
|
+
bitbudget indexes # list indexes (organisation axis)
|
|
48
|
+
bitbudget bench-index --synthetic 100000 128 # recall vs QPS vs bytes: flat/hnsw/ivfpq/bittrie
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
`run` embeds (torch) and evaluates (numpy) in one process. The corpora auto‑download.
|
|
52
|
+
|
|
53
|
+
### The organisation axis (`bench-index`)
|
|
54
|
+
|
|
55
|
+
The compression leaderboard answers *quality per byte*; `bench-index` answers the orthogonal
|
|
56
|
+
*recall per query-second*. It builds an index over the document vectors and reports recall@k,
|
|
57
|
+
throughput (QPS) and bytes per vector, so HNSW and IVF‑PQ (which buy throughput and *add* bytes)
|
|
58
|
+
can be compared against compact‑code indexes on one frontier. Run it on synthetic data, on a
|
|
59
|
+
cached embedding (`--embedder mxbai --corpus scifact`), or on your own vectors (`--npz`). The
|
|
60
|
+
faiss‑backed indexes need `pip install bitbudget[faiss]`; the numpy `bittrie` runs without it.
|
|
61
|
+
|
|
62
|
+
The `bittrie` index ships a small C kernel (`_bittrie.c`) for the query hot‑path, compiled on
|
|
63
|
+
first use and cached (no compiler needed to *install* — the wheel stays pure‑Python, and it falls
|
|
64
|
+
back to numpy if no compiler is present). It builds **multithreaded** when OpenMP is available
|
|
65
|
+
(GCC/clang on Linux, Homebrew `libomp` on macOS) and single‑threaded otherwise; results are
|
|
66
|
+
bit‑identical to the numpy path, and recall/footprint are algorithmic and unchanged either way.
|
|
67
|
+
|
|
68
|
+
Because faiss carries its own OpenMP runtime, it cannot share a process with the bit‑trie's
|
|
69
|
+
`libomp` on macOS. `bench-index` therefore runs the faiss indexes and the bit‑trie in **separate
|
|
70
|
+
subprocesses** and merges the results, so a single `bitbudget bench-index ...` works everywhere
|
|
71
|
+
(pass `--no-split` to force one process, e.g. on Linux where both share one OpenMP runtime).
|
|
72
|
+
|
|
73
|
+
> **macOS note.** torch and faiss each bundle their own OpenMP runtime and crash if imported
|
|
74
|
+
> in the same process. The core methods are numpy‑only, so `run` is safe; if you add a
|
|
75
|
+
> faiss‑backed method, run `bitbudget embed` (torch) and `bitbudget eval` (numpy/faiss)
|
|
76
|
+
> as separate processes.
|
|
77
|
+
|
|
78
|
+
## The protocol (frozen, so results are comparable)
|
|
79
|
+
|
|
80
|
+
- **Corpora:** the BEIR subsets `scifact`, `nfcorpus`, `arguana`, `fiqa` (small enough to run
|
|
81
|
+
on a laptop, diverse enough to be honest). Numbers are the mean over corpora; `±` is the
|
|
82
|
+
standard deviation across them.
|
|
83
|
+
- **Metrics:** `nDCG@10` against the graded BEIR judgements, and `recall@10` against the exact
|
|
84
|
+
floating‑point neighbours. `% of float` is nDCG relative to the uncompressed embedding.
|
|
85
|
+
- **Memory:** bytes stored per document vector (`4D` float, `D` int8, `D/8` binary, `M` for an
|
|
86
|
+
`M`‑byte product code, `4·dim` for a truncated/PCA‑reduced vector).
|
|
87
|
+
- **Embedders:** `minilm` (384‑d) and `mxbai` (1024‑d, Matryoshka) ship built in.
|
|
88
|
+
|
|
89
|
+
## Add your method in five lines
|
|
90
|
+
|
|
91
|
+
This is the point of the benchmark: drop in your compressor and it is scored against every
|
|
92
|
+
built‑in on the same protocol.
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
from bitbudget import method
|
|
96
|
+
import numpy as np
|
|
97
|
+
|
|
98
|
+
@method("my-2bit", bits=2)
|
|
99
|
+
def my_2bit(demb, qemb):
|
|
100
|
+
codes = my_quantise(demb) # your compression
|
|
101
|
+
scores = qemb @ my_reconstruct(codes).T # (queries x docs) similarity
|
|
102
|
+
return scores, demb.shape[1] * 2 / 8 # scores, bytes per stored vector
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
bitbudget run --embedder mxbai --corpus scifact --methods my-2bit binary+rerank float32
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Then open a pull request adding your row to [LEADERBOARD.md](LEADERBOARD.md). See
|
|
110
|
+
[CONTRIBUTING.md](CONTRIBUTING.md).
|
|
111
|
+
|
|
112
|
+
## Cite
|
|
113
|
+
|
|
114
|
+
If BitBudget helps your work, please cite the survey:
|
|
115
|
+
|
|
116
|
+
```bibtex
|
|
117
|
+
@article{moran2025projection,
|
|
118
|
+
title = {Projection and Quantisation: A Unifying View of Learning to Hash,
|
|
119
|
+
from Random Projections to the RAG Era},
|
|
120
|
+
author = {Moran, Sean},
|
|
121
|
+
journal = {arXiv preprint arXiv:2510.04127},
|
|
122
|
+
year = {2025}
|
|
123
|
+
}
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
MIT licensed.
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=64"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "bitbudget"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "How much retrieval quality do you keep per byte? A reproducible benchmark for embedding compression."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "Sean Moran" }]
|
|
13
|
+
keywords = ["retrieval", "embeddings", "quantisation", "hashing", "compression", "ANN", "RAG"]
|
|
14
|
+
dependencies = [
|
|
15
|
+
"numpy>=1.21",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[project.optional-dependencies]
|
|
19
|
+
# embedding a corpus (torch); kept optional so evaluation-only installs stay light and
|
|
20
|
+
# avoid importing torch alongside faiss (they clash on macOS OpenMP -- see README).
|
|
21
|
+
embed = ["sentence-transformers>=2.2"]
|
|
22
|
+
# faiss-backed methods (graph/IVF) run in their own evaluation process.
|
|
23
|
+
faiss = ["faiss-cpu>=1.7.4"]
|
|
24
|
+
all = ["sentence-transformers>=2.2", "faiss-cpu>=1.7.4"]
|
|
25
|
+
|
|
26
|
+
[project.scripts]
|
|
27
|
+
bitbudget = "bitbudget.cli:main"
|
|
28
|
+
|
|
29
|
+
[project.urls]
|
|
30
|
+
Paper = "https://arxiv.org/abs/2510.04127"
|
|
31
|
+
Leaderboard = "https://github.com/sjmoran/bitbudget/blob/main/LEADERBOARD.md"
|
|
32
|
+
|
|
33
|
+
[tool.setuptools.packages.find]
|
|
34
|
+
where = ["src"]
|
|
35
|
+
|
|
36
|
+
# Ship the bit-trie C kernel as data (compiled on demand at runtime, see _bittrie_build.py).
|
|
37
|
+
# The wheel stays pure-Python (py3-none-any); no compiler is needed to install.
|
|
38
|
+
[tool.setuptools.package-data]
|
|
39
|
+
bitbudget = ["_bittrie.c"]
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""BitBudget: how much retrieval quality do you keep per byte?
|
|
2
|
+
|
|
3
|
+
A reproducible benchmark for embedding compression. Given an embedder and a corpus, it
|
|
4
|
+
measures the retrieval quality (nDCG@10, recall@10) retained by each compression method
|
|
5
|
+
against the bytes it stores per vector -- the recall-per-byte frontier.
|
|
6
|
+
|
|
7
|
+
Add your own method in five lines::
|
|
8
|
+
|
|
9
|
+
from bitbudget import method
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
@method("my-2bit", bits=2)
|
|
13
|
+
def my_2bit(demb, qemb):
|
|
14
|
+
# return (query x doc similarity scores, bytes per stored vector)
|
|
15
|
+
codes = my_quantise(demb)
|
|
16
|
+
return qemb @ my_dequantise(codes).T, demb.shape[1] * 2 / 8
|
|
17
|
+
|
|
18
|
+
Then ``bitbudget run --embedder mxbai --corpus scifact`` scores it alongside the built-ins.
|
|
19
|
+
"""
|
|
20
|
+
from .methods import method, METHODS, list_methods
|
|
21
|
+
from .embedders import embedder, EMBEDDERS, list_embedders
|
|
22
|
+
from .eval import evaluate
|
|
23
|
+
from .indexes import index, INDEXES, list_indexes, bench_indexes
|
|
24
|
+
|
|
25
|
+
__version__ = "0.1.0"
|
|
26
|
+
__all__ = ["method", "METHODS", "list_methods", "embedder", "EMBEDDERS",
|
|
27
|
+
"list_embedders", "evaluate", "index", "INDEXES", "list_indexes",
|
|
28
|
+
"bench_indexes", "__version__"]
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
/* Fast batched bit-trie query: coarse-to-fine beam descent over sorted codes, then a
|
|
2
|
+
* full-precision re-ranking pass, looped over all queries in C. Plain C (no Python.h);
|
|
3
|
+
* loaded via ctypes from _bittrie_build.py and compiled on demand. Falls back to the
|
|
4
|
+
* numpy/Python path in bittrie.py if no compiler is available. */
|
|
5
|
+
#include <stdint.h>
|
|
6
|
+
#include <stdlib.h>
|
|
7
|
+
#include <string.h>
|
|
8
|
+
#include <math.h>
|
|
9
|
+
|
|
10
|
+
typedef struct { double cost; long lo, hi; unsigned long long pre; } Node;
|
|
11
|
+
|
|
12
|
+
static int cmp_node(const void *a, const void *b) {
|
|
13
|
+
double d = ((const Node *)a)->cost - ((const Node *)b)->cost;
|
|
14
|
+
return (d > 0) - (d < 0);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
static long lower_bound_u64(const uint64_t *a, long lo, long hi, uint64_t key) {
|
|
18
|
+
while (lo < hi) { long mid = lo + (hi - lo) / 2; if (a[mid] < key) lo = mid + 1; else hi = mid; }
|
|
19
|
+
return lo;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/* out: nq*k int64 of original doc ids (top-k by inner product), -1 padding if fewer found. */
|
|
23
|
+
/* Each query is independent and writes a disjoint slice of `out`, so the loop parallelises
|
|
24
|
+
* cleanly. Scratch buffers are allocated per thread inside the parallel region; when compiled
|
|
25
|
+
* without OpenMP the `omp` pragmas are ignored and this runs as one block, single-threaded. */
|
|
26
|
+
void bt_query_batch(const uint64_t *codes, const int64_t *docids,
|
|
27
|
+
const float *Xf, const float *qproj, const float *qf,
|
|
28
|
+
int n, int d, int b, int nq, int depth, int beam, int k,
|
|
29
|
+
int64_t *out) {
|
|
30
|
+
if (depth > b) depth = b;
|
|
31
|
+
if (beam < 1) beam = 1;
|
|
32
|
+
int cap = 2 * beam + 4;
|
|
33
|
+
|
|
34
|
+
#pragma omp parallel
|
|
35
|
+
{
|
|
36
|
+
Node *cur = (Node *)malloc(sizeof(Node) * cap);
|
|
37
|
+
Node *nxt = (Node *)malloc(sizeof(Node) * 2 * cap);
|
|
38
|
+
double *bscore = (double *)malloc(sizeof(double) * k);
|
|
39
|
+
long *bid = (long *)malloc(sizeof(long) * k);
|
|
40
|
+
|
|
41
|
+
#pragma omp for schedule(static)
|
|
42
|
+
for (int qi = 0; qi < nq; qi++) {
|
|
43
|
+
const float *qp = qproj + (size_t)qi * b;
|
|
44
|
+
const float *qv = qf + (size_t)qi * d;
|
|
45
|
+
int ncur = 1;
|
|
46
|
+
cur[0].cost = 0.0; cur[0].lo = 0; cur[0].hi = n; cur[0].pre = 0ULL;
|
|
47
|
+
|
|
48
|
+
for (int t = 0; t < depth; t++) {
|
|
49
|
+
int shift = b - t - 1;
|
|
50
|
+
int qb = qp[t] > 0.0f ? 1 : 0;
|
|
51
|
+
double c = fabs((double)qp[t]);
|
|
52
|
+
int nn = 0;
|
|
53
|
+
for (int i = 0; i < ncur; i++) {
|
|
54
|
+
long lo = cur[i].lo, hi = cur[i].hi;
|
|
55
|
+
unsigned long long pre = cur[i].pre;
|
|
56
|
+
unsigned long long hi_part = (b - t) >= 64 ? 0ULL : (pre << (b - t));
|
|
57
|
+
unsigned long long thresh = hi_part | (1ULL << shift);
|
|
58
|
+
long m = lower_bound_u64(codes, lo, hi, (uint64_t)thresh);
|
|
59
|
+
if (m > lo) { nxt[nn].cost = cur[i].cost + (qb == 0 ? 0.0 : c);
|
|
60
|
+
nxt[nn].lo = lo; nxt[nn].hi = m; nxt[nn].pre = (pre << 1); nn++; }
|
|
61
|
+
if (hi > m) { nxt[nn].cost = cur[i].cost + (qb == 1 ? 0.0 : c);
|
|
62
|
+
nxt[nn].lo = m; nxt[nn].hi = hi; nxt[nn].pre = (pre << 1) | 1ULL; nn++; }
|
|
63
|
+
}
|
|
64
|
+
qsort(nxt, nn, sizeof(Node), cmp_node);
|
|
65
|
+
int keep = nn < beam ? nn : beam;
|
|
66
|
+
memcpy(cur, nxt, sizeof(Node) * keep);
|
|
67
|
+
ncur = keep;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
for (int j = 0; j < k; j++) { bscore[j] = -1e300; bid[j] = -1; }
|
|
71
|
+
for (int i = 0; i < ncur; i++) {
|
|
72
|
+
for (long j = cur[i].lo; j < cur[i].hi; j++) {
|
|
73
|
+
long id = (long)docids[j];
|
|
74
|
+
const float *xr = Xf + (size_t)id * d;
|
|
75
|
+
double s = 0.0;
|
|
76
|
+
for (int dd = 0; dd < d; dd++) s += (double)qv[dd] * (double)xr[dd];
|
|
77
|
+
int mn = 0;
|
|
78
|
+
for (int t2 = 1; t2 < k; t2++) if (bscore[t2] < bscore[mn]) mn = t2;
|
|
79
|
+
if (s > bscore[mn]) { bscore[mn] = s; bid[mn] = id; }
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
for (int a = 0; a < k; a++) { /* selection-sort the k winners, score descending */
|
|
83
|
+
int mx = a;
|
|
84
|
+
for (int b2 = a + 1; b2 < k; b2++) if (bscore[b2] > bscore[mx]) mx = b2;
|
|
85
|
+
double ts = bscore[a]; bscore[a] = bscore[mx]; bscore[mx] = ts;
|
|
86
|
+
long ti = bid[a]; bid[a] = bid[mx]; bid[mx] = ti;
|
|
87
|
+
}
|
|
88
|
+
for (int j = 0; j < k; j++) out[(size_t)qi * k + j] = bid[j];
|
|
89
|
+
}
|
|
90
|
+
free(cur); free(nxt); free(bscore); free(bid);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Compile the bit-trie C kernel on demand and load it via ctypes.
|
|
2
|
+
|
|
3
|
+
The package ships pure-Python on PyPI (the .c is data, not a built extension), so install never
|
|
4
|
+
needs a compiler. The first time the fast path is requested we compile _bittrie.c into a cached
|
|
5
|
+
shared library with the system C compiler; if no compiler is available we return None and the
|
|
6
|
+
caller falls back to the numpy/Python path. The compiled library is cached by source hash, so the
|
|
7
|
+
build happens once per machine.
|
|
8
|
+
"""
|
|
9
|
+
import ctypes
|
|
10
|
+
import hashlib
|
|
11
|
+
import os
|
|
12
|
+
import subprocess
|
|
13
|
+
|
|
14
|
+
_LIB = None
|
|
15
|
+
_TRIED = False
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _cache_dir():
|
|
19
|
+
d = os.environ.get("BITBUDGET_CACHE") or os.path.join(os.path.expanduser("~"), ".cache", "bitbudget")
|
|
20
|
+
os.makedirs(d, exist_ok=True)
|
|
21
|
+
return d
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _brew_libomp():
|
|
25
|
+
"""Return the Homebrew libomp prefix on macOS, or None."""
|
|
26
|
+
try:
|
|
27
|
+
p = subprocess.run(["brew", "--prefix", "libomp"], capture_output=True, text=True)
|
|
28
|
+
prefix = p.stdout.strip()
|
|
29
|
+
if p.returncode == 0 and prefix and os.path.isdir(prefix):
|
|
30
|
+
return prefix
|
|
31
|
+
except Exception:
|
|
32
|
+
pass
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _variants():
|
|
37
|
+
"""(cflags, lflags) compile attempts, OpenMP first, single-thread C last."""
|
|
38
|
+
v = []
|
|
39
|
+
omp = _brew_libomp()
|
|
40
|
+
if omp: # macOS clang + Homebrew libomp (try first)
|
|
41
|
+
v.append((["-Xpreprocessor", "-fopenmp", f"-I{omp}/include"],
|
|
42
|
+
[f"-L{omp}/lib", "-lomp", f"-Wl,-rpath,{omp}/lib"]))
|
|
43
|
+
v.append((["-fopenmp"], ["-fopenmp"])) # gcc / OpenMP-capable clang (Linux)
|
|
44
|
+
v.append(([], [])) # no OpenMP: single-threaded C
|
|
45
|
+
return v
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _set_sig(lib):
|
|
49
|
+
f = lib.bt_query_batch
|
|
50
|
+
f.restype = None
|
|
51
|
+
P = ctypes.POINTER
|
|
52
|
+
f.argtypes = [P(ctypes.c_uint64), P(ctypes.c_int64), P(ctypes.c_float),
|
|
53
|
+
P(ctypes.c_float), P(ctypes.c_float),
|
|
54
|
+
ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int,
|
|
55
|
+
ctypes.c_int, ctypes.c_int, ctypes.c_int, P(ctypes.c_int64)]
|
|
56
|
+
return lib
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def get_lib():
|
|
60
|
+
"""Return the loaded ctypes library exposing bt_query_batch, or None if unavailable.
|
|
61
|
+
|
|
62
|
+
Tries to build a multithreaded (OpenMP) kernel first, falling back to single-threaded C,
|
|
63
|
+
then to None (the caller uses the numpy/Python path). The compiled library is cached by
|
|
64
|
+
source hash; set BITBUDGET_NO_OMP=1 to force the single-threaded build.
|
|
65
|
+
"""
|
|
66
|
+
global _LIB, _TRIED
|
|
67
|
+
if _TRIED:
|
|
68
|
+
return _LIB
|
|
69
|
+
_TRIED = True
|
|
70
|
+
src = os.path.join(os.path.dirname(__file__), "_bittrie.c")
|
|
71
|
+
if not os.path.exists(src):
|
|
72
|
+
return None
|
|
73
|
+
h = hashlib.sha1(open(src, "rb").read()).hexdigest()[:12]
|
|
74
|
+
so = os.path.join(_cache_dir(), f"_bittrie_{h}.so")
|
|
75
|
+
if os.path.exists(so): # cached: load directly
|
|
76
|
+
try:
|
|
77
|
+
_LIB = _set_sig(ctypes.CDLL(so)); return _LIB
|
|
78
|
+
except Exception:
|
|
79
|
+
try: os.remove(so)
|
|
80
|
+
except Exception: pass
|
|
81
|
+
cc = os.environ.get("CC") or "cc"
|
|
82
|
+
variants = [([], [])] if os.environ.get("BITBUDGET_NO_OMP") else _variants()
|
|
83
|
+
for cflags, lflags in variants:
|
|
84
|
+
cmd = [cc, "-O3", "-shared", "-fPIC", *cflags, "-o", so, src, *lflags, "-lm"]
|
|
85
|
+
try:
|
|
86
|
+
subprocess.run(cmd, check=True, capture_output=True)
|
|
87
|
+
except Exception:
|
|
88
|
+
continue
|
|
89
|
+
try: # must actually load (libomp present at runtime)
|
|
90
|
+
_LIB = _set_sig(ctypes.CDLL(so)); return _LIB
|
|
91
|
+
except Exception:
|
|
92
|
+
try: os.remove(so)
|
|
93
|
+
except Exception: pass
|
|
94
|
+
return None
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Bit-trie index: a van Emde Boas / PATRICIA-style radix trie over compact codes.
|
|
2
|
+
|
|
3
|
+
The sorted array of packed sign-codes *is* the trie: a node is a contiguous range of codes
|
|
4
|
+
sharing a bit prefix. Search is coarse-to-fine beam descent (the most discriminative bit is the
|
|
5
|
+
shallowest), then a re-ranking pass. Unsupervised (random rotation + sign), numpy-only, one knob
|
|
6
|
+
(``beam``). It stores compact codes rather than full vectors, so it sits on the organisation axis
|
|
7
|
+
at a fraction of a graph's footprint. Research entry to the BitBudget index benchmark; see the
|
|
8
|
+
projection-quantisation-organisation survey for the motivation.
|
|
9
|
+
|
|
10
|
+
This reference implementation packs into a single uint64, so ``n_bits <= 64``.
|
|
11
|
+
"""
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BitTrieIndex:
|
|
16
|
+
def __init__(self, n_bits=64, seed=0):
|
|
17
|
+
assert n_bits <= 64, "this reference packs into one uint64; use a word-array for >64 bits"
|
|
18
|
+
self.b = n_bits
|
|
19
|
+
self.seed = seed
|
|
20
|
+
|
|
21
|
+
def fit(self, X):
|
|
22
|
+
X = np.ascontiguousarray(X, dtype=np.float32)
|
|
23
|
+
n, d = X.shape
|
|
24
|
+
rng = np.random.RandomState(self.seed)
|
|
25
|
+
R = rng.randn(d, self.b).astype(np.float32) # unsupervised random rotation (RaBitQ)
|
|
26
|
+
R, _ = np.linalg.qr(R) if self.b <= d else (R, None)
|
|
27
|
+
P = X @ R
|
|
28
|
+
order = np.argsort(-P.var(axis=0)) # coarse-to-fine: MSB = highest variance
|
|
29
|
+
self.R = R[:, order]
|
|
30
|
+
P = P[:, order]
|
|
31
|
+
bits = (P > 0).astype(np.uint64)
|
|
32
|
+
weights = (np.uint64(1) << np.arange(self.b - 1, -1, -1, dtype=np.uint64))
|
|
33
|
+
codes = bits @ weights
|
|
34
|
+
order_idx = np.argsort(codes, kind="stable")
|
|
35
|
+
self.codes = codes[order_idx] # sorted: the implicit trie
|
|
36
|
+
self.docids = order_idx.astype(np.int64)
|
|
37
|
+
self.X = X # kept only for optional float re-rank (cold)
|
|
38
|
+
self.n, self.d = n, d
|
|
39
|
+
return self
|
|
40
|
+
|
|
41
|
+
def _descend(self, qbits, qconf, beam, depth):
|
|
42
|
+
"""Confidence-ordered beam over the sorted codes. Stop at ``depth`` bits so each surviving
|
|
43
|
+
prefix is still a fat bucket (the coarse-to-fine / anytime property)."""
|
|
44
|
+
b = self.b
|
|
45
|
+
state = [(0.0, 0, self.n, 0, 0)] # (cost, lo, hi, prefix_int, depth)
|
|
46
|
+
for t in range(depth):
|
|
47
|
+
shift = b - t - 1
|
|
48
|
+
nxt = []
|
|
49
|
+
for cost, lo, hi, pre, _ in state:
|
|
50
|
+
thresh = (pre << (b - t)) | (1 << shift)
|
|
51
|
+
m = lo + int(np.searchsorted(self.codes[lo:hi], np.uint64(thresh), "left"))
|
|
52
|
+
qb = int(qbits[t]); c = float(qconf[t])
|
|
53
|
+
left = (lo, m, (pre << 1))
|
|
54
|
+
right = (m, hi, (pre << 1) | 1)
|
|
55
|
+
for child_bit, (clo, chi, cpre) in ((0, left), (1, right)):
|
|
56
|
+
if chi <= clo:
|
|
57
|
+
continue
|
|
58
|
+
add = 0.0 if child_bit == qb else c # sibling costs its margin
|
|
59
|
+
nxt.append((cost + add, clo, chi, cpre, t + 1))
|
|
60
|
+
nxt.sort(key=lambda e: e[0])
|
|
61
|
+
state = nxt[:beam]
|
|
62
|
+
return state
|
|
63
|
+
|
|
64
|
+
def query(self, q, topk=10, beam=64, depth=28, rerank="float"):
|
|
65
|
+
q = np.asarray(q, dtype=np.float32)
|
|
66
|
+
p = q @ self.R
|
|
67
|
+
qbits = (p > 0).astype(int)
|
|
68
|
+
qconf = np.abs(p)
|
|
69
|
+
ranges = self._descend(qbits, qconf, beam, depth)
|
|
70
|
+
cand = np.concatenate([self.docids[lo:hi] for _, lo, hi, _, _ in ranges]) if ranges else np.empty(0, np.int64)
|
|
71
|
+
if cand.size == 0:
|
|
72
|
+
return cand
|
|
73
|
+
if rerank == "float": # DiskANN-style: cold float for re-rank only
|
|
74
|
+
s = self.X[cand] @ q
|
|
75
|
+
else: # asymmetric: float query vs sign code, no float
|
|
76
|
+
shifts = np.arange(self.b - 1, -1, -1, dtype=np.uint64)
|
|
77
|
+
sign = np.where(((self.codes_full(cand)[:, None] >> shifts) & np.uint64(1)) > 0, 1.0, -1.0)
|
|
78
|
+
s = sign @ (q @ self.R)
|
|
79
|
+
return cand[np.argsort(-s)[:topk]]
|
|
80
|
+
|
|
81
|
+
def query_batch(self, Q, topk=10, beam=64, depth=28):
|
|
82
|
+
"""Top-k for every query in Q. Uses the compiled C kernel when available (one to two
|
|
83
|
+
orders of magnitude faster), else falls back to the pure-Python per-query path."""
|
|
84
|
+
Q = np.ascontiguousarray(Q, dtype=np.float32)
|
|
85
|
+
depth = min(depth, self.b)
|
|
86
|
+
from ._bittrie_build import get_lib
|
|
87
|
+
lib = get_lib()
|
|
88
|
+
if lib is None:
|
|
89
|
+
return np.array([self.query(q, topk, beam, depth, rerank="float") for q in Q])
|
|
90
|
+
import ctypes
|
|
91
|
+
qproj = np.ascontiguousarray(Q @ self.R, dtype=np.float32)
|
|
92
|
+
Xf = np.ascontiguousarray(self.X, dtype=np.float32)
|
|
93
|
+
out = np.full((len(Q), topk), -1, dtype=np.int64)
|
|
94
|
+
cast = lambda a, t: a.ctypes.data_as(ctypes.POINTER(t))
|
|
95
|
+
lib.bt_query_batch(
|
|
96
|
+
cast(self.codes, ctypes.c_uint64), cast(self.docids, ctypes.c_int64),
|
|
97
|
+
cast(Xf, ctypes.c_float), cast(qproj, ctypes.c_float), cast(Q, ctypes.c_float),
|
|
98
|
+
self.n, self.d, self.b, len(Q), int(depth), int(beam), int(topk),
|
|
99
|
+
cast(out, ctypes.c_int64))
|
|
100
|
+
return out
|
|
101
|
+
|
|
102
|
+
def codes_full(self, cand):
|
|
103
|
+
inv = np.empty(self.n, np.int64); inv[self.docids] = np.arange(self.n)
|
|
104
|
+
return self.codes[inv[cand]]
|
|
105
|
+
|
|
106
|
+
def index_bytes(self):
|
|
107
|
+
"""Bytes that must be in RAM to route (codes + leaf ids); the trie is implicit in the sort."""
|
|
108
|
+
return self.codes.nbytes + self.docids.nbytes
|