bloom-torch 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bloom_torch-1.0.0/LICENSE +21 -0
- bloom_torch-1.0.0/PKG-INFO +73 -0
- bloom_torch-1.0.0/README.md +45 -0
- bloom_torch-1.0.0/pyproject.toml +52 -0
- bloom_torch-1.0.0/setup.cfg +4 -0
- bloom_torch-1.0.0/src/bloom_torch/__init__.py +25 -0
- bloom_torch-1.0.0/src/bloom_torch/bloom_hash.py +145 -0
- bloom_torch-1.0.0/src/bloom_torch/bloom_kmeans.py +571 -0
- bloom_torch-1.0.0/src/bloom_torch/bloom_matrix.py +421 -0
- bloom_torch-1.0.0/src/bloom_torch/py.typed +0 -0
- bloom_torch-1.0.0/src/bloom_torch.egg-info/PKG-INFO +73 -0
- bloom_torch-1.0.0/src/bloom_torch.egg-info/SOURCES.txt +13 -0
- bloom_torch-1.0.0/src/bloom_torch.egg-info/dependency_links.txt +1 -0
- bloom_torch-1.0.0/src/bloom_torch.egg-info/requires.txt +5 -0
- bloom_torch-1.0.0/src/bloom_torch.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 bloom-torch contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: bloom-torch
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Bloom-filter–accelerated clustering and set structures in PyTorch (not related to the BLOOM language model).
|
|
5
|
+
Author: bloom-torch contributors
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/your-org/bloom-torch
|
|
8
|
+
Project-URL: Documentation, https://github.com/your-org/bloom-torch#readme
|
|
9
|
+
Project-URL: Repository, https://github.com/your-org/bloom-torch
|
|
10
|
+
Project-URL: Issues, https://github.com/your-org/bloom-torch/issues
|
|
11
|
+
Keywords: bloom-filter,pytorch,clustering,k-means,bloom-matrix
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: torch>=2.0
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
26
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+
# bloom-torch
|
|
30
|
+
|
|
31
|
+
**bloom-torch** is a small PyTorch library for **Bloom filters**, **Bloom matrices** (bitwise set encodings), and **BloomKMeans** — K-means with Bloom-based candidate pruning when the number of clusters is large.
|
|
32
|
+
|
|
33
|
+
> **Note:** This package is **not** related to the [**BLOOM**](https://huggingface.co/bigscience/bloom) multilingual language model or other “BLOOM” model names in the Hugging Face ecosystem. The name refers to [**Bloom filters**](https://en.wikipedia.org/wiki/Bloom_filter) (probabilistic set membership, 1970).
|
|
34
|
+
|
|
35
|
+
## Install
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install bloom-torch
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
(After the first release; for now, from a clone: `pip install -e .`)
|
|
42
|
+
|
|
43
|
+
## Requirements
|
|
44
|
+
|
|
45
|
+
- Python ≥ 3.10
|
|
46
|
+
- PyTorch ≥ 2.0
|
|
47
|
+
|
|
48
|
+
## Quick start (v0.1)
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
import torch
|
|
52
|
+
from bloom_torch import BloomKMeans
|
|
53
|
+
|
|
54
|
+
# X: [n, d] float32 — e.g. token or item embeddings (cluster on raw or normalised data as you prefer)
|
|
55
|
+
X = torch.randn(10_000, 128, dtype=torch.float32)
|
|
56
|
+
|
|
57
|
+
km = BloomKMeans(n_clusters=256, topk_cache=16, bm_fp_rate=0.01, routing_fp_rate=0.01, seed=0)
|
|
58
|
+
km.fit(X, max_iters=20, use_bm_after=1, allow_bm_assign_small_k=False)
|
|
59
|
+
|
|
60
|
+
# Optional: build a cluster → element Bloom matrix for routing / masking
|
|
61
|
+
routing = km.build_routing_bloom(vocab_size=X.shape[0])
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Public API in **v0.1**: `BloomHasher`, `TorchBloomMatrix`, `BloomKMeans`.
|
|
65
|
+
Routing hooks, logits processors, and LLM-specific helpers may be added in later versions or live in application code.
|
|
66
|
+
|
|
67
|
+
## Relationship to research code
|
|
68
|
+
|
|
69
|
+
This repository was split out from the **PyBloomFilter** research prototype (`torch_bloom` package) so the core tensor algorithms can be versioned and published independently.
|
|
70
|
+
|
|
71
|
+
## License
|
|
72
|
+
|
|
73
|
+
MIT — see `LICENSE`.
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# bloom-torch
|
|
2
|
+
|
|
3
|
+
**bloom-torch** is a small PyTorch library for **Bloom filters**, **Bloom matrices** (bitwise set encodings), and **BloomKMeans** — K-means with Bloom-based candidate pruning when the number of clusters is large.
|
|
4
|
+
|
|
5
|
+
> **Note:** This package is **not** related to the [**BLOOM**](https://huggingface.co/bigscience/bloom) multilingual language model or other “BLOOM” model names in the Hugging Face ecosystem. The name refers to [**Bloom filters**](https://en.wikipedia.org/wiki/Bloom_filter) (probabilistic set membership, 1970).
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install bloom-torch
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
(After the first release; for now, from a clone: `pip install -e .`)
|
|
14
|
+
|
|
15
|
+
## Requirements
|
|
16
|
+
|
|
17
|
+
- Python ≥ 3.10
|
|
18
|
+
- PyTorch ≥ 2.0
|
|
19
|
+
|
|
20
|
+
## Quick start (v0.1)
|
|
21
|
+
|
|
22
|
+
```python
|
|
23
|
+
import torch
|
|
24
|
+
from bloom_torch import BloomKMeans
|
|
25
|
+
|
|
26
|
+
# X: [n, d] float32 — e.g. token or item embeddings (cluster on raw or normalised data as you prefer)
|
|
27
|
+
X = torch.randn(10_000, 128, dtype=torch.float32)
|
|
28
|
+
|
|
29
|
+
km = BloomKMeans(n_clusters=256, topk_cache=16, bm_fp_rate=0.01, routing_fp_rate=0.01, seed=0)
|
|
30
|
+
km.fit(X, max_iters=20, use_bm_after=1, allow_bm_assign_small_k=False)
|
|
31
|
+
|
|
32
|
+
# Optional: build a cluster → element Bloom matrix for routing / masking
|
|
33
|
+
routing = km.build_routing_bloom(vocab_size=X.shape[0])
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Public API in **v0.1**: `BloomHasher`, `TorchBloomMatrix`, `BloomKMeans`.
|
|
37
|
+
Routing hooks, logits processors, and LLM-specific helpers may be added in later versions or live in application code.
|
|
38
|
+
|
|
39
|
+
## Relationship to research code
|
|
40
|
+
|
|
41
|
+
This repository was split out from the **PyBloomFilter** research prototype (`torch_bloom` package) so the core tensor algorithms can be versioned and published independently.
|
|
42
|
+
|
|
43
|
+
## License
|
|
44
|
+
|
|
45
|
+
MIT — see `LICENSE`.
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "bloom-torch"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Bloom-filter–accelerated clustering and set structures in PyTorch (not related to the BLOOM language model)."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "bloom-torch contributors" },
|
|
14
|
+
]
|
|
15
|
+
keywords = [
|
|
16
|
+
"bloom-filter",
|
|
17
|
+
"pytorch",
|
|
18
|
+
"clustering",
|
|
19
|
+
"k-means",
|
|
20
|
+
"bloom-matrix",
|
|
21
|
+
]
|
|
22
|
+
classifiers = [
|
|
23
|
+
"Development Status :: 3 - Alpha",
|
|
24
|
+
"Intended Audience :: Science/Research",
|
|
25
|
+
"License :: OSI Approved :: MIT License",
|
|
26
|
+
"Programming Language :: Python :: 3",
|
|
27
|
+
"Programming Language :: Python :: 3.10",
|
|
28
|
+
"Programming Language :: Python :: 3.11",
|
|
29
|
+
"Programming Language :: Python :: 3.12",
|
|
30
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
31
|
+
]
|
|
32
|
+
dependencies = [
|
|
33
|
+
"torch>=2.0",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
[project.optional-dependencies]
|
|
37
|
+
dev = [
|
|
38
|
+
"pytest>=7.0",
|
|
39
|
+
"ruff>=0.4",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[project.urls]
|
|
43
|
+
Homepage = "https://github.com/your-org/bloom-torch"
|
|
44
|
+
Documentation = "https://github.com/your-org/bloom-torch#readme"
|
|
45
|
+
Repository = "https://github.com/your-org/bloom-torch"
|
|
46
|
+
Issues = "https://github.com/your-org/bloom-torch/issues"
|
|
47
|
+
|
|
48
|
+
[tool.setuptools.packages.find]
|
|
49
|
+
where = ["src"]
|
|
50
|
+
|
|
51
|
+
[tool.setuptools.package-data]
|
|
52
|
+
bloom_torch = ["py.typed"]
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""
|
|
2
|
+
bloom-torch — Bloom-filter–accelerated structures and clustering in PyTorch.
|
|
3
|
+
|
|
4
|
+
This package implements **Bloom filters** and **Bloom matrices** (Concas-style
|
|
5
|
+
multi-filters) in pure PyTorch for GPU/CPU/MPS, plus **BloomKMeans**, K-means
|
|
6
|
+
with Bloom-based candidate pruning on large cluster counts.
|
|
7
|
+
|
|
8
|
+
**Not related to** the `BLOOM`_ large language model or Hugging Face ``transformers``
|
|
9
|
+
model namesakes; the name refers to **Bloom filters** (Burton Howard Bloom, 1970).
|
|
10
|
+
|
|
11
|
+
.. _BLOOM: https://huggingface.co/bigscience/bloom
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from .bloom_hash import BloomHasher
|
|
15
|
+
from .bloom_matrix import TorchBloomMatrix
|
|
16
|
+
from .bloom_kmeans import BloomKMeans
|
|
17
|
+
|
|
18
|
+
__version__ = "1.0.0"
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"BloomHasher",
|
|
22
|
+
"TorchBloomMatrix",
|
|
23
|
+
"BloomKMeans",
|
|
24
|
+
"__version__",
|
|
25
|
+
]
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""
|
|
2
|
+
BloomHasher — GPU-friendly universal hash family for integer labels.
|
|
3
|
+
|
|
4
|
+
IMPROVEMENT: Replaces mmh3 with a polynomial hash family that runs entirely
|
|
5
|
+
on PyTorch tensors (CPU / CUDA / MPS), enabling batched hashing via matmul.
|
|
6
|
+
No C extension; same code path on all devices.
|
|
7
|
+
|
|
8
|
+
Hash family
|
|
9
|
+
-----------
|
|
10
|
+
h_i(x) = ((a_i * x + b_i) % P) % m
|
|
11
|
+
|
|
12
|
+
where P = 2^31 − 1 (Mersenne prime) and (a_i, b_i) are randomly drawn
|
|
13
|
+
coefficient pairs. The Mersenne prime modulus ensures near-uniform distribution
|
|
14
|
+
without costly modular reduction — % P is cheap in int64.
|
|
15
|
+
|
|
16
|
+
Shapes
|
|
17
|
+
------
|
|
18
|
+
hash_ids(ids) : ids [N] → positions [k, N]
|
|
19
|
+
hash_single(id) : scalar id → positions [k]
|
|
20
|
+
|
|
21
|
+
Both return int64 values in [0, m). All ops are device-agnostic.
|
|
22
|
+
|
|
23
|
+
Compatibility note
|
|
24
|
+
------------------
|
|
25
|
+
Hash values differ from mmh3 (MurmurHash3 on string labels). The Torch hasher
|
|
26
|
+
is self-consistent: TorchBloomMatrix built and queried with the *same*
|
|
27
|
+
BloomHasher instance will produce zero false negatives, matching the invariant
|
|
28
|
+
of BloolMultifilters.BloomMatrix. The two implementations are *not*
|
|
29
|
+
interchangeable at the bit level.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from __future__ import annotations
|
|
33
|
+
|
|
34
|
+
import math
|
|
35
|
+
|
|
36
|
+
import torch
|
|
37
|
+
from torch import Tensor
|
|
38
|
+
|
|
39
|
+
# Mersenne prime as modulus — cheap mod and good mixing
|
|
40
|
+
_P: int = (1 << 31) - 1
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class BloomHasher:
|
|
44
|
+
"""
|
|
45
|
+
Family of k universal hash functions h_i : int → [0, m).
|
|
46
|
+
|
|
47
|
+
Parameters
|
|
48
|
+
----------
|
|
49
|
+
k : number of hash functions
|
|
50
|
+
m : output range (= number of rows in TorchBloomMatrix)
|
|
51
|
+
seed : RNG seed for reproducibility
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(self, k: int, m: int, seed: int = 42) -> None:
|
|
55
|
+
self.k = k
|
|
56
|
+
self.m = m
|
|
57
|
+
self._seed = seed
|
|
58
|
+
rng = torch.Generator()
|
|
59
|
+
rng.manual_seed(seed)
|
|
60
|
+
# Odd multipliers keep the full-period property; offsets are arbitrary.
|
|
61
|
+
a = torch.randint(1, _P, (k,), generator=rng, dtype=torch.int64)
|
|
62
|
+
a = a | 1 # force odd
|
|
63
|
+
b = torch.randint(0, _P, (k,), generator=rng, dtype=torch.int64)
|
|
64
|
+
# Store as plain tensors (not nn.Parameter); moved with .to().
|
|
65
|
+
self._a: Tensor = a
|
|
66
|
+
self._b: Tensor = b
|
|
67
|
+
|
|
68
|
+
# ------------------------------------------------------------------
|
|
69
|
+
# Device management
|
|
70
|
+
# ------------------------------------------------------------------
|
|
71
|
+
|
|
72
|
+
def to(self, device: torch.device | str) -> "BloomHasher":
|
|
73
|
+
"""Move coefficient tensors to *device* in-place; returns self."""
|
|
74
|
+
self._a = self._a.to(device)
|
|
75
|
+
self._b = self._b.to(device)
|
|
76
|
+
return self
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def device(self) -> torch.device:
|
|
80
|
+
return self._a.device
|
|
81
|
+
|
|
82
|
+
# ------------------------------------------------------------------
|
|
83
|
+
# Hashing — hot-path ops
|
|
84
|
+
# ------------------------------------------------------------------
|
|
85
|
+
|
|
86
|
+
def hash_ids(self, ids: Tensor) -> Tensor:
|
|
87
|
+
"""
|
|
88
|
+
Compute k hash positions for a batch of integer ids.
|
|
89
|
+
|
|
90
|
+
Parameters
|
|
91
|
+
----------
|
|
92
|
+
ids : [N] int64 (label ids, e.g. token indices or cluster indices)
|
|
93
|
+
|
|
94
|
+
Returns
|
|
95
|
+
-------
|
|
96
|
+
positions : [k, N] int64 (values in [0, m))
|
|
97
|
+
"""
|
|
98
|
+
ids = ids.to(dtype=torch.int64, device=self._a.device)
|
|
99
|
+
# [k, 1] * [1, N] + [k, 1] → [k, N]
|
|
100
|
+
raw = self._a[:, None] * ids[None, :] + self._b[:, None]
|
|
101
|
+
return raw % _P % self.m
|
|
102
|
+
|
|
103
|
+
def hash_single(self, label_id: int) -> Tensor:
|
|
104
|
+
"""
|
|
105
|
+
Compute k hash positions for a single integer label_id.
|
|
106
|
+
|
|
107
|
+
Returns
|
|
108
|
+
-------
|
|
109
|
+
positions : [k] int64 (values in [0, m))
|
|
110
|
+
"""
|
|
111
|
+
x = torch.tensor(label_id, dtype=torch.int64, device=self._a.device)
|
|
112
|
+
return (self._a * x + self._b) % _P % self.m
|
|
113
|
+
|
|
114
|
+
# ------------------------------------------------------------------
|
|
115
|
+
# Optimal sizing helpers (mirror BloolMultifilters.BFAux)
|
|
116
|
+
# ------------------------------------------------------------------
|
|
117
|
+
|
|
118
|
+
@staticmethod
|
|
119
|
+
def optimal_m(n: int, p: float) -> int:
|
|
120
|
+
"""
|
|
121
|
+
Optimal filter row count for *n* elements at false-positive rate *p*.
|
|
122
|
+
|
|
123
|
+
Formula: m = ceil(n · ln(1/p) / ln(2)²)
|
|
124
|
+
"""
|
|
125
|
+
if n <= 0 or p <= 0 or p >= 1:
|
|
126
|
+
return 1
|
|
127
|
+
return max(round(n * -math.log(p) / (math.log(2) ** 2)), 1)
|
|
128
|
+
|
|
129
|
+
@staticmethod
|
|
130
|
+
def optimal_k(p: float) -> int:
|
|
131
|
+
"""
|
|
132
|
+
Optimal number of hash functions for false-positive rate *p*.
|
|
133
|
+
|
|
134
|
+
Formula: k = round(ln(2) · m/n) = round(-ln(p) / ln(2))
|
|
135
|
+
"""
|
|
136
|
+
if p <= 0 or p >= 1:
|
|
137
|
+
return 1
|
|
138
|
+
return max(round(-math.log(p) / math.log(2)), 1)
|
|
139
|
+
|
|
140
|
+
# ------------------------------------------------------------------
|
|
141
|
+
# Repr
|
|
142
|
+
# ------------------------------------------------------------------
|
|
143
|
+
|
|
144
|
+
def __repr__(self) -> str:
|
|
145
|
+
return f"BloomHasher(k={self.k}, m={self.m}, device={self.device})"
|