bloom-torch 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 bloom-torch contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,73 @@
1
+ Metadata-Version: 2.4
2
+ Name: bloom-torch
3
+ Version: 1.0.0
4
+ Summary: Bloom-filter–accelerated clustering and set structures in PyTorch (not related to the BLOOM language model).
5
+ Author: bloom-torch contributors
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/your-org/bloom-torch
8
+ Project-URL: Documentation, https://github.com/your-org/bloom-torch#readme
9
+ Project-URL: Repository, https://github.com/your-org/bloom-torch
10
+ Project-URL: Issues, https://github.com/your-org/bloom-torch/issues
11
+ Keywords: bloom-filter,pytorch,clustering,k-means,bloom-matrix
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: torch>=2.0
24
+ Provides-Extra: dev
25
+ Requires-Dist: pytest>=7.0; extra == "dev"
26
+ Requires-Dist: ruff>=0.4; extra == "dev"
27
+ Dynamic: license-file
28
+
29
+ # bloom-torch
30
+
31
+ **bloom-torch** is a small PyTorch library for **Bloom filters**, **Bloom matrices** (bitwise set encodings), and **BloomKMeans** — K-means with Bloom-based candidate pruning when the number of clusters is large.
32
+
33
+ > **Note:** This package is **not** related to the [**BLOOM**](https://huggingface.co/bigscience/bloom) multilingual language model or other “BLOOM” model names in the Hugging Face ecosystem. The name refers to [**Bloom filters**](https://en.wikipedia.org/wiki/Bloom_filter) (probabilistic set membership, 1970).
34
+
35
+ ## Install
36
+
37
+ ```bash
38
+ pip install bloom-torch
39
+ ```
40
+
41
+ (After the first release; for now, from a clone: `pip install -e .`)
42
+
43
+ ## Requirements
44
+
45
+ - Python ≥ 3.10
46
+ - PyTorch ≥ 2.0
47
+
48
+ ## Quick start (v0.1)
49
+
50
+ ```python
51
+ import torch
52
+ from bloom_torch import BloomKMeans
53
+
54
+ # X: [n, d] float32 — e.g. token or item embeddings (cluster on raw or normalised data as you prefer)
55
+ X = torch.randn(10_000, 128, dtype=torch.float32)
56
+
57
+ km = BloomKMeans(n_clusters=256, topk_cache=16, bm_fp_rate=0.01, routing_fp_rate=0.01, seed=0)
58
+ km.fit(X, max_iters=20, use_bm_after=1, allow_bm_assign_small_k=False)
59
+
60
+ # Optional: build a cluster → element Bloom matrix for routing / masking
61
+ routing = km.build_routing_bloom(vocab_size=X.shape[0])
62
+ ```
63
+
64
+ Public API in **v0.1**: `BloomHasher`, `TorchBloomMatrix`, `BloomKMeans`.
65
+ Routing hooks, logits processors, and LLM-specific helpers may be added in later versions or live in application code.
66
+
67
+ ## Relationship to research code
68
+
69
+ This repository was split out from the **PyBloomFilter** research prototype (`torch_bloom` package) so the core tensor algorithms can be versioned and published independently.
70
+
71
+ ## License
72
+
73
+ MIT — see `LICENSE`.
@@ -0,0 +1,45 @@
1
+ # bloom-torch
2
+
3
+ **bloom-torch** is a small PyTorch library for **Bloom filters**, **Bloom matrices** (bitwise set encodings), and **BloomKMeans** — K-means with Bloom-based candidate pruning when the number of clusters is large.
4
+
5
+ > **Note:** This package is **not** related to the [**BLOOM**](https://huggingface.co/bigscience/bloom) multilingual language model or other “BLOOM” model names in the Hugging Face ecosystem. The name refers to [**Bloom filters**](https://en.wikipedia.org/wiki/Bloom_filter) (probabilistic set membership, 1970).
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ pip install bloom-torch
11
+ ```
12
+
13
+ (After the first release; for now, from a clone: `pip install -e .`)
14
+
15
+ ## Requirements
16
+
17
+ - Python ≥ 3.10
18
+ - PyTorch ≥ 2.0
19
+
20
+ ## Quick start (v0.1)
21
+
22
+ ```python
23
+ import torch
24
+ from bloom_torch import BloomKMeans
25
+
26
+ # X: [n, d] float32 — e.g. token or item embeddings (cluster on raw or normalised data as you prefer)
27
+ X = torch.randn(10_000, 128, dtype=torch.float32)
28
+
29
+ km = BloomKMeans(n_clusters=256, topk_cache=16, bm_fp_rate=0.01, routing_fp_rate=0.01, seed=0)
30
+ km.fit(X, max_iters=20, use_bm_after=1, allow_bm_assign_small_k=False)
31
+
32
+ # Optional: build a cluster → element Bloom matrix for routing / masking
33
+ routing = km.build_routing_bloom(vocab_size=X.shape[0])
34
+ ```
35
+
36
+ Public API in **v0.1**: `BloomHasher`, `TorchBloomMatrix`, `BloomKMeans`.
37
+ Routing hooks, logits processors, and LLM-specific helpers may be added in later versions or live in application code.
38
+
39
+ ## Relationship to research code
40
+
41
+ This repository was split out from the **PyBloomFilter** research prototype (`torch_bloom` package) so the core tensor algorithms can be versioned and published independently.
42
+
43
+ ## License
44
+
45
+ MIT — see `LICENSE`.
@@ -0,0 +1,52 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "bloom-torch"
7
+ version = "1.0.0"
8
+ description = "Bloom-filter–accelerated clustering and set structures in PyTorch (not related to the BLOOM language model)."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "MIT" }
12
+ authors = [
13
+ { name = "bloom-torch contributors" },
14
+ ]
15
+ keywords = [
16
+ "bloom-filter",
17
+ "pytorch",
18
+ "clustering",
19
+ "k-means",
20
+ "bloom-matrix",
21
+ ]
22
+ classifiers = [
23
+ "Development Status :: 3 - Alpha",
24
+ "Intended Audience :: Science/Research",
25
+ "License :: OSI Approved :: MIT License",
26
+ "Programming Language :: Python :: 3",
27
+ "Programming Language :: Python :: 3.10",
28
+ "Programming Language :: Python :: 3.11",
29
+ "Programming Language :: Python :: 3.12",
30
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
31
+ ]
32
+ dependencies = [
33
+ "torch>=2.0",
34
+ ]
35
+
36
+ [project.optional-dependencies]
37
+ dev = [
38
+ "pytest>=7.0",
39
+ "ruff>=0.4",
40
+ ]
41
+
42
+ [project.urls]
43
+ Homepage = "https://github.com/your-org/bloom-torch"
44
+ Documentation = "https://github.com/your-org/bloom-torch#readme"
45
+ Repository = "https://github.com/your-org/bloom-torch"
46
+ Issues = "https://github.com/your-org/bloom-torch/issues"
47
+
48
+ [tool.setuptools.packages.find]
49
+ where = ["src"]
50
+
51
+ [tool.setuptools.package-data]
52
+ bloom_torch = ["py.typed"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,25 @@
1
+ """
2
+ bloom-torch — Bloom-filter–accelerated structures and clustering in PyTorch.
3
+
4
+ This package implements **Bloom filters** and **Bloom matrices** (Concas-style
5
+ multi-filters) in pure PyTorch for GPU/CPU/MPS, plus **BloomKMeans**, K-means
6
+ with Bloom-based candidate pruning on large cluster counts.
7
+
8
+ **Not related to** the `BLOOM`_ large language model or Hugging Face ``transformers``
9
+ model namesakes; the name refers to **Bloom filters** (Burton Howard Bloom, 1970).
10
+
11
+ .. _BLOOM: https://huggingface.co/bigscience/bloom
12
+ """
13
+
14
+ from .bloom_hash import BloomHasher
15
+ from .bloom_matrix import TorchBloomMatrix
16
+ from .bloom_kmeans import BloomKMeans
17
+
18
+ __version__ = "1.0.0"
19
+
20
+ __all__ = [
21
+ "BloomHasher",
22
+ "TorchBloomMatrix",
23
+ "BloomKMeans",
24
+ "__version__",
25
+ ]
@@ -0,0 +1,145 @@
1
+ """
2
+ BloomHasher — GPU-friendly universal hash family for integer labels.
3
+
4
+ IMPROVEMENT: Replaces mmh3 with a polynomial hash family that runs entirely
5
+ on PyTorch tensors (CPU / CUDA / MPS), enabling batched hashing via matmul.
6
+ No C extension; same code path on all devices.
7
+
8
+ Hash family
9
+ -----------
10
+ h_i(x) = ((a_i * x + b_i) % P) % m
11
+
12
+ where P = 2^31 − 1 (Mersenne prime) and (a_i, b_i) are randomly drawn
13
+ coefficient pairs. The Mersenne prime modulus ensures near-uniform distribution
14
+ without costly modular reduction — % P is cheap in int64.
15
+
16
+ Shapes
17
+ ------
18
+ hash_ids(ids) : ids [N] → positions [k, N]
19
+ hash_single(id) : scalar id → positions [k]
20
+
21
+ Both return int64 values in [0, m). All ops are device-agnostic.
22
+
23
+ Compatibility note
24
+ ------------------
25
+ Hash values differ from mmh3 (MurmurHash3 on string labels). The Torch hasher
26
+ is self-consistent: TorchBloomMatrix built and queried with the *same*
27
+ BloomHasher instance will produce zero false negatives, matching the invariant
28
+ of BloolMultifilters.BloomMatrix. The two implementations are *not*
29
+ interchangeable at the bit level.
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import math
35
+
36
+ import torch
37
+ from torch import Tensor
38
+
39
+ # Mersenne prime as modulus — cheap mod and good mixing
40
+ _P: int = (1 << 31) - 1
41
+
42
+
43
+ class BloomHasher:
44
+ """
45
+ Family of k universal hash functions h_i : int → [0, m).
46
+
47
+ Parameters
48
+ ----------
49
+ k : number of hash functions
50
+ m : output range (= number of rows in TorchBloomMatrix)
51
+ seed : RNG seed for reproducibility
52
+ """
53
+
54
+ def __init__(self, k: int, m: int, seed: int = 42) -> None:
55
+ self.k = k
56
+ self.m = m
57
+ self._seed = seed
58
+ rng = torch.Generator()
59
+ rng.manual_seed(seed)
60
+ # Odd multipliers keep the full-period property; offsets are arbitrary.
61
+ a = torch.randint(1, _P, (k,), generator=rng, dtype=torch.int64)
62
+ a = a | 1 # force odd
63
+ b = torch.randint(0, _P, (k,), generator=rng, dtype=torch.int64)
64
+ # Store as plain tensors (not nn.Parameter); moved with .to().
65
+ self._a: Tensor = a
66
+ self._b: Tensor = b
67
+
68
+ # ------------------------------------------------------------------
69
+ # Device management
70
+ # ------------------------------------------------------------------
71
+
72
+ def to(self, device: torch.device | str) -> "BloomHasher":
73
+ """Move coefficient tensors to *device* in-place; returns self."""
74
+ self._a = self._a.to(device)
75
+ self._b = self._b.to(device)
76
+ return self
77
+
78
+ @property
79
+ def device(self) -> torch.device:
80
+ return self._a.device
81
+
82
+ # ------------------------------------------------------------------
83
+ # Hashing — hot-path ops
84
+ # ------------------------------------------------------------------
85
+
86
+ def hash_ids(self, ids: Tensor) -> Tensor:
87
+ """
88
+ Compute k hash positions for a batch of integer ids.
89
+
90
+ Parameters
91
+ ----------
92
+ ids : [N] int64 (label ids, e.g. token indices or cluster indices)
93
+
94
+ Returns
95
+ -------
96
+ positions : [k, N] int64 (values in [0, m))
97
+ """
98
+ ids = ids.to(dtype=torch.int64, device=self._a.device)
99
+ # [k, 1] * [1, N] + [k, 1] → [k, N]
100
+ raw = self._a[:, None] * ids[None, :] + self._b[:, None]
101
+ return raw % _P % self.m
102
+
103
+ def hash_single(self, label_id: int) -> Tensor:
104
+ """
105
+ Compute k hash positions for a single integer label_id.
106
+
107
+ Returns
108
+ -------
109
+ positions : [k] int64 (values in [0, m))
110
+ """
111
+ x = torch.tensor(label_id, dtype=torch.int64, device=self._a.device)
112
+ return (self._a * x + self._b) % _P % self.m
113
+
114
+ # ------------------------------------------------------------------
115
+ # Optimal sizing helpers (mirror BloolMultifilters.BFAux)
116
+ # ------------------------------------------------------------------
117
+
118
+ @staticmethod
119
+ def optimal_m(n: int, p: float) -> int:
120
+ """
121
+ Optimal filter row count for *n* elements at false-positive rate *p*.
122
+
123
+ Formula: m = ceil(n · ln(1/p) / ln(2)²)
124
+ """
125
+ if n <= 0 or p <= 0 or p >= 1:
126
+ return 1
127
+ return max(round(n * -math.log(p) / (math.log(2) ** 2)), 1)
128
+
129
+ @staticmethod
130
+ def optimal_k(p: float) -> int:
131
+ """
132
+ Optimal number of hash functions for false-positive rate *p*.
133
+
134
+ Formula: k = round(ln(2) · m/n) = round(-ln(p) / ln(2))
135
+ """
136
+ if p <= 0 or p >= 1:
137
+ return 1
138
+ return max(round(-math.log(p) / math.log(2)), 1)
139
+
140
+ # ------------------------------------------------------------------
141
+ # Repr
142
+ # ------------------------------------------------------------------
143
+
144
+ def __repr__(self) -> str:
145
+ return f"BloomHasher(k={self.k}, m={self.m}, device={self.device})"