muvera-python 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+
12
+ # Lock file (library repo — dependencies should stay flexible)
13
+ uv.lock
14
+
15
+ # Local Python version (managed by pyenv/uv, not part of the library)
16
+ .python-version
17
+
18
+ # Embedding cache (generated by examples)
19
+ examples/.cache/
20
+
21
+ .vscode/
22
+
23
+ # Benchmark results (generated, not committed)
24
+ benchmarks/results/*.json
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2026 craftsangjae
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,126 @@
1
+ Metadata-Version: 2.4
2
+ Name: muvera-python
3
+ Version: 0.1.1
4
+ Summary: MuVERA: Multi-Vector Retrieval via Fixed Dimensional Encodings
5
+ Project-URL: Homepage, https://github.com/craftsangjae/muvera-python
6
+ Project-URL: Repository, https://github.com/craftsangjae/muvera-python
7
+ Project-URL: Issues, https://github.com/craftsangjae/muvera-python/issues
8
+ Author: craftsangjae
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: embedding,fixed-dimensional-encoding,multi-vector,muvera,muvera-python,retrieval
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
24
+ Requires-Python: >=3.9
25
+ Requires-Dist: numpy>=1.22.0
26
+ Provides-Extra: dev
27
+ Requires-Dist: mypy>=1.0; extra == 'dev'
28
+ Requires-Dist: pre-commit>=3.0; extra == 'dev'
29
+ Requires-Dist: pytest>=7.0; extra == 'dev'
30
+ Requires-Dist: ruff>=0.4; extra == 'dev'
31
+ Requires-Dist: scipy>=1.13.1; extra == 'dev'
32
+ Description-Content-Type: text/markdown
33
+
34
+ # MuVERA
35
+
36
+ A Python implementation of **Mu**lti-**Ve**ctor **R**etrieval via Fixed Dimensional Encoding **A**lgorithm.
37
+
38
+ Converts multi-vector embeddings (point clouds) into fixed-dimensional single vectors, enabling the use of existing single-vector search infrastructure (MIPS, ANN, etc.) as-is.
39
+
40
+ ## Why this library?
41
+
42
+ The original MuVERA algorithm is described in a [research paper](https://arxiv.org/abs/2405.19504) and implemented in C++ within Google's [graph-mining](https://github.com/google/graph-mining) repository. While a Python reference exists, it exposes low-level config objects and separate functions for queries vs. documents, making it cumbersome to integrate into real workflows.
43
+
44
+ This library wraps the full algorithm behind a **single `Muvera` class** with a minimal, intuitive interface — initialize once, then call `encode_documents()` and `encode_queries()`. No config dataclasses, no encoding-type enums, no manual seed juggling. Just NumPy arrays in, NumPy arrays out.
45
+
46
+ ## Installation
47
+
48
+ ```bash
49
+ pip install muvera
50
+ ```
51
+
52
+ Development install:
53
+
54
+ ```bash
55
+ git clone https://github.com/craftsangjae/muvera-python.git
56
+ cd muvera-python
57
+ pip install -e .
58
+ ```
59
+
60
+ ## Quick Start
61
+
62
+ ```python
63
+ import numpy as np
64
+ from muvera import Muvera
65
+
66
+ # Initialize encoder
67
+ encoder = Muvera(
68
+ num_repetitions=10,
69
+ num_simhash_projections=4,
70
+ dimension=128,
71
+ seed=42,
72
+ )
73
+
74
+ # Encode documents (batch)
75
+ # shape: (num_documents, num_vectors_per_doc, embedding_dim)
76
+ documents = np.random.randn(100, 80, 128).astype(np.float32)
77
+ doc_fdes = encoder.encode_documents(documents) # (100, output_dimension)
78
+
79
+ # Encode queries (batch)
80
+ queries = np.random.randn(10, 32, 128).astype(np.float32)
81
+ query_fdes = encoder.encode_queries(queries) # (10, output_dimension)
82
+
83
+ # Compute similarity (dot product)
84
+ scores = query_fdes @ doc_fdes.T # (10, 100)
85
+ ```
86
+
87
+ ## Parameters
88
+
89
+ | Parameter | Default | Description |
90
+ |---|---|---|
91
+ | `num_repetitions` | 20 | Number of FDE repetitions. Higher values improve accuracy but increase output dimension |
92
+ | `num_simhash_projections` | 5 | Number of SimHash projections. Number of partitions = 2^n |
93
+ | `dimension` | 16 | Input embedding dimension |
94
+ | `projection_type` | `"identity"` | `"identity"` or `"ams_sketch"` |
95
+ | `projection_dimension` | None | Projected dimension when using AMS Sketch |
96
+ | `fill_empty_partitions` | True | Whether to fill empty partitions with the nearest vector |
97
+ | `final_projection_dimension` | None | Final dimension reduction via Count Sketch |
98
+ | `seed` | 42 | Random seed for reproducibility |
99
+
100
+ ## Benchmark
101
+
102
+ End-to-end retrieval on [NanoFiQA2018](https://huggingface.co/datasets/zeta-alpha-ai/NanoFiQA2018) (4598 documents, 50 queries) using `raphaelsty/neural-cherche-colbert` (dim=128):
103
+
104
+ ```
105
+ =====================================================================================
106
+ RESULTS
107
+ (zeta-alpha-ai/NanoFiQA2018)
108
+ =====================================================================================
109
+ Retriever | Index (s) | Query (ms) | Recall@25
110
+ -------------------------------------------------------------------------------------
111
+ ColBERT (Native MaxSim) | 240.04 | 836.94 | 0.8400
112
+ ColBERT + Muvera FDE | 77.29 | 69.48 | 0.7600
113
+ =====================================================================================
114
+ ```
115
+
116
+ FDE achieves **90% of native MaxSim recall** while being **12x faster** at query time. See `examples/colbert_nanobeir.py` to reproduce.
117
+
118
+ ## Acknowledgments
119
+
120
+ This library was inspired by [sionic-ai/muvera-py](https://github.com/sionic-ai/muvera-py), the first Python implementation of the MuVERA algorithm. Their faithful port of the C++ reference made it possible to validate correctness and understand the algorithm deeply. This project builds on that foundation with a simplified API designed for easier integration.
121
+
122
+ ## References
123
+
124
+ - [MuVERA: Multi-Vector Retrieval via Fixed Dimensional Encodings](https://arxiv.org/abs/2405.19504)
125
+ - [Google graph-mining C++ implementation](https://github.com/google/graph-mining/blob/main/sketching/point_cloud/fixed_dimensional_encoding.cc)
126
+ - [sionic-ai/muvera-py](https://github.com/sionic-ai/muvera-py)
@@ -0,0 +1,93 @@
1
+ # MuVERA
2
+
3
+ A Python implementation of **Mu**lti-**Ve**ctor **R**etrieval via Fixed Dimensional Encoding **A**lgorithm.
4
+
5
+ Converts multi-vector embeddings (point clouds) into fixed-dimensional single vectors, enabling the use of existing single-vector search infrastructure (MIPS, ANN, etc.) as-is.
6
+
7
+ ## Why this library?
8
+
9
+ The original MuVERA algorithm is described in a [research paper](https://arxiv.org/abs/2405.19504) and implemented in C++ within Google's [graph-mining](https://github.com/google/graph-mining) repository. While a Python reference exists, it exposes low-level config objects and separate functions for queries vs. documents, making it cumbersome to integrate into real workflows.
10
+
11
+ This library wraps the full algorithm behind a **single `Muvera` class** with a minimal, intuitive interface — initialize once, then call `encode_documents()` and `encode_queries()`. No config dataclasses, no encoding-type enums, no manual seed juggling. Just NumPy arrays in, NumPy arrays out.
12
+
13
+ ## Installation
14
+
15
+ ```bash
16
+ pip install muvera
17
+ ```
18
+
19
+ Development install:
20
+
21
+ ```bash
22
+ git clone https://github.com/craftsangjae/muvera-python.git
23
+ cd muvera-python
24
+ pip install -e .
25
+ ```
26
+
27
+ ## Quick Start
28
+
29
+ ```python
30
+ import numpy as np
31
+ from muvera import Muvera
32
+
33
+ # Initialize encoder
34
+ encoder = Muvera(
35
+ num_repetitions=10,
36
+ num_simhash_projections=4,
37
+ dimension=128,
38
+ seed=42,
39
+ )
40
+
41
+ # Encode documents (batch)
42
+ # shape: (num_documents, num_vectors_per_doc, embedding_dim)
43
+ documents = np.random.randn(100, 80, 128).astype(np.float32)
44
+ doc_fdes = encoder.encode_documents(documents) # (100, output_dimension)
45
+
46
+ # Encode queries (batch)
47
+ queries = np.random.randn(10, 32, 128).astype(np.float32)
48
+ query_fdes = encoder.encode_queries(queries) # (10, output_dimension)
49
+
50
+ # Compute similarity (dot product)
51
+ scores = query_fdes @ doc_fdes.T # (10, 100)
52
+ ```
53
+
54
+ ## Parameters
55
+
56
+ | Parameter | Default | Description |
57
+ |---|---|---|
58
+ | `num_repetitions` | 20 | Number of FDE repetitions. Higher values improve accuracy but increase output dimension |
59
+ | `num_simhash_projections` | 5 | Number of SimHash projections. Number of partitions = 2^n |
60
+ | `dimension` | 16 | Input embedding dimension |
61
+ | `projection_type` | `"identity"` | `"identity"` or `"ams_sketch"` |
62
+ | `projection_dimension` | None | Projected dimension when using AMS Sketch |
63
+ | `fill_empty_partitions` | True | Whether to fill empty partitions with the nearest vector |
64
+ | `final_projection_dimension` | None | Final dimension reduction via Count Sketch |
65
+ | `seed` | 42 | Random seed for reproducibility |
66
+
67
+ ## Benchmark
68
+
69
+ End-to-end retrieval on [NanoFiQA2018](https://huggingface.co/datasets/zeta-alpha-ai/NanoFiQA2018) (4598 documents, 50 queries) using `raphaelsty/neural-cherche-colbert` (dim=128):
70
+
71
+ ```
72
+ =====================================================================================
73
+ RESULTS
74
+ (zeta-alpha-ai/NanoFiQA2018)
75
+ =====================================================================================
76
+ Retriever | Index (s) | Query (ms) | Recall@25
77
+ -------------------------------------------------------------------------------------
78
+ ColBERT (Native MaxSim) | 240.04 | 836.94 | 0.8400
79
+ ColBERT + Muvera FDE | 77.29 | 69.48 | 0.7600
80
+ =====================================================================================
81
+ ```
82
+
83
+ FDE achieves **90% of native MaxSim recall** while being **12x faster** at query time. See `examples/colbert_nanobeir.py` to reproduce.
84
+
85
+ ## Acknowledgments
86
+
87
+ This library was inspired by [sionic-ai/muvera-py](https://github.com/sionic-ai/muvera-py), the first Python implementation of the MuVERA algorithm. Their faithful port of the C++ reference made it possible to validate correctness and understand the algorithm deeply. This project builds on that foundation with a simplified API designed for easier integration.
88
+
89
+ ## References
90
+
91
+ - [MuVERA: Multi-Vector Retrieval via Fixed Dimensional Encodings](https://arxiv.org/abs/2405.19504)
92
+ - [Google graph-mining C++ implementation](https://github.com/google/graph-mining/blob/main/sketching/point_cloud/fixed_dimensional_encoding.cc)
93
+ - [sionic-ai/muvera-py](https://github.com/sionic-ai/muvera-py)
@@ -0,0 +1,6 @@
1
+ """MuVERA: Multi-Vector Retrieval via Fixed Dimensional Encodings."""
2
+
3
+ from muvera.muvera import Muvera
4
+
5
+ __all__ = ["Muvera"]
6
+ __version__ = "0.1.0"
@@ -0,0 +1,205 @@
1
+ """Internal helper functions for MuVERA Fixed Dimensional Encoding.
2
+
3
+ This module contains low-level utilities for Gray code manipulation,
4
+ random projection matrix generation, Count Sketch, and SimHash-based
5
+ partition indexing. These are not part of the public API.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import numpy as np
11
+
12
+ # ---------------------------------------------------------------------------
13
+ # Gray code utilities
14
+ # ---------------------------------------------------------------------------
15
+
16
+
17
+ def append_to_gray_code(gray_code: int, bit: bool) -> int:
18
+ """Append a single bit to a Gray code value.
19
+
20
+ Parameters
21
+ ----------
22
+ gray_code : int
23
+ Current Gray code value.
24
+ bit : bool
25
+ Bit to append (True=1, False=0).
26
+
27
+ Returns
28
+ -------
29
+ int
30
+ Updated Gray code.
31
+ """
32
+ return (gray_code << 1) + (int(bit) ^ (gray_code & 1))
33
+
34
+
35
+ def gray_code_to_binary(num: int) -> int:
36
+ """Convert a Gray code value to its binary representation.
37
+
38
+ Parameters
39
+ ----------
40
+ num : int
41
+ Gray code value.
42
+
43
+ Returns
44
+ -------
45
+ int
46
+ Corresponding binary representation.
47
+ """
48
+ mask = num >> 1
49
+ while mask != 0:
50
+ num = num ^ mask
51
+ mask >>= 1
52
+ return num
53
+
54
+
55
+ # ---------------------------------------------------------------------------
56
+ # Random projection matrices
57
+ # ---------------------------------------------------------------------------
58
+
59
+
60
+ def simhash_matrix_from_seed(dimension: int, num_projections: int, seed: int) -> np.ndarray:
61
+ """Generate a Gaussian random projection matrix for SimHash.
62
+
63
+ Parameters
64
+ ----------
65
+ dimension : int
66
+ Input vector dimension.
67
+ num_projections : int
68
+ Number of SimHash projections.
69
+ seed : int
70
+ Random seed.
71
+
72
+ Returns
73
+ -------
74
+ numpy.ndarray
75
+ Float32 matrix of shape ``(dimension, num_projections)``.
76
+ """
77
+ rng = np.random.default_rng(seed)
78
+ return rng.normal(loc=0.0, scale=1.0, size=(dimension, num_projections)).astype(np.float32)
79
+
80
+
81
+ def ams_projection_matrix_from_seed(dimension: int, projection_dim: int, seed: int) -> np.ndarray:
82
+ """Generate an AMS Sketch projection matrix.
83
+
84
+ Each row has exactly one non-zero entry (+1 or -1), forming a sparse
85
+ random projection.
86
+
87
+ Parameters
88
+ ----------
89
+ dimension : int
90
+ Input vector dimension.
91
+ projection_dim : int
92
+ Output (projected) dimension.
93
+ seed : int
94
+ Random seed.
95
+
96
+ Returns
97
+ -------
98
+ numpy.ndarray
99
+ Float32 matrix of shape ``(dimension, projection_dim)``.
100
+ """
101
+ rng = np.random.default_rng(seed)
102
+ out = np.zeros((dimension, projection_dim), dtype=np.float32)
103
+ indices = rng.integers(0, projection_dim, size=dimension)
104
+ signs = rng.choice(np.array([-1.0, 1.0], dtype=np.float32), size=dimension)
105
+ out[np.arange(dimension), indices] = signs
106
+ return out
107
+
108
+
109
+ def count_sketch_vector_from_seed(
110
+ input_vector: np.ndarray, final_dimension: int, seed: int
111
+ ) -> np.ndarray:
112
+ """Project a vector to a lower dimension using Count Sketch.
113
+
114
+ Parameters
115
+ ----------
116
+ input_vector : numpy.ndarray
117
+ Input vector (1-D).
118
+ final_dimension : int
119
+ Output dimension.
120
+ seed : int
121
+ Random seed.
122
+
123
+ Returns
124
+ -------
125
+ numpy.ndarray
126
+ Float32 vector of shape ``(final_dimension,)``.
127
+ """
128
+ rng = np.random.default_rng(seed)
129
+ out = np.zeros(final_dimension, dtype=np.float32)
130
+ indices = rng.integers(0, final_dimension, size=input_vector.shape[0])
131
+ signs = rng.choice(np.array([-1.0, 1.0], dtype=np.float32), size=input_vector.shape[0])
132
+ np.add.at(out, indices, signs * input_vector)
133
+ return out
134
+
135
+
136
+ # ---------------------------------------------------------------------------
137
+ # Partition indexing
138
+ # ---------------------------------------------------------------------------
139
+
140
+
141
+ def partition_index_gray(sketch_vector: np.ndarray) -> int:
142
+ """Compute a Gray-code-based partition index from a SimHash sketch vector.
143
+
144
+ Parameters
145
+ ----------
146
+ sketch_vector : numpy.ndarray
147
+ SimHash projection result vector (1-D).
148
+
149
+ Returns
150
+ -------
151
+ int
152
+ Partition index.
153
+ """
154
+ partition_index = 0
155
+ for val in sketch_vector:
156
+ partition_index = append_to_gray_code(partition_index, val > 0)
157
+ return partition_index
158
+
159
+
160
+ def distance_to_partition(sketch_vector: np.ndarray, partition_index: int) -> int:
161
+ """Compute the Hamming distance between a sketch vector and a partition.
162
+
163
+ Parameters
164
+ ----------
165
+ sketch_vector : numpy.ndarray
166
+ SimHash projection result vector (1-D).
167
+ partition_index : int
168
+ Target partition index.
169
+
170
+ Returns
171
+ -------
172
+ int
173
+ Hamming distance.
174
+ """
175
+ num_projections = sketch_vector.size
176
+ binary_representation = gray_code_to_binary(partition_index)
177
+ sketch_bits = (sketch_vector > 0).astype(int)
178
+ binary_array = (binary_representation >> np.arange(num_projections - 1, -1, -1)) & 1
179
+ return int(np.sum(sketch_bits != binary_array))
180
+
181
+
182
+ # ---------------------------------------------------------------------------
183
+ # Vectorised batch helpers
184
+ # ---------------------------------------------------------------------------
185
+
186
+
187
+ def partition_indices_gray_batch(sketches: np.ndarray) -> np.ndarray:
188
+ """Compute Gray-code partition indices for a batch of sketch vectors.
189
+
190
+ Parameters
191
+ ----------
192
+ sketches : numpy.ndarray
193
+ SimHash sketch matrix of shape ``(N, num_projections)``.
194
+
195
+ Returns
196
+ -------
197
+ numpy.ndarray
198
+ Uint32 partition index array of shape ``(N,)``.
199
+ """
200
+ num_projections = sketches.shape[1]
201
+ bits = (sketches > 0).astype(np.uint32)
202
+ partition_indices = np.zeros(sketches.shape[0], dtype=np.uint32)
203
+ for bit_idx in range(num_projections):
204
+ partition_indices = (partition_indices << 1) + (bits[:, bit_idx] ^ (partition_indices & 1))
205
+ return partition_indices
@@ -0,0 +1,411 @@
1
+ """MuVERA (Multi-Vector Retrieval via Fixed Dimensional Encodings).
2
+
3
+ This module provides the Fixed Dimensional Encoding (FDE) algorithm that
4
+ converts multi-vector embeddings (point clouds) into single fixed-dimensional
5
+ vectors.
6
+
7
+ References
8
+ ----------
9
+ .. [1] Google graph-mining: fixed_dimensional_encoding.cc
10
+ https://github.com/google/graph-mining/blob/main/sketching/point_cloud/fixed_dimensional_encoding.cc
11
+ .. [2] sionic-ai/muvera-py: fde_generator.py
12
+ https://github.com/sionic-ai/muvera-py/blob/master/fde_generator.py
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from typing import Literal
18
+
19
+ import numpy as np
20
+
21
+ from muvera.helper import (
22
+ ams_projection_matrix_from_seed,
23
+ count_sketch_vector_from_seed,
24
+ distance_to_partition,
25
+ gray_code_to_binary,
26
+ partition_index_gray,
27
+ partition_indices_gray_batch,
28
+ simhash_matrix_from_seed,
29
+ )
30
+
31
+
32
+ class Muvera:
33
+ """Encoder that converts multi-vector embeddings into Fixed Dimensional Encodings.
34
+
35
+ Uses the MuVERA algorithm to encode variable-length multi-vector
36
+ representations (point clouds) into fixed-dimensional single vectors.
37
+ The dot product between encoded vectors approximates the Chamfer
38
+ similarity (MaxSim) between the original multi-vector sets.
39
+
40
+ Parameters
41
+ ----------
42
+ num_repetitions : int, default=20
43
+ Number of repetitions for FDE generation. Higher values improve
44
+ accuracy but proportionally increase the output dimension.
45
+ num_simhash_projections : int, default=5
46
+ Number of SimHash projections. The number of partitions is
47
+ ``2 ** num_simhash_projections``. Must be in ``[0, 31)``.
48
+ dimension : int, default=16
49
+ Dimension of the input embedding vectors.
50
+ projection_type : {'identity', 'ams_sketch'}, default='identity'
51
+ Inner projection method.
52
+
53
+ - ``'identity'``: Uses the original dimension as-is.
54
+ - ``'ams_sketch'``: Uses AMS Sketch for dimensionality reduction.
55
+ projection_dimension : int or None, default=None
56
+ Projected dimension when ``projection_type='ams_sketch'``.
57
+ Ignored when ``'identity'``.
58
+ fill_empty_partitions : bool, default=True
59
+ Whether to fill empty partitions with the nearest vector during
60
+ document encoding.
61
+ final_projection_dimension : int or None, default=None
62
+ Final Count Sketch projection dimension. ``None`` disables
63
+ final projection.
64
+ seed : int, default=42
65
+ Random seed for reproducibility.
66
+
67
+ Attributes
68
+ ----------
69
+ output_dimension : int
70
+ Dimension of the encoded output vector.
71
+
72
+ Examples
73
+ --------
74
+ >>> import numpy as np
75
+ >>> from muvera import Muvera
76
+ >>> encoder = Muvera(num_repetitions=10, num_simhash_projections=4,
77
+ ... dimension=128, seed=42)
78
+ >>> # Single document/query
79
+ >>> doc = np.random.randn(80, 128).astype(np.float32)
80
+ >>> query = np.random.randn(32, 128).astype(np.float32)
81
+ >>> doc_fde = encoder.encode_documents(doc)
82
+ >>> query_fde = encoder.encode_queries(query)
83
+ >>> score = query_fde @ doc_fde # similarity score
84
+ >>>
85
+ >>> # Batch of documents with variable lengths
86
+ >>> docs = [np.random.randn(80, 128).astype(np.float32) for _ in range(5)]
87
+ >>> doc_fdes = encoder.encode_documents(docs) # (5, output_dimension)
88
+ """
89
+
90
+ def __init__(
91
+ self,
92
+ num_repetitions: int = 20,
93
+ num_simhash_projections: int = 5,
94
+ dimension: int = 16,
95
+ projection_type: Literal["identity", "ams_sketch"] = "identity",
96
+ projection_dimension: int | None = None,
97
+ fill_empty_partitions: bool = True,
98
+ final_projection_dimension: int | None = None,
99
+ seed: int = 42,
100
+ ):
101
+ if num_repetitions <= 0:
102
+ raise ValueError(f"num_repetitions must be greater than 0, got {num_repetitions}")
103
+ if not (0 <= num_simhash_projections < 31):
104
+ raise ValueError(
105
+ f"num_simhash_projections must be in [0, 31), got {num_simhash_projections}"
106
+ )
107
+ if projection_type not in ("identity", "ams_sketch"):
108
+ raise ValueError(
109
+ f"projection_type must be 'identity' or 'ams_sketch', got '{projection_type}'"
110
+ )
111
+ if projection_type == "ams_sketch" and (
112
+ projection_dimension is None or projection_dimension <= 0
113
+ ):
114
+ raise ValueError(
115
+ "A positive projection_dimension is required when projection_type='ams_sketch'"
116
+ )
117
+
118
+ self.num_repetitions = num_repetitions
119
+ self.num_simhash_projections = num_simhash_projections
120
+ self.dimension = dimension
121
+ self.projection_type = projection_type
122
+ self.projection_dimension = projection_dimension
123
+ self.fill_empty_partitions = fill_empty_partitions
124
+ self.final_projection_dimension = final_projection_dimension
125
+ self.seed = seed
126
+
127
+ # Derived constants
128
+ self._num_partitions: int = 2**num_simhash_projections
129
+ self._use_identity: bool = projection_type == "identity"
130
+ self._proj_dim: int = dimension if self._use_identity else projection_dimension # type: ignore[assignment]
131
+ self._fde_dim: int = num_repetitions * self._num_partitions * self._proj_dim
132
+
133
+ @property
134
+ def output_dimension(self) -> int:
135
+ """Dimension of the encoded output vector."""
136
+ if self.final_projection_dimension is not None and self.final_projection_dimension > 0:
137
+ return self.final_projection_dimension
138
+ return self._fde_dim
139
+
140
+ # ------------------------------------------------------------------
141
+ # Public API
142
+ # ------------------------------------------------------------------
143
+
144
+ def encode_documents(self, documents: np.ndarray | list[np.ndarray]) -> np.ndarray:
145
+ """Encode document point clouds into Fixed Dimensional Encodings.
146
+
147
+ Uses AVERAGE aggregation within partitions. Empty partitions are filled
148
+ with the nearest vector when ``fill_empty_partitions=True``.
149
+
150
+ Parameters
151
+ ----------
152
+ documents : np.ndarray or list[np.ndarray]
153
+ - Single document: shape ``(num_vectors, dimension)``
154
+ - Batch of documents: ``list[np.ndarray]`` where each element has
155
+ shape ``(num_vectors_i, dimension)``
156
+
157
+ Returns
158
+ -------
159
+ np.ndarray
160
+ Encoded FDE vector(s). Single document returns shape ``(output_dimension,)``,
161
+ batch returns shape ``(num_documents, output_dimension)``.
162
+ """
163
+ if isinstance(documents, list):
164
+ return self._encode_batch(documents, is_query=False)
165
+ return self._encode_single(documents, is_query=False)
166
+
167
+ def encode_queries(self, queries: np.ndarray | list[np.ndarray]) -> np.ndarray:
168
+ """Encode query point clouds into Fixed Dimensional Encodings.
169
+
170
+ Uses SUM aggregation within partitions. Empty partitions are not filled.
171
+
172
+ Parameters
173
+ ----------
174
+ queries : np.ndarray or list[np.ndarray]
175
+ - Single query: shape ``(num_vectors, dimension)``
176
+ - Batch of queries: ``list[np.ndarray]`` where each element has
177
+ shape ``(num_vectors_i, dimension)``
178
+
179
+ Returns
180
+ -------
181
+ np.ndarray
182
+ Encoded FDE vector(s). Single query returns shape ``(output_dimension,)``,
183
+ batch returns shape ``(num_queries, output_dimension)``.
184
+ """
185
+ if isinstance(queries, list):
186
+ return self._encode_batch(queries, is_query=True)
187
+ return self._encode_single(queries, is_query=True)
188
+
189
+ # ------------------------------------------------------------------
190
+ # Core Encoding
191
+ # ------------------------------------------------------------------
192
+
193
+ def _encode_single(self, point_cloud: np.ndarray, *, is_query: bool) -> np.ndarray:
194
+ """Encode a single point cloud."""
195
+ if point_cloud.ndim != 2 or point_cloud.shape[1] != self.dimension:
196
+ raise ValueError(f"Expected shape (N, {self.dimension}), got {point_cloud.shape}")
197
+
198
+ point_cloud = np.asarray(point_cloud, dtype=np.float32)
199
+ out = np.zeros(self._fde_dim, dtype=np.float32)
200
+
201
+ for rep in range(self.num_repetitions):
202
+ current_seed = self.seed + rep
203
+ sketches = self._compute_sketches(point_cloud, current_seed)
204
+ projected = self._compute_projection(point_cloud, current_seed)
205
+ rep_fde = self._aggregate_single(sketches, projected, is_query)
206
+
207
+ rep_start = rep * self._num_partitions * self._proj_dim
208
+ out[rep_start : rep_start + rep_fde.size] = rep_fde
209
+
210
+ return self._apply_final_projection(out)
211
+
212
+ def _encode_batch(self, point_clouds: list[np.ndarray], *, is_query: bool) -> np.ndarray:
213
+ """Encode a batch of variable-length point clouds."""
214
+ batch_size = len(point_clouds)
215
+ if batch_size == 0:
216
+ return np.zeros((0, self.output_dimension), dtype=np.float32)
217
+
218
+ # Validate and flatten
219
+ for i, pc in enumerate(point_clouds):
220
+ if pc.ndim != 2 or pc.shape[1] != self.dimension:
221
+ raise ValueError(f"Element {i}: expected (N, {self.dimension}), got {pc.shape}")
222
+
223
+ flat_points, doc_indices, doc_boundaries = self._flatten_batch(point_clouds)
224
+ out = np.zeros((batch_size, self._fde_dim), dtype=np.float32)
225
+
226
+ for rep in range(self.num_repetitions):
227
+ current_seed = self.seed + rep
228
+ sketches = self._compute_sketches(flat_points, current_seed)
229
+ projected = self._compute_projection(flat_points, current_seed)
230
+ rep_fde = self._aggregate_batch(
231
+ sketches, projected, doc_indices, doc_boundaries, is_query
232
+ )
233
+
234
+ rep_start = rep * self._num_partitions * self._proj_dim
235
+ out[:, rep_start : rep_start + rep_fde.size // batch_size] = rep_fde
236
+
237
+ return self._apply_final_projection(out)
238
+
239
+ # ------------------------------------------------------------------
240
+ # Helper Methods
241
+ # ------------------------------------------------------------------
242
+
243
+ def _compute_sketches(self, vectors: np.ndarray, seed: int) -> np.ndarray:
244
+ """Compute SimHash sketches for partition assignment."""
245
+ sim_matrix = simhash_matrix_from_seed(self.dimension, self.num_simhash_projections, seed)
246
+ return vectors @ sim_matrix
247
+
248
+ def _compute_projection(self, vectors: np.ndarray, seed: int) -> np.ndarray:
249
+ """Project vectors using identity or AMS sketch."""
250
+ if self._use_identity:
251
+ return vectors
252
+
253
+ ams_matrix = ams_projection_matrix_from_seed(self.dimension, self._proj_dim, seed)
254
+ return vectors @ ams_matrix
255
+
256
+ def _aggregate_single(
257
+ self, sketches: np.ndarray, projected: np.ndarray, is_query: bool
258
+ ) -> np.ndarray:
259
+ """Aggregate vectors into partitions for a single point cloud."""
260
+ num_points = sketches.shape[0]
261
+ partition_counts = np.zeros(self._num_partitions, dtype=np.int32)
262
+ rep_fde = np.zeros(self._num_partitions * self._proj_dim, dtype=np.float32)
263
+
264
+ # Assign vectors to partitions
265
+ for i in range(num_points):
266
+ pidx = partition_index_gray(sketches[i])
267
+ start = pidx * self._proj_dim
268
+ rep_fde[start : start + self._proj_dim] += projected[i]
269
+ partition_counts[pidx] += 1
270
+
271
+ # Apply AVERAGE for documents, SUM for queries
272
+ if not is_query:
273
+ self._apply_average_and_fill(rep_fde, partition_counts, sketches, projected)
274
+
275
+ return rep_fde
276
+
277
+ def _aggregate_batch(
278
+ self,
279
+ all_sketches: np.ndarray,
280
+ all_projected: np.ndarray,
281
+ doc_indices: np.ndarray,
282
+ doc_boundaries: np.ndarray,
283
+ is_query: bool,
284
+ ) -> np.ndarray:
285
+ """Aggregate vectors into partitions for a batch of point clouds."""
286
+ batch_size = len(doc_boundaries) - 1
287
+ part_indices = partition_indices_gray_batch(all_sketches)
288
+ partition_counts = np.zeros((batch_size, self._num_partitions), dtype=np.int32)
289
+ rep_fde = np.zeros((batch_size, self._num_partitions, self._proj_dim), dtype=np.float32)
290
+
291
+ # Count partitions
292
+ np.add.at(partition_counts, (doc_indices, part_indices), 1)
293
+
294
+ # Scatter-add projected vectors
295
+ self._scatter_add(rep_fde, doc_indices, part_indices, all_projected)
296
+
297
+ # Apply AVERAGE for documents
298
+ if not is_query:
299
+ self._apply_average_batch(rep_fde, partition_counts)
300
+ self._fill_empty_batch(
301
+ rep_fde, partition_counts, all_sketches, all_projected, doc_boundaries
302
+ )
303
+
304
+ return rep_fde.reshape(batch_size, -1)
305
+
306
+ def _apply_average_and_fill(
307
+ self,
308
+ rep_fde: np.ndarray,
309
+ partition_counts: np.ndarray,
310
+ sketches: np.ndarray,
311
+ projected: np.ndarray,
312
+ ) -> None:
313
+ """Apply AVERAGE aggregation and fill empty partitions for single cloud."""
314
+ num_points = sketches.shape[0]
315
+
316
+ for pidx in range(self._num_partitions):
317
+ start = pidx * self._proj_dim
318
+ if partition_counts[pidx] > 0:
319
+ rep_fde[start : start + self._proj_dim] /= partition_counts[pidx]
320
+ elif self.fill_empty_partitions and num_points > 0 and self.num_simhash_projections > 0:
321
+ distances = np.array(
322
+ [distance_to_partition(sketches[j], pidx) for j in range(num_points)]
323
+ )
324
+ nearest = np.argmin(distances)
325
+ rep_fde[start : start + self._proj_dim] = projected[nearest]
326
+
327
+ def _apply_average_batch(self, rep_fde: np.ndarray, partition_counts: np.ndarray) -> None:
328
+ """Apply AVERAGE aggregation for batch."""
329
+ counts_3d = partition_counts[:, :, np.newaxis]
330
+ np.divide(rep_fde, counts_3d, out=rep_fde, where=counts_3d > 0)
331
+
332
+ def _fill_empty_batch(
333
+ self,
334
+ rep_fde: np.ndarray,
335
+ partition_counts: np.ndarray,
336
+ all_sketches: np.ndarray,
337
+ all_projected: np.ndarray,
338
+ doc_boundaries: np.ndarray,
339
+ ) -> None:
340
+ """Fill empty partitions for batch."""
341
+ if not self.fill_empty_partitions or self.num_simhash_projections == 0:
342
+ return
343
+
344
+ empty_docs, empty_parts = np.where(partition_counts == 0)
345
+ for doc_idx, pidx in zip(empty_docs, empty_parts):
346
+ doc_start, doc_end = doc_boundaries[doc_idx], doc_boundaries[doc_idx + 1]
347
+ if doc_start == doc_end:
348
+ continue
349
+
350
+ doc_sketches = all_sketches[doc_start:doc_end]
351
+ binary_rep = gray_code_to_binary(int(pidx))
352
+ target_bits = (binary_rep >> np.arange(self.num_simhash_projections - 1, -1, -1)) & 1
353
+ distances = np.sum((doc_sketches > 0).astype(int) != target_bits, axis=1)
354
+ nearest_local = np.argmin(distances)
355
+ rep_fde[doc_idx, pidx, :] = all_projected[doc_start + nearest_local]
356
+
357
+ def _scatter_add(
358
+ self,
359
+ rep_fde: np.ndarray,
360
+ doc_indices: np.ndarray,
361
+ part_indices: np.ndarray,
362
+ all_projected: np.ndarray,
363
+ ) -> None:
364
+ """Scatter-add projected vectors into partitions."""
365
+ doc_part = doc_indices * self._num_partitions + part_indices
366
+ base = doc_part * self._proj_dim
367
+ flat_rep_fde = rep_fde.reshape(-1)
368
+
369
+ for d in range(self._proj_dim):
370
+ np.add.at(flat_rep_fde, base + d, all_projected[:, d])
371
+
372
+ def _flatten_batch(
373
+ self, point_clouds: list[np.ndarray]
374
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
375
+ """Flatten batch of point clouds into single array with metadata."""
376
+ doc_lengths = np.array([pc.shape[0] for pc in point_clouds], dtype=np.int32)
377
+ doc_boundaries = np.insert(np.cumsum(doc_lengths), 0, 0)
378
+ doc_indices = np.repeat(np.arange(len(point_clouds)), doc_lengths)
379
+ flat_points = np.vstack(point_clouds).astype(np.float32)
380
+
381
+ return flat_points, doc_indices, doc_boundaries
382
+
383
+ def _apply_final_projection(self, fdes: np.ndarray) -> np.ndarray:
384
+ """Apply optional Count Sketch final projection."""
385
+ if self.final_projection_dimension is None or self.final_projection_dimension <= 0:
386
+ return fdes
387
+
388
+ if fdes.ndim == 1:
389
+ return count_sketch_vector_from_seed(fdes, self.final_projection_dimension, self.seed)
390
+
391
+ # Batch
392
+ result = np.zeros((fdes.shape[0], self.final_projection_dimension), dtype=np.float32)
393
+ for i in range(fdes.shape[0]):
394
+ result[i] = count_sketch_vector_from_seed(
395
+ fdes[i], self.final_projection_dimension, self.seed
396
+ )
397
+ return result
398
+
399
+ def __repr__(self) -> str: # noqa: D105
400
+ return (
401
+ f"Muvera("
402
+ f"num_repetitions={self.num_repetitions}, "
403
+ f"num_simhash_projections={self.num_simhash_projections}, "
404
+ f"dimension={self.dimension}, "
405
+ f"projection_type='{self.projection_type}', "
406
+ f"projection_dimension={self.projection_dimension}, "
407
+ f"fill_empty_partitions={self.fill_empty_partitions}, "
408
+ f"final_projection_dimension={self.final_projection_dimension}, "
409
+ f"seed={self.seed}"
410
+ f")"
411
+ )
File without changes
@@ -0,0 +1,97 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "muvera-python"
7
+ version = "0.1.1"
8
+ description = "MuVERA: Multi-Vector Retrieval via Fixed Dimensional Encodings"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.9"
12
+ authors = [
13
+ { name = "craftsangjae" },
14
+ ]
15
+ keywords = ["muvera", "muvera-python", "multi-vector", "retrieval", "embedding", "fixed-dimensional-encoding"]
16
+ classifiers = [
17
+ "Development Status :: 3 - Alpha",
18
+ "Intended Audience :: Developers",
19
+ "Intended Audience :: Science/Research",
20
+ "License :: OSI Approved :: MIT License",
21
+ "Programming Language :: Python :: 3",
22
+ "Programming Language :: Python :: 3.9",
23
+ "Programming Language :: Python :: 3.10",
24
+ "Programming Language :: Python :: 3.11",
25
+ "Programming Language :: Python :: 3.12",
26
+ "Programming Language :: Python :: 3.13",
27
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
28
+ "Topic :: Scientific/Engineering :: Information Analysis",
29
+ ]
30
+ dependencies = [
31
+ "numpy>=1.22.0",
32
+ ]
33
+
34
+ [project.optional-dependencies]
35
+ dev = [
36
+ "pytest>=7.0",
37
+ "mypy>=1.0",
38
+ "ruff>=0.4",
39
+ "pre-commit>=3.0",
40
+ "scipy>=1.13.1",
41
+ ]
42
+
43
+ [project.urls]
44
+ Homepage = "https://github.com/craftsangjae/muvera-python"
45
+ Repository = "https://github.com/craftsangjae/muvera-python"
46
+ Issues = "https://github.com/craftsangjae/muvera-python/issues"
47
+
48
+ [tool.hatch.build.targets.wheel]
49
+ packages = ["muvera"]
50
+
51
+ [tool.hatch.build.targets.sdist]
52
+ include = ["muvera/", "README.md", "LICENSE"]
53
+
54
+ # ---------------------------------------------------------------------------
55
+ # Ruff
56
+ # ---------------------------------------------------------------------------
57
+
58
+ [tool.ruff]
59
+ target-version = "py39"
60
+ line-length = 100
61
+
62
+ [tool.ruff.lint]
63
+ select = [
64
+ "E", # pycodestyle errors
65
+ "W", # pycodestyle warnings
66
+ "F", # pyflakes
67
+ "I", # isort
68
+ "N", # pep8-naming
69
+ "UP", # pyupgrade
70
+ "B", # flake8-bugbear
71
+ "SIM", # flake8-simplify
72
+ "D", # pydocstyle
73
+ ]
74
+ ignore = [
75
+ "D100", # Missing docstring in public module (module docstring already present)
76
+ "D104", # Missing docstring in public package
77
+ "D203", # 1 blank line required before class docstring (conflicts with D211)
78
+ "D213", # Multi-line docstring summary should start at the second line (conflicts with D212)
79
+ ]
80
+
81
+ [tool.ruff.lint.pydocstyle]
82
+ convention = "numpy"
83
+
84
+ [tool.ruff.lint.isort]
85
+ known-first-party = ["muvera"]
86
+
87
+ [tool.ruff.lint.per-file-ignores]
88
+ "tests/**" = ["D"]
89
+ "examples/**" = ["D"]
90
+
91
+ # ---------------------------------------------------------------------------
92
+ # Pytest
93
+ # ---------------------------------------------------------------------------
94
+
95
+ [tool.pytest.ini_options]
96
+ testpaths = ["tests"]
97
+ addopts = "-v --tb=short"