muvera-python 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- muvera_python-0.1.1/.gitignore +24 -0
- muvera_python-0.1.1/LICENSE +21 -0
- muvera_python-0.1.1/PKG-INFO +126 -0
- muvera_python-0.1.1/README.md +93 -0
- muvera_python-0.1.1/muvera/__init__.py +6 -0
- muvera_python-0.1.1/muvera/helper.py +205 -0
- muvera_python-0.1.1/muvera/muvera.py +411 -0
- muvera_python-0.1.1/muvera/py.typed +0 -0
- muvera_python-0.1.1/pyproject.toml +97 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Python-generated files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[oc]
|
|
4
|
+
build/
|
|
5
|
+
dist/
|
|
6
|
+
wheels/
|
|
7
|
+
*.egg-info
|
|
8
|
+
|
|
9
|
+
# Virtual environments
|
|
10
|
+
.venv
|
|
11
|
+
|
|
12
|
+
# Lock file (library repo — dependencies should stay flexible)
|
|
13
|
+
uv.lock
|
|
14
|
+
|
|
15
|
+
# Local Python version (managed by pyenv/uv, not part of the library)
|
|
16
|
+
.python-version
|
|
17
|
+
|
|
18
|
+
# Embedding cache (generated by examples)
|
|
19
|
+
examples/.cache/
|
|
20
|
+
|
|
21
|
+
.vscode/
|
|
22
|
+
|
|
23
|
+
# Benchmark results (generated, not committed)
|
|
24
|
+
benchmarks/results/*.json
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 craftsangjae
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: muvera-python
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: MuVERA: Multi-Vector Retrieval via Fixed Dimensional Encodings
|
|
5
|
+
Project-URL: Homepage, https://github.com/craftsangjae/muvera-python
|
|
6
|
+
Project-URL: Repository, https://github.com/craftsangjae/muvera-python
|
|
7
|
+
Project-URL: Issues, https://github.com/craftsangjae/muvera-python/issues
|
|
8
|
+
Author: craftsangjae
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: embedding,fixed-dimensional-encoding,multi-vector,muvera,muvera-python,retrieval
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
24
|
+
Requires-Python: >=3.9
|
|
25
|
+
Requires-Dist: numpy>=1.22.0
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: mypy>=1.0; extra == 'dev'
|
|
28
|
+
Requires-Dist: pre-commit>=3.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
31
|
+
Requires-Dist: scipy>=1.13.1; extra == 'dev'
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
|
|
34
|
+
# MuVERA
|
|
35
|
+
|
|
36
|
+
A Python implementation of **Mu**lti-**Ve**ctor **R**etrieval via Fixed Dimensional Encoding **A**lgorithm.
|
|
37
|
+
|
|
38
|
+
Converts multi-vector embeddings (point clouds) into fixed-dimensional single vectors, enabling the use of existing single-vector search infrastructure (MIPS, ANN, etc.) as-is.
|
|
39
|
+
|
|
40
|
+
## Why this library?
|
|
41
|
+
|
|
42
|
+
The original MuVERA algorithm is described in a [research paper](https://arxiv.org/abs/2405.19504) and implemented in C++ within Google's [graph-mining](https://github.com/google/graph-mining) repository. While a Python reference exists, it exposes low-level config objects and separate functions for queries vs. documents, making it cumbersome to integrate into real workflows.
|
|
43
|
+
|
|
44
|
+
This library wraps the full algorithm behind a **single `Muvera` class** with a minimal, intuitive interface — initialize once, then call `encode_documents()` and `encode_queries()`. No config dataclasses, no encoding-type enums, no manual seed juggling. Just NumPy arrays in, NumPy arrays out.
|
|
45
|
+
|
|
46
|
+
## Installation
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pip install muvera
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Development install:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
git clone https://github.com/craftsangjae/muvera-python.git
|
|
56
|
+
cd muvera-python
|
|
57
|
+
pip install -e .
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Quick Start
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
import numpy as np
|
|
64
|
+
from muvera import Muvera
|
|
65
|
+
|
|
66
|
+
# Initialize encoder
|
|
67
|
+
encoder = Muvera(
|
|
68
|
+
num_repetitions=10,
|
|
69
|
+
num_simhash_projections=4,
|
|
70
|
+
dimension=128,
|
|
71
|
+
seed=42,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# Encode documents (batch)
|
|
75
|
+
# shape: (num_documents, num_vectors_per_doc, embedding_dim)
|
|
76
|
+
documents = np.random.randn(100, 80, 128).astype(np.float32)
|
|
77
|
+
doc_fdes = encoder.encode_documents(documents) # (100, output_dimension)
|
|
78
|
+
|
|
79
|
+
# Encode queries (batch)
|
|
80
|
+
queries = np.random.randn(10, 32, 128).astype(np.float32)
|
|
81
|
+
query_fdes = encoder.encode_queries(queries) # (10, output_dimension)
|
|
82
|
+
|
|
83
|
+
# Compute similarity (dot product)
|
|
84
|
+
scores = query_fdes @ doc_fdes.T # (10, 100)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Parameters
|
|
88
|
+
|
|
89
|
+
| Parameter | Default | Description |
|
|
90
|
+
|---|---|---|
|
|
91
|
+
| `num_repetitions` | 20 | Number of FDE repetitions. Higher values improve accuracy but increase output dimension |
|
|
92
|
+
| `num_simhash_projections` | 5 | Number of SimHash projections. Number of partitions = 2^n |
|
|
93
|
+
| `dimension` | 16 | Input embedding dimension |
|
|
94
|
+
| `projection_type` | `"identity"` | `"identity"` or `"ams_sketch"` |
|
|
95
|
+
| `projection_dimension` | None | Projected dimension when using AMS Sketch |
|
|
96
|
+
| `fill_empty_partitions` | True | Whether to fill empty partitions with the nearest vector |
|
|
97
|
+
| `final_projection_dimension` | None | Final dimension reduction via Count Sketch |
|
|
98
|
+
| `seed` | 42 | Random seed for reproducibility |
|
|
99
|
+
|
|
100
|
+
## Benchmark
|
|
101
|
+
|
|
102
|
+
End-to-end retrieval on [NanoFiQA2018](https://huggingface.co/datasets/zeta-alpha-ai/NanoFiQA2018) (4598 documents, 50 queries) using `raphaelsty/neural-cherche-colbert` (dim=128):
|
|
103
|
+
|
|
104
|
+
```
|
|
105
|
+
=====================================================================================
|
|
106
|
+
RESULTS
|
|
107
|
+
(zeta-alpha-ai/NanoFiQA2018)
|
|
108
|
+
=====================================================================================
|
|
109
|
+
Retriever | Index (s) | Query (ms) | Recall@25
|
|
110
|
+
-------------------------------------------------------------------------------------
|
|
111
|
+
ColBERT (Native MaxSim) | 240.04 | 836.94 | 0.8400
|
|
112
|
+
ColBERT + Muvera FDE | 77.29 | 69.48 | 0.7600
|
|
113
|
+
=====================================================================================
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
FDE achieves **90% of native MaxSim recall** while being **12x faster** at query time. See `examples/colbert_nanobeir.py` to reproduce.
|
|
117
|
+
|
|
118
|
+
## Acknowledgments
|
|
119
|
+
|
|
120
|
+
This library was inspired by [sionic-ai/muvera-py](https://github.com/sionic-ai/muvera-py), the first Python implementation of the MuVERA algorithm. Their faithful port of the C++ reference made it possible to validate correctness and understand the algorithm deeply. This project builds on that foundation with a simplified API designed for easier integration.
|
|
121
|
+
|
|
122
|
+
## References
|
|
123
|
+
|
|
124
|
+
- [MuVERA: Multi-Vector Retrieval via Fixed Dimensional Encodings](https://arxiv.org/abs/2405.19504)
|
|
125
|
+
- [Google graph-mining C++ implementation](https://github.com/google/graph-mining/blob/main/sketching/point_cloud/fixed_dimensional_encoding.cc)
|
|
126
|
+
- [sionic-ai/muvera-py](https://github.com/sionic-ai/muvera-py)
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# MuVERA
|
|
2
|
+
|
|
3
|
+
A Python implementation of **Mu**lti-**Ve**ctor **R**etrieval via Fixed Dimensional Encoding **A**lgorithm.
|
|
4
|
+
|
|
5
|
+
Converts multi-vector embeddings (point clouds) into fixed-dimensional single vectors, enabling the use of existing single-vector search infrastructure (MIPS, ANN, etc.) as-is.
|
|
6
|
+
|
|
7
|
+
## Why this library?
|
|
8
|
+
|
|
9
|
+
The original MuVERA algorithm is described in a [research paper](https://arxiv.org/abs/2405.19504) and implemented in C++ within Google's [graph-mining](https://github.com/google/graph-mining) repository. While a Python reference exists, it exposes low-level config objects and separate functions for queries vs. documents, making it cumbersome to integrate into real workflows.
|
|
10
|
+
|
|
11
|
+
This library wraps the full algorithm behind a **single `Muvera` class** with a minimal, intuitive interface — initialize once, then call `encode_documents()` and `encode_queries()`. No config dataclasses, no encoding-type enums, no manual seed juggling. Just NumPy arrays in, NumPy arrays out.
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install muvera
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
Development install:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
git clone https://github.com/craftsangjae/muvera-python.git
|
|
23
|
+
cd muvera-python
|
|
24
|
+
pip install -e .
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Quick Start
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
import numpy as np
|
|
31
|
+
from muvera import Muvera
|
|
32
|
+
|
|
33
|
+
# Initialize encoder
|
|
34
|
+
encoder = Muvera(
|
|
35
|
+
num_repetitions=10,
|
|
36
|
+
num_simhash_projections=4,
|
|
37
|
+
dimension=128,
|
|
38
|
+
seed=42,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Encode documents (batch)
|
|
42
|
+
# shape: (num_documents, num_vectors_per_doc, embedding_dim)
|
|
43
|
+
documents = np.random.randn(100, 80, 128).astype(np.float32)
|
|
44
|
+
doc_fdes = encoder.encode_documents(documents) # (100, output_dimension)
|
|
45
|
+
|
|
46
|
+
# Encode queries (batch)
|
|
47
|
+
queries = np.random.randn(10, 32, 128).astype(np.float32)
|
|
48
|
+
query_fdes = encoder.encode_queries(queries) # (10, output_dimension)
|
|
49
|
+
|
|
50
|
+
# Compute similarity (dot product)
|
|
51
|
+
scores = query_fdes @ doc_fdes.T # (10, 100)
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Parameters
|
|
55
|
+
|
|
56
|
+
| Parameter | Default | Description |
|
|
57
|
+
|---|---|---|
|
|
58
|
+
| `num_repetitions` | 20 | Number of FDE repetitions. Higher values improve accuracy but increase output dimension |
|
|
59
|
+
| `num_simhash_projections` | 5 | Number of SimHash projections. Number of partitions = 2^n |
|
|
60
|
+
| `dimension` | 16 | Input embedding dimension |
|
|
61
|
+
| `projection_type` | `"identity"` | `"identity"` or `"ams_sketch"` |
|
|
62
|
+
| `projection_dimension` | None | Projected dimension when using AMS Sketch |
|
|
63
|
+
| `fill_empty_partitions` | True | Whether to fill empty partitions with the nearest vector |
|
|
64
|
+
| `final_projection_dimension` | None | Final dimension reduction via Count Sketch |
|
|
65
|
+
| `seed` | 42 | Random seed for reproducibility |
|
|
66
|
+
|
|
67
|
+
## Benchmark
|
|
68
|
+
|
|
69
|
+
End-to-end retrieval on [NanoFiQA2018](https://huggingface.co/datasets/zeta-alpha-ai/NanoFiQA2018) (4598 documents, 50 queries) using `raphaelsty/neural-cherche-colbert` (dim=128):
|
|
70
|
+
|
|
71
|
+
```
|
|
72
|
+
=====================================================================================
|
|
73
|
+
RESULTS
|
|
74
|
+
(zeta-alpha-ai/NanoFiQA2018)
|
|
75
|
+
=====================================================================================
|
|
76
|
+
Retriever | Index (s) | Query (ms) | Recall@25
|
|
77
|
+
-------------------------------------------------------------------------------------
|
|
78
|
+
ColBERT (Native MaxSim) | 240.04 | 836.94 | 0.8400
|
|
79
|
+
ColBERT + Muvera FDE | 77.29 | 69.48 | 0.7600
|
|
80
|
+
=====================================================================================
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
FDE achieves **90% of native MaxSim recall** while being **12x faster** at query time. See `examples/colbert_nanobeir.py` to reproduce.
|
|
84
|
+
|
|
85
|
+
## Acknowledgments
|
|
86
|
+
|
|
87
|
+
This library was inspired by [sionic-ai/muvera-py](https://github.com/sionic-ai/muvera-py), the first Python implementation of the MuVERA algorithm. Their faithful port of the C++ reference made it possible to validate correctness and understand the algorithm deeply. This project builds on that foundation with a simplified API designed for easier integration.
|
|
88
|
+
|
|
89
|
+
## References
|
|
90
|
+
|
|
91
|
+
- [MuVERA: Multi-Vector Retrieval via Fixed Dimensional Encodings](https://arxiv.org/abs/2405.19504)
|
|
92
|
+
- [Google graph-mining C++ implementation](https://github.com/google/graph-mining/blob/main/sketching/point_cloud/fixed_dimensional_encoding.cc)
|
|
93
|
+
- [sionic-ai/muvera-py](https://github.com/sionic-ai/muvera-py)
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""Internal helper functions for MuVERA Fixed Dimensional Encoding.
|
|
2
|
+
|
|
3
|
+
This module contains low-level utilities for Gray code manipulation,
|
|
4
|
+
random projection matrix generation, Count Sketch, and SimHash-based
|
|
5
|
+
partition indexing. These are not part of the public API.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
# ---------------------------------------------------------------------------
|
|
13
|
+
# Gray code utilities
|
|
14
|
+
# ---------------------------------------------------------------------------
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def append_to_gray_code(gray_code: int, bit: bool) -> int:
|
|
18
|
+
"""Append a single bit to a Gray code value.
|
|
19
|
+
|
|
20
|
+
Parameters
|
|
21
|
+
----------
|
|
22
|
+
gray_code : int
|
|
23
|
+
Current Gray code value.
|
|
24
|
+
bit : bool
|
|
25
|
+
Bit to append (True=1, False=0).
|
|
26
|
+
|
|
27
|
+
Returns
|
|
28
|
+
-------
|
|
29
|
+
int
|
|
30
|
+
Updated Gray code.
|
|
31
|
+
"""
|
|
32
|
+
return (gray_code << 1) + (int(bit) ^ (gray_code & 1))
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def gray_code_to_binary(num: int) -> int:
|
|
36
|
+
"""Convert a Gray code value to its binary representation.
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
num : int
|
|
41
|
+
Gray code value.
|
|
42
|
+
|
|
43
|
+
Returns
|
|
44
|
+
-------
|
|
45
|
+
int
|
|
46
|
+
Corresponding binary representation.
|
|
47
|
+
"""
|
|
48
|
+
mask = num >> 1
|
|
49
|
+
while mask != 0:
|
|
50
|
+
num = num ^ mask
|
|
51
|
+
mask >>= 1
|
|
52
|
+
return num
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# ---------------------------------------------------------------------------
|
|
56
|
+
# Random projection matrices
|
|
57
|
+
# ---------------------------------------------------------------------------
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def simhash_matrix_from_seed(dimension: int, num_projections: int, seed: int) -> np.ndarray:
|
|
61
|
+
"""Generate a Gaussian random projection matrix for SimHash.
|
|
62
|
+
|
|
63
|
+
Parameters
|
|
64
|
+
----------
|
|
65
|
+
dimension : int
|
|
66
|
+
Input vector dimension.
|
|
67
|
+
num_projections : int
|
|
68
|
+
Number of SimHash projections.
|
|
69
|
+
seed : int
|
|
70
|
+
Random seed.
|
|
71
|
+
|
|
72
|
+
Returns
|
|
73
|
+
-------
|
|
74
|
+
numpy.ndarray
|
|
75
|
+
Float32 matrix of shape ``(dimension, num_projections)``.
|
|
76
|
+
"""
|
|
77
|
+
rng = np.random.default_rng(seed)
|
|
78
|
+
return rng.normal(loc=0.0, scale=1.0, size=(dimension, num_projections)).astype(np.float32)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def ams_projection_matrix_from_seed(dimension: int, projection_dim: int, seed: int) -> np.ndarray:
|
|
82
|
+
"""Generate an AMS Sketch projection matrix.
|
|
83
|
+
|
|
84
|
+
Each row has exactly one non-zero entry (+1 or -1), forming a sparse
|
|
85
|
+
random projection.
|
|
86
|
+
|
|
87
|
+
Parameters
|
|
88
|
+
----------
|
|
89
|
+
dimension : int
|
|
90
|
+
Input vector dimension.
|
|
91
|
+
projection_dim : int
|
|
92
|
+
Output (projected) dimension.
|
|
93
|
+
seed : int
|
|
94
|
+
Random seed.
|
|
95
|
+
|
|
96
|
+
Returns
|
|
97
|
+
-------
|
|
98
|
+
numpy.ndarray
|
|
99
|
+
Float32 matrix of shape ``(dimension, projection_dim)``.
|
|
100
|
+
"""
|
|
101
|
+
rng = np.random.default_rng(seed)
|
|
102
|
+
out = np.zeros((dimension, projection_dim), dtype=np.float32)
|
|
103
|
+
indices = rng.integers(0, projection_dim, size=dimension)
|
|
104
|
+
signs = rng.choice(np.array([-1.0, 1.0], dtype=np.float32), size=dimension)
|
|
105
|
+
out[np.arange(dimension), indices] = signs
|
|
106
|
+
return out
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def count_sketch_vector_from_seed(
|
|
110
|
+
input_vector: np.ndarray, final_dimension: int, seed: int
|
|
111
|
+
) -> np.ndarray:
|
|
112
|
+
"""Project a vector to a lower dimension using Count Sketch.
|
|
113
|
+
|
|
114
|
+
Parameters
|
|
115
|
+
----------
|
|
116
|
+
input_vector : numpy.ndarray
|
|
117
|
+
Input vector (1-D).
|
|
118
|
+
final_dimension : int
|
|
119
|
+
Output dimension.
|
|
120
|
+
seed : int
|
|
121
|
+
Random seed.
|
|
122
|
+
|
|
123
|
+
Returns
|
|
124
|
+
-------
|
|
125
|
+
numpy.ndarray
|
|
126
|
+
Float32 vector of shape ``(final_dimension,)``.
|
|
127
|
+
"""
|
|
128
|
+
rng = np.random.default_rng(seed)
|
|
129
|
+
out = np.zeros(final_dimension, dtype=np.float32)
|
|
130
|
+
indices = rng.integers(0, final_dimension, size=input_vector.shape[0])
|
|
131
|
+
signs = rng.choice(np.array([-1.0, 1.0], dtype=np.float32), size=input_vector.shape[0])
|
|
132
|
+
np.add.at(out, indices, signs * input_vector)
|
|
133
|
+
return out
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
# ---------------------------------------------------------------------------
|
|
137
|
+
# Partition indexing
|
|
138
|
+
# ---------------------------------------------------------------------------
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def partition_index_gray(sketch_vector: np.ndarray) -> int:
|
|
142
|
+
"""Compute a Gray-code-based partition index from a SimHash sketch vector.
|
|
143
|
+
|
|
144
|
+
Parameters
|
|
145
|
+
----------
|
|
146
|
+
sketch_vector : numpy.ndarray
|
|
147
|
+
SimHash projection result vector (1-D).
|
|
148
|
+
|
|
149
|
+
Returns
|
|
150
|
+
-------
|
|
151
|
+
int
|
|
152
|
+
Partition index.
|
|
153
|
+
"""
|
|
154
|
+
partition_index = 0
|
|
155
|
+
for val in sketch_vector:
|
|
156
|
+
partition_index = append_to_gray_code(partition_index, val > 0)
|
|
157
|
+
return partition_index
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def distance_to_partition(sketch_vector: np.ndarray, partition_index: int) -> int:
|
|
161
|
+
"""Compute the Hamming distance between a sketch vector and a partition.
|
|
162
|
+
|
|
163
|
+
Parameters
|
|
164
|
+
----------
|
|
165
|
+
sketch_vector : numpy.ndarray
|
|
166
|
+
SimHash projection result vector (1-D).
|
|
167
|
+
partition_index : int
|
|
168
|
+
Target partition index.
|
|
169
|
+
|
|
170
|
+
Returns
|
|
171
|
+
-------
|
|
172
|
+
int
|
|
173
|
+
Hamming distance.
|
|
174
|
+
"""
|
|
175
|
+
num_projections = sketch_vector.size
|
|
176
|
+
binary_representation = gray_code_to_binary(partition_index)
|
|
177
|
+
sketch_bits = (sketch_vector > 0).astype(int)
|
|
178
|
+
binary_array = (binary_representation >> np.arange(num_projections - 1, -1, -1)) & 1
|
|
179
|
+
return int(np.sum(sketch_bits != binary_array))
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
# ---------------------------------------------------------------------------
|
|
183
|
+
# Vectorised batch helpers
|
|
184
|
+
# ---------------------------------------------------------------------------
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def partition_indices_gray_batch(sketches: np.ndarray) -> np.ndarray:
|
|
188
|
+
"""Compute Gray-code partition indices for a batch of sketch vectors.
|
|
189
|
+
|
|
190
|
+
Parameters
|
|
191
|
+
----------
|
|
192
|
+
sketches : numpy.ndarray
|
|
193
|
+
SimHash sketch matrix of shape ``(N, num_projections)``.
|
|
194
|
+
|
|
195
|
+
Returns
|
|
196
|
+
-------
|
|
197
|
+
numpy.ndarray
|
|
198
|
+
Uint32 partition index array of shape ``(N,)``.
|
|
199
|
+
"""
|
|
200
|
+
num_projections = sketches.shape[1]
|
|
201
|
+
bits = (sketches > 0).astype(np.uint32)
|
|
202
|
+
partition_indices = np.zeros(sketches.shape[0], dtype=np.uint32)
|
|
203
|
+
for bit_idx in range(num_projections):
|
|
204
|
+
partition_indices = (partition_indices << 1) + (bits[:, bit_idx] ^ (partition_indices & 1))
|
|
205
|
+
return partition_indices
|
|
@@ -0,0 +1,411 @@
|
|
|
1
|
+
"""MuVERA (Multi-Vector Retrieval via Fixed Dimensional Encodings).
|
|
2
|
+
|
|
3
|
+
This module provides the Fixed Dimensional Encoding (FDE) algorithm that
|
|
4
|
+
converts multi-vector embeddings (point clouds) into single fixed-dimensional
|
|
5
|
+
vectors.
|
|
6
|
+
|
|
7
|
+
References
|
|
8
|
+
----------
|
|
9
|
+
.. [1] Google graph-mining: fixed_dimensional_encoding.cc
|
|
10
|
+
https://github.com/google/graph-mining/blob/main/sketching/point_cloud/fixed_dimensional_encoding.cc
|
|
11
|
+
.. [2] sionic-ai/muvera-py: fde_generator.py
|
|
12
|
+
https://github.com/sionic-ai/muvera-py/blob/master/fde_generator.py
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from typing import Literal
|
|
18
|
+
|
|
19
|
+
import numpy as np
|
|
20
|
+
|
|
21
|
+
from muvera.helper import (
|
|
22
|
+
ams_projection_matrix_from_seed,
|
|
23
|
+
count_sketch_vector_from_seed,
|
|
24
|
+
distance_to_partition,
|
|
25
|
+
gray_code_to_binary,
|
|
26
|
+
partition_index_gray,
|
|
27
|
+
partition_indices_gray_batch,
|
|
28
|
+
simhash_matrix_from_seed,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class Muvera:
|
|
33
|
+
"""Encoder that converts multi-vector embeddings into Fixed Dimensional Encodings.
|
|
34
|
+
|
|
35
|
+
Uses the MuVERA algorithm to encode variable-length multi-vector
|
|
36
|
+
representations (point clouds) into fixed-dimensional single vectors.
|
|
37
|
+
The dot product between encoded vectors approximates the Chamfer
|
|
38
|
+
similarity (MaxSim) between the original multi-vector sets.
|
|
39
|
+
|
|
40
|
+
Parameters
|
|
41
|
+
----------
|
|
42
|
+
num_repetitions : int, default=20
|
|
43
|
+
Number of repetitions for FDE generation. Higher values improve
|
|
44
|
+
accuracy but proportionally increase the output dimension.
|
|
45
|
+
num_simhash_projections : int, default=5
|
|
46
|
+
Number of SimHash projections. The number of partitions is
|
|
47
|
+
``2 ** num_simhash_projections``. Must be in ``[0, 31)``.
|
|
48
|
+
dimension : int, default=16
|
|
49
|
+
Dimension of the input embedding vectors.
|
|
50
|
+
projection_type : {'identity', 'ams_sketch'}, default='identity'
|
|
51
|
+
Inner projection method.
|
|
52
|
+
|
|
53
|
+
- ``'identity'``: Uses the original dimension as-is.
|
|
54
|
+
- ``'ams_sketch'``: Uses AMS Sketch for dimensionality reduction.
|
|
55
|
+
projection_dimension : int or None, default=None
|
|
56
|
+
Projected dimension when ``projection_type='ams_sketch'``.
|
|
57
|
+
Ignored when ``'identity'``.
|
|
58
|
+
fill_empty_partitions : bool, default=True
|
|
59
|
+
Whether to fill empty partitions with the nearest vector during
|
|
60
|
+
document encoding.
|
|
61
|
+
final_projection_dimension : int or None, default=None
|
|
62
|
+
Final Count Sketch projection dimension. ``None`` disables
|
|
63
|
+
final projection.
|
|
64
|
+
seed : int, default=42
|
|
65
|
+
Random seed for reproducibility.
|
|
66
|
+
|
|
67
|
+
Attributes
|
|
68
|
+
----------
|
|
69
|
+
output_dimension : int
|
|
70
|
+
Dimension of the encoded output vector.
|
|
71
|
+
|
|
72
|
+
Examples
|
|
73
|
+
--------
|
|
74
|
+
>>> import numpy as np
|
|
75
|
+
>>> from muvera import Muvera
|
|
76
|
+
>>> encoder = Muvera(num_repetitions=10, num_simhash_projections=4,
|
|
77
|
+
... dimension=128, seed=42)
|
|
78
|
+
>>> # Single document/query
|
|
79
|
+
>>> doc = np.random.randn(80, 128).astype(np.float32)
|
|
80
|
+
>>> query = np.random.randn(32, 128).astype(np.float32)
|
|
81
|
+
>>> doc_fde = encoder.encode_documents(doc)
|
|
82
|
+
>>> query_fde = encoder.encode_queries(query)
|
|
83
|
+
>>> score = query_fde @ doc_fde # similarity score
|
|
84
|
+
>>>
|
|
85
|
+
>>> # Batch of documents with variable lengths
|
|
86
|
+
>>> docs = [np.random.randn(80, 128).astype(np.float32) for _ in range(5)]
|
|
87
|
+
>>> doc_fdes = encoder.encode_documents(docs) # (5, output_dimension)
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
def __init__(
|
|
91
|
+
self,
|
|
92
|
+
num_repetitions: int = 20,
|
|
93
|
+
num_simhash_projections: int = 5,
|
|
94
|
+
dimension: int = 16,
|
|
95
|
+
projection_type: Literal["identity", "ams_sketch"] = "identity",
|
|
96
|
+
projection_dimension: int | None = None,
|
|
97
|
+
fill_empty_partitions: bool = True,
|
|
98
|
+
final_projection_dimension: int | None = None,
|
|
99
|
+
seed: int = 42,
|
|
100
|
+
):
|
|
101
|
+
if num_repetitions <= 0:
|
|
102
|
+
raise ValueError(f"num_repetitions must be greater than 0, got {num_repetitions}")
|
|
103
|
+
if not (0 <= num_simhash_projections < 31):
|
|
104
|
+
raise ValueError(
|
|
105
|
+
f"num_simhash_projections must be in [0, 31), got {num_simhash_projections}"
|
|
106
|
+
)
|
|
107
|
+
if projection_type not in ("identity", "ams_sketch"):
|
|
108
|
+
raise ValueError(
|
|
109
|
+
f"projection_type must be 'identity' or 'ams_sketch', got '{projection_type}'"
|
|
110
|
+
)
|
|
111
|
+
if projection_type == "ams_sketch" and (
|
|
112
|
+
projection_dimension is None or projection_dimension <= 0
|
|
113
|
+
):
|
|
114
|
+
raise ValueError(
|
|
115
|
+
"A positive projection_dimension is required when projection_type='ams_sketch'"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
self.num_repetitions = num_repetitions
|
|
119
|
+
self.num_simhash_projections = num_simhash_projections
|
|
120
|
+
self.dimension = dimension
|
|
121
|
+
self.projection_type = projection_type
|
|
122
|
+
self.projection_dimension = projection_dimension
|
|
123
|
+
self.fill_empty_partitions = fill_empty_partitions
|
|
124
|
+
self.final_projection_dimension = final_projection_dimension
|
|
125
|
+
self.seed = seed
|
|
126
|
+
|
|
127
|
+
# Derived constants
|
|
128
|
+
self._num_partitions: int = 2**num_simhash_projections
|
|
129
|
+
self._use_identity: bool = projection_type == "identity"
|
|
130
|
+
self._proj_dim: int = dimension if self._use_identity else projection_dimension # type: ignore[assignment]
|
|
131
|
+
self._fde_dim: int = num_repetitions * self._num_partitions * self._proj_dim
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
def output_dimension(self) -> int:
|
|
135
|
+
"""Dimension of the encoded output vector."""
|
|
136
|
+
if self.final_projection_dimension is not None and self.final_projection_dimension > 0:
|
|
137
|
+
return self.final_projection_dimension
|
|
138
|
+
return self._fde_dim
|
|
139
|
+
|
|
140
|
+
# ------------------------------------------------------------------
|
|
141
|
+
# Public API
|
|
142
|
+
# ------------------------------------------------------------------
|
|
143
|
+
|
|
144
|
+
def encode_documents(self, documents: np.ndarray | list[np.ndarray]) -> np.ndarray:
|
|
145
|
+
"""Encode document point clouds into Fixed Dimensional Encodings.
|
|
146
|
+
|
|
147
|
+
Uses AVERAGE aggregation within partitions. Empty partitions are filled
|
|
148
|
+
with the nearest vector when ``fill_empty_partitions=True``.
|
|
149
|
+
|
|
150
|
+
Parameters
|
|
151
|
+
----------
|
|
152
|
+
documents : np.ndarray or list[np.ndarray]
|
|
153
|
+
- Single document: shape ``(num_vectors, dimension)``
|
|
154
|
+
- Batch of documents: ``list[np.ndarray]`` where each element has
|
|
155
|
+
shape ``(num_vectors_i, dimension)``
|
|
156
|
+
|
|
157
|
+
Returns
|
|
158
|
+
-------
|
|
159
|
+
np.ndarray
|
|
160
|
+
Encoded FDE vector(s). Single document returns shape ``(output_dimension,)``,
|
|
161
|
+
batch returns shape ``(num_documents, output_dimension)``.
|
|
162
|
+
"""
|
|
163
|
+
if isinstance(documents, list):
|
|
164
|
+
return self._encode_batch(documents, is_query=False)
|
|
165
|
+
return self._encode_single(documents, is_query=False)
|
|
166
|
+
|
|
167
|
+
def encode_queries(self, queries: np.ndarray | list[np.ndarray]) -> np.ndarray:
|
|
168
|
+
"""Encode query point clouds into Fixed Dimensional Encodings.
|
|
169
|
+
|
|
170
|
+
Uses SUM aggregation within partitions. Empty partitions are not filled.
|
|
171
|
+
|
|
172
|
+
Parameters
|
|
173
|
+
----------
|
|
174
|
+
queries : np.ndarray or list[np.ndarray]
|
|
175
|
+
- Single query: shape ``(num_vectors, dimension)``
|
|
176
|
+
- Batch of queries: ``list[np.ndarray]`` where each element has
|
|
177
|
+
shape ``(num_vectors_i, dimension)``
|
|
178
|
+
|
|
179
|
+
Returns
|
|
180
|
+
-------
|
|
181
|
+
np.ndarray
|
|
182
|
+
Encoded FDE vector(s). Single query returns shape ``(output_dimension,)``,
|
|
183
|
+
batch returns shape ``(num_queries, output_dimension)``.
|
|
184
|
+
"""
|
|
185
|
+
if isinstance(queries, list):
|
|
186
|
+
return self._encode_batch(queries, is_query=True)
|
|
187
|
+
return self._encode_single(queries, is_query=True)
|
|
188
|
+
|
|
189
|
+
# ------------------------------------------------------------------
|
|
190
|
+
# Core Encoding
|
|
191
|
+
# ------------------------------------------------------------------
|
|
192
|
+
|
|
193
|
+
def _encode_single(self, point_cloud: np.ndarray, *, is_query: bool) -> np.ndarray:
|
|
194
|
+
"""Encode a single point cloud."""
|
|
195
|
+
if point_cloud.ndim != 2 or point_cloud.shape[1] != self.dimension:
|
|
196
|
+
raise ValueError(f"Expected shape (N, {self.dimension}), got {point_cloud.shape}")
|
|
197
|
+
|
|
198
|
+
point_cloud = np.asarray(point_cloud, dtype=np.float32)
|
|
199
|
+
out = np.zeros(self._fde_dim, dtype=np.float32)
|
|
200
|
+
|
|
201
|
+
for rep in range(self.num_repetitions):
|
|
202
|
+
current_seed = self.seed + rep
|
|
203
|
+
sketches = self._compute_sketches(point_cloud, current_seed)
|
|
204
|
+
projected = self._compute_projection(point_cloud, current_seed)
|
|
205
|
+
rep_fde = self._aggregate_single(sketches, projected, is_query)
|
|
206
|
+
|
|
207
|
+
rep_start = rep * self._num_partitions * self._proj_dim
|
|
208
|
+
out[rep_start : rep_start + rep_fde.size] = rep_fde
|
|
209
|
+
|
|
210
|
+
return self._apply_final_projection(out)
|
|
211
|
+
|
|
212
|
+
def _encode_batch(self, point_clouds: list[np.ndarray], *, is_query: bool) -> np.ndarray:
|
|
213
|
+
"""Encode a batch of variable-length point clouds."""
|
|
214
|
+
batch_size = len(point_clouds)
|
|
215
|
+
if batch_size == 0:
|
|
216
|
+
return np.zeros((0, self.output_dimension), dtype=np.float32)
|
|
217
|
+
|
|
218
|
+
# Validate and flatten
|
|
219
|
+
for i, pc in enumerate(point_clouds):
|
|
220
|
+
if pc.ndim != 2 or pc.shape[1] != self.dimension:
|
|
221
|
+
raise ValueError(f"Element {i}: expected (N, {self.dimension}), got {pc.shape}")
|
|
222
|
+
|
|
223
|
+
flat_points, doc_indices, doc_boundaries = self._flatten_batch(point_clouds)
|
|
224
|
+
out = np.zeros((batch_size, self._fde_dim), dtype=np.float32)
|
|
225
|
+
|
|
226
|
+
for rep in range(self.num_repetitions):
|
|
227
|
+
current_seed = self.seed + rep
|
|
228
|
+
sketches = self._compute_sketches(flat_points, current_seed)
|
|
229
|
+
projected = self._compute_projection(flat_points, current_seed)
|
|
230
|
+
rep_fde = self._aggregate_batch(
|
|
231
|
+
sketches, projected, doc_indices, doc_boundaries, is_query
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
rep_start = rep * self._num_partitions * self._proj_dim
|
|
235
|
+
out[:, rep_start : rep_start + rep_fde.size // batch_size] = rep_fde
|
|
236
|
+
|
|
237
|
+
return self._apply_final_projection(out)
|
|
238
|
+
|
|
239
|
+
# ------------------------------------------------------------------
|
|
240
|
+
# Helper Methods
|
|
241
|
+
# ------------------------------------------------------------------
|
|
242
|
+
|
|
243
|
+
def _compute_sketches(self, vectors: np.ndarray, seed: int) -> np.ndarray:
|
|
244
|
+
"""Compute SimHash sketches for partition assignment."""
|
|
245
|
+
sim_matrix = simhash_matrix_from_seed(self.dimension, self.num_simhash_projections, seed)
|
|
246
|
+
return vectors @ sim_matrix
|
|
247
|
+
|
|
248
|
+
def _compute_projection(self, vectors: np.ndarray, seed: int) -> np.ndarray:
|
|
249
|
+
"""Project vectors using identity or AMS sketch."""
|
|
250
|
+
if self._use_identity:
|
|
251
|
+
return vectors
|
|
252
|
+
|
|
253
|
+
ams_matrix = ams_projection_matrix_from_seed(self.dimension, self._proj_dim, seed)
|
|
254
|
+
return vectors @ ams_matrix
|
|
255
|
+
|
|
256
|
+
def _aggregate_single(
|
|
257
|
+
self, sketches: np.ndarray, projected: np.ndarray, is_query: bool
|
|
258
|
+
) -> np.ndarray:
|
|
259
|
+
"""Aggregate vectors into partitions for a single point cloud."""
|
|
260
|
+
num_points = sketches.shape[0]
|
|
261
|
+
partition_counts = np.zeros(self._num_partitions, dtype=np.int32)
|
|
262
|
+
rep_fde = np.zeros(self._num_partitions * self._proj_dim, dtype=np.float32)
|
|
263
|
+
|
|
264
|
+
# Assign vectors to partitions
|
|
265
|
+
for i in range(num_points):
|
|
266
|
+
pidx = partition_index_gray(sketches[i])
|
|
267
|
+
start = pidx * self._proj_dim
|
|
268
|
+
rep_fde[start : start + self._proj_dim] += projected[i]
|
|
269
|
+
partition_counts[pidx] += 1
|
|
270
|
+
|
|
271
|
+
# Apply AVERAGE for documents, SUM for queries
|
|
272
|
+
if not is_query:
|
|
273
|
+
self._apply_average_and_fill(rep_fde, partition_counts, sketches, projected)
|
|
274
|
+
|
|
275
|
+
return rep_fde
|
|
276
|
+
|
|
277
|
+
def _aggregate_batch(
|
|
278
|
+
self,
|
|
279
|
+
all_sketches: np.ndarray,
|
|
280
|
+
all_projected: np.ndarray,
|
|
281
|
+
doc_indices: np.ndarray,
|
|
282
|
+
doc_boundaries: np.ndarray,
|
|
283
|
+
is_query: bool,
|
|
284
|
+
) -> np.ndarray:
|
|
285
|
+
"""Aggregate vectors into partitions for a batch of point clouds."""
|
|
286
|
+
batch_size = len(doc_boundaries) - 1
|
|
287
|
+
part_indices = partition_indices_gray_batch(all_sketches)
|
|
288
|
+
partition_counts = np.zeros((batch_size, self._num_partitions), dtype=np.int32)
|
|
289
|
+
rep_fde = np.zeros((batch_size, self._num_partitions, self._proj_dim), dtype=np.float32)
|
|
290
|
+
|
|
291
|
+
# Count partitions
|
|
292
|
+
np.add.at(partition_counts, (doc_indices, part_indices), 1)
|
|
293
|
+
|
|
294
|
+
# Scatter-add projected vectors
|
|
295
|
+
self._scatter_add(rep_fde, doc_indices, part_indices, all_projected)
|
|
296
|
+
|
|
297
|
+
# Apply AVERAGE for documents
|
|
298
|
+
if not is_query:
|
|
299
|
+
self._apply_average_batch(rep_fde, partition_counts)
|
|
300
|
+
self._fill_empty_batch(
|
|
301
|
+
rep_fde, partition_counts, all_sketches, all_projected, doc_boundaries
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
return rep_fde.reshape(batch_size, -1)
|
|
305
|
+
|
|
306
|
+
def _apply_average_and_fill(
|
|
307
|
+
self,
|
|
308
|
+
rep_fde: np.ndarray,
|
|
309
|
+
partition_counts: np.ndarray,
|
|
310
|
+
sketches: np.ndarray,
|
|
311
|
+
projected: np.ndarray,
|
|
312
|
+
) -> None:
|
|
313
|
+
"""Apply AVERAGE aggregation and fill empty partitions for single cloud."""
|
|
314
|
+
num_points = sketches.shape[0]
|
|
315
|
+
|
|
316
|
+
for pidx in range(self._num_partitions):
|
|
317
|
+
start = pidx * self._proj_dim
|
|
318
|
+
if partition_counts[pidx] > 0:
|
|
319
|
+
rep_fde[start : start + self._proj_dim] /= partition_counts[pidx]
|
|
320
|
+
elif self.fill_empty_partitions and num_points > 0 and self.num_simhash_projections > 0:
|
|
321
|
+
distances = np.array(
|
|
322
|
+
[distance_to_partition(sketches[j], pidx) for j in range(num_points)]
|
|
323
|
+
)
|
|
324
|
+
nearest = np.argmin(distances)
|
|
325
|
+
rep_fde[start : start + self._proj_dim] = projected[nearest]
|
|
326
|
+
|
|
327
|
+
def _apply_average_batch(self, rep_fde: np.ndarray, partition_counts: np.ndarray) -> None:
|
|
328
|
+
"""Apply AVERAGE aggregation for batch."""
|
|
329
|
+
counts_3d = partition_counts[:, :, np.newaxis]
|
|
330
|
+
np.divide(rep_fde, counts_3d, out=rep_fde, where=counts_3d > 0)
|
|
331
|
+
|
|
332
|
+
def _fill_empty_batch(
|
|
333
|
+
self,
|
|
334
|
+
rep_fde: np.ndarray,
|
|
335
|
+
partition_counts: np.ndarray,
|
|
336
|
+
all_sketches: np.ndarray,
|
|
337
|
+
all_projected: np.ndarray,
|
|
338
|
+
doc_boundaries: np.ndarray,
|
|
339
|
+
) -> None:
|
|
340
|
+
"""Fill empty partitions for batch."""
|
|
341
|
+
if not self.fill_empty_partitions or self.num_simhash_projections == 0:
|
|
342
|
+
return
|
|
343
|
+
|
|
344
|
+
empty_docs, empty_parts = np.where(partition_counts == 0)
|
|
345
|
+
for doc_idx, pidx in zip(empty_docs, empty_parts):
|
|
346
|
+
doc_start, doc_end = doc_boundaries[doc_idx], doc_boundaries[doc_idx + 1]
|
|
347
|
+
if doc_start == doc_end:
|
|
348
|
+
continue
|
|
349
|
+
|
|
350
|
+
doc_sketches = all_sketches[doc_start:doc_end]
|
|
351
|
+
binary_rep = gray_code_to_binary(int(pidx))
|
|
352
|
+
target_bits = (binary_rep >> np.arange(self.num_simhash_projections - 1, -1, -1)) & 1
|
|
353
|
+
distances = np.sum((doc_sketches > 0).astype(int) != target_bits, axis=1)
|
|
354
|
+
nearest_local = np.argmin(distances)
|
|
355
|
+
rep_fde[doc_idx, pidx, :] = all_projected[doc_start + nearest_local]
|
|
356
|
+
|
|
357
|
+
def _scatter_add(
|
|
358
|
+
self,
|
|
359
|
+
rep_fde: np.ndarray,
|
|
360
|
+
doc_indices: np.ndarray,
|
|
361
|
+
part_indices: np.ndarray,
|
|
362
|
+
all_projected: np.ndarray,
|
|
363
|
+
) -> None:
|
|
364
|
+
"""Scatter-add projected vectors into partitions."""
|
|
365
|
+
doc_part = doc_indices * self._num_partitions + part_indices
|
|
366
|
+
base = doc_part * self._proj_dim
|
|
367
|
+
flat_rep_fde = rep_fde.reshape(-1)
|
|
368
|
+
|
|
369
|
+
for d in range(self._proj_dim):
|
|
370
|
+
np.add.at(flat_rep_fde, base + d, all_projected[:, d])
|
|
371
|
+
|
|
372
|
+
def _flatten_batch(
|
|
373
|
+
self, point_clouds: list[np.ndarray]
|
|
374
|
+
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
|
|
375
|
+
"""Flatten batch of point clouds into single array with metadata."""
|
|
376
|
+
doc_lengths = np.array([pc.shape[0] for pc in point_clouds], dtype=np.int32)
|
|
377
|
+
doc_boundaries = np.insert(np.cumsum(doc_lengths), 0, 0)
|
|
378
|
+
doc_indices = np.repeat(np.arange(len(point_clouds)), doc_lengths)
|
|
379
|
+
flat_points = np.vstack(point_clouds).astype(np.float32)
|
|
380
|
+
|
|
381
|
+
return flat_points, doc_indices, doc_boundaries
|
|
382
|
+
|
|
383
|
+
def _apply_final_projection(self, fdes: np.ndarray) -> np.ndarray:
|
|
384
|
+
"""Apply optional Count Sketch final projection."""
|
|
385
|
+
if self.final_projection_dimension is None or self.final_projection_dimension <= 0:
|
|
386
|
+
return fdes
|
|
387
|
+
|
|
388
|
+
if fdes.ndim == 1:
|
|
389
|
+
return count_sketch_vector_from_seed(fdes, self.final_projection_dimension, self.seed)
|
|
390
|
+
|
|
391
|
+
# Batch
|
|
392
|
+
result = np.zeros((fdes.shape[0], self.final_projection_dimension), dtype=np.float32)
|
|
393
|
+
for i in range(fdes.shape[0]):
|
|
394
|
+
result[i] = count_sketch_vector_from_seed(
|
|
395
|
+
fdes[i], self.final_projection_dimension, self.seed
|
|
396
|
+
)
|
|
397
|
+
return result
|
|
398
|
+
|
|
399
|
+
def __repr__(self) -> str: # noqa: D105
|
|
400
|
+
return (
|
|
401
|
+
f"Muvera("
|
|
402
|
+
f"num_repetitions={self.num_repetitions}, "
|
|
403
|
+
f"num_simhash_projections={self.num_simhash_projections}, "
|
|
404
|
+
f"dimension={self.dimension}, "
|
|
405
|
+
f"projection_type='{self.projection_type}', "
|
|
406
|
+
f"projection_dimension={self.projection_dimension}, "
|
|
407
|
+
f"fill_empty_partitions={self.fill_empty_partitions}, "
|
|
408
|
+
f"final_projection_dimension={self.final_projection_dimension}, "
|
|
409
|
+
f"seed={self.seed}"
|
|
410
|
+
f")"
|
|
411
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "muvera-python"
|
|
7
|
+
version = "0.1.1"
|
|
8
|
+
description = "MuVERA: Multi-Vector Retrieval via Fixed Dimensional Encodings"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "craftsangjae" },
|
|
14
|
+
]
|
|
15
|
+
keywords = ["muvera", "muvera-python", "multi-vector", "retrieval", "embedding", "fixed-dimensional-encoding"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Intended Audience :: Science/Research",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Programming Language :: Python :: 3.9",
|
|
23
|
+
"Programming Language :: Python :: 3.10",
|
|
24
|
+
"Programming Language :: Python :: 3.11",
|
|
25
|
+
"Programming Language :: Python :: 3.12",
|
|
26
|
+
"Programming Language :: Python :: 3.13",
|
|
27
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
28
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
29
|
+
]
|
|
30
|
+
dependencies = [
|
|
31
|
+
"numpy>=1.22.0",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[project.optional-dependencies]
|
|
35
|
+
dev = [
|
|
36
|
+
"pytest>=7.0",
|
|
37
|
+
"mypy>=1.0",
|
|
38
|
+
"ruff>=0.4",
|
|
39
|
+
"pre-commit>=3.0",
|
|
40
|
+
"scipy>=1.13.1",
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
[project.urls]
|
|
44
|
+
Homepage = "https://github.com/craftsangjae/muvera-python"
|
|
45
|
+
Repository = "https://github.com/craftsangjae/muvera-python"
|
|
46
|
+
Issues = "https://github.com/craftsangjae/muvera-python/issues"
|
|
47
|
+
|
|
48
|
+
[tool.hatch.build.targets.wheel]
|
|
49
|
+
packages = ["muvera"]
|
|
50
|
+
|
|
51
|
+
[tool.hatch.build.targets.sdist]
|
|
52
|
+
include = ["muvera/", "README.md", "LICENSE"]
|
|
53
|
+
|
|
54
|
+
# ---------------------------------------------------------------------------
|
|
55
|
+
# Ruff
|
|
56
|
+
# ---------------------------------------------------------------------------
|
|
57
|
+
|
|
58
|
+
[tool.ruff]
|
|
59
|
+
target-version = "py39"
|
|
60
|
+
line-length = 100
|
|
61
|
+
|
|
62
|
+
[tool.ruff.lint]
|
|
63
|
+
select = [
|
|
64
|
+
"E", # pycodestyle errors
|
|
65
|
+
"W", # pycodestyle warnings
|
|
66
|
+
"F", # pyflakes
|
|
67
|
+
"I", # isort
|
|
68
|
+
"N", # pep8-naming
|
|
69
|
+
"UP", # pyupgrade
|
|
70
|
+
"B", # flake8-bugbear
|
|
71
|
+
"SIM", # flake8-simplify
|
|
72
|
+
"D", # pydocstyle
|
|
73
|
+
]
|
|
74
|
+
ignore = [
|
|
75
|
+
"D100", # Missing docstring in public module (module docstring already present)
|
|
76
|
+
"D104", # Missing docstring in public package
|
|
77
|
+
"D203", # 1 blank line required before class docstring (conflicts with D211)
|
|
78
|
+
"D213", # Multi-line docstring summary should start at the second line (conflicts with D212)
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
[tool.ruff.lint.pydocstyle]
|
|
82
|
+
convention = "numpy"
|
|
83
|
+
|
|
84
|
+
[tool.ruff.lint.isort]
|
|
85
|
+
known-first-party = ["muvera"]
|
|
86
|
+
|
|
87
|
+
[tool.ruff.lint.per-file-ignores]
|
|
88
|
+
"tests/**" = ["D"]
|
|
89
|
+
"examples/**" = ["D"]
|
|
90
|
+
|
|
91
|
+
# ---------------------------------------------------------------------------
|
|
92
|
+
# Pytest
|
|
93
|
+
# ---------------------------------------------------------------------------
|
|
94
|
+
|
|
95
|
+
[tool.pytest.ini_options]
|
|
96
|
+
testpaths = ["tests"]
|
|
97
|
+
addopts = "-v --tb=short"
|