muvera-python 0.1.3__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. muvera_python-0.2.0/.github/workflows/publish.yml +144 -0
  2. muvera_python-0.2.0/.github/workflows/test.yml +42 -0
  3. {muvera_python-0.1.3 → muvera_python-0.2.0}/.gitignore +8 -1
  4. muvera_python-0.2.0/.pre-commit-config.yaml +24 -0
  5. muvera_python-0.2.0/CLAUDE.md +253 -0
  6. muvera_python-0.2.0/Cargo.lock +270 -0
  7. muvera_python-0.2.0/Cargo.toml +14 -0
  8. {muvera_python-0.1.3 → muvera_python-0.2.0}/PKG-INFO +53 -24
  9. {muvera_python-0.1.3 → muvera_python-0.2.0}/README.md +36 -7
  10. muvera_python-0.2.0/RELEASE_GUIDE.md +336 -0
  11. muvera_python-0.2.0/benchmarks/bench_speed.py +298 -0
  12. muvera_python-0.2.0/examples/basic_usage.py +155 -0
  13. muvera_python-0.2.0/examples/colbert_nanobeir.py +332 -0
  14. {muvera_python-0.1.3 → muvera_python-0.2.0}/muvera/__init__.py +7 -0
  15. muvera_python-0.2.0/muvera/_rust_kernels.pyi +34 -0
  16. {muvera_python-0.1.3 → muvera_python-0.2.0}/muvera/helper.py +32 -10
  17. {muvera_python-0.1.3 → muvera_python-0.2.0}/muvera/muvera.py +54 -0
  18. muvera_python-0.2.0/muvera/py.typed +0 -0
  19. {muvera_python-0.1.3 → muvera_python-0.2.0}/pyproject.toml +6 -8
  20. muvera_python-0.2.0/references/muvera(2405).pdf +0 -0
  21. muvera_python-0.2.0/scripts/generate_test_fixtures.py +138 -0
  22. muvera_python-0.2.0/src/fill_empty.rs +147 -0
  23. muvera_python-0.2.0/src/gray_code.rs +47 -0
  24. muvera_python-0.2.0/src/lib.rs +109 -0
  25. muvera_python-0.2.0/src/partition.rs +56 -0
  26. muvera_python-0.2.0/src/scatter.rs +53 -0
  27. muvera_python-0.2.0/tests/__init__.py +0 -0
  28. muvera_python-0.2.0/tests/fixtures/colbert_nanobeir/documents.npz +0 -0
  29. muvera_python-0.2.0/tests/fixtures/colbert_nanobeir/qrels.json +47 -0
  30. muvera_python-0.2.0/tests/fixtures/colbert_nanobeir/queries.npz +0 -0
  31. muvera_python-0.2.0/tests/test_helper.py +172 -0
  32. muvera_python-0.2.0/tests/test_muvera.py +241 -0
  33. muvera_python-0.2.0/tests/test_real_colbert.py +218 -0
  34. muvera_python-0.2.0/tests/test_reference.py +279 -0
  35. muvera_python-0.2.0/tests/test_rust_equivalence.py +214 -0
  36. {muvera_python-0.1.3 → muvera_python-0.2.0}/LICENSE +0 -0
  37. /muvera_python-0.1.3/muvera/py.typed → /muvera_python-0.2.0/benchmarks/results/.gitkeep +0 -0
@@ -0,0 +1,144 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+
7
+ jobs:
8
+ check-version:
9
+ runs-on: ubuntu-latest
10
+ outputs:
11
+ should_release: ${{ steps.check.outputs.should_release }}
12
+ version: ${{ steps.check.outputs.version }}
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+ with:
16
+ fetch-depth: 0
17
+
18
+ - name: Check if version tag already exists
19
+ id: check
20
+ run: |
21
+ VERSION=$(grep '^version = ' pyproject.toml | cut -d'"' -f2)
22
+ echo "version=$VERSION" >> "$GITHUB_OUTPUT"
23
+ if git rev-parse "v$VERSION" >/dev/null 2>&1; then
24
+ echo "Tag v$VERSION already exists, skipping release"
25
+ echo "should_release=false" >> "$GITHUB_OUTPUT"
26
+ else
27
+ echo "Tag v$VERSION does not exist, proceeding with release"
28
+ echo "should_release=true" >> "$GITHUB_OUTPUT"
29
+ fi
30
+
31
+ test:
32
+ needs: check-version
33
+ if: needs.check-version.outputs.should_release == 'true'
34
+ runs-on: ubuntu-latest
35
+ steps:
36
+ - uses: actions/checkout@v4
37
+
38
+ - name: Set up Python
39
+ uses: actions/setup-python@v5
40
+ with:
41
+ python-version: "3.11"
42
+
43
+ - name: Install Rust toolchain
44
+ uses: dtolnay/rust-toolchain@stable
45
+
46
+ - name: Install dependencies
47
+ run: |
48
+ python -m pip install --upgrade pip
49
+ pip install ".[dev]"
50
+
51
+ - name: Run tests
52
+ run: pytest -v
53
+
54
+ build-wheels:
55
+ needs: [check-version, test]
56
+ if: needs.check-version.outputs.should_release == 'true'
57
+ strategy:
58
+ fail-fast: false
59
+ matrix:
60
+ include:
61
+ - os: ubuntu-latest
62
+ target: x86_64
63
+ - os: ubuntu-latest
64
+ target: aarch64
65
+ - os: macos-latest
66
+ target: x86_64
67
+ - os: macos-latest
68
+ target: aarch64
69
+ runs-on: ${{ matrix.os }}
70
+ steps:
71
+ - uses: actions/checkout@v4
72
+
73
+ - name: Build wheels
74
+ uses: PyO3/maturin-action@v1
75
+ with:
76
+ target: ${{ matrix.target }}
77
+ args: --release --out dist --interpreter 3.9 3.10 3.11 3.12 3.13
78
+ manylinux: auto
79
+
80
+ - name: Upload wheels
81
+ uses: actions/upload-artifact@v4
82
+ with:
83
+ name: wheels-${{ matrix.os }}-${{ matrix.target }}
84
+ path: dist
85
+
86
+ build-sdist:
87
+ needs: [check-version, test]
88
+ if: needs.check-version.outputs.should_release == 'true'
89
+ runs-on: ubuntu-latest
90
+ steps:
91
+ - uses: actions/checkout@v4
92
+
93
+ - name: Build sdist
94
+ uses: PyO3/maturin-action@v1
95
+ with:
96
+ command: sdist
97
+ args: --out dist
98
+
99
+ - name: Upload sdist
100
+ uses: actions/upload-artifact@v4
101
+ with:
102
+ name: wheels-sdist
103
+ path: dist
104
+
105
+ publish:
106
+ needs: [check-version, build-wheels, build-sdist]
107
+ if: always() && needs.check-version.outputs.should_release == 'true' && (needs.build-wheels.result == 'success' || needs.build-sdist.result == 'success')
108
+ runs-on: ubuntu-latest
109
+ permissions:
110
+ id-token: write
111
+ contents: write
112
+
113
+ steps:
114
+ - uses: actions/checkout@v4
115
+
116
+ - name: Create version tag
117
+ run: |
118
+ VERSION=${{ needs.check-version.outputs.version }}
119
+ git tag "v$VERSION"
120
+ git push origin "v$VERSION"
121
+
122
+ - name: Download all artifacts
123
+ uses: actions/download-artifact@v4
124
+ with:
125
+ pattern: wheels-*
126
+ merge-multiple: true
127
+ path: dist
128
+
129
+ - name: Publish to PyPI
130
+ uses: pypa/gh-action-pypi-publish@release/v1
131
+
132
+ - name: Create GitHub Release
133
+ uses: softprops/action-gh-release@v2
134
+ with:
135
+ tag_name: v${{ needs.check-version.outputs.version }}
136
+ files: dist/*
137
+ generate_release_notes: true
138
+ body: |
139
+ ## Installation
140
+ ```bash
141
+ pip install muvera-python==${{ needs.check-version.outputs.version }}
142
+ ```
143
+
144
+ See [PyPI](https://pypi.org/project/muvera-python/${{ needs.check-version.outputs.version }}/) for full package details.
@@ -0,0 +1,42 @@
1
+ name: Tests
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Set up Python ${{ matrix.python-version }}
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: ${{ matrix.python-version }}
23
+
24
+ - name: Install Rust toolchain
25
+ uses: dtolnay/rust-toolchain@stable
26
+
27
+ - name: Install dependencies
28
+ run: |
29
+ python -m pip install --upgrade pip
30
+ pip install ".[dev]"
31
+
32
+ - name: Run ruff (lint)
33
+ run: ruff check .
34
+
35
+ - name: Run ruff (format check)
36
+ run: ruff format --check .
37
+
38
+ - name: Run mypy
39
+ run: mypy muvera
40
+
41
+ - name: Run pytest
42
+ run: pytest -v --tb=short
@@ -21,4 +21,11 @@ examples/.cache/
21
21
  .vscode/
22
22
 
23
23
  # Benchmark results (generated, not committed)
24
- benchmarks/results/*.json
24
+ benchmarks/results/*.json
25
+
26
+ target/
27
+
28
+ # Rust/maturin build artifacts
29
+ *.so
30
+ *.dylib
31
+ *.pyd
@@ -0,0 +1,24 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ rev: v0.9.7
4
+ hooks:
5
+ - id: ruff
6
+ args: [--fix, --exit-non-zero-on-fix]
7
+ - id: ruff-format
8
+
9
+ - repo: https://github.com/pre-commit/mirrors-mypy
10
+ rev: v1.13.0
11
+ hooks:
12
+ - id: mypy
13
+ additional_dependencies: [numpy>=1.22.0]
14
+ files: ^muvera/
15
+
16
+ - repo: local
17
+ hooks:
18
+ - id: pytest
19
+ name: pytest
20
+ entry: pytest
21
+ language: system
22
+ pass_filenames: false
23
+ always_run: true
24
+ args: [tests/, -q]
@@ -0,0 +1,253 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Project Overview
6
+
7
+ MuVERA (Multi-Vector Retrieval via Fixed Dimensional Encoding Algorithm) is a Python library that converts multi-vector embeddings (point clouds) into fixed-dimensional single vectors. This enables using existing single-vector search infrastructure (MIPS, ANN) without modification.
8
+
9
+ ### Project Goals
10
+
11
+ **Simplicity First**: This project pursues a simple, intuitive interface to make MuVERA easy to use. Unlike the reference implementation which exposes low-level config objects and separate functions for queries vs. documents, this library wraps everything behind a single `Muvera` class:
12
+ - `encode_documents()` - encodes document embeddings using AVERAGE method
13
+ - `encode_queries()` - encodes query embeddings using SUM method
14
+
15
+ No config dataclasses, no encoding-type enums, no manual seed juggling. Just NumPy arrays in, NumPy arrays out.
16
+
17
+ **Distribution Plan**: The library will be published to PyPI for easy installation via `pip install muvera-python`.
18
+
19
+ **Key use case**: Efficiently encode ColBERT-style multi-vector embeddings for retrieval without specialized infrastructure.
20
+
21
+ ## Git & PR Conventions
22
+ - **Do NOT** add `Co-Authored-By` lines to commit messages.
23
+ - **Do NOT** add "Generated with Claude Code" or similar attribution to PR descriptions.
24
+
25
+ ## Development Commands
26
+
27
+ ### Environment Activation
28
+ **IMPORTANT**: Always activate the virtual environment before running any commands:
29
+ ```bash
30
+ source .venv/bin/activate
31
+ ```
32
+
33
+ ### Post-Code-Writing Checklist
34
+ **IMPORTANT**: After writing or modifying any code, always run:
35
+ ```bash
36
+ ruff check . # Lint check (must pass)
37
+ pytest # Tests (must pass)
38
+ ```
39
+
40
+ ### Setup
41
+ ```bash
42
+ pip install maturin # Required for building Rust extension
43
+ maturin develop --release # Build Rust extension (needs Rust toolchain)
44
+ pip install -e ".[dev]" # Install with dev dependencies
45
+ ```
46
+
47
+ ### Testing
48
+ ```bash
49
+ pytest # Run all tests
50
+ pytest tests/test_muvera.py # Run specific test file
51
+ pytest -v --tb=short # Verbose output with short traceback
52
+ pytest -k test_name # Run specific test by name
53
+ ```
54
+
55
+ ### Code Quality
56
+ ```bash
57
+ ruff check . # Lint all files
58
+ ruff check --fix . # Lint and auto-fix issues
59
+ ruff format . # Format code
60
+ mypy muvera # Type checking
61
+ pre-commit run --all-files # Run all pre-commit hooks
62
+ ```
63
+
64
+ ### Running Examples
65
+ ```bash
66
+ python examples/basic_usage.py
67
+ python examples/colbert_nanobeir.py
68
+ ```
69
+
70
+ ### Deployment
71
+
72
+ **Version bump → merge → auto-release:**
73
+
74
+ ```bash
75
+ # 1. Update version in pyproject.toml
76
+ # version = "0.2.0"
77
+
78
+ # 2. Create PR and merge to main
79
+ # GitHub Actions will automatically:
80
+ # - Detect the new version (tag doesn't exist yet)
81
+ # - Run full test suite
82
+ # - Create git tag v0.2.0
83
+ # - Build wheel and sdist
84
+ # - Publish to PyPI via OIDC
85
+ # - Create GitHub Release
86
+ #
87
+ # If version is unchanged, all release steps are skipped.
88
+ ```
89
+
90
+ **Local build (for testing):**
91
+ ```bash
92
+ maturin build --release # Build wheel with Rust extension
93
+ ```
94
+
95
+ ## Architecture
96
+
97
+ ### Core Components
98
+
99
+ **`muvera/muvera.py`** - Main `Muvera` class implementing Fixed Dimensional Encoding (FDE)
100
+ - Two encoding paths: single document, variable-length batch
101
+ - Document encoding uses AVERAGE aggregation within partitions
102
+ - Query encoding uses SUM aggregation within partitions
103
+ - Optional final dimensionality reduction via Count Sketch
104
+ - Hot-path methods (`_aggregate_single`, `_scatter_add`, `_fill_empty_batch`) delegate to Rust kernels when available
105
+
106
+ **`muvera/helper.py`** - Low-level utilities (not public API)
107
+ - Gray code manipulation for partition indexing
108
+ - Random projection matrices (SimHash, AMS Sketch, Count Sketch)
109
+ - Vectorized batch partition indexing
110
+ - `partition_index_gray` and `partition_indices_gray_batch` delegate to Rust when available
111
+
112
+ **`src/`** - Rust extension module (`muvera._rust_kernels`) via PyO3/maturin
113
+ - `gray_code.rs` — Gray code append and binary conversion
114
+ - `partition.rs` — Single and batch Gray-code partition indexing
115
+ - `scatter.rs` — Scatter-add kernel for batch aggregation
116
+ - `fill_empty.rs` — Single-point-cloud aggregation + batch empty partition filling
117
+ - `lib.rs` — PyO3 module definition exposing 5 functions
118
+
119
+ **`muvera/_rust_kernels.pyi`** - Type stubs for the Rust extension module
120
+
121
+ ### Algorithm Flow
122
+
123
+ 1. **SimHash Projection**: Maps each vector to a partition using random Gaussian projections
124
+ 2. **Partition Assignment**: Uses Gray code to assign vectors to one of `2^num_simhash_projections` partitions
125
+ 3. **Inner Projection**: Optionally reduces dimension via AMS Sketch (or uses identity)
126
+ 4. **Aggregation**:
127
+ - Documents: compute centroid (average) of vectors in each partition
128
+ - Queries: compute sum of vectors in each partition
129
+ 5. **Empty Partition Filling** (documents only): Fill empty partitions with nearest vector by Hamming distance
130
+ 6. **Repetitions**: Repeat steps 1-5 with different random seeds, concatenating results
131
+ 7. **Final Projection** (optional): Apply Count Sketch to reduce final dimension
132
+
133
+ ### Rust Acceleration
134
+
135
+ Performance-critical inner loops are implemented in Rust via PyO3, with automatic fallback to pure Python:
136
+
137
+ ```python
138
+ # muvera/__init__.py
139
+ try:
140
+ import muvera._rust_kernels
141
+ _RUST_AVAILABLE = True
142
+ except ImportError:
143
+ _RUST_AVAILABLE = False
144
+ ```
145
+
146
+ **Accelerated functions:**
147
+ | Rust function | Python fallback | Speedup |
148
+ |---|---|---|
149
+ | `aggregate_single` | `Muvera._aggregate_single_python` | 8-17x (single doc) |
150
+ | `scatter_add_partitions` | `Muvera._scatter_add` (np.add.at loop) | 1-2.5x (batch) |
151
+ | `fill_empty_partitions_batch` | `Muvera._fill_empty_batch` (Python loop) | 1-2.5x (batch) |
152
+ | `partition_index_gray` | `helper._partition_index_gray_python` | part of aggregate |
153
+ | `partition_indices_gray_batch` | `helper._partition_indices_gray_batch_python` | part of batch |
154
+
155
+ **What is NOT in Rust** (intentionally kept in NumPy for seed compatibility):
156
+ - `simhash_matrix_from_seed`, `ams_projection_matrix_from_seed` — depend on `np.random.default_rng`
157
+ - `count_sketch_vector_from_seed` — same reason
158
+ - `Muvera.__init__`, public API signatures — 100% unchanged
159
+
160
+ ### Batch Processing
161
+
162
+ The library supports two input formats:
163
+ - **Single**: `(num_vectors, dimension)` - processes one point cloud
164
+ - **Variable-length batch**: `list[np.ndarray]` - each point cloud has different length (recommended for real-world data)
165
+
166
+ Variable-length batch processing flattens all point clouds, processes them together, then aggregates per-document using Rust `scatter_add_partitions` (or `np.add.at()` fallback).
167
+
168
+ ## Code Conventions
169
+
170
+ ### Python
171
+ - NumPy-style docstrings (configured in pyproject.toml)
172
+ - Type hints required (Python 3.9+ syntax with `|` for unions)
173
+ - Line length: 100 characters
174
+ - Use `np.float32` for all embeddings (memory efficiency)
175
+ - Use `np.uint32` for partition indices
176
+ - Random number generation via `np.random.default_rng(seed)` for reproducibility
177
+
178
+ ### Rust
179
+ - Edition 2021
180
+ - Dependencies: `pyo3` 0.23, `numpy` 0.23 (Rust crate, not Python package), `ndarray` 0.16
181
+ - All three crates are version-locked together (upgrade all at once)
182
+ - Use `f32` for all floating-point data, `u32` for partition indices, `i32` for counts, `i64` for boundaries
183
+ - PyO3 functions accept `PyReadonlyArray*` for input arrays and `&Bound<PyArray*>` for in-place mutation
184
+
185
+ ## Testing
186
+
187
+ ### Test Organization
188
+
189
+ - **`test_helper.py`**: Low-level helper function tests (Gray code, projections, etc.)
190
+ - **`test_muvera.py`**: Core Muvera class tests (shapes, validation, reproducibility)
191
+ - **`test_reference.py`**: Validation against reference implementation (sionic-ai/muvera-py)
192
+ - **`test_real_colbert.py`**: Real-world ColBERT embedding tests using NanoBEIR fixtures
193
+ - **`test_rust_equivalence.py`**: Numerical equivalence tests between Rust kernels and Python fallbacks (skipped if Rust extension is unavailable)
194
+
195
+ ### Real Data Testing
196
+
197
+ `test_real_colbert.py` uses cached ColBERT embeddings from NanoBEIR to validate performance on real data:
198
+ - **Fixtures**: 35 documents, 5 queries, 35 relevance judgments (~2.2MB cached in git)
199
+ - **Tests**: FDE encoding, correlation with native MaxSim, Recall@K metrics
200
+ - **Generation**: `python scripts/generate_test_fixtures.py` (requires pylate, datasets, torch)
201
+
202
+ Fixtures are cached in `tests/fixtures/colbert_nanobeir/` to avoid slow model inference during CI/testing.
203
+
204
+ ## Key Parameters
205
+
206
+ - `num_repetitions`: Controls accuracy/dimension trade-off (default: 20)
207
+ - `num_simhash_projections`: Determines partition count as `2^n` (default: 5, range: [0, 31))
208
+ - `fill_empty_partitions`: Whether to fill empty partitions with nearest vector (default: True, documents only)
209
+ - `projection_type`: "identity" or "ams_sketch" for dimensionality reduction
210
+ - `final_projection_dimension`: Optional Count Sketch final projection
211
+
212
+ Output dimension: `num_repetitions * 2^num_simhash_projections * projection_dimension` (or `final_projection_dimension` if set)
213
+
214
+ ## CI/CD and Deployment
215
+
216
+ ### GitHub Actions Workflows
217
+
218
+ **`.github/workflows/test.yml`** - Continuous Integration
219
+ - Triggers: Push to main, all pull requests
220
+ - Tests across Python 3.9-3.13
221
+ - Installs Rust toolchain via `dtolnay/rust-toolchain@stable`
222
+ - Builds Rust extension via `pip install ".[dev]"` (maturin build backend)
223
+ - Runs ruff (lint + format check), mypy (type checking), pytest
224
+
225
+ **`.github/workflows/publish.yml`** - PyPI Publishing
226
+ - Triggers: Push to main
227
+ - Checks if `v{version}` tag already exists; skips release if it does
228
+ - Runs full test suite
229
+ - Builds cross-platform wheels via `PyO3/maturin-action@v1` (Linux x86_64/aarch64, macOS x86_64/aarch64, Windows x86_64)
230
+ - Builds sdist separately
231
+ - Creates git tag, publishes to PyPI via OIDC, creates GitHub Release
232
+
233
+ ### Deployment Policy
234
+
235
+ **Auto-release on version bump:**
236
+ - Merging a PR that changes the version in `pyproject.toml` triggers a release
237
+ - If the version is unchanged, all release steps are skipped
238
+ - Uses OpenID Connect (OIDC) for secure, token-free authentication to PyPI
239
+
240
+ **Pre-deployment checklist:**
241
+ 1. Update `version` in `pyproject.toml`
242
+ 2. Run tests locally: `pytest`
243
+ 3. Check code quality: `ruff check . && mypy muvera`
244
+ 4. Create PR and merge to main
245
+
246
+ **OIDC Setup (one-time):**
247
+ 1. Go to [PyPI](https://pypi.org) → Account settings → Publishing
248
+ 2. Add a new pending publisher:
249
+ - PyPI Project Name: `muvera-python`
250
+ - Owner: `craftsangjae`
251
+ - Repository: `muvera-python`
252
+ - Workflow: `publish.yml`
253
+ - Environment: (leave empty)