muvera-python 0.1.3__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- muvera_python-0.2.0/.github/workflows/publish.yml +144 -0
- muvera_python-0.2.0/.github/workflows/test.yml +42 -0
- {muvera_python-0.1.3 → muvera_python-0.2.0}/.gitignore +8 -1
- muvera_python-0.2.0/.pre-commit-config.yaml +24 -0
- muvera_python-0.2.0/CLAUDE.md +253 -0
- muvera_python-0.2.0/Cargo.lock +270 -0
- muvera_python-0.2.0/Cargo.toml +14 -0
- {muvera_python-0.1.3 → muvera_python-0.2.0}/PKG-INFO +53 -24
- {muvera_python-0.1.3 → muvera_python-0.2.0}/README.md +36 -7
- muvera_python-0.2.0/RELEASE_GUIDE.md +336 -0
- muvera_python-0.2.0/benchmarks/bench_speed.py +298 -0
- muvera_python-0.2.0/examples/basic_usage.py +155 -0
- muvera_python-0.2.0/examples/colbert_nanobeir.py +332 -0
- {muvera_python-0.1.3 → muvera_python-0.2.0}/muvera/__init__.py +7 -0
- muvera_python-0.2.0/muvera/_rust_kernels.pyi +34 -0
- {muvera_python-0.1.3 → muvera_python-0.2.0}/muvera/helper.py +32 -10
- {muvera_python-0.1.3 → muvera_python-0.2.0}/muvera/muvera.py +54 -0
- muvera_python-0.2.0/muvera/py.typed +0 -0
- {muvera_python-0.1.3 → muvera_python-0.2.0}/pyproject.toml +6 -8
- muvera_python-0.2.0/references/muvera(2405).pdf +0 -0
- muvera_python-0.2.0/scripts/generate_test_fixtures.py +138 -0
- muvera_python-0.2.0/src/fill_empty.rs +147 -0
- muvera_python-0.2.0/src/gray_code.rs +47 -0
- muvera_python-0.2.0/src/lib.rs +109 -0
- muvera_python-0.2.0/src/partition.rs +56 -0
- muvera_python-0.2.0/src/scatter.rs +53 -0
- muvera_python-0.2.0/tests/__init__.py +0 -0
- muvera_python-0.2.0/tests/fixtures/colbert_nanobeir/documents.npz +0 -0
- muvera_python-0.2.0/tests/fixtures/colbert_nanobeir/qrels.json +47 -0
- muvera_python-0.2.0/tests/fixtures/colbert_nanobeir/queries.npz +0 -0
- muvera_python-0.2.0/tests/test_helper.py +172 -0
- muvera_python-0.2.0/tests/test_muvera.py +241 -0
- muvera_python-0.2.0/tests/test_real_colbert.py +218 -0
- muvera_python-0.2.0/tests/test_reference.py +279 -0
- muvera_python-0.2.0/tests/test_rust_equivalence.py +214 -0
- {muvera_python-0.1.3 → muvera_python-0.2.0}/LICENSE +0 -0
- /muvera_python-0.1.3/muvera/py.typed → /muvera_python-0.2.0/benchmarks/results/.gitkeep +0 -0
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
check-version:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
outputs:
|
|
11
|
+
should_release: ${{ steps.check.outputs.should_release }}
|
|
12
|
+
version: ${{ steps.check.outputs.version }}
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
with:
|
|
16
|
+
fetch-depth: 0
|
|
17
|
+
|
|
18
|
+
- name: Check if version tag already exists
|
|
19
|
+
id: check
|
|
20
|
+
run: |
|
|
21
|
+
VERSION=$(grep '^version = ' pyproject.toml | cut -d'"' -f2)
|
|
22
|
+
echo "version=$VERSION" >> "$GITHUB_OUTPUT"
|
|
23
|
+
if git rev-parse "v$VERSION" >/dev/null 2>&1; then
|
|
24
|
+
echo "Tag v$VERSION already exists, skipping release"
|
|
25
|
+
echo "should_release=false" >> "$GITHUB_OUTPUT"
|
|
26
|
+
else
|
|
27
|
+
echo "Tag v$VERSION does not exist, proceeding with release"
|
|
28
|
+
echo "should_release=true" >> "$GITHUB_OUTPUT"
|
|
29
|
+
fi
|
|
30
|
+
|
|
31
|
+
test:
|
|
32
|
+
needs: check-version
|
|
33
|
+
if: needs.check-version.outputs.should_release == 'true'
|
|
34
|
+
runs-on: ubuntu-latest
|
|
35
|
+
steps:
|
|
36
|
+
- uses: actions/checkout@v4
|
|
37
|
+
|
|
38
|
+
- name: Set up Python
|
|
39
|
+
uses: actions/setup-python@v5
|
|
40
|
+
with:
|
|
41
|
+
python-version: "3.11"
|
|
42
|
+
|
|
43
|
+
- name: Install Rust toolchain
|
|
44
|
+
uses: dtolnay/rust-toolchain@stable
|
|
45
|
+
|
|
46
|
+
- name: Install dependencies
|
|
47
|
+
run: |
|
|
48
|
+
python -m pip install --upgrade pip
|
|
49
|
+
pip install ".[dev]"
|
|
50
|
+
|
|
51
|
+
- name: Run tests
|
|
52
|
+
run: pytest -v
|
|
53
|
+
|
|
54
|
+
build-wheels:
|
|
55
|
+
needs: [check-version, test]
|
|
56
|
+
if: needs.check-version.outputs.should_release == 'true'
|
|
57
|
+
strategy:
|
|
58
|
+
fail-fast: false
|
|
59
|
+
matrix:
|
|
60
|
+
include:
|
|
61
|
+
- os: ubuntu-latest
|
|
62
|
+
target: x86_64
|
|
63
|
+
- os: ubuntu-latest
|
|
64
|
+
target: aarch64
|
|
65
|
+
- os: macos-latest
|
|
66
|
+
target: x86_64
|
|
67
|
+
- os: macos-latest
|
|
68
|
+
target: aarch64
|
|
69
|
+
runs-on: ${{ matrix.os }}
|
|
70
|
+
steps:
|
|
71
|
+
- uses: actions/checkout@v4
|
|
72
|
+
|
|
73
|
+
- name: Build wheels
|
|
74
|
+
uses: PyO3/maturin-action@v1
|
|
75
|
+
with:
|
|
76
|
+
target: ${{ matrix.target }}
|
|
77
|
+
args: --release --out dist --interpreter 3.9 3.10 3.11 3.12 3.13
|
|
78
|
+
manylinux: auto
|
|
79
|
+
|
|
80
|
+
- name: Upload wheels
|
|
81
|
+
uses: actions/upload-artifact@v4
|
|
82
|
+
with:
|
|
83
|
+
name: wheels-${{ matrix.os }}-${{ matrix.target }}
|
|
84
|
+
path: dist
|
|
85
|
+
|
|
86
|
+
build-sdist:
|
|
87
|
+
needs: [check-version, test]
|
|
88
|
+
if: needs.check-version.outputs.should_release == 'true'
|
|
89
|
+
runs-on: ubuntu-latest
|
|
90
|
+
steps:
|
|
91
|
+
- uses: actions/checkout@v4
|
|
92
|
+
|
|
93
|
+
- name: Build sdist
|
|
94
|
+
uses: PyO3/maturin-action@v1
|
|
95
|
+
with:
|
|
96
|
+
command: sdist
|
|
97
|
+
args: --out dist
|
|
98
|
+
|
|
99
|
+
- name: Upload sdist
|
|
100
|
+
uses: actions/upload-artifact@v4
|
|
101
|
+
with:
|
|
102
|
+
name: wheels-sdist
|
|
103
|
+
path: dist
|
|
104
|
+
|
|
105
|
+
publish:
|
|
106
|
+
needs: [check-version, build-wheels, build-sdist]
|
|
107
|
+
if: always() && needs.check-version.outputs.should_release == 'true' && (needs.build-wheels.result == 'success' || needs.build-sdist.result == 'success')
|
|
108
|
+
runs-on: ubuntu-latest
|
|
109
|
+
permissions:
|
|
110
|
+
id-token: write
|
|
111
|
+
contents: write
|
|
112
|
+
|
|
113
|
+
steps:
|
|
114
|
+
- uses: actions/checkout@v4
|
|
115
|
+
|
|
116
|
+
- name: Create version tag
|
|
117
|
+
run: |
|
|
118
|
+
VERSION=${{ needs.check-version.outputs.version }}
|
|
119
|
+
git tag "v$VERSION"
|
|
120
|
+
git push origin "v$VERSION"
|
|
121
|
+
|
|
122
|
+
- name: Download all artifacts
|
|
123
|
+
uses: actions/download-artifact@v4
|
|
124
|
+
with:
|
|
125
|
+
pattern: wheels-*
|
|
126
|
+
merge-multiple: true
|
|
127
|
+
path: dist
|
|
128
|
+
|
|
129
|
+
- name: Publish to PyPI
|
|
130
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
131
|
+
|
|
132
|
+
- name: Create GitHub Release
|
|
133
|
+
uses: softprops/action-gh-release@v2
|
|
134
|
+
with:
|
|
135
|
+
tag_name: v${{ needs.check-version.outputs.version }}
|
|
136
|
+
files: dist/*
|
|
137
|
+
generate_release_notes: true
|
|
138
|
+
body: |
|
|
139
|
+
## Installation
|
|
140
|
+
```bash
|
|
141
|
+
pip install muvera-python==${{ needs.check-version.outputs.version }}
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
See [PyPI](https://pypi.org/project/muvera-python/${{ needs.check-version.outputs.version }}/) for full package details.
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
name: Tests
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
|
|
15
|
+
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
20
|
+
uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: ${{ matrix.python-version }}
|
|
23
|
+
|
|
24
|
+
- name: Install Rust toolchain
|
|
25
|
+
uses: dtolnay/rust-toolchain@stable
|
|
26
|
+
|
|
27
|
+
- name: Install dependencies
|
|
28
|
+
run: |
|
|
29
|
+
python -m pip install --upgrade pip
|
|
30
|
+
pip install ".[dev]"
|
|
31
|
+
|
|
32
|
+
- name: Run ruff (lint)
|
|
33
|
+
run: ruff check .
|
|
34
|
+
|
|
35
|
+
- name: Run ruff (format check)
|
|
36
|
+
run: ruff format --check .
|
|
37
|
+
|
|
38
|
+
- name: Run mypy
|
|
39
|
+
run: mypy muvera
|
|
40
|
+
|
|
41
|
+
- name: Run pytest
|
|
42
|
+
run: pytest -v --tb=short
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
3
|
+
rev: v0.9.7
|
|
4
|
+
hooks:
|
|
5
|
+
- id: ruff
|
|
6
|
+
args: [--fix, --exit-non-zero-on-fix]
|
|
7
|
+
- id: ruff-format
|
|
8
|
+
|
|
9
|
+
- repo: https://github.com/pre-commit/mirrors-mypy
|
|
10
|
+
rev: v1.13.0
|
|
11
|
+
hooks:
|
|
12
|
+
- id: mypy
|
|
13
|
+
additional_dependencies: [numpy>=1.22.0]
|
|
14
|
+
files: ^muvera/
|
|
15
|
+
|
|
16
|
+
- repo: local
|
|
17
|
+
hooks:
|
|
18
|
+
- id: pytest
|
|
19
|
+
name: pytest
|
|
20
|
+
entry: pytest
|
|
21
|
+
language: system
|
|
22
|
+
pass_filenames: false
|
|
23
|
+
always_run: true
|
|
24
|
+
args: [tests/, -q]
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Project Overview
|
|
6
|
+
|
|
7
|
+
MuVERA (Multi-Vector Retrieval via Fixed Dimensional Encoding Algorithm) is a Python library that converts multi-vector embeddings (point clouds) into fixed-dimensional single vectors. This enables using existing single-vector search infrastructure (MIPS, ANN) without modification.
|
|
8
|
+
|
|
9
|
+
### Project Goals
|
|
10
|
+
|
|
11
|
+
**Simplicity First**: This project pursues a simple, intuitive interface to make MuVERA easy to use. Unlike the reference implementation which exposes low-level config objects and separate functions for queries vs. documents, this library wraps everything behind a single `Muvera` class:
|
|
12
|
+
- `encode_documents()` - encodes document embeddings using AVERAGE method
|
|
13
|
+
- `encode_queries()` - encodes query embeddings using SUM method
|
|
14
|
+
|
|
15
|
+
No config dataclasses, no encoding-type enums, no manual seed juggling. Just NumPy arrays in, NumPy arrays out.
|
|
16
|
+
|
|
17
|
+
**Distribution Plan**: The library will be published to PyPI for easy installation via `pip install muvera-python`.
|
|
18
|
+
|
|
19
|
+
**Key use case**: Efficiently encode ColBERT-style multi-vector embeddings for retrieval without specialized infrastructure.
|
|
20
|
+
|
|
21
|
+
## Git & PR Conventions
|
|
22
|
+
- **Do NOT** add `Co-Authored-By` lines to commit messages.
|
|
23
|
+
- **Do NOT** add "Generated with Claude Code" or similar attribution to PR descriptions.
|
|
24
|
+
|
|
25
|
+
## Development Commands
|
|
26
|
+
|
|
27
|
+
### Environment Activation
|
|
28
|
+
**IMPORTANT**: Always activate the virtual environment before running any commands:
|
|
29
|
+
```bash
|
|
30
|
+
source .venv/bin/activate
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
### Post-Code-Writing Checklist
|
|
34
|
+
**IMPORTANT**: After writing or modifying any code, always run:
|
|
35
|
+
```bash
|
|
36
|
+
ruff check . # Lint check (must pass)
|
|
37
|
+
pytest # Tests (must pass)
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### Setup
|
|
41
|
+
```bash
|
|
42
|
+
pip install maturin # Required for building Rust extension
|
|
43
|
+
maturin develop --release # Build Rust extension (needs Rust toolchain)
|
|
44
|
+
pip install -e ".[dev]" # Install with dev dependencies
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### Testing
|
|
48
|
+
```bash
|
|
49
|
+
pytest # Run all tests
|
|
50
|
+
pytest tests/test_muvera.py # Run specific test file
|
|
51
|
+
pytest -v --tb=short # Verbose output with short traceback
|
|
52
|
+
pytest -k test_name # Run specific test by name
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Code Quality
|
|
56
|
+
```bash
|
|
57
|
+
ruff check . # Lint all files
|
|
58
|
+
ruff check --fix . # Lint and auto-fix issues
|
|
59
|
+
ruff format . # Format code
|
|
60
|
+
mypy muvera # Type checking
|
|
61
|
+
pre-commit run --all-files # Run all pre-commit hooks
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Running Examples
|
|
65
|
+
```bash
|
|
66
|
+
python examples/basic_usage.py
|
|
67
|
+
python examples/colbert_nanobeir.py
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### Deployment
|
|
71
|
+
|
|
72
|
+
**Version bump → merge → auto-release:**
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
# 1. Update version in pyproject.toml
|
|
76
|
+
# version = "0.2.0"
|
|
77
|
+
|
|
78
|
+
# 2. Create PR and merge to main
|
|
79
|
+
# GitHub Actions will automatically:
|
|
80
|
+
# - Detect the new version (tag doesn't exist yet)
|
|
81
|
+
# - Run full test suite
|
|
82
|
+
# - Create git tag v0.2.0
|
|
83
|
+
# - Build wheel and sdist
|
|
84
|
+
# - Publish to PyPI via OIDC
|
|
85
|
+
# - Create GitHub Release
|
|
86
|
+
#
|
|
87
|
+
# If version is unchanged, all release steps are skipped.
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
**Local build (for testing):**
|
|
91
|
+
```bash
|
|
92
|
+
maturin build --release # Build wheel with Rust extension
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
## Architecture
|
|
96
|
+
|
|
97
|
+
### Core Components
|
|
98
|
+
|
|
99
|
+
**`muvera/muvera.py`** - Main `Muvera` class implementing Fixed Dimensional Encoding (FDE)
|
|
100
|
+
- Two encoding paths: single document, variable-length batch
|
|
101
|
+
- Document encoding uses AVERAGE aggregation within partitions
|
|
102
|
+
- Query encoding uses SUM aggregation within partitions
|
|
103
|
+
- Optional final dimensionality reduction via Count Sketch
|
|
104
|
+
- Hot-path methods (`_aggregate_single`, `_scatter_add`, `_fill_empty_batch`) delegate to Rust kernels when available
|
|
105
|
+
|
|
106
|
+
**`muvera/helper.py`** - Low-level utilities (not public API)
|
|
107
|
+
- Gray code manipulation for partition indexing
|
|
108
|
+
- Random projection matrices (SimHash, AMS Sketch, Count Sketch)
|
|
109
|
+
- Vectorized batch partition indexing
|
|
110
|
+
- `partition_index_gray` and `partition_indices_gray_batch` delegate to Rust when available
|
|
111
|
+
|
|
112
|
+
**`src/`** - Rust extension module (`muvera._rust_kernels`) via PyO3/maturin
|
|
113
|
+
- `gray_code.rs` — Gray code append and binary conversion
|
|
114
|
+
- `partition.rs` — Single and batch Gray-code partition indexing
|
|
115
|
+
- `scatter.rs` — Scatter-add kernel for batch aggregation
|
|
116
|
+
- `fill_empty.rs` — Single-point-cloud aggregation + batch empty partition filling
|
|
117
|
+
- `lib.rs` — PyO3 module definition exposing 5 functions
|
|
118
|
+
|
|
119
|
+
**`muvera/_rust_kernels.pyi`** - Type stubs for the Rust extension module
|
|
120
|
+
|
|
121
|
+
### Algorithm Flow
|
|
122
|
+
|
|
123
|
+
1. **SimHash Projection**: Maps each vector to a partition using random Gaussian projections
|
|
124
|
+
2. **Partition Assignment**: Uses Gray code to assign vectors to one of `2^num_simhash_projections` partitions
|
|
125
|
+
3. **Inner Projection**: Optionally reduces dimension via AMS Sketch (or uses identity)
|
|
126
|
+
4. **Aggregation**:
|
|
127
|
+
- Documents: compute centroid (average) of vectors in each partition
|
|
128
|
+
- Queries: compute sum of vectors in each partition
|
|
129
|
+
5. **Empty Partition Filling** (documents only): Fill empty partitions with nearest vector by Hamming distance
|
|
130
|
+
6. **Repetitions**: Repeat steps 1-5 with different random seeds, concatenating results
|
|
131
|
+
7. **Final Projection** (optional): Apply Count Sketch to reduce final dimension
|
|
132
|
+
|
|
133
|
+
### Rust Acceleration
|
|
134
|
+
|
|
135
|
+
Performance-critical inner loops are implemented in Rust via PyO3, with automatic fallback to pure Python:
|
|
136
|
+
|
|
137
|
+
```python
|
|
138
|
+
# muvera/__init__.py
|
|
139
|
+
try:
|
|
140
|
+
import muvera._rust_kernels
|
|
141
|
+
_RUST_AVAILABLE = True
|
|
142
|
+
except ImportError:
|
|
143
|
+
_RUST_AVAILABLE = False
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
**Accelerated functions:**
|
|
147
|
+
| Rust function | Python fallback | Speedup |
|
|
148
|
+
|---|---|---|
|
|
149
|
+
| `aggregate_single` | `Muvera._aggregate_single_python` | 8-17x (single doc) |
|
|
150
|
+
| `scatter_add_partitions` | `Muvera._scatter_add` (np.add.at loop) | 1-2.5x (batch) |
|
|
151
|
+
| `fill_empty_partitions_batch` | `Muvera._fill_empty_batch` (Python loop) | 1-2.5x (batch) |
|
|
152
|
+
| `partition_index_gray` | `helper._partition_index_gray_python` | part of aggregate |
|
|
153
|
+
| `partition_indices_gray_batch` | `helper._partition_indices_gray_batch_python` | part of batch |
|
|
154
|
+
|
|
155
|
+
**What is NOT in Rust** (intentionally kept in NumPy for seed compatibility):
|
|
156
|
+
- `simhash_matrix_from_seed`, `ams_projection_matrix_from_seed` — depend on `np.random.default_rng`
|
|
157
|
+
- `count_sketch_vector_from_seed` — same reason
|
|
158
|
+
- `Muvera.__init__`, public API signatures — 100% unchanged
|
|
159
|
+
|
|
160
|
+
### Batch Processing
|
|
161
|
+
|
|
162
|
+
The library supports two input formats:
|
|
163
|
+
- **Single**: `(num_vectors, dimension)` - processes one point cloud
|
|
164
|
+
- **Variable-length batch**: `list[np.ndarray]` - each point cloud has different length (recommended for real-world data)
|
|
165
|
+
|
|
166
|
+
Variable-length batch processing flattens all point clouds, processes them together, then aggregates per-document using Rust `scatter_add_partitions` (or `np.add.at()` fallback).
|
|
167
|
+
|
|
168
|
+
## Code Conventions
|
|
169
|
+
|
|
170
|
+
### Python
|
|
171
|
+
- NumPy-style docstrings (configured in pyproject.toml)
|
|
172
|
+
- Type hints required (Python 3.9+ syntax with `|` for unions)
|
|
173
|
+
- Line length: 100 characters
|
|
174
|
+
- Use `np.float32` for all embeddings (memory efficiency)
|
|
175
|
+
- Use `np.uint32` for partition indices
|
|
176
|
+
- Random number generation via `np.random.default_rng(seed)` for reproducibility
|
|
177
|
+
|
|
178
|
+
### Rust
|
|
179
|
+
- Edition 2021
|
|
180
|
+
- Dependencies: `pyo3` 0.23, `numpy` 0.23 (Rust crate, not Python package), `ndarray` 0.16
|
|
181
|
+
- All three crates are version-locked together (upgrade all at once)
|
|
182
|
+
- Use `f32` for all floating-point data, `u32` for partition indices, `i32` for counts, `i64` for boundaries
|
|
183
|
+
- PyO3 functions accept `PyReadonlyArray*` for input arrays and `&Bound<PyArray*>` for in-place mutation
|
|
184
|
+
|
|
185
|
+
## Testing
|
|
186
|
+
|
|
187
|
+
### Test Organization
|
|
188
|
+
|
|
189
|
+
- **`test_helper.py`**: Low-level helper function tests (Gray code, projections, etc.)
|
|
190
|
+
- **`test_muvera.py`**: Core Muvera class tests (shapes, validation, reproducibility)
|
|
191
|
+
- **`test_reference.py`**: Validation against reference implementation (sionic-ai/muvera-py)
|
|
192
|
+
- **`test_real_colbert.py`**: Real-world ColBERT embedding tests using NanoBEIR fixtures
|
|
193
|
+
- **`test_rust_equivalence.py`**: Numerical equivalence tests between Rust kernels and Python fallbacks (skipped if Rust extension is unavailable)
|
|
194
|
+
|
|
195
|
+
### Real Data Testing
|
|
196
|
+
|
|
197
|
+
`test_real_colbert.py` uses cached ColBERT embeddings from NanoBEIR to validate performance on real data:
|
|
198
|
+
- **Fixtures**: 35 documents, 5 queries, 35 relevance judgments (~2.2MB cached in git)
|
|
199
|
+
- **Tests**: FDE encoding, correlation with native MaxSim, Recall@K metrics
|
|
200
|
+
- **Generation**: `python scripts/generate_test_fixtures.py` (requires pylate, datasets, torch)
|
|
201
|
+
|
|
202
|
+
Fixtures are cached in `tests/fixtures/colbert_nanobeir/` to avoid slow model inference during CI/testing.
|
|
203
|
+
|
|
204
|
+
## Key Parameters
|
|
205
|
+
|
|
206
|
+
- `num_repetitions`: Controls accuracy/dimension trade-off (default: 20)
|
|
207
|
+
- `num_simhash_projections`: Determines partition count as `2^n` (default: 5, range: [0, 31))
|
|
208
|
+
- `fill_empty_partitions`: Whether to fill empty partitions with nearest vector (default: True, documents only)
|
|
209
|
+
- `projection_type`: "identity" or "ams_sketch" for dimensionality reduction
|
|
210
|
+
- `final_projection_dimension`: Optional Count Sketch final projection
|
|
211
|
+
|
|
212
|
+
Output dimension: `num_repetitions * 2^num_simhash_projections * projection_dimension` (or `final_projection_dimension` if set)
|
|
213
|
+
|
|
214
|
+
## CI/CD and Deployment
|
|
215
|
+
|
|
216
|
+
### GitHub Actions Workflows
|
|
217
|
+
|
|
218
|
+
**`.github/workflows/test.yml`** - Continuous Integration
|
|
219
|
+
- Triggers: Push to main, all pull requests
|
|
220
|
+
- Tests across Python 3.9-3.13
|
|
221
|
+
- Installs Rust toolchain via `dtolnay/rust-toolchain@stable`
|
|
222
|
+
- Builds Rust extension via `pip install ".[dev]"` (maturin build backend)
|
|
223
|
+
- Runs ruff (lint + format check), mypy (type checking), pytest
|
|
224
|
+
|
|
225
|
+
**`.github/workflows/publish.yml`** - PyPI Publishing
|
|
226
|
+
- Triggers: Push to main
|
|
227
|
+
- Checks if `v{version}` tag already exists; skips release if it does
|
|
228
|
+
- Runs full test suite
|
|
229
|
+
- Builds cross-platform wheels via `PyO3/maturin-action@v1` (Linux x86_64/aarch64, macOS x86_64/aarch64, Windows x86_64)
|
|
230
|
+
- Builds sdist separately
|
|
231
|
+
- Creates git tag, publishes to PyPI via OIDC, creates GitHub Release
|
|
232
|
+
|
|
233
|
+
### Deployment Policy
|
|
234
|
+
|
|
235
|
+
**Auto-release on version bump:**
|
|
236
|
+
- Merging a PR that changes the version in `pyproject.toml` triggers a release
|
|
237
|
+
- If the version is unchanged, all release steps are skipped
|
|
238
|
+
- Uses OpenID Connect (OIDC) for secure, token-free authentication to PyPI
|
|
239
|
+
|
|
240
|
+
**Pre-deployment checklist:**
|
|
241
|
+
1. Update `version` in `pyproject.toml`
|
|
242
|
+
2. Run tests locally: `pytest`
|
|
243
|
+
3. Check code quality: `ruff check . && mypy muvera`
|
|
244
|
+
4. Create PR and merge to main
|
|
245
|
+
|
|
246
|
+
**OIDC Setup (one-time):**
|
|
247
|
+
1. Go to [PyPI](https://pypi.org) → Account settings → Publishing
|
|
248
|
+
2. Add a new pending publisher:
|
|
249
|
+
- PyPI Project Name: `muvera-python`
|
|
250
|
+
- Owner: `craftsangjae`
|
|
251
|
+
- Repository: `muvera-python`
|
|
252
|
+
- Workflow: `publish.yml`
|
|
253
|
+
- Environment: (leave empty)
|