zvec-db 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zvec_db-0.3.0/PKG-INFO +720 -0
- zvec_db-0.3.0/README.md +668 -0
- zvec_db-0.3.0/pyproject.toml +76 -0
- zvec_db-0.3.0/setup.cfg +36 -0
- zvec_db-0.3.0/tests/test_defaults.py +158 -0
- zvec_db-0.3.0/tests/test_integration.py +326 -0
- zvec_db-0.3.0/tests/test_zvec_compat.py +502 -0
- zvec_db-0.3.0/zvec_db/__init__.py +104 -0
- zvec_db-0.3.0/zvec_db/embedders/__init__.py +36 -0
- zvec_db-0.3.0/zvec_db/embedders/base.py +651 -0
- zvec_db-0.3.0/zvec_db/embedders/defaults.py +214 -0
- zvec_db-0.3.0/zvec_db/embedders/dense/__init__.py +11 -0
- zvec_db-0.3.0/zvec_db/embedders/dense/embedders.py +182 -0
- zvec_db-0.3.0/zvec_db/embedders/dense/openai.py +467 -0
- zvec_db-0.3.0/zvec_db/embedders/dense/sentence_transformers.py +319 -0
- zvec_db-0.3.0/zvec_db/embedders/sparse/__init__.py +17 -0
- zvec_db-0.3.0/zvec_db/embedders/sparse/bm25.py +251 -0
- zvec_db-0.3.0/zvec_db/embedders/sparse/bm25l.py +261 -0
- zvec_db-0.3.0/zvec_db/embedders/sparse/bm25plus.py +295 -0
- zvec_db-0.3.0/zvec_db/embedders/sparse/count.py +111 -0
- zvec_db-0.3.0/zvec_db/embedders/sparse/dismax.py +361 -0
- zvec_db-0.3.0/zvec_db/embedders/sparse/tfidf.py +117 -0
- zvec_db-0.3.0/zvec_db/evaluation/__init__.py +71 -0
- zvec_db-0.3.0/zvec_db/evaluation/metrics.py +469 -0
- zvec_db-0.3.0/zvec_db/preprocessing/__init__.py +90 -0
- zvec_db-0.3.0/zvec_db/preprocessing/config.py +350 -0
- zvec_db-0.3.0/zvec_db/preprocessing/normalization.py +79 -0
- zvec_db-0.3.0/zvec_db/preprocessing/stemming.py +99 -0
- zvec_db-0.3.0/zvec_db/preprocessing/stopwords.py +303 -0
- zvec_db-0.3.0/zvec_db/preprocessing/tokenization.py +46 -0
- zvec_db-0.3.0/zvec_db/rerankers/__init__.py +70 -0
- zvec_db-0.3.0/zvec_db/rerankers/base.py +188 -0
- zvec_db-0.3.0/zvec_db/rerankers/cross_encoder/__init__.py +87 -0
- zvec_db-0.3.0/zvec_db/rerankers/cross_encoder/base.py +200 -0
- zvec_db-0.3.0/zvec_db/rerankers/cross_encoder/classification.py +318 -0
- zvec_db-0.3.0/zvec_db/rerankers/cross_encoder/openai.py +259 -0
- zvec_db-0.3.0/zvec_db/rerankers/cross_encoder/openai_decoder.py +342 -0
- zvec_db-0.3.0/zvec_db/rerankers/cross_encoder/openai_encoder.py +244 -0
- zvec_db-0.3.0/zvec_db/rerankers/cross_encoder/sentence_transformer.py +230 -0
- zvec_db-0.3.0/zvec_db/rerankers/defaults.py +177 -0
- zvec_db-0.3.0/zvec_db/rerankers/diversification/__init__.py +7 -0
- zvec_db-0.3.0/zvec_db/rerankers/diversification/submodular.py +279 -0
- zvec_db-0.3.0/zvec_db/rerankers/fusion/__init__.py +17 -0
- zvec_db-0.3.0/zvec_db/rerankers/fusion/hybrid_fusion.py +267 -0
- zvec_db-0.3.0/zvec_db/rerankers/fusion/multi_field.py +290 -0
- zvec_db-0.3.0/zvec_db/rerankers/fusion/rrf.py +260 -0
- zvec_db-0.3.0/zvec_db/rerankers/fusion/weighted.py +396 -0
- zvec_db-0.3.0/zvec_db/rerankers/utils/__init__.py +13 -0
- zvec_db-0.3.0/zvec_db/rerankers/utils/base_utils.py +108 -0
- zvec_db-0.3.0/zvec_db/rerankers/utils/normalize.py +351 -0
- zvec_db-0.3.0/zvec_db/rerankers/utils/pipeline.py +127 -0
- zvec_db-0.3.0/zvec_db.egg-info/PKG-INFO +720 -0
- zvec_db-0.3.0/zvec_db.egg-info/SOURCES.txt +55 -0
- zvec_db-0.3.0/zvec_db.egg-info/dependency_links.txt +1 -0
- zvec_db-0.3.0/zvec_db.egg-info/requires.txt +36 -0
- zvec_db-0.3.0/zvec_db.egg-info/top_level.txt +1 -0
zvec_db-0.3.0/PKG-INFO
ADDED
|
@@ -0,0 +1,720 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: zvec-db
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Suite d'utilitaires pour la vectorisation sparse et le re-ranking de documents
|
|
5
|
+
Author: Charles Condevaux
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/ccdv-ai/zvec-db
|
|
8
|
+
Project-URL: Repository, https://github.com/ccdv-ai/zvec-db.git
|
|
9
|
+
Project-URL: Issues, https://github.com/ccdv-ai/zvec-db/issues
|
|
10
|
+
Keywords: search,ranking,BM25,TF-IDF,sparse,vectors,embeddings
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Requires-Python: >=3.12
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
Requires-Dist: zvec
|
|
22
|
+
Requires-Dist: scikit-learn
|
|
23
|
+
Requires-Dist: numpy
|
|
24
|
+
Requires-Dist: scipy
|
|
25
|
+
Requires-Dist: httpx
|
|
26
|
+
Requires-Dist: requests
|
|
27
|
+
Requires-Dist: sentence_transformers
|
|
28
|
+
Requires-Dist: openai
|
|
29
|
+
Requires-Dist: aiohttp>=3.9.0
|
|
30
|
+
Requires-Dist: cloudpickle
|
|
31
|
+
Provides-Extra: test
|
|
32
|
+
Requires-Dist: pytest>=7.0.0; extra == "test"
|
|
33
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "test"
|
|
34
|
+
Requires-Dist: pytest-asyncio>=0.23.0; extra == "test"
|
|
35
|
+
Requires-Dist: nltk>=3.8.0; extra == "test"
|
|
36
|
+
Provides-Extra: preprocessing
|
|
37
|
+
Requires-Dist: nltk>=3.8.0; extra == "preprocessing"
|
|
38
|
+
Provides-Extra: docs
|
|
39
|
+
Requires-Dist: sphinx>=7.0.0; extra == "docs"
|
|
40
|
+
Requires-Dist: sphinx-rtd-theme>=2.0.0; extra == "docs"
|
|
41
|
+
Requires-Dist: sphinx-math-dollar>=1.2.0; extra == "docs"
|
|
42
|
+
Provides-Extra: dev
|
|
43
|
+
Requires-Dist: pre-commit; extra == "dev"
|
|
44
|
+
Requires-Dist: black>=25.9.0; extra == "dev"
|
|
45
|
+
Requires-Dist: isort; extra == "dev"
|
|
46
|
+
Requires-Dist: flake8; extra == "dev"
|
|
47
|
+
Requires-Dist: mypy; extra == "dev"
|
|
48
|
+
Provides-Extra: build
|
|
49
|
+
Requires-Dist: build; extra == "build"
|
|
50
|
+
Requires-Dist: twine; extra == "build"
|
|
51
|
+
Requires-Dist: wheel; extra == "build"
|
|
52
|
+
|
|
53
|
+
# zvec-db
|
|
54
|
+
|
|
55
|
+
[](https://pypi.org/project/zvec-db/)
|
|
56
|
+
[](https://www.python.org/downloads/)
|
|
57
|
+
[](https://github.com/ccdv-ai/zvec-db/blob/main/LICENSE)
|
|
58
|
+
|
|
59
|
+
Utility suite for sparse/dense vectorization and document re-ranking, designed to work with [zvec](https://github.com/ccdv-ai/zvec).
|
|
60
|
+
|
|
61
|
+
## Table of Contents
|
|
62
|
+
|
|
63
|
+
- [Installation](#installation)
|
|
64
|
+
- [Quick Start](#quick-start)
|
|
65
|
+
- [Sparse Embedders](#sparse-embedders)
|
|
66
|
+
- [Dense Embedders](#dense-embedders)
|
|
67
|
+
- [Re-ranking](#re-ranking)
|
|
68
|
+
- [Preprocessing](#preprocessing)
|
|
69
|
+
- [Model Persistence](#model-persistence)
|
|
70
|
+
- [Evaluation](#evaluation)
|
|
71
|
+
- [Complete Example: Hybrid Search Pipeline](#complete-example-hybrid-search-pipeline)
|
|
72
|
+
- [License](#license)
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## Installation
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
pip install zvec-db
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
**Optional dependencies:**
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
# For preprocessing (stemming, stopwords)
|
|
86
|
+
pip install "zvec-db[preprocessing]"
|
|
87
|
+
|
|
88
|
+
# For development
|
|
89
|
+
pip install "zvec-db[dev,test,docs]"
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
## Quick Start
|
|
95
|
+
|
|
96
|
+
### Hybrid search with zvec (recommended)
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
import zvec
|
|
100
|
+
from zvec_db.embedders import BM25Embedder, OpenAIEmbedder
|
|
101
|
+
from zvec_db.rerankers import NormalizedWeightedReRanker
|
|
102
|
+
|
|
103
|
+
# 1. Create embedders
|
|
104
|
+
bm25 = BM25Embedder(max_features=4096)
|
|
105
|
+
bm25.fit(documents)
|
|
106
|
+
|
|
107
|
+
dense = OpenAIEmbedder(base_url="http://localhost:9300/v1", model="embedding")
|
|
108
|
+
|
|
109
|
+
# 2. Create collection
|
|
110
|
+
schema = zvec.CollectionSchema(
|
|
111
|
+
name="docs",
|
|
112
|
+
vectors=[
|
|
113
|
+
zvec.VectorSchema("sparse", zvec.DataType.SPARSE_FP32, dimension=4096),
|
|
114
|
+
zvec.VectorSchema("dense", zvec.DataType.VECTOR_FP32, dimension=1024),
|
|
115
|
+
]
|
|
116
|
+
)
|
|
117
|
+
collection = zvec.create_and_open("./my_db", schema)
|
|
118
|
+
|
|
119
|
+
# 3. Insert
|
|
120
|
+
for i, doc in enumerate(documents):
|
|
121
|
+
collection.insert(zvec.Doc(
|
|
122
|
+
id=str(i),
|
|
123
|
+
fields={"text": doc},
|
|
124
|
+
vectors={
|
|
125
|
+
"sparse": bm25.embed(doc),
|
|
126
|
+
"dense": dense.embed(doc),
|
|
127
|
+
}
|
|
128
|
+
))
|
|
129
|
+
|
|
130
|
+
# 4. Search with weighted fusion
|
|
131
|
+
# Note: metrics=None because we mix BM25 (arbitrary scores) and dense (COSINE distances)
|
|
132
|
+
results = collection.query(
|
|
133
|
+
vectors=[
|
|
134
|
+
zvec.VectorQuery(field_name="sparse", vector=bm25.embed(query)),
|
|
135
|
+
zvec.VectorQuery(field_name="dense", vector=dense.embed(query)),
|
|
136
|
+
],
|
|
137
|
+
topk=10,
|
|
138
|
+
reranker=NormalizedWeightedReRanker(
|
|
139
|
+
metrics=None, # No automatic conversion (mixed metrics)
|
|
140
|
+
weights={"sparse": 0.4, "dense": 0.6},
|
|
141
|
+
normalizer_configs={"sparse": {"method": "bayes"}},
|
|
142
|
+
),
|
|
143
|
+
)
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
## Sparse Embedders
|
|
149
|
+
|
|
150
|
+
All sparse embedders return dictionaries `{index: score, ...}` compatible with zvec's `SPARSE_FP32` format.
|
|
151
|
+
|
|
152
|
+
### BM25Embedder (recommended)
|
|
153
|
+
|
|
154
|
+
Standard BM25 scoring - best for general use cases.
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
from zvec_db.embedders import BM25Embedder
|
|
158
|
+
from zvec_db.preprocessing import NormalizationConfig
|
|
159
|
+
|
|
160
|
+
# With automatic preprocessing
|
|
161
|
+
config = NormalizationConfig.aggressive(language="french")
|
|
162
|
+
bm25 = BM25Embedder(
|
|
163
|
+
max_features=4096,
|
|
164
|
+
k1=1.2, # Term frequency saturation (default: 1.2)
|
|
165
|
+
b=0.75, # Length normalization (default: 0.75)
|
|
166
|
+
preprocessing_config=config
|
|
167
|
+
)
|
|
168
|
+
bm25.fit(documents)
|
|
169
|
+
|
|
170
|
+
vector = bm25.embed("search query") # {index: score, ...}
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
### Other sparse embedders
|
|
174
|
+
|
|
175
|
+
| Embedder | Use case |
|
|
176
|
+
|----------|----------|
|
|
177
|
+
| `TfidfEmbedder` | TF-IDF weighting with sublinear TF option |
|
|
178
|
+
| `CountEmbedder` | Simple term counts (binary option available) |
|
|
179
|
+
| `BM25LEmbedder` | Documents with variable lengths |
|
|
180
|
+
| `BM25PlusEmbedder` | Avoid zero scores with delta smoothing |
|
|
181
|
+
| `DisMaxEmbedder` | Multi-field search (takes maximum score) |
|
|
182
|
+
|
|
183
|
+
```python
|
|
184
|
+
from zvec_db.embedders import TfidfEmbedder, CountEmbedder, DisMaxEmbedder
|
|
185
|
+
|
|
186
|
+
tfidf = TfidfEmbedder(max_features=4096, sublinear_tf=True)
|
|
187
|
+
count = CountEmbedder(max_features=4096, binary=True)
|
|
188
|
+
dismax = DisMaxEmbedder(tie_breaker=0.1)
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
---
|
|
192
|
+
|
|
193
|
+
## Dense Embedders
|
|
194
|
+
|
|
195
|
+
### OpenAIEmbedder (API / vLLM)
|
|
196
|
+
|
|
197
|
+
Works with OpenAI API or compatible endpoints (vLLM, local servers).
|
|
198
|
+
|
|
199
|
+
```python
|
|
200
|
+
from zvec_db.embedders import OpenAIEmbedder
|
|
201
|
+
|
|
202
|
+
# OpenAI API
|
|
203
|
+
embedder = OpenAIEmbedder(model="text-embedding-3-small", api_key="sk-...")
|
|
204
|
+
|
|
205
|
+
# Local vLLM
|
|
206
|
+
embedder = OpenAIEmbedder(
|
|
207
|
+
base_url="http://localhost:9300/v1",
|
|
208
|
+
model="embedding",
|
|
209
|
+
max_batch_size=32,
|
|
210
|
+
)
|
|
211
|
+
vector = embedder.embed("search query")
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
### SentenceTransformersEmbedder (local)
|
|
215
|
+
|
|
216
|
+
Run embedding models locally using sentence-transformers.
|
|
217
|
+
|
|
218
|
+
```python
|
|
219
|
+
from zvec_db.embedders import SentenceTransformersEmbedder
|
|
220
|
+
|
|
221
|
+
embedder = SentenceTransformersEmbedder(
|
|
222
|
+
model_name="all-MiniLM-L6-v2", # 384 dims, fast
|
|
223
|
+
device="cpu",
|
|
224
|
+
normalize=True,
|
|
225
|
+
)
|
|
226
|
+
vector = embedder.embed("search query")
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
---
|
|
230
|
+
|
|
231
|
+
## Re-ranking
|
|
232
|
+
|
|
233
|
+
### Understanding distance/similarity metrics
|
|
234
|
+
|
|
235
|
+
**Problem**: Vector databases store **distances** (smaller = more similar), but fusion algorithms assume **similarities** (larger = more relevant).
|
|
236
|
+
|
|
237
|
+
The `metrics` parameter handles conversion:
|
|
238
|
+
|
|
239
|
+
| Metric | Type | Range | Conversion | Usage |
|
|
240
|
+
|--------|------|-------|------------|-------|
|
|
241
|
+
| `COSINE` | Distance | [0, 2] | `1.0 - score/2.0` | Normalized embeddings (Qdrant, zvec) |
|
|
242
|
+
| `L2` | Distance | [0, ∞) | `1 - 2*atan(s)/π` | Euclidean distance |
|
|
243
|
+
| `IP` | Similarity | (-∞, ∞) | None | Inner product (already similarity) |
|
|
244
|
+
| `None` | - | - | None | BM25 scores or already normalized [0, 1] |
|
|
245
|
+
|
|
246
|
+
**Default**: `metrics=MetricType.COSINE` (main use case with zvec/Qdrant).
|
|
247
|
+
|
|
248
|
+
```python
|
|
249
|
+
from zvec_db.rerankers import NormalizedWeightedReRanker, MetricType
|
|
250
|
+
|
|
251
|
+
# COSINE distances from zvec/Qdrant (default)
|
|
252
|
+
reranker = NormalizedWeightedReRanker(topn=10)
|
|
253
|
+
|
|
254
|
+
# BM25 scores (not distances!)
|
|
255
|
+
reranker = NormalizedWeightedReRanker(topn=10, metrics=None)
|
|
256
|
+
|
|
257
|
+
# Hybrid: BM25 + dense with per-source normalization
|
|
258
|
+
reranker = NormalizedWeightedReRanker(
|
|
259
|
+
metrics=None, # No global conversion
|
|
260
|
+
weights={"sparse": 0.4, "dense": 0.6},
|
|
261
|
+
normalizer_configs={
|
|
262
|
+
"sparse": "bayes", # BM25: handles outliers well
|
|
263
|
+
"dense": True, # Dense: standard normalization
|
|
264
|
+
},
|
|
265
|
+
)
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
### Fusion rerankers
|
|
269
|
+
|
|
270
|
+
#### Normalizer configuration
|
|
271
|
+
|
|
272
|
+
The `normalizer_configs` parameter controls how scores are normalized per source:
|
|
273
|
+
|
|
274
|
+
| Value | Effect |
|
|
275
|
+
|-------|--------|
|
|
276
|
+
| `True` | Standard normalization (scales scores to [0, 1]) |
|
|
277
|
+
| `"bayes"`, `"bayesian"`, `"bb25"` | Bayesian sigmoid calibration (robust to outliers). These are aliases for the same method. |
|
|
278
|
+
| `{"method": "bayes", "alpha": 1.0}` | Dict with custom parameters (`alpha`, `beta`) |
|
|
279
|
+
| `None` | Skip normalization (use raw scores after metric conversion) |
|
|
280
|
+
|
|
281
|
+
**Example:**
|
|
282
|
+
|
|
283
|
+
```python
|
|
284
|
+
normalizer_configs={
|
|
285
|
+
"sparse": "bayes", # Bayesian: handles BM25 outliers well
|
|
286
|
+
"dense": None, # Optional: Cosine already scales between in [0, 1]
|
|
287
|
+
}
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
#### NormalizedWeightedReRanker (weighted fusion)
|
|
291
|
+
|
|
292
|
+
```python
|
|
293
|
+
from zvec_db.rerankers import NormalizedWeightedReRanker
|
|
294
|
+
|
|
295
|
+
reranker = NormalizedWeightedReRanker(
|
|
296
|
+
topn=10,
|
|
297
|
+
weights={"source1": 0.7, "source2": 0.3},
|
|
298
|
+
normalizer_configs={"source1": "bayes", "source2": True},
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
results = collection.query(vectors=[...], topk=20, reranker=reranker)
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
#### Using `schema` parameter (auto-detect metrics from collection)
|
|
305
|
+
|
|
306
|
+
When working with zvec collections, you can use the `schema` parameter to automatically infer the correct metrics for each vector field:
|
|
307
|
+
|
|
308
|
+
```python
|
|
309
|
+
import zvec
|
|
310
|
+
from zvec_db.rerankers import NormalizedWeightedReRanker
|
|
311
|
+
|
|
312
|
+
# Open existing collection
|
|
313
|
+
collection = zvec.open("./my_collection")
|
|
314
|
+
|
|
315
|
+
# Reranker auto-infers metrics from schema
|
|
316
|
+
# - SPARSE_FP32 fields -> metrics=None (BM25 scores)
|
|
317
|
+
# - VECTOR_FP32 fields with COSINE -> metrics=MetricType.COSINE
|
|
318
|
+
reranker = NormalizedWeightedReRanker(
|
|
319
|
+
topn=10,
|
|
320
|
+
metrics=None, # Will infer from schema
|
|
321
|
+
schema=collection.schema,
|
|
322
|
+
weights={"sparse": 0.4, "dense": 0.6},
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
# No need to manually specify metrics per source!
|
|
326
|
+
results = collection.query(
|
|
327
|
+
vectors=[
|
|
328
|
+
zvec.VectorQuery(field_name="sparse", vector=bm25.embed(query)),
|
|
329
|
+
zvec.VectorQuery(field_name="dense", vector=dense.embed(query)),
|
|
330
|
+
],
|
|
331
|
+
topk=20,
|
|
332
|
+
reranker=reranker,
|
|
333
|
+
)
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
**Manual per-source metrics (alternative):**
|
|
337
|
+
|
|
338
|
+
```python
|
|
339
|
+
from zvec_db.rerankers import NormalizedWeightedReRanker, MetricType
|
|
340
|
+
|
|
341
|
+
# Explicit per-source metrics
|
|
342
|
+
reranker = NormalizedWeightedReRanker(
|
|
343
|
+
topn=10,
|
|
344
|
+
metrics={
|
|
345
|
+
"sparse": None, # BM25 scores (not distances)
|
|
346
|
+
"dense": MetricType.COSINE, # Convert COSINE distance [0,2] -> similarity
|
|
347
|
+
},
|
|
348
|
+
weights={"sparse": 0.4, "dense": 0.6},
|
|
349
|
+
)
|
|
350
|
+
```
|
|
351
|
+
|
|
352
|
+
#### NormalizedRrfReRanker (Reciprocal Rank Fusion)
|
|
353
|
+
|
|
354
|
+
```python
|
|
355
|
+
from zvec_db.rerankers import NormalizedRrfReRanker
|
|
356
|
+
|
|
357
|
+
reranker = NormalizedRrfReRanker(topn=10, rank_constant=60)
|
|
358
|
+
results = reranker.rerank({"bm25": bm25_docs, "dense": dense_docs})
|
|
359
|
+
```
|
|
360
|
+
|
|
361
|
+
#### WeightedReRanker (scores already normalized)
|
|
362
|
+
|
|
363
|
+
Use when scores are already in [0, 1] with "higher=better" orientation.
|
|
364
|
+
|
|
365
|
+
```python
|
|
366
|
+
from zvec_db.rerankers import WeightedReRanker
|
|
367
|
+
|
|
368
|
+
reranker = WeightedReRanker(
|
|
369
|
+
topn=10,
|
|
370
|
+
weights={"source1": 0.7, "source2": 0.3},
|
|
371
|
+
)
|
|
372
|
+
```
|
|
373
|
+
|
|
374
|
+
### Default rerankers (ready-to-use)
|
|
375
|
+
|
|
376
|
+
```python
|
|
377
|
+
from zvec_db.rerankers.defaults import (
|
|
378
|
+
DefaultWeightedReranker,
|
|
379
|
+
DefaultHybridReranker,
|
|
380
|
+
DefaultRrfReranker,
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
# Weighted fusion with Bayesian normalization
|
|
384
|
+
reranker = DefaultWeightedReranker()
|
|
385
|
+
|
|
386
|
+
# Optimized hybrid: dense (60%) + BM25 (40%)
|
|
387
|
+
reranker = DefaultHybridReranker()
|
|
388
|
+
|
|
389
|
+
# RRF with standard parameters
|
|
390
|
+
reranker = DefaultRrfReranker()
|
|
391
|
+
|
|
392
|
+
results = reranker.rerank({"bm25": bm25_docs, "dense": dense_docs})
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
### Cross-Encoder rerankers
|
|
396
|
+
|
|
397
|
+
All cross-encoders require a `query` parameter at initialization.
|
|
398
|
+
|
|
399
|
+
#### SentenceTransformerReranker (local, binary)
|
|
400
|
+
|
|
401
|
+
```python
|
|
402
|
+
from zvec_db.rerankers import SentenceTransformerReranker
|
|
403
|
+
|
|
404
|
+
reranker = SentenceTransformerReranker(
|
|
405
|
+
query="machine learning",
|
|
406
|
+
model_name="cross-encoder/ms-marco-MiniLM-L-6-v2",
|
|
407
|
+
topn=10,
|
|
408
|
+
)
|
|
409
|
+
results = reranker.rerank({"bm25": docs})
|
|
410
|
+
```
|
|
411
|
+
|
|
412
|
+
#### ClassificationReranker (local, multi-class)
|
|
413
|
+
|
|
414
|
+
```python
|
|
415
|
+
from zvec_db.rerankers import ClassificationReranker
|
|
416
|
+
|
|
417
|
+
reranker = ClassificationReranker(
|
|
418
|
+
query="machine learning",
|
|
419
|
+
model_name="your-multi-class-model",
|
|
420
|
+
num_classes=5, # Auto-inferred if not specified
|
|
421
|
+
topn=10,
|
|
422
|
+
)
|
|
423
|
+
results = reranker.rerank({"bm25": docs})
|
|
424
|
+
```
|
|
425
|
+
|
|
426
|
+
#### OpenAIReranker (API)
|
|
427
|
+
|
|
428
|
+
```python
|
|
429
|
+
from zvec_db.rerankers import OpenAIReranker
|
|
430
|
+
|
|
431
|
+
reranker = OpenAIReranker(
|
|
432
|
+
query="machine learning",
|
|
433
|
+
base_url="http://localhost:9400/v1",
|
|
434
|
+
model="BAAI/bge-reranker-v2-m3",
|
|
435
|
+
endpoint="rerank", # or "score"
|
|
436
|
+
topn=10,
|
|
437
|
+
)
|
|
438
|
+
results = reranker.rerank({"bm25": docs})
|
|
439
|
+
```
|
|
440
|
+
|
|
441
|
+
### Diversification
|
|
442
|
+
|
|
443
|
+
#### SubmodularReranker (MMR)
|
|
444
|
+
|
|
445
|
+
Maximize relevance while diversifying results.
|
|
446
|
+
|
|
447
|
+
```python
|
|
448
|
+
from zvec_db.rerankers import SubmodularReranker
|
|
449
|
+
|
|
450
|
+
reranker = SubmodularReranker(
|
|
451
|
+
topn=10,
|
|
452
|
+
lambda_param=0.7, # 70% relevance, 30% diversity
|
|
453
|
+
vector_field="embedding",
|
|
454
|
+
)
|
|
455
|
+
results = reranker.rerank({"source": docs_with_vectors})
|
|
456
|
+
```
|
|
457
|
+
|
|
458
|
+
---
|
|
459
|
+
|
|
460
|
+
## Preprocessing
|
|
461
|
+
|
|
462
|
+
Preprocessing improves sparse embedding quality.
|
|
463
|
+
|
|
464
|
+
### Automatic (recommended)
|
|
465
|
+
|
|
466
|
+
```python
|
|
467
|
+
from zvec_db.embedders import BM25Embedder
|
|
468
|
+
from zvec_db.preprocessing import NormalizationConfig
|
|
469
|
+
|
|
470
|
+
config = NormalizationConfig.aggressive(language="french")
|
|
471
|
+
bm25 = BM25Embedder(max_features=4096, preprocessing_config=config)
|
|
472
|
+
bm25.fit(documents)
|
|
473
|
+
# Preprocessing is automatically applied and saved with the model
|
|
474
|
+
```
|
|
475
|
+
|
|
476
|
+
### Utility functions
|
|
477
|
+
|
|
478
|
+
```python
|
|
479
|
+
from zvec_db.preprocessing import normalize_text, stem_word, remove_stopwords
|
|
480
|
+
|
|
481
|
+
# Full pipeline
|
|
482
|
+
normalize_text(" CHAT MANGEAIT ", lowercase=True, remove_accents=True, stem=True) # "chat mang"
|
|
483
|
+
|
|
484
|
+
# Individual functions
|
|
485
|
+
stem_word("mangeaient", language="french") # "mang"
|
|
486
|
+
remove_stopwords("le chat mange", language="french") # "chat mange"
|
|
487
|
+
```
|
|
488
|
+
|
|
489
|
+
**nltk installation:**
|
|
490
|
+
|
|
491
|
+
```bash
|
|
492
|
+
pip install "zvec-db[preprocessing]"
|
|
493
|
+
```
|
|
494
|
+
|
|
495
|
+
---
|
|
496
|
+
|
|
497
|
+
## Model Persistence
|
|
498
|
+
|
|
499
|
+
```python
|
|
500
|
+
from zvec_db.embedders import BM25Embedder
|
|
501
|
+
|
|
502
|
+
# Save
|
|
503
|
+
bm25 = BM25Embedder(max_features=4096, preprocessing_config=config)
|
|
504
|
+
bm25.fit(documents)
|
|
505
|
+
bm25.save("models/bm25_model.joblib")
|
|
506
|
+
|
|
507
|
+
# Load
|
|
508
|
+
bm25_loaded = BM25Embedder()
|
|
509
|
+
bm25_loaded.load("models/bm25_model.joblib")
|
|
510
|
+
|
|
511
|
+
# Embeddings are identical (preprocessing included)
|
|
512
|
+
assert bm25.embed("query") == bm25_loaded.embed("query")
|
|
513
|
+
```
|
|
514
|
+
|
|
515
|
+
---
|
|
516
|
+
|
|
517
|
+
## Evaluation
|
|
518
|
+
|
|
519
|
+
```python
|
|
520
|
+
from zvec_db.evaluation import evaluate_ranking
|
|
521
|
+
|
|
522
|
+
# Evaluate ranking quality
|
|
523
|
+
metrics = evaluate_ranking(
|
|
524
|
+
ground_truth=[["doc1", "doc2"], ["doc3"]],
|
|
525
|
+
predictions=[["doc2", "doc1"], ["doc3", "doc4"]],
|
|
526
|
+
metrics=["ndcg", "map", "mrr", "recall"],
|
|
527
|
+
)
|
|
528
|
+
```
|
|
529
|
+
|
|
530
|
+
---
|
|
531
|
+
|
|
532
|
+
## Development
|
|
533
|
+
|
|
534
|
+
```bash
|
|
535
|
+
# Clone
|
|
536
|
+
git clone https://github.com/ccdv-ai/zvec-db.git
|
|
537
|
+
cd zvec-db
|
|
538
|
+
|
|
539
|
+
# Install with all dependencies
|
|
540
|
+
make install
|
|
541
|
+
|
|
542
|
+
# Run tests
|
|
543
|
+
make test
|
|
544
|
+
|
|
545
|
+
# Lint
|
|
546
|
+
make lint
|
|
547
|
+
|
|
548
|
+
# Build docs
|
|
549
|
+
make docs
|
|
550
|
+
```
|
|
551
|
+
|
|
552
|
+
---
|
|
553
|
+
|
|
554
|
+
## License
|
|
555
|
+
|
|
556
|
+
MIT License
|
|
557
|
+
|
|
558
|
+
---
|
|
559
|
+
|
|
560
|
+
## Complete Example: Hybrid Search Pipeline
|
|
561
|
+
|
|
562
|
+
This section demonstrates a complete hybrid search pipeline with BM25 + dense embeddings and re-ranking.
|
|
563
|
+
|
|
564
|
+
### Setup
|
|
565
|
+
|
|
566
|
+
```python
|
|
567
|
+
import zvec
|
|
568
|
+
from zvec.model.doc import Doc
|
|
569
|
+
from zvec_db.embedders import BM25Embedder, SentenceTransformersEmbedder
|
|
570
|
+
from zvec_db.rerankers import NormalizedWeightedReRanker, DefaultHybridReranker
|
|
571
|
+
|
|
572
|
+
# Sample documents
|
|
573
|
+
documents = [
|
|
574
|
+
"Machine learning is a subset of artificial intelligence",
|
|
575
|
+
"Deep learning uses neural networks with many layers",
|
|
576
|
+
"Natural language processing enables computers to understand text",
|
|
577
|
+
"Computer vision allows machines to interpret images",
|
|
578
|
+
"Reinforcement learning trains agents through rewards",
|
|
579
|
+
]
|
|
580
|
+
|
|
581
|
+
# Initialize embedders
|
|
582
|
+
bm25 = BM25Embedder(max_features=4096, k1=1.2, b=0.75)
|
|
583
|
+
bm25.fit(documents)
|
|
584
|
+
|
|
585
|
+
dense = SentenceTransformersEmbedder(
|
|
586
|
+
model_name="all-MiniLM-L6-v2",
|
|
587
|
+
device="cpu",
|
|
588
|
+
normalize=True,
|
|
589
|
+
)
|
|
590
|
+
```
|
|
591
|
+
|
|
592
|
+
### Create and populate collection
|
|
593
|
+
|
|
594
|
+
```python
|
|
595
|
+
# Create zvec collection
|
|
596
|
+
schema = zvec.CollectionSchema(
|
|
597
|
+
name="docs",
|
|
598
|
+
vectors=[
|
|
599
|
+
zvec.VectorSchema("sparse", zvec.DataType.SPARSE_FP32, dimension=4096),
|
|
600
|
+
zvec.VectorSchema("dense", zvec.DataType.VECTOR_FP32, dimension=384),
|
|
601
|
+
]
|
|
602
|
+
)
|
|
603
|
+
collection = zvec.create_and_open("./my_db", schema)
|
|
604
|
+
|
|
605
|
+
# Index documents
|
|
606
|
+
for i, doc in enumerate(documents):
|
|
607
|
+
collection.insert(zvec.Doc(
|
|
608
|
+
id=str(i),
|
|
609
|
+
fields={"text": doc},
|
|
610
|
+
vectors={
|
|
611
|
+
"sparse": bm25.embed(doc),
|
|
612
|
+
"dense": dense.embed(doc),
|
|
613
|
+
}
|
|
614
|
+
))
|
|
615
|
+
```
|
|
616
|
+
|
|
617
|
+
### Hybrid search with re-ranking
|
|
618
|
+
|
|
619
|
+
```python
|
|
620
|
+
query = "neural networks and deep learning"
|
|
621
|
+
|
|
622
|
+
# Method 1: Using collection.query with built-in reranker
|
|
623
|
+
results = collection.query(
|
|
624
|
+
vectors=[
|
|
625
|
+
zvec.VectorQuery(field_name="sparse", vector=bm25.embed(query)),
|
|
626
|
+
zvec.VectorQuery(field_name="dense", vector=dense.embed(query)),
|
|
627
|
+
],
|
|
628
|
+
topk=20,
|
|
629
|
+
reranker=DefaultHybridReranker(
|
|
630
|
+
weights={"sparse": 0.4, "dense": 0.6},
|
|
631
|
+
),
|
|
632
|
+
)
|
|
633
|
+
|
|
634
|
+
print("Top results:")
|
|
635
|
+
for i, doc in enumerate(results[:5]):
|
|
636
|
+
print(f" {i+1}. {doc.fields['text']} (score: {doc.score:.4f})")
|
|
637
|
+
```
|
|
638
|
+
|
|
639
|
+
### Manual hybrid search (more control)
|
|
640
|
+
|
|
641
|
+
```python
|
|
642
|
+
from zvec.model.doc import Doc
|
|
643
|
+
|
|
644
|
+
# 1. Separate searches
|
|
645
|
+
sparse_results = collection.search(
|
|
646
|
+
vector_name="sparse",
|
|
647
|
+
vector=bm25.embed(query),
|
|
648
|
+
topk=20,
|
|
649
|
+
)
|
|
650
|
+
|
|
651
|
+
dense_results = collection.search(
|
|
652
|
+
vector_name="dense",
|
|
653
|
+
vector=dense.embed(query),
|
|
654
|
+
topk=20,
|
|
655
|
+
)
|
|
656
|
+
|
|
657
|
+
# 2. Re-rank with schema-based auto-detection
|
|
658
|
+
reranker = NormalizedWeightedReRanker(
|
|
659
|
+
topn=10,
|
|
660
|
+
metrics=None, # Infer from schema
|
|
661
|
+
schema=collection.schema,
|
|
662
|
+
weights={"sparse": 0.4, "dense": 0.6},
|
|
663
|
+
normalizer_configs={
|
|
664
|
+
"sparse": "bayes", # Robust to BM25 outliers
|
|
665
|
+
"dense": None, # Optional: COSINE is already in [0, 1]
|
|
666
|
+
},
|
|
667
|
+
)
|
|
668
|
+
|
|
669
|
+
# 3. Combine and re-rank
|
|
670
|
+
final_results = reranker.rerank({
|
|
671
|
+
"sparse": sparse_results,
|
|
672
|
+
"dense": dense_results,
|
|
673
|
+
})
|
|
674
|
+
|
|
675
|
+
print("\nFinal re-ranked results:")
|
|
676
|
+
for i, doc in enumerate(final_results[:5]):
|
|
677
|
+
print(f" {i+1}. {doc.fields['text']} (score: {doc.score:.4f})")
|
|
678
|
+
```
|
|
679
|
+
|
|
680
|
+
### Standalone re-ranking (no zvec collection)
|
|
681
|
+
|
|
682
|
+
```python
|
|
683
|
+
# If you're not using zvec, you can still use the rerankers standalone
|
|
684
|
+
|
|
685
|
+
# Mock search results from different sources
|
|
686
|
+
bm25_results = [
|
|
687
|
+
Doc(id="doc1", score=15.5, fields={"text": "Machine learning..."}),
|
|
688
|
+
Doc(id="doc2", score=12.3, fields={"text": "Deep neural..."}),
|
|
689
|
+
Doc(id="doc3", score=8.7, fields={"text": "AI systems..."}),
|
|
690
|
+
]
|
|
691
|
+
|
|
692
|
+
dense_results = [
|
|
693
|
+
Doc(id="doc2", score=0.92, fields={"text": "Deep neural..."}),
|
|
694
|
+
Doc(id="doc1", score=0.75, fields={"text": "Machine learning..."}),
|
|
695
|
+
Doc(id="doc4", score=0.68, fields={"text": "Data science..."}),
|
|
696
|
+
]
|
|
697
|
+
|
|
698
|
+
# Re-rank with explicit metrics
|
|
699
|
+
reranker = NormalizedWeightedReRanker(
|
|
700
|
+
topn=10,
|
|
701
|
+
metrics={
|
|
702
|
+
"bm25": None, # BM25 scores
|
|
703
|
+
"dense": MetricType.COSINE, # COSINE distances [0, 2]
|
|
704
|
+
},
|
|
705
|
+
weights={"bm25": 0.4, "dense": 0.6},
|
|
706
|
+
)
|
|
707
|
+
|
|
708
|
+
final_results = reranker.rerank({
|
|
709
|
+
"bm25": bm25_results,
|
|
710
|
+
"dense": dense_results,
|
|
711
|
+
})
|
|
712
|
+
```
|
|
713
|
+
|
|
714
|
+
---
|
|
715
|
+
|
|
716
|
+
## Resources
|
|
717
|
+
|
|
718
|
+
- [Full Documentation](https://zvec-db.readthedocs.io/)
|
|
719
|
+
- [Examples](examples/)
|
|
720
|
+
- [Issue Tracker](https://github.com/ccdv-ai/zvec-db/issues)
|