zvec-db 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. zvec_db-0.3.0/PKG-INFO +720 -0
  2. zvec_db-0.3.0/README.md +668 -0
  3. zvec_db-0.3.0/pyproject.toml +76 -0
  4. zvec_db-0.3.0/setup.cfg +36 -0
  5. zvec_db-0.3.0/tests/test_defaults.py +158 -0
  6. zvec_db-0.3.0/tests/test_integration.py +326 -0
  7. zvec_db-0.3.0/tests/test_zvec_compat.py +502 -0
  8. zvec_db-0.3.0/zvec_db/__init__.py +104 -0
  9. zvec_db-0.3.0/zvec_db/embedders/__init__.py +36 -0
  10. zvec_db-0.3.0/zvec_db/embedders/base.py +651 -0
  11. zvec_db-0.3.0/zvec_db/embedders/defaults.py +214 -0
  12. zvec_db-0.3.0/zvec_db/embedders/dense/__init__.py +11 -0
  13. zvec_db-0.3.0/zvec_db/embedders/dense/embedders.py +182 -0
  14. zvec_db-0.3.0/zvec_db/embedders/dense/openai.py +467 -0
  15. zvec_db-0.3.0/zvec_db/embedders/dense/sentence_transformers.py +319 -0
  16. zvec_db-0.3.0/zvec_db/embedders/sparse/__init__.py +17 -0
  17. zvec_db-0.3.0/zvec_db/embedders/sparse/bm25.py +251 -0
  18. zvec_db-0.3.0/zvec_db/embedders/sparse/bm25l.py +261 -0
  19. zvec_db-0.3.0/zvec_db/embedders/sparse/bm25plus.py +295 -0
  20. zvec_db-0.3.0/zvec_db/embedders/sparse/count.py +111 -0
  21. zvec_db-0.3.0/zvec_db/embedders/sparse/dismax.py +361 -0
  22. zvec_db-0.3.0/zvec_db/embedders/sparse/tfidf.py +117 -0
  23. zvec_db-0.3.0/zvec_db/evaluation/__init__.py +71 -0
  24. zvec_db-0.3.0/zvec_db/evaluation/metrics.py +469 -0
  25. zvec_db-0.3.0/zvec_db/preprocessing/__init__.py +90 -0
  26. zvec_db-0.3.0/zvec_db/preprocessing/config.py +350 -0
  27. zvec_db-0.3.0/zvec_db/preprocessing/normalization.py +79 -0
  28. zvec_db-0.3.0/zvec_db/preprocessing/stemming.py +99 -0
  29. zvec_db-0.3.0/zvec_db/preprocessing/stopwords.py +303 -0
  30. zvec_db-0.3.0/zvec_db/preprocessing/tokenization.py +46 -0
  31. zvec_db-0.3.0/zvec_db/rerankers/__init__.py +70 -0
  32. zvec_db-0.3.0/zvec_db/rerankers/base.py +188 -0
  33. zvec_db-0.3.0/zvec_db/rerankers/cross_encoder/__init__.py +87 -0
  34. zvec_db-0.3.0/zvec_db/rerankers/cross_encoder/base.py +200 -0
  35. zvec_db-0.3.0/zvec_db/rerankers/cross_encoder/classification.py +318 -0
  36. zvec_db-0.3.0/zvec_db/rerankers/cross_encoder/openai.py +259 -0
  37. zvec_db-0.3.0/zvec_db/rerankers/cross_encoder/openai_decoder.py +342 -0
  38. zvec_db-0.3.0/zvec_db/rerankers/cross_encoder/openai_encoder.py +244 -0
  39. zvec_db-0.3.0/zvec_db/rerankers/cross_encoder/sentence_transformer.py +230 -0
  40. zvec_db-0.3.0/zvec_db/rerankers/defaults.py +177 -0
  41. zvec_db-0.3.0/zvec_db/rerankers/diversification/__init__.py +7 -0
  42. zvec_db-0.3.0/zvec_db/rerankers/diversification/submodular.py +279 -0
  43. zvec_db-0.3.0/zvec_db/rerankers/fusion/__init__.py +17 -0
  44. zvec_db-0.3.0/zvec_db/rerankers/fusion/hybrid_fusion.py +267 -0
  45. zvec_db-0.3.0/zvec_db/rerankers/fusion/multi_field.py +290 -0
  46. zvec_db-0.3.0/zvec_db/rerankers/fusion/rrf.py +260 -0
  47. zvec_db-0.3.0/zvec_db/rerankers/fusion/weighted.py +396 -0
  48. zvec_db-0.3.0/zvec_db/rerankers/utils/__init__.py +13 -0
  49. zvec_db-0.3.0/zvec_db/rerankers/utils/base_utils.py +108 -0
  50. zvec_db-0.3.0/zvec_db/rerankers/utils/normalize.py +351 -0
  51. zvec_db-0.3.0/zvec_db/rerankers/utils/pipeline.py +127 -0
  52. zvec_db-0.3.0/zvec_db.egg-info/PKG-INFO +720 -0
  53. zvec_db-0.3.0/zvec_db.egg-info/SOURCES.txt +55 -0
  54. zvec_db-0.3.0/zvec_db.egg-info/dependency_links.txt +1 -0
  55. zvec_db-0.3.0/zvec_db.egg-info/requires.txt +36 -0
  56. zvec_db-0.3.0/zvec_db.egg-info/top_level.txt +1 -0
zvec_db-0.3.0/PKG-INFO ADDED
@@ -0,0 +1,720 @@
1
+ Metadata-Version: 2.4
2
+ Name: zvec-db
3
+ Version: 0.3.0
4
+ Summary: Suite d'utilitaires pour la vectorisation sparse et le re-ranking de documents
5
+ Author: Charles Condevaux
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/ccdv-ai/zvec-db
8
+ Project-URL: Repository, https://github.com/ccdv-ai/zvec-db.git
9
+ Project-URL: Issues, https://github.com/ccdv-ai/zvec-db/issues
10
+ Keywords: search,ranking,BM25,TF-IDF,sparse,vectors,embeddings
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Requires-Python: >=3.12
20
+ Description-Content-Type: text/markdown
21
+ Requires-Dist: zvec
22
+ Requires-Dist: scikit-learn
23
+ Requires-Dist: numpy
24
+ Requires-Dist: scipy
25
+ Requires-Dist: httpx
26
+ Requires-Dist: requests
27
+ Requires-Dist: sentence_transformers
28
+ Requires-Dist: openai
29
+ Requires-Dist: aiohttp>=3.9.0
30
+ Requires-Dist: cloudpickle
31
+ Provides-Extra: test
32
+ Requires-Dist: pytest>=7.0.0; extra == "test"
33
+ Requires-Dist: pytest-cov>=4.0.0; extra == "test"
34
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == "test"
35
+ Requires-Dist: nltk>=3.8.0; extra == "test"
36
+ Provides-Extra: preprocessing
37
+ Requires-Dist: nltk>=3.8.0; extra == "preprocessing"
38
+ Provides-Extra: docs
39
+ Requires-Dist: sphinx>=7.0.0; extra == "docs"
40
+ Requires-Dist: sphinx-rtd-theme>=2.0.0; extra == "docs"
41
+ Requires-Dist: sphinx-math-dollar>=1.2.0; extra == "docs"
42
+ Provides-Extra: dev
43
+ Requires-Dist: pre-commit; extra == "dev"
44
+ Requires-Dist: black>=25.9.0; extra == "dev"
45
+ Requires-Dist: isort; extra == "dev"
46
+ Requires-Dist: flake8; extra == "dev"
47
+ Requires-Dist: mypy; extra == "dev"
48
+ Provides-Extra: build
49
+ Requires-Dist: build; extra == "build"
50
+ Requires-Dist: twine; extra == "build"
51
+ Requires-Dist: wheel; extra == "build"
52
+
53
+ # zvec-db
54
+
55
+ [![Version](https://img.shields.io/pypi/v/zvec-db)](https://pypi.org/project/zvec-db/)
56
+ [![Python 3.12+](https://img.shields.io/badge/python-3.12+-blue.svg)](https://www.python.org/downloads/)
57
+ [![License](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/ccdv-ai/zvec-db/blob/main/LICENSE)
58
+
59
+ Utility suite for sparse/dense vectorization and document re-ranking, designed to work with [zvec](https://github.com/ccdv-ai/zvec).
60
+
61
+ ## Table of Contents
62
+
63
+ - [Installation](#installation)
64
+ - [Quick Start](#quick-start)
65
+ - [Sparse Embedders](#sparse-embedders)
66
+ - [Dense Embedders](#dense-embedders)
67
+ - [Re-ranking](#re-ranking)
68
+ - [Preprocessing](#preprocessing)
69
+ - [Model Persistence](#model-persistence)
70
+ - [Evaluation](#evaluation)
71
+ - [Complete Example: Hybrid Search Pipeline](#complete-example-hybrid-search-pipeline)
72
+ - [License](#license)
73
+
74
+ ---
75
+
76
+ ## Installation
77
+
78
+ ```bash
79
+ pip install zvec-db
80
+ ```
81
+
82
+ **Optional dependencies:**
83
+
84
+ ```bash
85
+ # For preprocessing (stemming, stopwords)
86
+ pip install "zvec-db[preprocessing]"
87
+
88
+ # For development
89
+ pip install "zvec-db[dev,test,docs]"
90
+ ```
91
+
92
+ ---
93
+
94
+ ## Quick Start
95
+
96
+ ### Hybrid search with zvec (recommended)
97
+
98
+ ```python
99
+ import zvec
100
+ from zvec_db.embedders import BM25Embedder, OpenAIEmbedder
101
+ from zvec_db.rerankers import NormalizedWeightedReRanker
102
+
103
+ # 1. Create embedders
104
+ bm25 = BM25Embedder(max_features=4096)
105
+ bm25.fit(documents)
106
+
107
+ dense = OpenAIEmbedder(base_url="http://localhost:9300/v1", model="embedding")
108
+
109
+ # 2. Create collection
110
+ schema = zvec.CollectionSchema(
111
+ name="docs",
112
+ vectors=[
113
+ zvec.VectorSchema("sparse", zvec.DataType.SPARSE_FP32, dimension=4096),
114
+ zvec.VectorSchema("dense", zvec.DataType.VECTOR_FP32, dimension=1024),
115
+ ]
116
+ )
117
+ collection = zvec.create_and_open("./my_db", schema)
118
+
119
+ # 3. Insert
120
+ for i, doc in enumerate(documents):
121
+ collection.insert(zvec.Doc(
122
+ id=str(i),
123
+ fields={"text": doc},
124
+ vectors={
125
+ "sparse": bm25.embed(doc),
126
+ "dense": dense.embed(doc),
127
+ }
128
+ ))
129
+
130
+ # 4. Search with weighted fusion
131
+ # Note: metrics=None because we mix BM25 (arbitrary scores) and dense (COSINE distances)
132
+ results = collection.query(
133
+ vectors=[
134
+ zvec.VectorQuery(field_name="sparse", vector=bm25.embed(query)),
135
+ zvec.VectorQuery(field_name="dense", vector=dense.embed(query)),
136
+ ],
137
+ topk=10,
138
+ reranker=NormalizedWeightedReRanker(
139
+ metrics=None, # No automatic conversion (mixed metrics)
140
+ weights={"sparse": 0.4, "dense": 0.6},
141
+ normalizer_configs={"sparse": {"method": "bayes"}},
142
+ ),
143
+ )
144
+ ```
145
+
146
+ ---
147
+
148
+ ## Sparse Embedders
149
+
150
+ All sparse embedders return dictionaries `{index: score, ...}` compatible with zvec's `SPARSE_FP32` format.
151
+
152
+ ### BM25Embedder (recommended)
153
+
154
+ Standard BM25 scoring - best for general use cases.
155
+
156
+ ```python
157
+ from zvec_db.embedders import BM25Embedder
158
+ from zvec_db.preprocessing import NormalizationConfig
159
+
160
+ # With automatic preprocessing
161
+ config = NormalizationConfig.aggressive(language="french")
162
+ bm25 = BM25Embedder(
163
+ max_features=4096,
164
+ k1=1.2, # Term frequency saturation (default: 1.2)
165
+ b=0.75, # Length normalization (default: 0.75)
166
+ preprocessing_config=config
167
+ )
168
+ bm25.fit(documents)
169
+
170
+ vector = bm25.embed("search query") # {index: score, ...}
171
+ ```
172
+
173
+ ### Other sparse embedders
174
+
175
+ | Embedder | Use case |
176
+ |----------|----------|
177
+ | `TfidfEmbedder` | TF-IDF weighting with sublinear TF option |
178
+ | `CountEmbedder` | Simple term counts (binary option available) |
179
+ | `BM25LEmbedder` | Documents with variable lengths |
180
+ | `BM25PlusEmbedder` | Avoid zero scores with delta smoothing |
181
+ | `DisMaxEmbedder` | Multi-field search (takes maximum score) |
182
+
183
+ ```python
184
+ from zvec_db.embedders import TfidfEmbedder, CountEmbedder, DisMaxEmbedder
185
+
186
+ tfidf = TfidfEmbedder(max_features=4096, sublinear_tf=True)
187
+ count = CountEmbedder(max_features=4096, binary=True)
188
+ dismax = DisMaxEmbedder(tie_breaker=0.1)
189
+ ```
190
+
191
+ ---
192
+
193
+ ## Dense Embedders
194
+
195
+ ### OpenAIEmbedder (API / vLLM)
196
+
197
+ Works with OpenAI API or compatible endpoints (vLLM, local servers).
198
+
199
+ ```python
200
+ from zvec_db.embedders import OpenAIEmbedder
201
+
202
+ # OpenAI API
203
+ embedder = OpenAIEmbedder(model="text-embedding-3-small", api_key="sk-...")
204
+
205
+ # Local vLLM
206
+ embedder = OpenAIEmbedder(
207
+ base_url="http://localhost:9300/v1",
208
+ model="embedding",
209
+ max_batch_size=32,
210
+ )
211
+ vector = embedder.embed("search query")
212
+ ```
213
+
214
+ ### SentenceTransformersEmbedder (local)
215
+
216
+ Run embedding models locally using sentence-transformers.
217
+
218
+ ```python
219
+ from zvec_db.embedders import SentenceTransformersEmbedder
220
+
221
+ embedder = SentenceTransformersEmbedder(
222
+ model_name="all-MiniLM-L6-v2", # 384 dims, fast
223
+ device="cpu",
224
+ normalize=True,
225
+ )
226
+ vector = embedder.embed("search query")
227
+ ```
228
+
229
+ ---
230
+
231
+ ## Re-ranking
232
+
233
+ ### Understanding distance/similarity metrics
234
+
235
+ **Problem**: Vector databases store **distances** (smaller = more similar), but fusion algorithms assume **similarities** (larger = more relevant).
236
+
237
+ The `metrics` parameter handles conversion:
238
+
239
+ | Metric | Type | Range | Conversion | Usage |
240
+ |--------|------|-------|------------|-------|
241
+ | `COSINE` | Distance | [0, 2] | `1.0 - score/2.0` | Normalized embeddings (Qdrant, zvec) |
242
+ | `L2` | Distance | [0, ∞) | `1 - 2*atan(s)/π` | Euclidean distance |
243
+ | `IP` | Similarity | (-∞, ∞) | None | Inner product (already similarity) |
244
+ | `None` | - | - | None | BM25 scores or already normalized [0, 1] |
245
+
246
+ **Default**: `metrics=MetricType.COSINE` (main use case with zvec/Qdrant).
247
+
248
+ ```python
249
+ from zvec_db.rerankers import NormalizedWeightedReRanker, MetricType
250
+
251
+ # COSINE distances from zvec/Qdrant (default)
252
+ reranker = NormalizedWeightedReRanker(topn=10)
253
+
254
+ # BM25 scores (not distances!)
255
+ reranker = NormalizedWeightedReRanker(topn=10, metrics=None)
256
+
257
+ # Hybrid: BM25 + dense with per-source normalization
258
+ reranker = NormalizedWeightedReRanker(
259
+ metrics=None, # No global conversion
260
+ weights={"sparse": 0.4, "dense": 0.6},
261
+ normalizer_configs={
262
+ "sparse": "bayes", # BM25: handles outliers well
263
+ "dense": True, # Dense: standard normalization
264
+ },
265
+ )
266
+ ```
267
+
268
+ ### Fusion rerankers
269
+
270
+ #### Normalizer configuration
271
+
272
+ The `normalizer_configs` parameter controls how scores are normalized per source:
273
+
274
+ | Value | Effect |
275
+ |-------|--------|
276
+ | `True` | Standard normalization (scales scores to [0, 1]) |
277
+ | `"bayes"`, `"bayesian"`, `"bb25"` | Bayesian sigmoid calibration (robust to outliers). These are aliases for the same method. |
278
+ | `{"method": "bayes", "alpha": 1.0}` | Dict with custom parameters (`alpha`, `beta`) |
279
+ | `None` | Skip normalization (use raw scores after metric conversion) |
280
+
281
+ **Example:**
282
+
283
+ ```python
284
+ normalizer_configs={
285
+ "sparse": "bayes", # Bayesian: handles BM25 outliers well
286
+ "dense": None, # Optional: Cosine already scales between in [0, 1]
287
+ }
288
+ ```
289
+
290
+ #### NormalizedWeightedReRanker (weighted fusion)
291
+
292
+ ```python
293
+ from zvec_db.rerankers import NormalizedWeightedReRanker
294
+
295
+ reranker = NormalizedWeightedReRanker(
296
+ topn=10,
297
+ weights={"source1": 0.7, "source2": 0.3},
298
+ normalizer_configs={"source1": "bayes", "source2": True},
299
+ )
300
+
301
+ results = collection.query(vectors=[...], topk=20, reranker=reranker)
302
+ ```
303
+
304
+ #### Using `schema` parameter (auto-detect metrics from collection)
305
+
306
+ When working with zvec collections, you can use the `schema` parameter to automatically infer the correct metrics for each vector field:
307
+
308
+ ```python
309
+ import zvec
310
+ from zvec_db.rerankers import NormalizedWeightedReRanker
311
+
312
+ # Open existing collection
313
+ collection = zvec.open("./my_collection")
314
+
315
+ # Reranker auto-infers metrics from schema
316
+ # - SPARSE_FP32 fields -> metrics=None (BM25 scores)
317
+ # - VECTOR_FP32 fields with COSINE -> metrics=MetricType.COSINE
318
+ reranker = NormalizedWeightedReRanker(
319
+ topn=10,
320
+ metrics=None, # Will infer from schema
321
+ schema=collection.schema,
322
+ weights={"sparse": 0.4, "dense": 0.6},
323
+ )
324
+
325
+ # No need to manually specify metrics per source!
326
+ results = collection.query(
327
+ vectors=[
328
+ zvec.VectorQuery(field_name="sparse", vector=bm25.embed(query)),
329
+ zvec.VectorQuery(field_name="dense", vector=dense.embed(query)),
330
+ ],
331
+ topk=20,
332
+ reranker=reranker,
333
+ )
334
+ ```
335
+
336
+ **Manual per-source metrics (alternative):**
337
+
338
+ ```python
339
+ from zvec_db.rerankers import NormalizedWeightedReRanker, MetricType
340
+
341
+ # Explicit per-source metrics
342
+ reranker = NormalizedWeightedReRanker(
343
+ topn=10,
344
+ metrics={
345
+ "sparse": None, # BM25 scores (not distances)
346
+ "dense": MetricType.COSINE, # Convert COSINE distance [0,2] -> similarity
347
+ },
348
+ weights={"sparse": 0.4, "dense": 0.6},
349
+ )
350
+ ```
351
+
352
+ #### NormalizedRrfReRanker (Reciprocal Rank Fusion)
353
+
354
+ ```python
355
+ from zvec_db.rerankers import NormalizedRrfReRanker
356
+
357
+ reranker = NormalizedRrfReRanker(topn=10, rank_constant=60)
358
+ results = reranker.rerank({"bm25": bm25_docs, "dense": dense_docs})
359
+ ```
360
+
361
+ #### WeightedReRanker (scores already normalized)
362
+
363
+ Use when scores are already in [0, 1] with "higher=better" orientation.
364
+
365
+ ```python
366
+ from zvec_db.rerankers import WeightedReRanker
367
+
368
+ reranker = WeightedReRanker(
369
+ topn=10,
370
+ weights={"source1": 0.7, "source2": 0.3},
371
+ )
372
+ ```
373
+
374
+ ### Default rerankers (ready-to-use)
375
+
376
+ ```python
377
+ from zvec_db.rerankers.defaults import (
378
+ DefaultWeightedReranker,
379
+ DefaultHybridReranker,
380
+ DefaultRrfReranker,
381
+ )
382
+
383
+ # Weighted fusion with Bayesian normalization
384
+ reranker = DefaultWeightedReranker()
385
+
386
+ # Optimized hybrid: dense (60%) + BM25 (40%)
387
+ reranker = DefaultHybridReranker()
388
+
389
+ # RRF with standard parameters
390
+ reranker = DefaultRrfReranker()
391
+
392
+ results = reranker.rerank({"bm25": bm25_docs, "dense": dense_docs})
393
+ ```
394
+
395
+ ### Cross-Encoder rerankers
396
+
397
+ All cross-encoders require a `query` parameter at initialization.
398
+
399
+ #### SentenceTransformerReranker (local, binary)
400
+
401
+ ```python
402
+ from zvec_db.rerankers import SentenceTransformerReranker
403
+
404
+ reranker = SentenceTransformerReranker(
405
+ query="machine learning",
406
+ model_name="cross-encoder/ms-marco-MiniLM-L-6-v2",
407
+ topn=10,
408
+ )
409
+ results = reranker.rerank({"bm25": docs})
410
+ ```
411
+
412
+ #### ClassificationReranker (local, multi-class)
413
+
414
+ ```python
415
+ from zvec_db.rerankers import ClassificationReranker
416
+
417
+ reranker = ClassificationReranker(
418
+ query="machine learning",
419
+ model_name="your-multi-class-model",
420
+ num_classes=5, # Auto-inferred if not specified
421
+ topn=10,
422
+ )
423
+ results = reranker.rerank({"bm25": docs})
424
+ ```
425
+
426
+ #### OpenAIReranker (API)
427
+
428
+ ```python
429
+ from zvec_db.rerankers import OpenAIReranker
430
+
431
+ reranker = OpenAIReranker(
432
+ query="machine learning",
433
+ base_url="http://localhost:9400/v1",
434
+ model="BAAI/bge-reranker-v2-m3",
435
+ endpoint="rerank", # or "score"
436
+ topn=10,
437
+ )
438
+ results = reranker.rerank({"bm25": docs})
439
+ ```
440
+
441
+ ### Diversification
442
+
443
+ #### SubmodularReranker (MMR)
444
+
445
+ Maximize relevance while diversifying results.
446
+
447
+ ```python
448
+ from zvec_db.rerankers import SubmodularReranker
449
+
450
+ reranker = SubmodularReranker(
451
+ topn=10,
452
+ lambda_param=0.7, # 70% relevance, 30% diversity
453
+ vector_field="embedding",
454
+ )
455
+ results = reranker.rerank({"source": docs_with_vectors})
456
+ ```
457
+
458
+ ---
459
+
460
+ ## Preprocessing
461
+
462
+ Preprocessing improves sparse embedding quality.
463
+
464
+ ### Automatic (recommended)
465
+
466
+ ```python
467
+ from zvec_db.embedders import BM25Embedder
468
+ from zvec_db.preprocessing import NormalizationConfig
469
+
470
+ config = NormalizationConfig.aggressive(language="french")
471
+ bm25 = BM25Embedder(max_features=4096, preprocessing_config=config)
472
+ bm25.fit(documents)
473
+ # Preprocessing is automatically applied and saved with the model
474
+ ```
475
+
476
+ ### Utility functions
477
+
478
+ ```python
479
+ from zvec_db.preprocessing import normalize_text, stem_word, remove_stopwords
480
+
481
+ # Full pipeline
482
+ normalize_text(" CHAT MANGEAIT ", lowercase=True, remove_accents=True, stem=True) # "chat mang"
483
+
484
+ # Individual functions
485
+ stem_word("mangeaient", language="french") # "mang"
486
+ remove_stopwords("le chat mange", language="french") # "chat mange"
487
+ ```
488
+
489
+ **nltk installation:**
490
+
491
+ ```bash
492
+ pip install "zvec-db[preprocessing]"
493
+ ```
494
+
495
+ ---
496
+
497
+ ## Model Persistence
498
+
499
+ ```python
500
+ from zvec_db.embedders import BM25Embedder
501
+
502
+ # Save
503
+ bm25 = BM25Embedder(max_features=4096, preprocessing_config=config)
504
+ bm25.fit(documents)
505
+ bm25.save("models/bm25_model.joblib")
506
+
507
+ # Load
508
+ bm25_loaded = BM25Embedder()
509
+ bm25_loaded.load("models/bm25_model.joblib")
510
+
511
+ # Embeddings are identical (preprocessing included)
512
+ assert bm25.embed("query") == bm25_loaded.embed("query")
513
+ ```
514
+
515
+ ---
516
+
517
+ ## Evaluation
518
+
519
+ ```python
520
+ from zvec_db.evaluation import evaluate_ranking
521
+
522
+ # Evaluate ranking quality
523
+ metrics = evaluate_ranking(
524
+ ground_truth=[["doc1", "doc2"], ["doc3"]],
525
+ predictions=[["doc2", "doc1"], ["doc3", "doc4"]],
526
+ metrics=["ndcg", "map", "mrr", "recall"],
527
+ )
528
+ ```
529
+
530
+ ---
531
+
532
+ ## Development
533
+
534
+ ```bash
535
+ # Clone
536
+ git clone https://github.com/ccdv-ai/zvec-db.git
537
+ cd zvec-db
538
+
539
+ # Install with all dependencies
540
+ make install
541
+
542
+ # Run tests
543
+ make test
544
+
545
+ # Lint
546
+ make lint
547
+
548
+ # Build docs
549
+ make docs
550
+ ```
551
+
552
+ ---
553
+
554
+ ## License
555
+
556
+ MIT License
557
+
558
+ ---
559
+
560
+ ## Complete Example: Hybrid Search Pipeline
561
+
562
+ This section demonstrates a complete hybrid search pipeline with BM25 + dense embeddings and re-ranking.
563
+
564
+ ### Setup
565
+
566
+ ```python
567
+ import zvec
568
+ from zvec.model.doc import Doc
569
+ from zvec_db.embedders import BM25Embedder, SentenceTransformersEmbedder
570
+ from zvec_db.rerankers import NormalizedWeightedReRanker, DefaultHybridReranker
571
+
572
+ # Sample documents
573
+ documents = [
574
+ "Machine learning is a subset of artificial intelligence",
575
+ "Deep learning uses neural networks with many layers",
576
+ "Natural language processing enables computers to understand text",
577
+ "Computer vision allows machines to interpret images",
578
+ "Reinforcement learning trains agents through rewards",
579
+ ]
580
+
581
+ # Initialize embedders
582
+ bm25 = BM25Embedder(max_features=4096, k1=1.2, b=0.75)
583
+ bm25.fit(documents)
584
+
585
+ dense = SentenceTransformersEmbedder(
586
+ model_name="all-MiniLM-L6-v2",
587
+ device="cpu",
588
+ normalize=True,
589
+ )
590
+ ```
591
+
592
+ ### Create and populate collection
593
+
594
+ ```python
595
+ # Create zvec collection
596
+ schema = zvec.CollectionSchema(
597
+ name="docs",
598
+ vectors=[
599
+ zvec.VectorSchema("sparse", zvec.DataType.SPARSE_FP32, dimension=4096),
600
+ zvec.VectorSchema("dense", zvec.DataType.VECTOR_FP32, dimension=384),
601
+ ]
602
+ )
603
+ collection = zvec.create_and_open("./my_db", schema)
604
+
605
+ # Index documents
606
+ for i, doc in enumerate(documents):
607
+ collection.insert(zvec.Doc(
608
+ id=str(i),
609
+ fields={"text": doc},
610
+ vectors={
611
+ "sparse": bm25.embed(doc),
612
+ "dense": dense.embed(doc),
613
+ }
614
+ ))
615
+ ```
616
+
617
+ ### Hybrid search with re-ranking
618
+
619
+ ```python
620
+ query = "neural networks and deep learning"
621
+
622
+ # Method 1: Using collection.query with built-in reranker
623
+ results = collection.query(
624
+ vectors=[
625
+ zvec.VectorQuery(field_name="sparse", vector=bm25.embed(query)),
626
+ zvec.VectorQuery(field_name="dense", vector=dense.embed(query)),
627
+ ],
628
+ topk=20,
629
+ reranker=DefaultHybridReranker(
630
+ weights={"sparse": 0.4, "dense": 0.6},
631
+ ),
632
+ )
633
+
634
+ print("Top results:")
635
+ for i, doc in enumerate(results[:5]):
636
+ print(f" {i+1}. {doc.fields['text']} (score: {doc.score:.4f})")
637
+ ```
638
+
639
+ ### Manual hybrid search (more control)
640
+
641
+ ```python
642
+ from zvec.model.doc import Doc
643
+
644
+ # 1. Separate searches
645
+ sparse_results = collection.search(
646
+ vector_name="sparse",
647
+ vector=bm25.embed(query),
648
+ topk=20,
649
+ )
650
+
651
+ dense_results = collection.search(
652
+ vector_name="dense",
653
+ vector=dense.embed(query),
654
+ topk=20,
655
+ )
656
+
657
+ # 2. Re-rank with schema-based auto-detection
658
+ reranker = NormalizedWeightedReRanker(
659
+ topn=10,
660
+ metrics=None, # Infer from schema
661
+ schema=collection.schema,
662
+ weights={"sparse": 0.4, "dense": 0.6},
663
+ normalizer_configs={
664
+ "sparse": "bayes", # Robust to BM25 outliers
665
+ "dense": None, # Optional: COSINE is already in [0, 1]
666
+ },
667
+ )
668
+
669
+ # 3. Combine and re-rank
670
+ final_results = reranker.rerank({
671
+ "sparse": sparse_results,
672
+ "dense": dense_results,
673
+ })
674
+
675
+ print("\nFinal re-ranked results:")
676
+ for i, doc in enumerate(final_results[:5]):
677
+ print(f" {i+1}. {doc.fields['text']} (score: {doc.score:.4f})")
678
+ ```
679
+
680
+ ### Standalone re-ranking (no zvec collection)
681
+
682
+ ```python
683
+ # If you're not using zvec, you can still use the rerankers standalone
684
+
685
+ # Mock search results from different sources
686
+ bm25_results = [
687
+ Doc(id="doc1", score=15.5, fields={"text": "Machine learning..."}),
688
+ Doc(id="doc2", score=12.3, fields={"text": "Deep neural..."}),
689
+ Doc(id="doc3", score=8.7, fields={"text": "AI systems..."}),
690
+ ]
691
+
692
+ dense_results = [
693
+ Doc(id="doc2", score=0.92, fields={"text": "Deep neural..."}),
694
+ Doc(id="doc1", score=0.75, fields={"text": "Machine learning..."}),
695
+ Doc(id="doc4", score=0.68, fields={"text": "Data science..."}),
696
+ ]
697
+
698
+ # Re-rank with explicit metrics
699
+ reranker = NormalizedWeightedReRanker(
700
+ topn=10,
701
+ metrics={
702
+ "bm25": None, # BM25 scores
703
+ "dense": MetricType.COSINE, # COSINE distances [0, 2]
704
+ },
705
+ weights={"bm25": 0.4, "dense": 0.6},
706
+ )
707
+
708
+ final_results = reranker.rerank({
709
+ "bm25": bm25_results,
710
+ "dense": dense_results,
711
+ })
712
+ ```
713
+
714
+ ---
715
+
716
+ ## Resources
717
+
718
+ - [Full Documentation](https://zvec-db.readthedocs.io/)
719
+ - [Examples](examples/)
720
+ - [Issue Tracker](https://github.com/ccdv-ai/zvec-db/issues)