zvec-db 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. zvec_db/__init__.py +104 -0
  2. zvec_db/embedders/__init__.py +36 -0
  3. zvec_db/embedders/base.py +651 -0
  4. zvec_db/embedders/defaults.py +214 -0
  5. zvec_db/embedders/dense/__init__.py +11 -0
  6. zvec_db/embedders/dense/embedders.py +182 -0
  7. zvec_db/embedders/dense/openai.py +467 -0
  8. zvec_db/embedders/dense/sentence_transformers.py +319 -0
  9. zvec_db/embedders/sparse/__init__.py +17 -0
  10. zvec_db/embedders/sparse/bm25.py +251 -0
  11. zvec_db/embedders/sparse/bm25l.py +261 -0
  12. zvec_db/embedders/sparse/bm25plus.py +295 -0
  13. zvec_db/embedders/sparse/count.py +111 -0
  14. zvec_db/embedders/sparse/dismax.py +361 -0
  15. zvec_db/embedders/sparse/tfidf.py +117 -0
  16. zvec_db/evaluation/__init__.py +71 -0
  17. zvec_db/evaluation/metrics.py +469 -0
  18. zvec_db/preprocessing/__init__.py +90 -0
  19. zvec_db/preprocessing/config.py +350 -0
  20. zvec_db/preprocessing/normalization.py +79 -0
  21. zvec_db/preprocessing/stemming.py +99 -0
  22. zvec_db/preprocessing/stopwords.py +303 -0
  23. zvec_db/preprocessing/tokenization.py +46 -0
  24. zvec_db/rerankers/__init__.py +70 -0
  25. zvec_db/rerankers/base.py +188 -0
  26. zvec_db/rerankers/cross_encoder/__init__.py +87 -0
  27. zvec_db/rerankers/cross_encoder/base.py +200 -0
  28. zvec_db/rerankers/cross_encoder/classification.py +318 -0
  29. zvec_db/rerankers/cross_encoder/openai.py +259 -0
  30. zvec_db/rerankers/cross_encoder/openai_decoder.py +342 -0
  31. zvec_db/rerankers/cross_encoder/openai_encoder.py +244 -0
  32. zvec_db/rerankers/cross_encoder/sentence_transformer.py +230 -0
  33. zvec_db/rerankers/defaults.py +177 -0
  34. zvec_db/rerankers/diversification/__init__.py +7 -0
  35. zvec_db/rerankers/diversification/submodular.py +279 -0
  36. zvec_db/rerankers/fusion/__init__.py +17 -0
  37. zvec_db/rerankers/fusion/hybrid_fusion.py +267 -0
  38. zvec_db/rerankers/fusion/multi_field.py +290 -0
  39. zvec_db/rerankers/fusion/rrf.py +260 -0
  40. zvec_db/rerankers/fusion/weighted.py +396 -0
  41. zvec_db/rerankers/utils/__init__.py +13 -0
  42. zvec_db/rerankers/utils/base_utils.py +108 -0
  43. zvec_db/rerankers/utils/normalize.py +351 -0
  44. zvec_db/rerankers/utils/pipeline.py +127 -0
  45. zvec_db-0.3.0.dist-info/METADATA +720 -0
  46. zvec_db-0.3.0.dist-info/RECORD +48 -0
  47. zvec_db-0.3.0.dist-info/WHEEL +5 -0
  48. zvec_db-0.3.0.dist-info/top_level.txt +1 -0
zvec_db/__init__.py ADDED
@@ -0,0 +1,104 @@
1
+ """zvec-db: Suite d'utilitaires pour la vectorisation sparse/dense et le re-ranking.
2
+
3
+ This library provides embedders for sparse and dense vectorization, as well as
4
+ rerankers for post-processing and combining search results. It is designed to
5
+ work with zvec (https://github.com/ccdv-ai/zvec).
6
+
7
+ Example Usage
8
+ -------------
9
+ >>> from zvec_db import BM25Embedder, SentenceTransformersEmbedder
10
+ >>> from zvec_db import NormalizedWeightedReRanker
11
+ >>> from zvec_db import NormalizationConfig
12
+
13
+ # Sparse embedding
14
+ bm25 = BM25Embedder(max_features=4096)
15
+ bm25.fit(documents)
16
+ sparse_vector = bm25("query") # __call__ alias for embed()
17
+
18
+ # Dense embedding
19
+ dense = SentenceTransformersEmbedder(model_name="all-MiniLM-L6-v2")
20
+ dense.fit(documents)
21
+ dense_vector = dense("query")
22
+
23
+ # Re-ranking
24
+ reranker = NormalizedWeightedReRanker(
25
+ weights={"sparse": 0.4, "dense": 0.6},
26
+ normalizer_configs={"sparse": {"method": "bayes"}},
27
+ )
28
+ """
29
+
30
+ from .embedders import ( # Sparse; Dense
31
+ BM25Embedder,
32
+ BM25LEmbedder,
33
+ BM25PlusEmbedder,
34
+ CountEmbedder,
35
+ DisMaxEmbedder,
36
+ OpenAIEmbedder,
37
+ SentenceTransformersEmbedder,
38
+ TfidfEmbedder,
39
+ )
40
+ from .preprocessing import (
41
+ NormalizationConfig,
42
+ normalize_text,
43
+ remove_stopwords,
44
+ stem_word,
45
+ )
46
+ from .rerankers import ( # Fusion; Diversification; Cross-encoder; Utils; Defaults
47
+ BaseCrossEncoderReranker,
48
+ ClassificationReranker,
49
+ DefaultHybridReranker,
50
+ DefaultRrfReranker,
51
+ DefaultWeightedReranker,
52
+ HybridFusionReranker,
53
+ MultiFieldWeightedReranker,
54
+ NormalizedRrfReRanker,
55
+ NormalizedWeightedReRanker,
56
+ OpenAIDecoderReranker,
57
+ OpenAIEncoderReranker,
58
+ OpenAIReranker,
59
+ PipelineReranker,
60
+ RrfReRanker,
61
+ SentenceTransformerReranker,
62
+ SubmodularReranker,
63
+ WeightedReRanker,
64
+ )
65
+
66
+ __all__ = [
67
+ # Sparse Embedders
68
+ "BM25Embedder",
69
+ "BM25LEmbedder",
70
+ "BM25PlusEmbedder",
71
+ "TfidfEmbedder",
72
+ "CountEmbedder",
73
+ "DisMaxEmbedder",
74
+ # Dense Embedders
75
+ "SentenceTransformersEmbedder",
76
+ "OpenAIEmbedder",
77
+ # Fusion Rerankers
78
+ "RrfReRanker",
79
+ "NormalizedRrfReRanker",
80
+ "WeightedReRanker",
81
+ "NormalizedWeightedReRanker",
82
+ "HybridFusionReranker",
83
+ "MultiFieldWeightedReranker",
84
+ # Diversification
85
+ "SubmodularReranker",
86
+ # Cross-encoder Rerankers
87
+ "BaseCrossEncoderReranker",
88
+ "SentenceTransformerReranker",
89
+ "ClassificationReranker",
90
+ "OpenAIReranker",
91
+ "OpenAIEncoderReranker",
92
+ "OpenAIDecoderReranker",
93
+ # Utils
94
+ "PipelineReranker",
95
+ # Defaults
96
+ "DefaultRrfReranker",
97
+ "DefaultWeightedReranker",
98
+ "DefaultHybridReranker",
99
+ # Preprocessing
100
+ "NormalizationConfig",
101
+ "normalize_text",
102
+ "stem_word",
103
+ "remove_stopwords",
104
+ ]
@@ -0,0 +1,36 @@
1
+ """Embedders for dense and sparse representations."""
2
+
3
+ from .base import BaseSparseEmbedder
4
+ from .defaults import DefaultDenseEmbedder, DefaultSparseEmbedder
5
+ from .dense import (
6
+ BaseDenseEmbedder,
7
+ OpenAIEmbedder,
8
+ SentenceTransformersEmbedder,
9
+ )
10
+ from .sparse import (
11
+ BM25Embedder,
12
+ BM25LEmbedder,
13
+ BM25PlusEmbedder,
14
+ CountEmbedder,
15
+ DisMaxEmbedder,
16
+ TfidfEmbedder,
17
+ )
18
+
19
+ __all__ = [
20
+ # Base
21
+ "BaseSparseEmbedder",
22
+ # Defaults
23
+ "DefaultSparseEmbedder",
24
+ "DefaultDenseEmbedder",
25
+ # Sparse
26
+ "BM25Embedder",
27
+ "BM25LEmbedder",
28
+ "BM25PlusEmbedder",
29
+ "TfidfEmbedder",
30
+ "CountEmbedder",
31
+ "DisMaxEmbedder",
32
+ # Dense
33
+ "BaseDenseEmbedder",
34
+ "SentenceTransformersEmbedder",
35
+ "OpenAIEmbedder",
36
+ ]