fastembed-bio 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fastembed/__init__.py +24 -0
- fastembed/bio/__init__.py +3 -0
- fastembed/bio/protein_embedding.py +456 -0
- fastembed/common/__init__.py +3 -0
- fastembed/common/model_description.py +52 -0
- fastembed/common/model_management.py +471 -0
- fastembed/common/onnx_model.py +188 -0
- fastembed/common/preprocessor_utils.py +84 -0
- fastembed/common/types.py +27 -0
- fastembed/common/utils.py +69 -0
- fastembed/embedding.py +24 -0
- fastembed/image/__init__.py +3 -0
- fastembed/image/image_embedding.py +135 -0
- fastembed/image/image_embedding_base.py +55 -0
- fastembed/image/onnx_embedding.py +217 -0
- fastembed/image/onnx_image_model.py +156 -0
- fastembed/image/transform/functional.py +221 -0
- fastembed/image/transform/operators.py +499 -0
- fastembed/late_interaction/__init__.py +5 -0
- fastembed/late_interaction/colbert.py +301 -0
- fastembed/late_interaction/jina_colbert.py +58 -0
- fastembed/late_interaction/late_interaction_embedding_base.py +80 -0
- fastembed/late_interaction/late_interaction_text_embedding.py +180 -0
- fastembed/late_interaction/token_embeddings.py +83 -0
- fastembed/late_interaction_multimodal/__init__.py +5 -0
- fastembed/late_interaction_multimodal/colmodernvbert.py +532 -0
- fastembed/late_interaction_multimodal/colpali.py +327 -0
- fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py +189 -0
- fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py +86 -0
- fastembed/late_interaction_multimodal/onnx_multimodal_model.py +291 -0
- fastembed/parallel_processor.py +253 -0
- fastembed/postprocess/__init__.py +3 -0
- fastembed/postprocess/muvera.py +362 -0
- fastembed/py.typed +1 -0
- fastembed/rerank/cross_encoder/__init__.py +3 -0
- fastembed/rerank/cross_encoder/custom_text_cross_encoder.py +47 -0
- fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py +239 -0
- fastembed/rerank/cross_encoder/onnx_text_model.py +204 -0
- fastembed/rerank/cross_encoder/text_cross_encoder.py +178 -0
- fastembed/rerank/cross_encoder/text_cross_encoder_base.py +63 -0
- fastembed/sparse/__init__.py +4 -0
- fastembed/sparse/bm25.py +359 -0
- fastembed/sparse/bm42.py +369 -0
- fastembed/sparse/minicoil.py +372 -0
- fastembed/sparse/sparse_embedding_base.py +90 -0
- fastembed/sparse/sparse_text_embedding.py +143 -0
- fastembed/sparse/splade_pp.py +196 -0
- fastembed/sparse/utils/minicoil_encoder.py +146 -0
- fastembed/sparse/utils/sparse_vectors_converter.py +244 -0
- fastembed/sparse/utils/tokenizer.py +120 -0
- fastembed/sparse/utils/vocab_resolver.py +202 -0
- fastembed/text/__init__.py +3 -0
- fastembed/text/clip_embedding.py +56 -0
- fastembed/text/custom_text_embedding.py +97 -0
- fastembed/text/multitask_embedding.py +109 -0
- fastembed/text/onnx_embedding.py +353 -0
- fastembed/text/onnx_text_model.py +180 -0
- fastembed/text/pooled_embedding.py +136 -0
- fastembed/text/pooled_normalized_embedding.py +164 -0
- fastembed/text/text_embedding.py +228 -0
- fastembed/text/text_embedding_base.py +75 -0
- fastembed_bio-0.1.0.dist-info/METADATA +339 -0
- fastembed_bio-0.1.0.dist-info/RECORD +66 -0
- fastembed_bio-0.1.0.dist-info/WHEEL +4 -0
- fastembed_bio-0.1.0.dist-info/licenses/LICENSE +201 -0
- fastembed_bio-0.1.0.dist-info/licenses/NOTICE +22 -0
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
from typing import Any, Iterable, Sequence, Type
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from fastembed.common import OnnxProvider
|
|
5
|
+
from fastembed.common.onnx_model import OnnxOutputContext
|
|
6
|
+
from fastembed.common.types import Device
|
|
7
|
+
from fastembed.common.utils import define_cache_dir
|
|
8
|
+
from fastembed.sparse.sparse_embedding_base import (
|
|
9
|
+
SparseEmbedding,
|
|
10
|
+
SparseTextEmbeddingBase,
|
|
11
|
+
)
|
|
12
|
+
from fastembed.text.onnx_text_model import OnnxTextModel, TextEmbeddingWorker
|
|
13
|
+
from fastembed.common.model_description import SparseModelDescription, ModelSource
|
|
14
|
+
|
|
15
|
+
supported_splade_models: list[SparseModelDescription] = [
|
|
16
|
+
SparseModelDescription(
|
|
17
|
+
model="prithivida/Splade_PP_en_v1",
|
|
18
|
+
vocab_size=30522,
|
|
19
|
+
description="Independent Implementation of SPLADE++ Model for English.",
|
|
20
|
+
license="apache-2.0",
|
|
21
|
+
size_in_GB=0.532,
|
|
22
|
+
sources=ModelSource(hf="Qdrant/Splade_PP_en_v1"),
|
|
23
|
+
model_file="model.onnx",
|
|
24
|
+
),
|
|
25
|
+
SparseModelDescription(
|
|
26
|
+
model="prithvida/Splade_PP_en_v1",
|
|
27
|
+
vocab_size=30522,
|
|
28
|
+
description="Independent Implementation of SPLADE++ Model for English.",
|
|
29
|
+
license="apache-2.0",
|
|
30
|
+
size_in_GB=0.532,
|
|
31
|
+
sources=ModelSource(hf="Qdrant/Splade_PP_en_v1"),
|
|
32
|
+
model_file="model.onnx",
|
|
33
|
+
),
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class SpladePP(SparseTextEmbeddingBase, OnnxTextModel[SparseEmbedding]):
|
|
38
|
+
def _post_process_onnx_output(
|
|
39
|
+
self, output: OnnxOutputContext, **kwargs: Any
|
|
40
|
+
) -> Iterable[SparseEmbedding]:
|
|
41
|
+
if output.attention_mask is None:
|
|
42
|
+
raise ValueError("attention_mask must be provided for document post-processing")
|
|
43
|
+
|
|
44
|
+
relu_log = np.log(1 + np.maximum(output.model_output, 0))
|
|
45
|
+
|
|
46
|
+
weighted_log = relu_log * np.expand_dims(output.attention_mask, axis=-1)
|
|
47
|
+
|
|
48
|
+
scores = np.max(weighted_log, axis=1)
|
|
49
|
+
|
|
50
|
+
# Score matrix of shape (batch_size, vocab_size)
|
|
51
|
+
# Most of the values are 0, only a few are non-zero
|
|
52
|
+
for row_scores in scores:
|
|
53
|
+
indices = row_scores.nonzero()[0]
|
|
54
|
+
scores = row_scores[indices]
|
|
55
|
+
yield SparseEmbedding(values=scores, indices=indices)
|
|
56
|
+
|
|
57
|
+
def token_count(
|
|
58
|
+
self, texts: str | Iterable[str], batch_size: int = 1024, **kwargs: Any
|
|
59
|
+
) -> int:
|
|
60
|
+
return self._token_count(texts, batch_size=batch_size, **kwargs)
|
|
61
|
+
|
|
62
|
+
@classmethod
|
|
63
|
+
def _list_supported_models(cls) -> list[SparseModelDescription]:
|
|
64
|
+
"""Lists the supported models.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
list[SparseModelDescription]: A list of SparseModelDescription objects containing the model information.
|
|
68
|
+
"""
|
|
69
|
+
return supported_splade_models
|
|
70
|
+
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
model_name: str,
|
|
74
|
+
cache_dir: str | None = None,
|
|
75
|
+
threads: int | None = None,
|
|
76
|
+
providers: Sequence[OnnxProvider] | None = None,
|
|
77
|
+
cuda: bool | Device = Device.AUTO,
|
|
78
|
+
device_ids: list[int] | None = None,
|
|
79
|
+
lazy_load: bool = False,
|
|
80
|
+
device_id: int | None = None,
|
|
81
|
+
specific_model_path: str | None = None,
|
|
82
|
+
**kwargs: Any,
|
|
83
|
+
):
|
|
84
|
+
"""
|
|
85
|
+
Args:
|
|
86
|
+
model_name (str): The name of the model to use.
|
|
87
|
+
cache_dir (str, optional): The path to the cache directory.
|
|
88
|
+
Can be set using the `FASTEMBED_CACHE_PATH` env variable.
|
|
89
|
+
Defaults to `fastembed_cache` in the system's temp directory.
|
|
90
|
+
threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
|
|
91
|
+
providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
|
|
92
|
+
Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
|
|
93
|
+
cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers`
|
|
94
|
+
Defaults to Device.
|
|
95
|
+
device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
|
|
96
|
+
workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive
|
|
97
|
+
with `providers`. Defaults to None.
|
|
98
|
+
lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
|
|
99
|
+
Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
|
|
100
|
+
device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
|
|
101
|
+
specific_model_path (Optional[str], optional): The specific path to the onnx model dir if it should be imported from somewhere else
|
|
102
|
+
|
|
103
|
+
Raises:
|
|
104
|
+
ValueError: If the model_name is not in the format <org>/<model> e.g. BAAI/bge-base-en.
|
|
105
|
+
"""
|
|
106
|
+
super().__init__(model_name, cache_dir, threads, **kwargs)
|
|
107
|
+
self.providers = providers
|
|
108
|
+
self.lazy_load = lazy_load
|
|
109
|
+
self._extra_session_options = self._select_exposed_session_options(kwargs)
|
|
110
|
+
|
|
111
|
+
# List of device ids, that can be used for data parallel processing in workers
|
|
112
|
+
self.device_ids = device_ids
|
|
113
|
+
self.cuda = cuda
|
|
114
|
+
|
|
115
|
+
# This device_id will be used if we need to load model in current process
|
|
116
|
+
self.device_id: int | None = None
|
|
117
|
+
if device_id is not None:
|
|
118
|
+
self.device_id = device_id
|
|
119
|
+
elif self.device_ids is not None:
|
|
120
|
+
self.device_id = self.device_ids[0]
|
|
121
|
+
|
|
122
|
+
self.model_description = self._get_model_description(model_name)
|
|
123
|
+
self.cache_dir = str(define_cache_dir(cache_dir))
|
|
124
|
+
|
|
125
|
+
self._specific_model_path = specific_model_path
|
|
126
|
+
self._model_dir = self.download_model(
|
|
127
|
+
self.model_description,
|
|
128
|
+
self.cache_dir,
|
|
129
|
+
local_files_only=self._local_files_only,
|
|
130
|
+
specific_model_path=self._specific_model_path,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
if not self.lazy_load:
|
|
134
|
+
self.load_onnx_model()
|
|
135
|
+
|
|
136
|
+
def load_onnx_model(self) -> None:
|
|
137
|
+
self._load_onnx_model(
|
|
138
|
+
model_dir=self._model_dir,
|
|
139
|
+
model_file=self.model_description.model_file,
|
|
140
|
+
threads=self.threads,
|
|
141
|
+
providers=self.providers,
|
|
142
|
+
cuda=self.cuda,
|
|
143
|
+
device_id=self.device_id,
|
|
144
|
+
extra_session_options=self._extra_session_options,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
def embed(
|
|
148
|
+
self,
|
|
149
|
+
documents: str | Iterable[str],
|
|
150
|
+
batch_size: int = 256,
|
|
151
|
+
parallel: int | None = None,
|
|
152
|
+
**kwargs: Any,
|
|
153
|
+
) -> Iterable[SparseEmbedding]:
|
|
154
|
+
"""
|
|
155
|
+
Encode a list of documents into list of embeddings.
|
|
156
|
+
We use mean pooling with attention so that the model can handle variable-length inputs.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
documents: Iterator of documents or single document to embed
|
|
160
|
+
batch_size: Batch size for encoding -- higher values will use more memory, but be faster
|
|
161
|
+
parallel:
|
|
162
|
+
If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
|
|
163
|
+
If 0, use all available cores.
|
|
164
|
+
If None, don't use data-parallel processing, use default onnxruntime threading instead.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
List of embeddings, one per document
|
|
168
|
+
"""
|
|
169
|
+
yield from self._embed_documents(
|
|
170
|
+
model_name=self.model_name,
|
|
171
|
+
cache_dir=str(self.cache_dir),
|
|
172
|
+
documents=documents,
|
|
173
|
+
batch_size=batch_size,
|
|
174
|
+
parallel=parallel,
|
|
175
|
+
providers=self.providers,
|
|
176
|
+
cuda=self.cuda,
|
|
177
|
+
device_ids=self.device_ids,
|
|
178
|
+
local_files_only=self._local_files_only,
|
|
179
|
+
specific_model_path=self._specific_model_path,
|
|
180
|
+
extra_session_options=self._extra_session_options,
|
|
181
|
+
**kwargs,
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
@classmethod
|
|
185
|
+
def _get_worker_class(cls) -> Type[TextEmbeddingWorker[SparseEmbedding]]:
|
|
186
|
+
return SpladePPEmbeddingWorker
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
class SpladePPEmbeddingWorker(TextEmbeddingWorker[SparseEmbedding]):
|
|
190
|
+
def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> SpladePP:
|
|
191
|
+
return SpladePP(
|
|
192
|
+
model_name=model_name,
|
|
193
|
+
cache_dir=cache_dir,
|
|
194
|
+
threads=1,
|
|
195
|
+
**kwargs,
|
|
196
|
+
)
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pure numpy implementation of encoder model for a single word.
|
|
3
|
+
|
|
4
|
+
This model is not trainable, and should only be used for inference.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from fastembed.common.types import NumpyArray
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Encoder:
|
|
12
|
+
"""
|
|
13
|
+
Encoder(768, 4, 10000)
|
|
14
|
+
|
|
15
|
+
Will look like this:
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
Per-word
|
|
19
|
+
Encoder Matrix
|
|
20
|
+
┌─────────────────────┐
|
|
21
|
+
│ Token Embedding(768)├──────┐ (10k, 768, 4)
|
|
22
|
+
└─────────────────────┘ │ ┌─────────┐
|
|
23
|
+
│ │ │
|
|
24
|
+
┌─────────────────────┐ │ ┌─┴───────┐ │
|
|
25
|
+
│ │ │ │ │ │
|
|
26
|
+
└─────────────────────┘ │ ┌─┴───────┐ │ │ ┌─────────┐
|
|
27
|
+
└────►│ │ │ ├─────►│Tanh │
|
|
28
|
+
┌─────────────────────┐ │ │ │ │ └─────────┘
|
|
29
|
+
│ │ │ │ ├─┘
|
|
30
|
+
└─────────────────────┘ │ ├─┘
|
|
31
|
+
│ │
|
|
32
|
+
┌─────────────────────┐ └─────────┘
|
|
33
|
+
│ │
|
|
34
|
+
└─────────────────────┘
|
|
35
|
+
|
|
36
|
+
Final linear transformation is accompanied by a non-linear activation function: Tanh.
|
|
37
|
+
|
|
38
|
+
Tanh is used to ensure that the output is in the range [-1, 1].
|
|
39
|
+
It would be easier to visually interpret the output of the model, assuming that each dimension
|
|
40
|
+
would need to encode a type of semantic cluster.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
weights: NumpyArray,
|
|
46
|
+
):
|
|
47
|
+
self.weights = weights
|
|
48
|
+
self.vocab_size, self.input_dim, self.output_dim = weights.shape
|
|
49
|
+
|
|
50
|
+
self.encoder_weights: NumpyArray = weights
|
|
51
|
+
|
|
52
|
+
# Activation function
|
|
53
|
+
self.activation = np.tanh
|
|
54
|
+
|
|
55
|
+
@staticmethod
|
|
56
|
+
def convert_vocab_ids(vocab_ids: NumpyArray) -> NumpyArray:
|
|
57
|
+
"""
|
|
58
|
+
Convert vocab_ids of shape (batch_size, seq_len) into (batch_size, seq_len, 2)
|
|
59
|
+
by appending batch_id alongside each vocab_id.
|
|
60
|
+
"""
|
|
61
|
+
batch_size, seq_len = vocab_ids.shape
|
|
62
|
+
batch_ids = np.arange(batch_size, dtype=vocab_ids.dtype).reshape(batch_size, 1)
|
|
63
|
+
batch_ids = np.repeat(batch_ids, seq_len, axis=1)
|
|
64
|
+
# Stack vocab_ids and batch_ids along the last dimension
|
|
65
|
+
combined: NumpyArray = np.stack((vocab_ids, batch_ids), axis=2).astype(np.int32)
|
|
66
|
+
return combined
|
|
67
|
+
|
|
68
|
+
@classmethod
|
|
69
|
+
def avg_by_vocab_ids(
|
|
70
|
+
cls, vocab_ids: NumpyArray, embeddings: NumpyArray
|
|
71
|
+
) -> tuple[NumpyArray, NumpyArray]:
|
|
72
|
+
"""
|
|
73
|
+
Takes:
|
|
74
|
+
vocab_ids: (batch_size, seq_len) int array
|
|
75
|
+
embeddings: (batch_size, seq_len, input_dim) float array
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
unique_flattened_vocab_ids: (total_unique, 2) array of [vocab_id, batch_id]
|
|
79
|
+
unique_flattened_embeddings: (total_unique, input_dim) averaged embeddings
|
|
80
|
+
"""
|
|
81
|
+
input_dim = embeddings.shape[2]
|
|
82
|
+
|
|
83
|
+
# Flatten vocab_ids and embeddings
|
|
84
|
+
# flattened_vocab_ids: (batch_size*seq_len, 2)
|
|
85
|
+
flattened_vocab_ids = cls.convert_vocab_ids(vocab_ids).reshape(-1, 2)
|
|
86
|
+
|
|
87
|
+
# flattened_embeddings: (batch_size*seq_len, input_dim)
|
|
88
|
+
flattened_embeddings = embeddings.reshape(-1, input_dim)
|
|
89
|
+
|
|
90
|
+
# Find unique (vocab_id, batch_id) pairs
|
|
91
|
+
unique_flattened_vocab_ids, inverse_indices = np.unique(
|
|
92
|
+
flattened_vocab_ids, axis=0, return_inverse=True
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Prepare arrays to accumulate sums
|
|
96
|
+
unique_count = unique_flattened_vocab_ids.shape[0]
|
|
97
|
+
unique_flattened_embeddings = np.zeros((unique_count, input_dim), dtype=np.float32)
|
|
98
|
+
unique_flattened_count = np.zeros(unique_count, dtype=np.int32)
|
|
99
|
+
|
|
100
|
+
# Use np.add.at to accumulate sums based on inverse indices
|
|
101
|
+
np.add.at(unique_flattened_embeddings, inverse_indices, flattened_embeddings)
|
|
102
|
+
np.add.at(unique_flattened_count, inverse_indices, 1)
|
|
103
|
+
|
|
104
|
+
# Compute averages
|
|
105
|
+
unique_flattened_embeddings /= unique_flattened_count[:, None]
|
|
106
|
+
|
|
107
|
+
return unique_flattened_vocab_ids.astype(np.int32), unique_flattened_embeddings.astype(
|
|
108
|
+
np.float32
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
def forward(
|
|
112
|
+
self, vocab_ids: NumpyArray, embeddings: NumpyArray
|
|
113
|
+
) -> tuple[NumpyArray, NumpyArray]:
|
|
114
|
+
"""
|
|
115
|
+
Args:
|
|
116
|
+
vocab_ids: (batch_size, seq_len) int array
|
|
117
|
+
embeddings: (batch_size, seq_len, input_dim) float array
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
unique_flattened_vocab_ids_and_batch_ids: (total_unique, 2)
|
|
121
|
+
unique_flattened_encoded: (total_unique, output_dim)
|
|
122
|
+
"""
|
|
123
|
+
# Average embeddings for duplicate vocab_ids
|
|
124
|
+
unique_flattened_vocab_ids_and_batch_ids, unique_flattened_embeddings = (
|
|
125
|
+
self.avg_by_vocab_ids(vocab_ids, embeddings)
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# Select the encoder weights for each unique vocab_id
|
|
129
|
+
unique_flattened_vocab_ids = unique_flattened_vocab_ids_and_batch_ids[:, 0].astype(
|
|
130
|
+
np.int32
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
# unique_encoder_weights: (total_unique, input_dim, output_dim)
|
|
134
|
+
unique_encoder_weights = self.encoder_weights[unique_flattened_vocab_ids]
|
|
135
|
+
|
|
136
|
+
# Compute linear transform: (total_unique, output_dim)
|
|
137
|
+
# Using Einstein summation for matrix multiplication:
|
|
138
|
+
# 'bi,bio->bo' means: for each "b" (batch element), multiply embeddings (b,i) by weights (b,i,o) -> (b,o)
|
|
139
|
+
unique_flattened_encoded = np.einsum(
|
|
140
|
+
"bi,bio->bo", unique_flattened_embeddings, unique_encoder_weights
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# Apply Tanh activation and ensure float32 type
|
|
144
|
+
unique_flattened_encoded = self.activation(unique_flattened_encoded).astype(np.float32)
|
|
145
|
+
|
|
146
|
+
return unique_flattened_vocab_ids_and_batch_ids.astype(np.int32), unique_flattened_encoded
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import mmh3
|
|
5
|
+
import numpy as np
|
|
6
|
+
from py_rust_stemmers import SnowballStemmer
|
|
7
|
+
|
|
8
|
+
from fastembed.common.utils import get_all_punctuation, remove_non_alphanumeric
|
|
9
|
+
from fastembed.sparse.sparse_embedding_base import SparseEmbedding
|
|
10
|
+
|
|
11
|
+
GAP = 32000
|
|
12
|
+
INT32_MAX = 2**31 - 1
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class WordEmbedding:
|
|
17
|
+
word: str
|
|
18
|
+
forms: list[str]
|
|
19
|
+
count: int
|
|
20
|
+
word_id: int
|
|
21
|
+
embedding: list[float]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class SparseVectorConverter:
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
stopwords: set[str],
|
|
28
|
+
stemmer: SnowballStemmer,
|
|
29
|
+
k: float = 1.2,
|
|
30
|
+
b: float = 0.75,
|
|
31
|
+
avg_len: float = 150.0,
|
|
32
|
+
):
|
|
33
|
+
punctuation = set(get_all_punctuation())
|
|
34
|
+
special_tokens = {"[CLS]", "[SEP]", "[PAD]", "[UNK]", "[MASK]"}
|
|
35
|
+
|
|
36
|
+
self.stemmer = stemmer
|
|
37
|
+
self.unwanted_tokens = punctuation | special_tokens | stopwords
|
|
38
|
+
|
|
39
|
+
self.k = k
|
|
40
|
+
self.b = b
|
|
41
|
+
self.avg_len = avg_len
|
|
42
|
+
|
|
43
|
+
@classmethod
|
|
44
|
+
def unkn_word_token_id(
|
|
45
|
+
cls, word: str, shift: int
|
|
46
|
+
) -> int: # 2-3 words can collide in 1 index with this mapping, not considering mm3 collisions
|
|
47
|
+
token_hash = abs(mmh3.hash(word))
|
|
48
|
+
|
|
49
|
+
range_size = INT32_MAX - shift
|
|
50
|
+
remapped_hash = shift + (token_hash % range_size)
|
|
51
|
+
|
|
52
|
+
return remapped_hash
|
|
53
|
+
|
|
54
|
+
def bm25_tf(self, num_occurrences: int, sentence_len: int) -> float:
|
|
55
|
+
res = num_occurrences * (self.k + 1)
|
|
56
|
+
res /= num_occurrences + self.k * (1 - self.b + self.b * sentence_len / self.avg_len)
|
|
57
|
+
return res
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
def normalize_vector(cls, vector: list[float]) -> list[float]:
|
|
61
|
+
norm = sum([x**2 for x in vector]) ** 0.5
|
|
62
|
+
if norm < 1e-8:
|
|
63
|
+
return vector
|
|
64
|
+
return [x / norm for x in vector]
|
|
65
|
+
|
|
66
|
+
def clean_words(
|
|
67
|
+
self, sentence_embedding: dict[str, WordEmbedding], token_max_length: int = 40
|
|
68
|
+
) -> dict[str, WordEmbedding]:
|
|
69
|
+
"""
|
|
70
|
+
Clean miniCOIL-produced sentence_embedding, as unknown to the miniCOIL's stemmer tokens should fully resemble
|
|
71
|
+
our BM25 token representation.
|
|
72
|
+
|
|
73
|
+
sentence_embedding = {"9°": {"word": "9°", "word_id": -1, "count": 2, "embedding": [1], "forms": ["9°"]},
|
|
74
|
+
"9": {"word": "9", "word_id": -1, "count": 2, "embedding": [1], "forms": ["9"]},
|
|
75
|
+
"bat": {"word": "bat", "word_id": 2, "count": 3, "embedding": [0.2, 0.1, -0.2, -0.2], "forms": ["bats", "bat"]},
|
|
76
|
+
"9°9": {"word": "9°9", "word_id": -1, "count": 1, "embedding": [1], "forms": ["9°9"]},
|
|
77
|
+
"screech": {"word": "screech", "word_id": -1, "count": 1, "embedding": [1], "forms": ["screech"]},
|
|
78
|
+
"screeched": {"word": "screeched", "word_id": -1, "count": 1, "embedding": [1], "forms": ["screeched"]}
|
|
79
|
+
}
|
|
80
|
+
cleaned_embedding_ground_truth = {
|
|
81
|
+
"9": {"word": "9", "word_id": -1, "count": 6, "embedding": [1], "forms": ["9°", "9", "9°9", "9°9"]},
|
|
82
|
+
"bat": {"word": "bat", "word_id": 2, "count": 3, "embedding": [0.2, 0.1, -0.2, -0.2], "forms": ["bats", "bat"]},
|
|
83
|
+
"screech": {"word": "screech", "word_id": -1, "count": 2, "embedding": [1], "forms": ["screech", "screeched"]}
|
|
84
|
+
}
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
new_sentence_embedding: dict[str, WordEmbedding] = {}
|
|
88
|
+
|
|
89
|
+
for word, embedding in sentence_embedding.items():
|
|
90
|
+
# embedding = {
|
|
91
|
+
# "word": "vector",
|
|
92
|
+
# "forms": ["vector", "vectors"],
|
|
93
|
+
# "count": 2,
|
|
94
|
+
# "word_id": 1231,
|
|
95
|
+
# "embedding": [0.1, 0.2, 0.3, 0.4]
|
|
96
|
+
# }
|
|
97
|
+
if embedding.word_id > 0:
|
|
98
|
+
# Known word, no need to clean
|
|
99
|
+
new_sentence_embedding[word] = embedding
|
|
100
|
+
else:
|
|
101
|
+
# Unknown word
|
|
102
|
+
if word in self.unwanted_tokens:
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
# Example complex word split:
|
|
106
|
+
# word = `word^vec`
|
|
107
|
+
word_cleaned = remove_non_alphanumeric(word).strip()
|
|
108
|
+
# word_cleaned = `word vec`
|
|
109
|
+
|
|
110
|
+
if len(word_cleaned) > 0:
|
|
111
|
+
# Subwords: ['word', 'vec']
|
|
112
|
+
for subword in word_cleaned.split():
|
|
113
|
+
stemmed_subword: str = self.stemmer.stem_word(subword)
|
|
114
|
+
if (
|
|
115
|
+
len(stemmed_subword) <= token_max_length
|
|
116
|
+
and stemmed_subword not in self.unwanted_tokens
|
|
117
|
+
):
|
|
118
|
+
if stemmed_subword not in new_sentence_embedding:
|
|
119
|
+
new_sentence_embedding[stemmed_subword] = copy.deepcopy(embedding)
|
|
120
|
+
new_sentence_embedding[stemmed_subword].word = stemmed_subword
|
|
121
|
+
else:
|
|
122
|
+
new_sentence_embedding[stemmed_subword].count += embedding.count
|
|
123
|
+
new_sentence_embedding[stemmed_subword].forms += embedding.forms
|
|
124
|
+
|
|
125
|
+
return new_sentence_embedding
|
|
126
|
+
|
|
127
|
+
def embedding_to_vector(
|
|
128
|
+
self,
|
|
129
|
+
sentence_embedding: dict[str, WordEmbedding],
|
|
130
|
+
embedding_size: int,
|
|
131
|
+
vocab_size: int,
|
|
132
|
+
) -> SparseEmbedding:
|
|
133
|
+
"""
|
|
134
|
+
Convert miniCOIL sentence embedding to Qdrant sparse vector
|
|
135
|
+
|
|
136
|
+
Example input:
|
|
137
|
+
|
|
138
|
+
```
|
|
139
|
+
{
|
|
140
|
+
"vector": WordEmbedding({ // Vocabulary word, encoded with miniCOIL normally
|
|
141
|
+
"word": "vector",
|
|
142
|
+
"forms": ["vector", "vectors"],
|
|
143
|
+
"count": 2,
|
|
144
|
+
"word_id": 1231,
|
|
145
|
+
"embedding": [0.1, 0.2, 0.3, 0.4]
|
|
146
|
+
}),
|
|
147
|
+
"axiotic": WordEmbedding({ // Out-of-vocabulary word, fallback to BM25
|
|
148
|
+
"word": "axiotic",
|
|
149
|
+
"forms": ["axiotics"],
|
|
150
|
+
"count": 1,
|
|
151
|
+
"word_id": -1,
|
|
152
|
+
})
|
|
153
|
+
}
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
"""
|
|
157
|
+
|
|
158
|
+
indices: list[int] = []
|
|
159
|
+
values: list[float] = []
|
|
160
|
+
|
|
161
|
+
# Example:
|
|
162
|
+
# vocab_size = 10000
|
|
163
|
+
# embedding_size = 4
|
|
164
|
+
# GAP = 32000
|
|
165
|
+
#
|
|
166
|
+
# We want to start random words section from the bucket, that is guaranteed to not
|
|
167
|
+
# include any vocab words.
|
|
168
|
+
# We need (vocab_size * embedding_size) slots for vocab words.
|
|
169
|
+
# Therefore we need (vocab_size * embedding_size) // GAP + 1 buckets for vocab words.
|
|
170
|
+
# Therefore, we can start random words from bucket (vocab_size * embedding_size) // GAP + 1 + 1
|
|
171
|
+
|
|
172
|
+
# ID at which the scope of OOV words starts
|
|
173
|
+
unknown_words_shift = ((vocab_size * embedding_size) // GAP + 2) * GAP
|
|
174
|
+
sentence_embedding_cleaned = self.clean_words(sentence_embedding)
|
|
175
|
+
|
|
176
|
+
# Calculate sentence length after cleaning
|
|
177
|
+
sentence_len = 0
|
|
178
|
+
for embedding in sentence_embedding_cleaned.values():
|
|
179
|
+
sentence_len += embedding.count
|
|
180
|
+
|
|
181
|
+
for embedding in sentence_embedding_cleaned.values():
|
|
182
|
+
word_id = embedding.word_id
|
|
183
|
+
num_occurrences = embedding.count
|
|
184
|
+
tf = self.bm25_tf(num_occurrences, sentence_len)
|
|
185
|
+
if (
|
|
186
|
+
word_id > 0
|
|
187
|
+
): # miniCOIL starts with ID 1, we generally won't have word_id == 0 (UNK), as we don't add
|
|
188
|
+
# these words to sentence_embedding
|
|
189
|
+
embedding_values = embedding.embedding
|
|
190
|
+
normalized_embedding = self.normalize_vector(embedding_values)
|
|
191
|
+
|
|
192
|
+
for val_id, value in enumerate(normalized_embedding):
|
|
193
|
+
indices.append(
|
|
194
|
+
word_id * embedding_size + val_id
|
|
195
|
+
) # since miniCOIL IDs start with 1
|
|
196
|
+
values.append(value * tf)
|
|
197
|
+
else:
|
|
198
|
+
indices.append(self.unkn_word_token_id(embedding.word, unknown_words_shift))
|
|
199
|
+
values.append(tf)
|
|
200
|
+
|
|
201
|
+
return SparseEmbedding(
|
|
202
|
+
indices=np.array(indices, dtype=np.int32),
|
|
203
|
+
values=np.array(values, dtype=np.float32),
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
def embedding_to_vector_query(
|
|
207
|
+
self,
|
|
208
|
+
sentence_embedding: dict[str, WordEmbedding],
|
|
209
|
+
embedding_size: int,
|
|
210
|
+
vocab_size: int,
|
|
211
|
+
) -> SparseEmbedding:
|
|
212
|
+
"""
|
|
213
|
+
Same as `embedding_to_vector`, but no TF
|
|
214
|
+
"""
|
|
215
|
+
|
|
216
|
+
indices: list[int] = []
|
|
217
|
+
values: list[float] = []
|
|
218
|
+
|
|
219
|
+
# ID at which the scope of OOV words starts
|
|
220
|
+
unknown_words_shift = ((vocab_size * embedding_size) // GAP + 2) * GAP
|
|
221
|
+
|
|
222
|
+
sentence_embedding_cleaned = self.clean_words(sentence_embedding)
|
|
223
|
+
|
|
224
|
+
for embedding in sentence_embedding_cleaned.values():
|
|
225
|
+
word_id = embedding.word_id
|
|
226
|
+
tf = 1.0
|
|
227
|
+
|
|
228
|
+
if word_id >= 0: # miniCOIL starts with ID 1
|
|
229
|
+
embedding_values = embedding.embedding
|
|
230
|
+
normalized_embedding = self.normalize_vector(embedding_values)
|
|
231
|
+
|
|
232
|
+
for val_id, value in enumerate(normalized_embedding):
|
|
233
|
+
indices.append(
|
|
234
|
+
word_id * embedding_size + val_id
|
|
235
|
+
) # since miniCOIL IDs start with 1
|
|
236
|
+
values.append(value * tf)
|
|
237
|
+
else:
|
|
238
|
+
indices.append(self.unkn_word_token_id(embedding.word, unknown_words_shift))
|
|
239
|
+
values.append(tf)
|
|
240
|
+
|
|
241
|
+
return SparseEmbedding(
|
|
242
|
+
indices=np.array(indices, dtype=np.int32),
|
|
243
|
+
values=np.array(values, dtype=np.float32),
|
|
244
|
+
)
|