fastembed-bio 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fastembed/__init__.py +24 -0
- fastembed/bio/__init__.py +3 -0
- fastembed/bio/protein_embedding.py +456 -0
- fastembed/common/__init__.py +3 -0
- fastembed/common/model_description.py +52 -0
- fastembed/common/model_management.py +471 -0
- fastembed/common/onnx_model.py +188 -0
- fastembed/common/preprocessor_utils.py +84 -0
- fastembed/common/types.py +27 -0
- fastembed/common/utils.py +69 -0
- fastembed/embedding.py +24 -0
- fastembed/image/__init__.py +3 -0
- fastembed/image/image_embedding.py +135 -0
- fastembed/image/image_embedding_base.py +55 -0
- fastembed/image/onnx_embedding.py +217 -0
- fastembed/image/onnx_image_model.py +156 -0
- fastembed/image/transform/functional.py +221 -0
- fastembed/image/transform/operators.py +499 -0
- fastembed/late_interaction/__init__.py +5 -0
- fastembed/late_interaction/colbert.py +301 -0
- fastembed/late_interaction/jina_colbert.py +58 -0
- fastembed/late_interaction/late_interaction_embedding_base.py +80 -0
- fastembed/late_interaction/late_interaction_text_embedding.py +180 -0
- fastembed/late_interaction/token_embeddings.py +83 -0
- fastembed/late_interaction_multimodal/__init__.py +5 -0
- fastembed/late_interaction_multimodal/colmodernvbert.py +532 -0
- fastembed/late_interaction_multimodal/colpali.py +327 -0
- fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py +189 -0
- fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py +86 -0
- fastembed/late_interaction_multimodal/onnx_multimodal_model.py +291 -0
- fastembed/parallel_processor.py +253 -0
- fastembed/postprocess/__init__.py +3 -0
- fastembed/postprocess/muvera.py +362 -0
- fastembed/py.typed +1 -0
- fastembed/rerank/cross_encoder/__init__.py +3 -0
- fastembed/rerank/cross_encoder/custom_text_cross_encoder.py +47 -0
- fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py +239 -0
- fastembed/rerank/cross_encoder/onnx_text_model.py +204 -0
- fastembed/rerank/cross_encoder/text_cross_encoder.py +178 -0
- fastembed/rerank/cross_encoder/text_cross_encoder_base.py +63 -0
- fastembed/sparse/__init__.py +4 -0
- fastembed/sparse/bm25.py +359 -0
- fastembed/sparse/bm42.py +369 -0
- fastembed/sparse/minicoil.py +372 -0
- fastembed/sparse/sparse_embedding_base.py +90 -0
- fastembed/sparse/sparse_text_embedding.py +143 -0
- fastembed/sparse/splade_pp.py +196 -0
- fastembed/sparse/utils/minicoil_encoder.py +146 -0
- fastembed/sparse/utils/sparse_vectors_converter.py +244 -0
- fastembed/sparse/utils/tokenizer.py +120 -0
- fastembed/sparse/utils/vocab_resolver.py +202 -0
- fastembed/text/__init__.py +3 -0
- fastembed/text/clip_embedding.py +56 -0
- fastembed/text/custom_text_embedding.py +97 -0
- fastembed/text/multitask_embedding.py +109 -0
- fastembed/text/onnx_embedding.py +353 -0
- fastembed/text/onnx_text_model.py +180 -0
- fastembed/text/pooled_embedding.py +136 -0
- fastembed/text/pooled_normalized_embedding.py +164 -0
- fastembed/text/text_embedding.py +228 -0
- fastembed/text/text_embedding_base.py +75 -0
- fastembed_bio-0.1.0.dist-info/METADATA +339 -0
- fastembed_bio-0.1.0.dist-info/RECORD +66 -0
- fastembed_bio-0.1.0.dist-info/WHEEL +4 -0
- fastembed_bio-0.1.0.dist-info/licenses/LICENSE +201 -0
- fastembed_bio-0.1.0.dist-info/licenses/NOTICE +22 -0
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
from typing import Any, Iterable, Sequence, Type
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from tokenizers import Encoding
|
|
5
|
+
|
|
6
|
+
from fastembed.common import OnnxProvider, ImageInput
|
|
7
|
+
from fastembed.common.onnx_model import OnnxOutputContext
|
|
8
|
+
from fastembed.common.types import NumpyArray, Device
|
|
9
|
+
from fastembed.common.utils import define_cache_dir, iter_batch
|
|
10
|
+
from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import (
|
|
11
|
+
LateInteractionMultimodalEmbeddingBase,
|
|
12
|
+
)
|
|
13
|
+
from fastembed.late_interaction_multimodal.onnx_multimodal_model import (
|
|
14
|
+
OnnxMultimodalModel,
|
|
15
|
+
TextEmbeddingWorker,
|
|
16
|
+
ImageEmbeddingWorker,
|
|
17
|
+
)
|
|
18
|
+
from fastembed.common.model_description import DenseModelDescription, ModelSource
|
|
19
|
+
|
|
20
|
+
supported_colpali_models: list[DenseModelDescription] = [
|
|
21
|
+
DenseModelDescription(
|
|
22
|
+
model="Qdrant/colpali-v1.3-fp16",
|
|
23
|
+
dim=128,
|
|
24
|
+
description="Text embeddings, Multimodal (text&image), English, 50 tokens query length truncation, 2024.",
|
|
25
|
+
license="mit",
|
|
26
|
+
size_in_GB=6.5,
|
|
27
|
+
sources=ModelSource(hf="Qdrant/colpali-v1.3-fp16"),
|
|
28
|
+
additional_files=["model.onnx_data"],
|
|
29
|
+
model_file="model.onnx",
|
|
30
|
+
),
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ColPali(LateInteractionMultimodalEmbeddingBase, OnnxMultimodalModel[NumpyArray]):
|
|
35
|
+
QUERY_PREFIX = "Query: "
|
|
36
|
+
BOS_TOKEN = "<s>"
|
|
37
|
+
PAD_TOKEN = "<pad>"
|
|
38
|
+
QUERY_MARKER_TOKEN_ID = [2, 5098]
|
|
39
|
+
IMAGE_PLACEHOLDER_SIZE = (3, 448, 448)
|
|
40
|
+
EMPTY_TEXT_PLACEHOLDER = np.array(
|
|
41
|
+
[257152] * 1024 + [2, 50721, 573, 2416, 235265, 108]
|
|
42
|
+
) # This is a tokenization of '<image>' * 1024 + '<bos>Describe the image.\n' line which is used as placeholder
|
|
43
|
+
# while processing an image
|
|
44
|
+
EVEN_ATTENTION_MASK = np.array([1] * 1030)
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
model_name: str,
|
|
49
|
+
cache_dir: str | None = None,
|
|
50
|
+
threads: int | None = None,
|
|
51
|
+
providers: Sequence[OnnxProvider] | None = None,
|
|
52
|
+
cuda: bool | Device = Device.AUTO,
|
|
53
|
+
device_ids: list[int] | None = None,
|
|
54
|
+
lazy_load: bool = False,
|
|
55
|
+
device_id: int | None = None,
|
|
56
|
+
specific_model_path: str | None = None,
|
|
57
|
+
**kwargs: Any,
|
|
58
|
+
):
|
|
59
|
+
"""
|
|
60
|
+
Args:
|
|
61
|
+
model_name (str): The name of the model to use.
|
|
62
|
+
cache_dir (str, optional): The path to the cache directory.
|
|
63
|
+
Can be set using the `FASTEMBED_CACHE_PATH` env variable.
|
|
64
|
+
Defaults to `fastembed_cache` in the system's temp directory.
|
|
65
|
+
threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
|
|
66
|
+
providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
|
|
67
|
+
Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
|
|
68
|
+
cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers`
|
|
69
|
+
Defaults to Device.AUTO.
|
|
70
|
+
device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
|
|
71
|
+
workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive
|
|
72
|
+
with `providers`. Defaults to None.
|
|
73
|
+
lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
|
|
74
|
+
Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
|
|
75
|
+
device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
|
|
76
|
+
|
|
77
|
+
Raises:
|
|
78
|
+
ValueError: If the model_name is not in the format <org>/<model> e.g. BAAI/bge-base-en.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
super().__init__(model_name, cache_dir, threads, **kwargs)
|
|
82
|
+
self.providers = providers
|
|
83
|
+
self.lazy_load = lazy_load
|
|
84
|
+
self._extra_session_options = self._select_exposed_session_options(kwargs)
|
|
85
|
+
|
|
86
|
+
# List of device ids, that can be used for data parallel processing in workers
|
|
87
|
+
self.device_ids = device_ids
|
|
88
|
+
self.cuda = cuda
|
|
89
|
+
|
|
90
|
+
# This device_id will be used if we need to load model in current process
|
|
91
|
+
self.device_id: int | None = None
|
|
92
|
+
if device_id is not None:
|
|
93
|
+
self.device_id = device_id
|
|
94
|
+
elif self.device_ids is not None:
|
|
95
|
+
self.device_id = self.device_ids[0]
|
|
96
|
+
|
|
97
|
+
self.model_description = self._get_model_description(model_name)
|
|
98
|
+
self.cache_dir = str(define_cache_dir(cache_dir))
|
|
99
|
+
|
|
100
|
+
self._specific_model_path = specific_model_path
|
|
101
|
+
self._model_dir = self.download_model(
|
|
102
|
+
self.model_description,
|
|
103
|
+
self.cache_dir,
|
|
104
|
+
local_files_only=self._local_files_only,
|
|
105
|
+
specific_model_path=self._specific_model_path,
|
|
106
|
+
)
|
|
107
|
+
self.mask_token_id = None
|
|
108
|
+
self.pad_token_id = None
|
|
109
|
+
|
|
110
|
+
if not self.lazy_load:
|
|
111
|
+
self.load_onnx_model()
|
|
112
|
+
|
|
113
|
+
@classmethod
|
|
114
|
+
def _list_supported_models(cls) -> list[DenseModelDescription]:
|
|
115
|
+
"""Lists the supported models.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
list[DenseModelDescription]: A list of DenseModelDescription objects containing the model information.
|
|
119
|
+
"""
|
|
120
|
+
return supported_colpali_models
|
|
121
|
+
|
|
122
|
+
def load_onnx_model(self) -> None:
|
|
123
|
+
self._load_onnx_model(
|
|
124
|
+
model_dir=self._model_dir,
|
|
125
|
+
model_file=self.model_description.model_file,
|
|
126
|
+
threads=self.threads,
|
|
127
|
+
providers=self.providers,
|
|
128
|
+
cuda=self.cuda,
|
|
129
|
+
device_id=self.device_id,
|
|
130
|
+
extra_session_options=self._extra_session_options,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
def _post_process_onnx_image_output(
|
|
134
|
+
self,
|
|
135
|
+
output: OnnxOutputContext,
|
|
136
|
+
) -> Iterable[NumpyArray]:
|
|
137
|
+
"""
|
|
138
|
+
Post-process the ONNX model output to convert it into a usable format.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
output (OnnxOutputContext): The raw output from the ONNX model.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
Iterable[NumpyArray]: Post-processed output as NumPy arrays.
|
|
145
|
+
"""
|
|
146
|
+
assert self.model_description.dim is not None, "Model dim is not defined"
|
|
147
|
+
return output.model_output.reshape(
|
|
148
|
+
output.model_output.shape[0], -1, self.model_description.dim
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
def _post_process_onnx_text_output(
|
|
152
|
+
self,
|
|
153
|
+
output: OnnxOutputContext,
|
|
154
|
+
) -> Iterable[NumpyArray]:
|
|
155
|
+
"""
|
|
156
|
+
Post-process the ONNX model output to convert it into a usable format.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
output (OnnxOutputContext): The raw output from the ONNX model.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
Iterable[NumpyArray]: Post-processed output as NumPy arrays.
|
|
163
|
+
"""
|
|
164
|
+
return output.model_output
|
|
165
|
+
|
|
166
|
+
def tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
|
|
167
|
+
texts_query: list[str] = []
|
|
168
|
+
for query in documents:
|
|
169
|
+
query = self.BOS_TOKEN + self.QUERY_PREFIX + query + self.PAD_TOKEN * 10
|
|
170
|
+
query += "\n"
|
|
171
|
+
|
|
172
|
+
texts_query.append(query)
|
|
173
|
+
encoded = self.tokenizer.encode_batch(texts_query) # type: ignore[union-attr]
|
|
174
|
+
return encoded
|
|
175
|
+
|
|
176
|
+
def token_count(
|
|
177
|
+
self,
|
|
178
|
+
texts: str | Iterable[str],
|
|
179
|
+
batch_size: int = 1024,
|
|
180
|
+
include_extension: bool = False,
|
|
181
|
+
**kwargs: Any,
|
|
182
|
+
) -> int:
|
|
183
|
+
if not hasattr(self, "model") or self.model is None:
|
|
184
|
+
self.load_onnx_model() # loads the tokenizer as well
|
|
185
|
+
token_num = 0
|
|
186
|
+
texts = [texts] if isinstance(texts, str) else texts
|
|
187
|
+
assert self.tokenizer is not None
|
|
188
|
+
tokenize_func = self.tokenize if include_extension else self.tokenizer.encode_batch
|
|
189
|
+
for batch in iter_batch(texts, batch_size):
|
|
190
|
+
token_num += sum([sum(encoding.attention_mask) for encoding in tokenize_func(batch)])
|
|
191
|
+
return token_num
|
|
192
|
+
|
|
193
|
+
def _preprocess_onnx_text_input(
|
|
194
|
+
self, onnx_input: dict[str, NumpyArray], **kwargs: Any
|
|
195
|
+
) -> dict[str, NumpyArray]:
|
|
196
|
+
onnx_input["input_ids"] = np.array(
|
|
197
|
+
[
|
|
198
|
+
self.QUERY_MARKER_TOKEN_ID + input_ids[2:].tolist() # type: ignore[index]
|
|
199
|
+
for input_ids in onnx_input["input_ids"]
|
|
200
|
+
]
|
|
201
|
+
)
|
|
202
|
+
empty_image_placeholder: NumpyArray = np.zeros(
|
|
203
|
+
self.IMAGE_PLACEHOLDER_SIZE, dtype=np.float32
|
|
204
|
+
)
|
|
205
|
+
onnx_input["pixel_values"] = np.array(
|
|
206
|
+
[empty_image_placeholder for _ in onnx_input["input_ids"]],
|
|
207
|
+
)
|
|
208
|
+
return onnx_input
|
|
209
|
+
|
|
210
|
+
def _preprocess_onnx_image_input(
|
|
211
|
+
self, onnx_input: dict[str, np.ndarray], **kwargs: Any
|
|
212
|
+
) -> dict[str, NumpyArray]:
|
|
213
|
+
"""
|
|
214
|
+
Add placeholders for text input when processing image data for ONNX.
|
|
215
|
+
Args:
|
|
216
|
+
onnx_input (Dict[str, NumpyArray]): Preprocessed image inputs.
|
|
217
|
+
**kwargs: Additional arguments.
|
|
218
|
+
Returns:
|
|
219
|
+
Dict[str, NumpyArray]: ONNX input with text placeholders.
|
|
220
|
+
"""
|
|
221
|
+
onnx_input["input_ids"] = np.array(
|
|
222
|
+
[self.EMPTY_TEXT_PLACEHOLDER for _ in onnx_input["pixel_values"]]
|
|
223
|
+
)
|
|
224
|
+
onnx_input["attention_mask"] = np.array(
|
|
225
|
+
[self.EVEN_ATTENTION_MASK for _ in onnx_input["pixel_values"]]
|
|
226
|
+
)
|
|
227
|
+
return onnx_input
|
|
228
|
+
|
|
229
|
+
def embed_text(
|
|
230
|
+
self,
|
|
231
|
+
documents: str | Iterable[str],
|
|
232
|
+
batch_size: int = 256,
|
|
233
|
+
parallel: int | None = None,
|
|
234
|
+
**kwargs: Any,
|
|
235
|
+
) -> Iterable[NumpyArray]:
|
|
236
|
+
"""
|
|
237
|
+
Encode a list of documents into list of embeddings.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
documents: Iterator of documents or single document to embed
|
|
241
|
+
batch_size: Batch size for encoding -- higher values will use more memory, but be faster
|
|
242
|
+
parallel:
|
|
243
|
+
If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
|
|
244
|
+
If 0, use all available cores.
|
|
245
|
+
If None, don't use data-parallel processing, use default onnxruntime threading instead.
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
List of embeddings, one per document
|
|
249
|
+
"""
|
|
250
|
+
yield from self._embed_documents(
|
|
251
|
+
model_name=self.model_name,
|
|
252
|
+
cache_dir=str(self.cache_dir),
|
|
253
|
+
documents=documents,
|
|
254
|
+
batch_size=batch_size,
|
|
255
|
+
parallel=parallel,
|
|
256
|
+
providers=self.providers,
|
|
257
|
+
cuda=self.cuda,
|
|
258
|
+
device_ids=self.device_ids,
|
|
259
|
+
local_files_only=self._local_files_only,
|
|
260
|
+
specific_model_path=self._specific_model_path,
|
|
261
|
+
extra_session_options=self._extra_session_options,
|
|
262
|
+
**kwargs,
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
def embed_image(
|
|
266
|
+
self,
|
|
267
|
+
images: ImageInput | Iterable[ImageInput],
|
|
268
|
+
batch_size: int = 16,
|
|
269
|
+
parallel: int | None = None,
|
|
270
|
+
**kwargs: Any,
|
|
271
|
+
) -> Iterable[NumpyArray]:
|
|
272
|
+
"""
|
|
273
|
+
Encode a list of images into list of embeddings.
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
images: Iterator of image paths or single image path to embed
|
|
277
|
+
batch_size: Batch size for encoding -- higher values will use more memory, but be faster
|
|
278
|
+
parallel:
|
|
279
|
+
If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
|
|
280
|
+
If 0, use all available cores.
|
|
281
|
+
If None, don't use data-parallel processing, use default onnxruntime threading instead.
|
|
282
|
+
|
|
283
|
+
Returns:
|
|
284
|
+
List of embeddings, one per document
|
|
285
|
+
"""
|
|
286
|
+
yield from self._embed_images(
|
|
287
|
+
model_name=self.model_name,
|
|
288
|
+
cache_dir=str(self.cache_dir),
|
|
289
|
+
images=images,
|
|
290
|
+
batch_size=batch_size,
|
|
291
|
+
parallel=parallel,
|
|
292
|
+
providers=self.providers,
|
|
293
|
+
cuda=self.cuda,
|
|
294
|
+
device_ids=self.device_ids,
|
|
295
|
+
local_files_only=self._local_files_only,
|
|
296
|
+
specific_model_path=self._specific_model_path,
|
|
297
|
+
extra_session_options=self._extra_session_options,
|
|
298
|
+
**kwargs,
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
@classmethod
|
|
302
|
+
def _get_text_worker_class(cls) -> Type[TextEmbeddingWorker[NumpyArray]]:
|
|
303
|
+
return ColPaliTextEmbeddingWorker
|
|
304
|
+
|
|
305
|
+
@classmethod
|
|
306
|
+
def _get_image_worker_class(cls) -> Type[ImageEmbeddingWorker[NumpyArray]]:
|
|
307
|
+
return ColPaliImageEmbeddingWorker
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
class ColPaliTextEmbeddingWorker(TextEmbeddingWorker[NumpyArray]):
|
|
311
|
+
def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColPali:
|
|
312
|
+
return ColPali(
|
|
313
|
+
model_name=model_name,
|
|
314
|
+
cache_dir=cache_dir,
|
|
315
|
+
threads=1,
|
|
316
|
+
**kwargs,
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
class ColPaliImageEmbeddingWorker(ImageEmbeddingWorker[NumpyArray]):
|
|
321
|
+
def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColPali:
|
|
322
|
+
return ColPali(
|
|
323
|
+
model_name=model_name,
|
|
324
|
+
cache_dir=cache_dir,
|
|
325
|
+
threads=1,
|
|
326
|
+
**kwargs,
|
|
327
|
+
)
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
from typing import Any, Iterable, Sequence, Type
|
|
2
|
+
from dataclasses import asdict
|
|
3
|
+
|
|
4
|
+
from fastembed.common import OnnxProvider, ImageInput
|
|
5
|
+
from fastembed.common.types import NumpyArray, Device
|
|
6
|
+
from fastembed.late_interaction_multimodal.colpali import ColPali
|
|
7
|
+
from fastembed.late_interaction_multimodal.colmodernvbert import ColModernVBERT
|
|
8
|
+
|
|
9
|
+
from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import (
|
|
10
|
+
LateInteractionMultimodalEmbeddingBase,
|
|
11
|
+
)
|
|
12
|
+
from fastembed.common.model_description import DenseModelDescription
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class LateInteractionMultimodalEmbedding(LateInteractionMultimodalEmbeddingBase):
|
|
16
|
+
EMBEDDINGS_REGISTRY: list[Type[LateInteractionMultimodalEmbeddingBase]] = [
|
|
17
|
+
ColPali,
|
|
18
|
+
ColModernVBERT,
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
def list_supported_models(cls) -> list[dict[str, Any]]:
|
|
23
|
+
"""
|
|
24
|
+
Lists the supported models.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
list[dict[str, Any]]: A list of dictionaries containing the model information.
|
|
28
|
+
|
|
29
|
+
Example:
|
|
30
|
+
```
|
|
31
|
+
[
|
|
32
|
+
{
|
|
33
|
+
"model": "Qdrant/colpali-v1.3-fp16",
|
|
34
|
+
"dim": 128,
|
|
35
|
+
"description": "Text embeddings, Unimodal (text), Aligned to image latent space, ColBERT-compatible, 512 tokens max, 2024.",
|
|
36
|
+
"license": "mit",
|
|
37
|
+
"size_in_GB": 6.06,
|
|
38
|
+
"sources": {
|
|
39
|
+
"hf": "Qdrant/colpali-v1.3-fp16",
|
|
40
|
+
},
|
|
41
|
+
"additional_files": [
|
|
42
|
+
"model.onnx_data",
|
|
43
|
+
],
|
|
44
|
+
"model_file": "model.onnx",
|
|
45
|
+
},
|
|
46
|
+
]
|
|
47
|
+
```
|
|
48
|
+
"""
|
|
49
|
+
return [asdict(model) for model in cls._list_supported_models()]
|
|
50
|
+
|
|
51
|
+
@classmethod
|
|
52
|
+
def _list_supported_models(cls) -> list[DenseModelDescription]:
|
|
53
|
+
result: list[DenseModelDescription] = []
|
|
54
|
+
for embedding in cls.EMBEDDINGS_REGISTRY:
|
|
55
|
+
result.extend(embedding._list_supported_models())
|
|
56
|
+
return result
|
|
57
|
+
|
|
58
|
+
def __init__(
|
|
59
|
+
self,
|
|
60
|
+
model_name: str,
|
|
61
|
+
cache_dir: str | None = None,
|
|
62
|
+
threads: int | None = None,
|
|
63
|
+
providers: Sequence[OnnxProvider] | None = None,
|
|
64
|
+
cuda: bool | Device = Device.AUTO,
|
|
65
|
+
device_ids: list[int] | None = None,
|
|
66
|
+
lazy_load: bool = False,
|
|
67
|
+
**kwargs: Any,
|
|
68
|
+
):
|
|
69
|
+
super().__init__(model_name, cache_dir, threads, **kwargs)
|
|
70
|
+
for EMBEDDING_MODEL_TYPE in self.EMBEDDINGS_REGISTRY:
|
|
71
|
+
supported_models = EMBEDDING_MODEL_TYPE._list_supported_models()
|
|
72
|
+
if any(model_name.lower() == model.model.lower() for model in supported_models):
|
|
73
|
+
self.model = EMBEDDING_MODEL_TYPE(
|
|
74
|
+
model_name,
|
|
75
|
+
cache_dir,
|
|
76
|
+
threads=threads,
|
|
77
|
+
providers=providers,
|
|
78
|
+
cuda=cuda,
|
|
79
|
+
device_ids=device_ids,
|
|
80
|
+
lazy_load=lazy_load,
|
|
81
|
+
**kwargs,
|
|
82
|
+
)
|
|
83
|
+
return
|
|
84
|
+
|
|
85
|
+
raise ValueError(
|
|
86
|
+
f"Model {model_name} is not supported in LateInteractionMultimodalEmbedding."
|
|
87
|
+
"Please check the supported models using `LateInteractionMultimodalEmbedding.list_supported_models()`"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
@property
|
|
91
|
+
def embedding_size(self) -> int:
|
|
92
|
+
"""Get the embedding size of the current model"""
|
|
93
|
+
if self._embedding_size is None:
|
|
94
|
+
self._embedding_size = self.get_embedding_size(self.model_name)
|
|
95
|
+
return self._embedding_size
|
|
96
|
+
|
|
97
|
+
@classmethod
|
|
98
|
+
def get_embedding_size(cls, model_name: str) -> int:
|
|
99
|
+
"""Get the embedding size of the passed model
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
model_name (str): The name of the model to get embedding size for.
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
int: The size of the embedding.
|
|
106
|
+
|
|
107
|
+
Raises:
|
|
108
|
+
ValueError: If the model name is not found in the supported models.
|
|
109
|
+
"""
|
|
110
|
+
descriptions = cls._list_supported_models()
|
|
111
|
+
embedding_size: int | None = None
|
|
112
|
+
for description in descriptions:
|
|
113
|
+
if description.model.lower() == model_name.lower():
|
|
114
|
+
embedding_size = description.dim
|
|
115
|
+
break
|
|
116
|
+
if embedding_size is None:
|
|
117
|
+
model_names = [description.model for description in descriptions]
|
|
118
|
+
raise ValueError(
|
|
119
|
+
f"Embedding size for model {model_name} was None. "
|
|
120
|
+
f"Available model names: {model_names}"
|
|
121
|
+
)
|
|
122
|
+
return embedding_size
|
|
123
|
+
|
|
124
|
+
def embed_text(
|
|
125
|
+
self,
|
|
126
|
+
documents: str | Iterable[str],
|
|
127
|
+
batch_size: int = 256,
|
|
128
|
+
parallel: int | None = None,
|
|
129
|
+
**kwargs: Any,
|
|
130
|
+
) -> Iterable[NumpyArray]:
|
|
131
|
+
"""
|
|
132
|
+
Encode a list of documents into list of embeddings.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
documents: Iterator of documents or single document to embed
|
|
136
|
+
batch_size: Batch size for encoding -- higher values will use more memory, but be faster
|
|
137
|
+
parallel:
|
|
138
|
+
If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
|
|
139
|
+
If 0, use all available cores.
|
|
140
|
+
If None, don't use data-parallel processing, use default onnxruntime threading instead.
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
List of embeddings, one per document
|
|
144
|
+
"""
|
|
145
|
+
yield from self.model.embed_text(documents, batch_size, parallel, **kwargs)
|
|
146
|
+
|
|
147
|
+
def embed_image(
|
|
148
|
+
self,
|
|
149
|
+
images: ImageInput | Iterable[ImageInput],
|
|
150
|
+
batch_size: int = 16,
|
|
151
|
+
parallel: int | None = None,
|
|
152
|
+
**kwargs: Any,
|
|
153
|
+
) -> Iterable[NumpyArray]:
|
|
154
|
+
"""
|
|
155
|
+
Encode a list of images into list of embeddings.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
images: Iterator of image paths or single image path to embed
|
|
159
|
+
batch_size: Batch size for encoding -- higher values will use more memory, but be faster
|
|
160
|
+
parallel:
|
|
161
|
+
If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
|
|
162
|
+
If 0, use all available cores.
|
|
163
|
+
If None, don't use data-parallel processing, use default onnxruntime threading instead.
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
List of embeddings, one per image
|
|
167
|
+
"""
|
|
168
|
+
yield from self.model.embed_image(images, batch_size, parallel, **kwargs)
|
|
169
|
+
|
|
170
|
+
def token_count(
|
|
171
|
+
self,
|
|
172
|
+
texts: str | Iterable[str],
|
|
173
|
+
batch_size: int = 1024,
|
|
174
|
+
include_extension: bool = False,
|
|
175
|
+
**kwargs: Any,
|
|
176
|
+
) -> int:
|
|
177
|
+
"""Returns the number of tokens in the texts.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
texts (str | Iterable[str]): The list of texts to embed.
|
|
181
|
+
batch_size (int): Batch size for encoding
|
|
182
|
+
include_extension (bool): Whether to include tokens added by preprocessing
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
int: Sum of number of tokens in the texts.
|
|
186
|
+
"""
|
|
187
|
+
return self.model.token_count(
|
|
188
|
+
texts, batch_size=batch_size, include_extension=include_extension, **kwargs
|
|
189
|
+
)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from typing import Iterable, Any
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
from fastembed.common import ImageInput
|
|
5
|
+
from fastembed.common.model_description import DenseModelDescription
|
|
6
|
+
from fastembed.common.model_management import ModelManagement
|
|
7
|
+
from fastembed.common.types import NumpyArray
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class LateInteractionMultimodalEmbeddingBase(ModelManagement[DenseModelDescription]):
|
|
11
|
+
def __init__(
|
|
12
|
+
self,
|
|
13
|
+
model_name: str,
|
|
14
|
+
cache_dir: str | None = None,
|
|
15
|
+
threads: int | None = None,
|
|
16
|
+
**kwargs: Any,
|
|
17
|
+
):
|
|
18
|
+
self.model_name = model_name
|
|
19
|
+
self.cache_dir = cache_dir
|
|
20
|
+
self.threads = threads
|
|
21
|
+
self._local_files_only = kwargs.pop("local_files_only", False)
|
|
22
|
+
self._embedding_size: int | None = None
|
|
23
|
+
|
|
24
|
+
def embed_text(
|
|
25
|
+
self,
|
|
26
|
+
documents: str | Iterable[str],
|
|
27
|
+
batch_size: int = 256,
|
|
28
|
+
parallel: int | None = None,
|
|
29
|
+
**kwargs: Any,
|
|
30
|
+
) -> Iterable[NumpyArray]:
|
|
31
|
+
"""
|
|
32
|
+
Embeds a list of documents into a list of embeddings.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
documents (Iterable[str]): The list of texts to embed.
|
|
36
|
+
batch_size: Batch size for encoding -- higher values will use more memory, but be faster
|
|
37
|
+
parallel:
|
|
38
|
+
If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
|
|
39
|
+
If 0, use all available cores.
|
|
40
|
+
If None, don't use data-parallel processing, use default onnxruntime threading instead.
|
|
41
|
+
**kwargs: Additional keyword argument to pass to the embed method.
|
|
42
|
+
|
|
43
|
+
Yields:
|
|
44
|
+
Iterable[NumpyArray]: The embeddings.
|
|
45
|
+
"""
|
|
46
|
+
raise NotImplementedError()
|
|
47
|
+
|
|
48
|
+
def embed_image(
|
|
49
|
+
self,
|
|
50
|
+
images: ImageInput | Iterable[ImageInput],
|
|
51
|
+
batch_size: int = 16,
|
|
52
|
+
parallel: int | None = None,
|
|
53
|
+
**kwargs: Any,
|
|
54
|
+
) -> Iterable[NumpyArray]:
|
|
55
|
+
"""
|
|
56
|
+
Encode a list of images into list of embeddings.
|
|
57
|
+
Args:
|
|
58
|
+
images: Iterator of image paths or single image path to embed
|
|
59
|
+
batch_size: Batch size for encoding -- higher values will use more memory, but be faster
|
|
60
|
+
parallel:
|
|
61
|
+
If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
|
|
62
|
+
If 0, use all available cores.
|
|
63
|
+
If None, don't use data-parallel processing, use default onnxruntime threading instead.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
List of embeddings, one per image
|
|
67
|
+
"""
|
|
68
|
+
raise NotImplementedError()
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def get_embedding_size(cls, model_name: str) -> int:
|
|
72
|
+
"""Returns embedding size of the chosen model."""
|
|
73
|
+
raise NotImplementedError("Subclasses must implement this method")
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def embedding_size(self) -> int:
|
|
77
|
+
"""Returns embedding size for the current model"""
|
|
78
|
+
raise NotImplementedError("Subclasses must implement this method")
|
|
79
|
+
|
|
80
|
+
def token_count(
|
|
81
|
+
self,
|
|
82
|
+
texts: str | Iterable[str],
|
|
83
|
+
**kwargs: Any,
|
|
84
|
+
) -> int:
|
|
85
|
+
"""Returns the number of tokens in the texts."""
|
|
86
|
+
raise NotImplementedError("Subclasses must implement this method")
|