fastembed-bio 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. fastembed/__init__.py +24 -0
  2. fastembed/bio/__init__.py +3 -0
  3. fastembed/bio/protein_embedding.py +456 -0
  4. fastembed/common/__init__.py +3 -0
  5. fastembed/common/model_description.py +52 -0
  6. fastembed/common/model_management.py +471 -0
  7. fastembed/common/onnx_model.py +188 -0
  8. fastembed/common/preprocessor_utils.py +84 -0
  9. fastembed/common/types.py +27 -0
  10. fastembed/common/utils.py +69 -0
  11. fastembed/embedding.py +24 -0
  12. fastembed/image/__init__.py +3 -0
  13. fastembed/image/image_embedding.py +135 -0
  14. fastembed/image/image_embedding_base.py +55 -0
  15. fastembed/image/onnx_embedding.py +217 -0
  16. fastembed/image/onnx_image_model.py +156 -0
  17. fastembed/image/transform/functional.py +221 -0
  18. fastembed/image/transform/operators.py +499 -0
  19. fastembed/late_interaction/__init__.py +5 -0
  20. fastembed/late_interaction/colbert.py +301 -0
  21. fastembed/late_interaction/jina_colbert.py +58 -0
  22. fastembed/late_interaction/late_interaction_embedding_base.py +80 -0
  23. fastembed/late_interaction/late_interaction_text_embedding.py +180 -0
  24. fastembed/late_interaction/token_embeddings.py +83 -0
  25. fastembed/late_interaction_multimodal/__init__.py +5 -0
  26. fastembed/late_interaction_multimodal/colmodernvbert.py +532 -0
  27. fastembed/late_interaction_multimodal/colpali.py +327 -0
  28. fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py +189 -0
  29. fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py +86 -0
  30. fastembed/late_interaction_multimodal/onnx_multimodal_model.py +291 -0
  31. fastembed/parallel_processor.py +253 -0
  32. fastembed/postprocess/__init__.py +3 -0
  33. fastembed/postprocess/muvera.py +362 -0
  34. fastembed/py.typed +1 -0
  35. fastembed/rerank/cross_encoder/__init__.py +3 -0
  36. fastembed/rerank/cross_encoder/custom_text_cross_encoder.py +47 -0
  37. fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py +239 -0
  38. fastembed/rerank/cross_encoder/onnx_text_model.py +204 -0
  39. fastembed/rerank/cross_encoder/text_cross_encoder.py +178 -0
  40. fastembed/rerank/cross_encoder/text_cross_encoder_base.py +63 -0
  41. fastembed/sparse/__init__.py +4 -0
  42. fastembed/sparse/bm25.py +359 -0
  43. fastembed/sparse/bm42.py +369 -0
  44. fastembed/sparse/minicoil.py +372 -0
  45. fastembed/sparse/sparse_embedding_base.py +90 -0
  46. fastembed/sparse/sparse_text_embedding.py +143 -0
  47. fastembed/sparse/splade_pp.py +196 -0
  48. fastembed/sparse/utils/minicoil_encoder.py +146 -0
  49. fastembed/sparse/utils/sparse_vectors_converter.py +244 -0
  50. fastembed/sparse/utils/tokenizer.py +120 -0
  51. fastembed/sparse/utils/vocab_resolver.py +202 -0
  52. fastembed/text/__init__.py +3 -0
  53. fastembed/text/clip_embedding.py +56 -0
  54. fastembed/text/custom_text_embedding.py +97 -0
  55. fastembed/text/multitask_embedding.py +109 -0
  56. fastembed/text/onnx_embedding.py +353 -0
  57. fastembed/text/onnx_text_model.py +180 -0
  58. fastembed/text/pooled_embedding.py +136 -0
  59. fastembed/text/pooled_normalized_embedding.py +164 -0
  60. fastembed/text/text_embedding.py +228 -0
  61. fastembed/text/text_embedding_base.py +75 -0
  62. fastembed_bio-0.1.0.dist-info/METADATA +339 -0
  63. fastembed_bio-0.1.0.dist-info/RECORD +66 -0
  64. fastembed_bio-0.1.0.dist-info/WHEEL +4 -0
  65. fastembed_bio-0.1.0.dist-info/licenses/LICENSE +201 -0
  66. fastembed_bio-0.1.0.dist-info/licenses/NOTICE +22 -0
@@ -0,0 +1,327 @@
1
+ from typing import Any, Iterable, Sequence, Type
2
+
3
+ import numpy as np
4
+ from tokenizers import Encoding
5
+
6
+ from fastembed.common import OnnxProvider, ImageInput
7
+ from fastembed.common.onnx_model import OnnxOutputContext
8
+ from fastembed.common.types import NumpyArray, Device
9
+ from fastembed.common.utils import define_cache_dir, iter_batch
10
+ from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import (
11
+ LateInteractionMultimodalEmbeddingBase,
12
+ )
13
+ from fastembed.late_interaction_multimodal.onnx_multimodal_model import (
14
+ OnnxMultimodalModel,
15
+ TextEmbeddingWorker,
16
+ ImageEmbeddingWorker,
17
+ )
18
+ from fastembed.common.model_description import DenseModelDescription, ModelSource
19
+
20
+ supported_colpali_models: list[DenseModelDescription] = [
21
+ DenseModelDescription(
22
+ model="Qdrant/colpali-v1.3-fp16",
23
+ dim=128,
24
+ description="Text embeddings, Multimodal (text&image), English, 50 tokens query length truncation, 2024.",
25
+ license="mit",
26
+ size_in_GB=6.5,
27
+ sources=ModelSource(hf="Qdrant/colpali-v1.3-fp16"),
28
+ additional_files=["model.onnx_data"],
29
+ model_file="model.onnx",
30
+ ),
31
+ ]
32
+
33
+
34
+ class ColPali(LateInteractionMultimodalEmbeddingBase, OnnxMultimodalModel[NumpyArray]):
35
+ QUERY_PREFIX = "Query: "
36
+ BOS_TOKEN = "<s>"
37
+ PAD_TOKEN = "<pad>"
38
+ QUERY_MARKER_TOKEN_ID = [2, 5098]
39
+ IMAGE_PLACEHOLDER_SIZE = (3, 448, 448)
40
+ EMPTY_TEXT_PLACEHOLDER = np.array(
41
+ [257152] * 1024 + [2, 50721, 573, 2416, 235265, 108]
42
+ ) # This is a tokenization of '<image>' * 1024 + '<bos>Describe the image.\n' line which is used as placeholder
43
+ # while processing an image
44
+ EVEN_ATTENTION_MASK = np.array([1] * 1030)
45
+
46
+ def __init__(
47
+ self,
48
+ model_name: str,
49
+ cache_dir: str | None = None,
50
+ threads: int | None = None,
51
+ providers: Sequence[OnnxProvider] | None = None,
52
+ cuda: bool | Device = Device.AUTO,
53
+ device_ids: list[int] | None = None,
54
+ lazy_load: bool = False,
55
+ device_id: int | None = None,
56
+ specific_model_path: str | None = None,
57
+ **kwargs: Any,
58
+ ):
59
+ """
60
+ Args:
61
+ model_name (str): The name of the model to use.
62
+ cache_dir (str, optional): The path to the cache directory.
63
+ Can be set using the `FASTEMBED_CACHE_PATH` env variable.
64
+ Defaults to `fastembed_cache` in the system's temp directory.
65
+ threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
66
+ providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
67
+ Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
68
+ cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers`
69
+ Defaults to Device.AUTO.
70
+ device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
71
+ workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive
72
+ with `providers`. Defaults to None.
73
+ lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
74
+ Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
75
+ device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
76
+
77
+ Raises:
78
+ ValueError: If the model_name is not in the format <org>/<model> e.g. BAAI/bge-base-en.
79
+ """
80
+
81
+ super().__init__(model_name, cache_dir, threads, **kwargs)
82
+ self.providers = providers
83
+ self.lazy_load = lazy_load
84
+ self._extra_session_options = self._select_exposed_session_options(kwargs)
85
+
86
+ # List of device ids, that can be used for data parallel processing in workers
87
+ self.device_ids = device_ids
88
+ self.cuda = cuda
89
+
90
+ # This device_id will be used if we need to load model in current process
91
+ self.device_id: int | None = None
92
+ if device_id is not None:
93
+ self.device_id = device_id
94
+ elif self.device_ids is not None:
95
+ self.device_id = self.device_ids[0]
96
+
97
+ self.model_description = self._get_model_description(model_name)
98
+ self.cache_dir = str(define_cache_dir(cache_dir))
99
+
100
+ self._specific_model_path = specific_model_path
101
+ self._model_dir = self.download_model(
102
+ self.model_description,
103
+ self.cache_dir,
104
+ local_files_only=self._local_files_only,
105
+ specific_model_path=self._specific_model_path,
106
+ )
107
+ self.mask_token_id = None
108
+ self.pad_token_id = None
109
+
110
+ if not self.lazy_load:
111
+ self.load_onnx_model()
112
+
113
+ @classmethod
114
+ def _list_supported_models(cls) -> list[DenseModelDescription]:
115
+ """Lists the supported models.
116
+
117
+ Returns:
118
+ list[DenseModelDescription]: A list of DenseModelDescription objects containing the model information.
119
+ """
120
+ return supported_colpali_models
121
+
122
+ def load_onnx_model(self) -> None:
123
+ self._load_onnx_model(
124
+ model_dir=self._model_dir,
125
+ model_file=self.model_description.model_file,
126
+ threads=self.threads,
127
+ providers=self.providers,
128
+ cuda=self.cuda,
129
+ device_id=self.device_id,
130
+ extra_session_options=self._extra_session_options,
131
+ )
132
+
133
+ def _post_process_onnx_image_output(
134
+ self,
135
+ output: OnnxOutputContext,
136
+ ) -> Iterable[NumpyArray]:
137
+ """
138
+ Post-process the ONNX model output to convert it into a usable format.
139
+
140
+ Args:
141
+ output (OnnxOutputContext): The raw output from the ONNX model.
142
+
143
+ Returns:
144
+ Iterable[NumpyArray]: Post-processed output as NumPy arrays.
145
+ """
146
+ assert self.model_description.dim is not None, "Model dim is not defined"
147
+ return output.model_output.reshape(
148
+ output.model_output.shape[0], -1, self.model_description.dim
149
+ )
150
+
151
+ def _post_process_onnx_text_output(
152
+ self,
153
+ output: OnnxOutputContext,
154
+ ) -> Iterable[NumpyArray]:
155
+ """
156
+ Post-process the ONNX model output to convert it into a usable format.
157
+
158
+ Args:
159
+ output (OnnxOutputContext): The raw output from the ONNX model.
160
+
161
+ Returns:
162
+ Iterable[NumpyArray]: Post-processed output as NumPy arrays.
163
+ """
164
+ return output.model_output
165
+
166
+ def tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
167
+ texts_query: list[str] = []
168
+ for query in documents:
169
+ query = self.BOS_TOKEN + self.QUERY_PREFIX + query + self.PAD_TOKEN * 10
170
+ query += "\n"
171
+
172
+ texts_query.append(query)
173
+ encoded = self.tokenizer.encode_batch(texts_query) # type: ignore[union-attr]
174
+ return encoded
175
+
176
+ def token_count(
177
+ self,
178
+ texts: str | Iterable[str],
179
+ batch_size: int = 1024,
180
+ include_extension: bool = False,
181
+ **kwargs: Any,
182
+ ) -> int:
183
+ if not hasattr(self, "model") or self.model is None:
184
+ self.load_onnx_model() # loads the tokenizer as well
185
+ token_num = 0
186
+ texts = [texts] if isinstance(texts, str) else texts
187
+ assert self.tokenizer is not None
188
+ tokenize_func = self.tokenize if include_extension else self.tokenizer.encode_batch
189
+ for batch in iter_batch(texts, batch_size):
190
+ token_num += sum([sum(encoding.attention_mask) for encoding in tokenize_func(batch)])
191
+ return token_num
192
+
193
+ def _preprocess_onnx_text_input(
194
+ self, onnx_input: dict[str, NumpyArray], **kwargs: Any
195
+ ) -> dict[str, NumpyArray]:
196
+ onnx_input["input_ids"] = np.array(
197
+ [
198
+ self.QUERY_MARKER_TOKEN_ID + input_ids[2:].tolist() # type: ignore[index]
199
+ for input_ids in onnx_input["input_ids"]
200
+ ]
201
+ )
202
+ empty_image_placeholder: NumpyArray = np.zeros(
203
+ self.IMAGE_PLACEHOLDER_SIZE, dtype=np.float32
204
+ )
205
+ onnx_input["pixel_values"] = np.array(
206
+ [empty_image_placeholder for _ in onnx_input["input_ids"]],
207
+ )
208
+ return onnx_input
209
+
210
+ def _preprocess_onnx_image_input(
211
+ self, onnx_input: dict[str, np.ndarray], **kwargs: Any
212
+ ) -> dict[str, NumpyArray]:
213
+ """
214
+ Add placeholders for text input when processing image data for ONNX.
215
+ Args:
216
+ onnx_input (Dict[str, NumpyArray]): Preprocessed image inputs.
217
+ **kwargs: Additional arguments.
218
+ Returns:
219
+ Dict[str, NumpyArray]: ONNX input with text placeholders.
220
+ """
221
+ onnx_input["input_ids"] = np.array(
222
+ [self.EMPTY_TEXT_PLACEHOLDER for _ in onnx_input["pixel_values"]]
223
+ )
224
+ onnx_input["attention_mask"] = np.array(
225
+ [self.EVEN_ATTENTION_MASK for _ in onnx_input["pixel_values"]]
226
+ )
227
+ return onnx_input
228
+
229
+ def embed_text(
230
+ self,
231
+ documents: str | Iterable[str],
232
+ batch_size: int = 256,
233
+ parallel: int | None = None,
234
+ **kwargs: Any,
235
+ ) -> Iterable[NumpyArray]:
236
+ """
237
+ Encode a list of documents into list of embeddings.
238
+
239
+ Args:
240
+ documents: Iterator of documents or single document to embed
241
+ batch_size: Batch size for encoding -- higher values will use more memory, but be faster
242
+ parallel:
243
+ If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
244
+ If 0, use all available cores.
245
+ If None, don't use data-parallel processing, use default onnxruntime threading instead.
246
+
247
+ Returns:
248
+ List of embeddings, one per document
249
+ """
250
+ yield from self._embed_documents(
251
+ model_name=self.model_name,
252
+ cache_dir=str(self.cache_dir),
253
+ documents=documents,
254
+ batch_size=batch_size,
255
+ parallel=parallel,
256
+ providers=self.providers,
257
+ cuda=self.cuda,
258
+ device_ids=self.device_ids,
259
+ local_files_only=self._local_files_only,
260
+ specific_model_path=self._specific_model_path,
261
+ extra_session_options=self._extra_session_options,
262
+ **kwargs,
263
+ )
264
+
265
+ def embed_image(
266
+ self,
267
+ images: ImageInput | Iterable[ImageInput],
268
+ batch_size: int = 16,
269
+ parallel: int | None = None,
270
+ **kwargs: Any,
271
+ ) -> Iterable[NumpyArray]:
272
+ """
273
+ Encode a list of images into list of embeddings.
274
+
275
+ Args:
276
+ images: Iterator of image paths or single image path to embed
277
+ batch_size: Batch size for encoding -- higher values will use more memory, but be faster
278
+ parallel:
279
+ If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
280
+ If 0, use all available cores.
281
+ If None, don't use data-parallel processing, use default onnxruntime threading instead.
282
+
283
+ Returns:
284
+ List of embeddings, one per document
285
+ """
286
+ yield from self._embed_images(
287
+ model_name=self.model_name,
288
+ cache_dir=str(self.cache_dir),
289
+ images=images,
290
+ batch_size=batch_size,
291
+ parallel=parallel,
292
+ providers=self.providers,
293
+ cuda=self.cuda,
294
+ device_ids=self.device_ids,
295
+ local_files_only=self._local_files_only,
296
+ specific_model_path=self._specific_model_path,
297
+ extra_session_options=self._extra_session_options,
298
+ **kwargs,
299
+ )
300
+
301
+ @classmethod
302
+ def _get_text_worker_class(cls) -> Type[TextEmbeddingWorker[NumpyArray]]:
303
+ return ColPaliTextEmbeddingWorker
304
+
305
+ @classmethod
306
+ def _get_image_worker_class(cls) -> Type[ImageEmbeddingWorker[NumpyArray]]:
307
+ return ColPaliImageEmbeddingWorker
308
+
309
+
310
+ class ColPaliTextEmbeddingWorker(TextEmbeddingWorker[NumpyArray]):
311
+ def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColPali:
312
+ return ColPali(
313
+ model_name=model_name,
314
+ cache_dir=cache_dir,
315
+ threads=1,
316
+ **kwargs,
317
+ )
318
+
319
+
320
+ class ColPaliImageEmbeddingWorker(ImageEmbeddingWorker[NumpyArray]):
321
+ def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> ColPali:
322
+ return ColPali(
323
+ model_name=model_name,
324
+ cache_dir=cache_dir,
325
+ threads=1,
326
+ **kwargs,
327
+ )
@@ -0,0 +1,189 @@
1
+ from typing import Any, Iterable, Sequence, Type
2
+ from dataclasses import asdict
3
+
4
+ from fastembed.common import OnnxProvider, ImageInput
5
+ from fastembed.common.types import NumpyArray, Device
6
+ from fastembed.late_interaction_multimodal.colpali import ColPali
7
+ from fastembed.late_interaction_multimodal.colmodernvbert import ColModernVBERT
8
+
9
+ from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import (
10
+ LateInteractionMultimodalEmbeddingBase,
11
+ )
12
+ from fastembed.common.model_description import DenseModelDescription
13
+
14
+
15
+ class LateInteractionMultimodalEmbedding(LateInteractionMultimodalEmbeddingBase):
16
+ EMBEDDINGS_REGISTRY: list[Type[LateInteractionMultimodalEmbeddingBase]] = [
17
+ ColPali,
18
+ ColModernVBERT,
19
+ ]
20
+
21
+ @classmethod
22
+ def list_supported_models(cls) -> list[dict[str, Any]]:
23
+ """
24
+ Lists the supported models.
25
+
26
+ Returns:
27
+ list[dict[str, Any]]: A list of dictionaries containing the model information.
28
+
29
+ Example:
30
+ ```
31
+ [
32
+ {
33
+ "model": "Qdrant/colpali-v1.3-fp16",
34
+ "dim": 128,
35
+ "description": "Text embeddings, Unimodal (text), Aligned to image latent space, ColBERT-compatible, 512 tokens max, 2024.",
36
+ "license": "mit",
37
+ "size_in_GB": 6.06,
38
+ "sources": {
39
+ "hf": "Qdrant/colpali-v1.3-fp16",
40
+ },
41
+ "additional_files": [
42
+ "model.onnx_data",
43
+ ],
44
+ "model_file": "model.onnx",
45
+ },
46
+ ]
47
+ ```
48
+ """
49
+ return [asdict(model) for model in cls._list_supported_models()]
50
+
51
+ @classmethod
52
+ def _list_supported_models(cls) -> list[DenseModelDescription]:
53
+ result: list[DenseModelDescription] = []
54
+ for embedding in cls.EMBEDDINGS_REGISTRY:
55
+ result.extend(embedding._list_supported_models())
56
+ return result
57
+
58
+ def __init__(
59
+ self,
60
+ model_name: str,
61
+ cache_dir: str | None = None,
62
+ threads: int | None = None,
63
+ providers: Sequence[OnnxProvider] | None = None,
64
+ cuda: bool | Device = Device.AUTO,
65
+ device_ids: list[int] | None = None,
66
+ lazy_load: bool = False,
67
+ **kwargs: Any,
68
+ ):
69
+ super().__init__(model_name, cache_dir, threads, **kwargs)
70
+ for EMBEDDING_MODEL_TYPE in self.EMBEDDINGS_REGISTRY:
71
+ supported_models = EMBEDDING_MODEL_TYPE._list_supported_models()
72
+ if any(model_name.lower() == model.model.lower() for model in supported_models):
73
+ self.model = EMBEDDING_MODEL_TYPE(
74
+ model_name,
75
+ cache_dir,
76
+ threads=threads,
77
+ providers=providers,
78
+ cuda=cuda,
79
+ device_ids=device_ids,
80
+ lazy_load=lazy_load,
81
+ **kwargs,
82
+ )
83
+ return
84
+
85
+ raise ValueError(
86
+ f"Model {model_name} is not supported in LateInteractionMultimodalEmbedding."
87
+ "Please check the supported models using `LateInteractionMultimodalEmbedding.list_supported_models()`"
88
+ )
89
+
90
+ @property
91
+ def embedding_size(self) -> int:
92
+ """Get the embedding size of the current model"""
93
+ if self._embedding_size is None:
94
+ self._embedding_size = self.get_embedding_size(self.model_name)
95
+ return self._embedding_size
96
+
97
+ @classmethod
98
+ def get_embedding_size(cls, model_name: str) -> int:
99
+ """Get the embedding size of the passed model
100
+
101
+ Args:
102
+ model_name (str): The name of the model to get embedding size for.
103
+
104
+ Returns:
105
+ int: The size of the embedding.
106
+
107
+ Raises:
108
+ ValueError: If the model name is not found in the supported models.
109
+ """
110
+ descriptions = cls._list_supported_models()
111
+ embedding_size: int | None = None
112
+ for description in descriptions:
113
+ if description.model.lower() == model_name.lower():
114
+ embedding_size = description.dim
115
+ break
116
+ if embedding_size is None:
117
+ model_names = [description.model for description in descriptions]
118
+ raise ValueError(
119
+ f"Embedding size for model {model_name} was None. "
120
+ f"Available model names: {model_names}"
121
+ )
122
+ return embedding_size
123
+
124
+ def embed_text(
125
+ self,
126
+ documents: str | Iterable[str],
127
+ batch_size: int = 256,
128
+ parallel: int | None = None,
129
+ **kwargs: Any,
130
+ ) -> Iterable[NumpyArray]:
131
+ """
132
+ Encode a list of documents into list of embeddings.
133
+
134
+ Args:
135
+ documents: Iterator of documents or single document to embed
136
+ batch_size: Batch size for encoding -- higher values will use more memory, but be faster
137
+ parallel:
138
+ If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
139
+ If 0, use all available cores.
140
+ If None, don't use data-parallel processing, use default onnxruntime threading instead.
141
+
142
+ Returns:
143
+ List of embeddings, one per document
144
+ """
145
+ yield from self.model.embed_text(documents, batch_size, parallel, **kwargs)
146
+
147
+ def embed_image(
148
+ self,
149
+ images: ImageInput | Iterable[ImageInput],
150
+ batch_size: int = 16,
151
+ parallel: int | None = None,
152
+ **kwargs: Any,
153
+ ) -> Iterable[NumpyArray]:
154
+ """
155
+ Encode a list of images into list of embeddings.
156
+
157
+ Args:
158
+ images: Iterator of image paths or single image path to embed
159
+ batch_size: Batch size for encoding -- higher values will use more memory, but be faster
160
+ parallel:
161
+ If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
162
+ If 0, use all available cores.
163
+ If None, don't use data-parallel processing, use default onnxruntime threading instead.
164
+
165
+ Returns:
166
+ List of embeddings, one per image
167
+ """
168
+ yield from self.model.embed_image(images, batch_size, parallel, **kwargs)
169
+
170
+ def token_count(
171
+ self,
172
+ texts: str | Iterable[str],
173
+ batch_size: int = 1024,
174
+ include_extension: bool = False,
175
+ **kwargs: Any,
176
+ ) -> int:
177
+ """Returns the number of tokens in the texts.
178
+
179
+ Args:
180
+ texts (str | Iterable[str]): The list of texts to embed.
181
+ batch_size (int): Batch size for encoding
182
+ include_extension (bool): Whether to include tokens added by preprocessing
183
+
184
+ Returns:
185
+ int: Sum of number of tokens in the texts.
186
+ """
187
+ return self.model.token_count(
188
+ texts, batch_size=batch_size, include_extension=include_extension, **kwargs
189
+ )
@@ -0,0 +1,86 @@
1
+ from typing import Iterable, Any
2
+
3
+
4
+ from fastembed.common import ImageInput
5
+ from fastembed.common.model_description import DenseModelDescription
6
+ from fastembed.common.model_management import ModelManagement
7
+ from fastembed.common.types import NumpyArray
8
+
9
+
10
+ class LateInteractionMultimodalEmbeddingBase(ModelManagement[DenseModelDescription]):
11
+ def __init__(
12
+ self,
13
+ model_name: str,
14
+ cache_dir: str | None = None,
15
+ threads: int | None = None,
16
+ **kwargs: Any,
17
+ ):
18
+ self.model_name = model_name
19
+ self.cache_dir = cache_dir
20
+ self.threads = threads
21
+ self._local_files_only = kwargs.pop("local_files_only", False)
22
+ self._embedding_size: int | None = None
23
+
24
+ def embed_text(
25
+ self,
26
+ documents: str | Iterable[str],
27
+ batch_size: int = 256,
28
+ parallel: int | None = None,
29
+ **kwargs: Any,
30
+ ) -> Iterable[NumpyArray]:
31
+ """
32
+ Embeds a list of documents into a list of embeddings.
33
+
34
+ Args:
35
+ documents (Iterable[str]): The list of texts to embed.
36
+ batch_size: Batch size for encoding -- higher values will use more memory, but be faster
37
+ parallel:
38
+ If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
39
+ If 0, use all available cores.
40
+ If None, don't use data-parallel processing, use default onnxruntime threading instead.
41
+ **kwargs: Additional keyword argument to pass to the embed method.
42
+
43
+ Yields:
44
+ Iterable[NumpyArray]: The embeddings.
45
+ """
46
+ raise NotImplementedError()
47
+
48
+ def embed_image(
49
+ self,
50
+ images: ImageInput | Iterable[ImageInput],
51
+ batch_size: int = 16,
52
+ parallel: int | None = None,
53
+ **kwargs: Any,
54
+ ) -> Iterable[NumpyArray]:
55
+ """
56
+ Encode a list of images into list of embeddings.
57
+ Args:
58
+ images: Iterator of image paths or single image path to embed
59
+ batch_size: Batch size for encoding -- higher values will use more memory, but be faster
60
+ parallel:
61
+ If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
62
+ If 0, use all available cores.
63
+ If None, don't use data-parallel processing, use default onnxruntime threading instead.
64
+
65
+ Returns:
66
+ List of embeddings, one per image
67
+ """
68
+ raise NotImplementedError()
69
+
70
+ @classmethod
71
+ def get_embedding_size(cls, model_name: str) -> int:
72
+ """Returns embedding size of the chosen model."""
73
+ raise NotImplementedError("Subclasses must implement this method")
74
+
75
+ @property
76
+ def embedding_size(self) -> int:
77
+ """Returns embedding size for the current model"""
78
+ raise NotImplementedError("Subclasses must implement this method")
79
+
80
+ def token_count(
81
+ self,
82
+ texts: str | Iterable[str],
83
+ **kwargs: Any,
84
+ ) -> int:
85
+ """Returns the number of tokens in the texts."""
86
+ raise NotImplementedError("Subclasses must implement this method")