sentence-transformers-haystack 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (18) hide show
  1. haystack_integrations/components/embedders/py.typed +0 -0
  2. haystack_integrations/components/embedders/sentence_transformers/__init__.py +17 -0
  3. haystack_integrations/components/embedders/sentence_transformers/embedding_backend/__init__.py +3 -0
  4. haystack_integrations/components/embedders/sentence_transformers/embedding_backend/backend.py +109 -0
  5. haystack_integrations/components/embedders/sentence_transformers/embedding_backend/sparse_backend.py +118 -0
  6. haystack_integrations/components/embedders/sentence_transformers/sentence_transformers_doc_image_embedder.py +290 -0
  7. haystack_integrations/components/embedders/sentence_transformers/sentence_transformers_document_embedder.py +272 -0
  8. haystack_integrations/components/embedders/sentence_transformers/sentence_transformers_sparse_document_embedder.py +240 -0
  9. haystack_integrations/components/embedders/sentence_transformers/sentence_transformers_sparse_text_embedder.py +200 -0
  10. haystack_integrations/components/embedders/sentence_transformers/sentence_transformers_text_embedder.py +245 -0
  11. haystack_integrations/components/rankers/py.typed +0 -0
  12. haystack_integrations/components/rankers/sentence_transformers/__init__.py +11 -0
  13. haystack_integrations/components/rankers/sentence_transformers/sentence_transformers_diversity.py +432 -0
  14. haystack_integrations/components/rankers/sentence_transformers/sentence_transformers_similarity.py +295 -0
  15. sentence_transformers_haystack-0.1.0.dist-info/METADATA +41 -0
  16. sentence_transformers_haystack-0.1.0.dist-info/RECORD +18 -0
  17. sentence_transformers_haystack-0.1.0.dist-info/WHEEL +4 -0
  18. sentence_transformers_haystack-0.1.0.dist-info/licenses/LICENSE.txt +201 -0
File without changes
@@ -0,0 +1,17 @@
1
+ # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from .sentence_transformers_doc_image_embedder import SentenceTransformersDocumentImageEmbedder
6
+ from .sentence_transformers_document_embedder import SentenceTransformersDocumentEmbedder
7
+ from .sentence_transformers_sparse_document_embedder import SentenceTransformersSparseDocumentEmbedder
8
+ from .sentence_transformers_sparse_text_embedder import SentenceTransformersSparseTextEmbedder
9
+ from .sentence_transformers_text_embedder import SentenceTransformersTextEmbedder
10
+
11
+ __all__ = [
12
+ "SentenceTransformersDocumentEmbedder",
13
+ "SentenceTransformersDocumentImageEmbedder",
14
+ "SentenceTransformersSparseDocumentEmbedder",
15
+ "SentenceTransformersSparseTextEmbedder",
16
+ "SentenceTransformersTextEmbedder",
17
+ ]
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,109 @@
1
+ # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import json
6
+ from typing import Any, ClassVar, Literal
7
+
8
+ from haystack.utils.auth import Secret
9
+ from PIL.Image import Image
10
+
11
+ from sentence_transformers import SentenceTransformer
12
+
13
+
14
+ class _SentenceTransformersEmbeddingBackendFactory:
15
+ """
16
+ Factory class to create instances of Sentence Transformers embedding backends.
17
+ """
18
+
19
+ _instances: ClassVar[dict[str, "_SentenceTransformersEmbeddingBackend"]] = {}
20
+
21
+ @staticmethod
22
+ def get_embedding_backend(
23
+ *,
24
+ model: str,
25
+ device: str | None = None,
26
+ auth_token: Secret | None = None,
27
+ trust_remote_code: bool = False,
28
+ revision: str | None = None,
29
+ local_files_only: bool = False,
30
+ truncate_dim: int | None = None,
31
+ model_kwargs: dict[str, Any] | None = None,
32
+ tokenizer_kwargs: dict[str, Any] | None = None,
33
+ config_kwargs: dict[str, Any] | None = None,
34
+ backend: Literal["torch", "onnx", "openvino"] = "torch",
35
+ ) -> "_SentenceTransformersEmbeddingBackend":
36
+ cache_params = {
37
+ "model": model,
38
+ "device": device,
39
+ "auth_token": auth_token,
40
+ "trust_remote_code": trust_remote_code,
41
+ "revision": revision,
42
+ "local_files_only": local_files_only,
43
+ "truncate_dim": truncate_dim,
44
+ "model_kwargs": model_kwargs,
45
+ "tokenizer_kwargs": tokenizer_kwargs,
46
+ "config_kwargs": config_kwargs,
47
+ "backend": backend,
48
+ }
49
+
50
+ embedding_backend_id = json.dumps(cache_params, sort_keys=True, default=str)
51
+
52
+ if embedding_backend_id in _SentenceTransformersEmbeddingBackendFactory._instances:
53
+ return _SentenceTransformersEmbeddingBackendFactory._instances[embedding_backend_id]
54
+
55
+ embedding_backend = _SentenceTransformersEmbeddingBackend(
56
+ model=model,
57
+ device=device,
58
+ auth_token=auth_token,
59
+ trust_remote_code=trust_remote_code,
60
+ revision=revision,
61
+ local_files_only=local_files_only,
62
+ truncate_dim=truncate_dim,
63
+ model_kwargs=model_kwargs,
64
+ tokenizer_kwargs=tokenizer_kwargs,
65
+ config_kwargs=config_kwargs,
66
+ backend=backend,
67
+ )
68
+
69
+ _SentenceTransformersEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend
70
+ return embedding_backend
71
+
72
+
73
+ class _SentenceTransformersEmbeddingBackend:
74
+ """
75
+ Class to manage Sentence Transformers embeddings.
76
+ """
77
+
78
+ def __init__(
79
+ self,
80
+ *,
81
+ model: str,
82
+ device: str | None = None,
83
+ auth_token: Secret | None = None,
84
+ trust_remote_code: bool = False,
85
+ revision: str | None = None,
86
+ local_files_only: bool = False,
87
+ truncate_dim: int | None = None,
88
+ model_kwargs: dict[str, Any] | None = None,
89
+ tokenizer_kwargs: dict[str, Any] | None = None,
90
+ config_kwargs: dict[str, Any] | None = None,
91
+ backend: Literal["torch", "onnx", "openvino"] = "torch",
92
+ ) -> None:
93
+ self.model = SentenceTransformer(
94
+ model_name_or_path=model,
95
+ device=device,
96
+ token=auth_token.resolve_value() if auth_token else None,
97
+ trust_remote_code=trust_remote_code,
98
+ revision=revision,
99
+ local_files_only=local_files_only,
100
+ truncate_dim=truncate_dim,
101
+ model_kwargs=model_kwargs,
102
+ # `tokenizer_kwargs` was renamed to `processor_kwargs` in sentence-transformers 5.4.0
103
+ processor_kwargs=tokenizer_kwargs,
104
+ config_kwargs=config_kwargs,
105
+ backend=backend,
106
+ )
107
+
108
+ def embed(self, data: list[str] | list[Image], **kwargs: Any) -> list[list[float]]:
109
+ return self.model.encode(data, **kwargs).tolist()
@@ -0,0 +1,118 @@
1
+ # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import json
6
+ from typing import Any, ClassVar, Literal
7
+
8
+ from haystack.dataclasses.sparse_embedding import SparseEmbedding
9
+ from haystack.utils.auth import Secret
10
+
11
+ from sentence_transformers import SparseEncoder
12
+
13
+
14
+ class _SentenceTransformersSparseEmbeddingBackendFactory:
15
+ """
16
+ Factory class to create instances of Sentence Transformers embedding backends.
17
+ """
18
+
19
+ _instances: ClassVar[dict[str, "_SentenceTransformersSparseEncoderEmbeddingBackend"]] = {}
20
+
21
+ @staticmethod
22
+ def get_embedding_backend(
23
+ *,
24
+ model: str,
25
+ device: str | None = None,
26
+ auth_token: Secret | None = None,
27
+ trust_remote_code: bool = False,
28
+ revision: str | None = None,
29
+ local_files_only: bool = False,
30
+ model_kwargs: dict[str, Any] | None = None,
31
+ tokenizer_kwargs: dict[str, Any] | None = None,
32
+ config_kwargs: dict[str, Any] | None = None,
33
+ backend: Literal["torch", "onnx", "openvino"] = "torch",
34
+ ) -> "_SentenceTransformersSparseEncoderEmbeddingBackend":
35
+ cache_params = {
36
+ "model": model,
37
+ "device": device,
38
+ "auth_token": auth_token,
39
+ "trust_remote_code": trust_remote_code,
40
+ "revision": revision,
41
+ "local_files_only": local_files_only,
42
+ "model_kwargs": model_kwargs,
43
+ "tokenizer_kwargs": tokenizer_kwargs,
44
+ "config_kwargs": config_kwargs,
45
+ "backend": backend,
46
+ }
47
+
48
+ embedding_backend_id = json.dumps(cache_params, sort_keys=True, default=str)
49
+
50
+ if embedding_backend_id in _SentenceTransformersSparseEmbeddingBackendFactory._instances:
51
+ return _SentenceTransformersSparseEmbeddingBackendFactory._instances[embedding_backend_id]
52
+
53
+ embedding_backend = _SentenceTransformersSparseEncoderEmbeddingBackend(
54
+ model=model,
55
+ device=device,
56
+ auth_token=auth_token,
57
+ trust_remote_code=trust_remote_code,
58
+ revision=revision,
59
+ local_files_only=local_files_only,
60
+ model_kwargs=model_kwargs,
61
+ tokenizer_kwargs=tokenizer_kwargs,
62
+ config_kwargs=config_kwargs,
63
+ backend=backend,
64
+ )
65
+
66
+ _SentenceTransformersSparseEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend
67
+ return embedding_backend
68
+
69
+
70
+ class _SentenceTransformersSparseEncoderEmbeddingBackend:
71
+ """
72
+ Class to manage Sparse embeddings from Sentence Transformers.
73
+ """
74
+
75
+ def __init__(
76
+ self,
77
+ *,
78
+ model: str,
79
+ device: str | None = None,
80
+ auth_token: Secret | None = None,
81
+ trust_remote_code: bool = False,
82
+ revision: str | None = None,
83
+ local_files_only: bool = False,
84
+ model_kwargs: dict[str, Any] | None = None,
85
+ tokenizer_kwargs: dict[str, Any] | None = None,
86
+ config_kwargs: dict[str, Any] | None = None,
87
+ backend: Literal["torch", "onnx", "openvino"] = "torch",
88
+ ) -> None:
89
+ self.model = SparseEncoder(
90
+ model_name_or_path=model,
91
+ device=device,
92
+ token=auth_token.resolve_value() if auth_token else None,
93
+ trust_remote_code=trust_remote_code,
94
+ revision=revision,
95
+ local_files_only=local_files_only,
96
+ model_kwargs=model_kwargs,
97
+ # `tokenizer_kwargs` was renamed to `processor_kwargs` in sentence-transformers 5.4.0
98
+ processor_kwargs=tokenizer_kwargs,
99
+ config_kwargs=config_kwargs,
100
+ backend=backend,
101
+ )
102
+
103
+ def embed(self, *, data: list[str], **kwargs: Any) -> list[SparseEmbedding]:
104
+ embeddings_list = self.model.encode(
105
+ data,
106
+ convert_to_tensor=False, # output is a list of individual tensors
107
+ convert_to_sparse_tensor=True,
108
+ **kwargs,
109
+ )
110
+
111
+ sparse_embeddings: list[SparseEmbedding] = []
112
+ for embedding_tensor in embeddings_list:
113
+ coalesced_tensor = embedding_tensor.coalesce()
114
+ indices = coalesced_tensor.indices()[0].tolist() # Only column indices
115
+ values = coalesced_tensor.values().tolist()
116
+ sparse_embeddings.append(SparseEmbedding(indices=indices, values=values))
117
+
118
+ return sparse_embeddings
@@ -0,0 +1,290 @@
1
+ # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from dataclasses import replace
6
+ from typing import Any, Literal
7
+
8
+ from haystack import Document, component, default_from_dict, default_to_dict
9
+ from haystack.components.converters.image.image_utils import (
10
+ _batch_convert_pdf_pages_to_images,
11
+ _extract_image_sources_info,
12
+ _PDFPageInfo,
13
+ )
14
+ from haystack.utils.auth import Secret
15
+ from haystack.utils.device import ComponentDevice
16
+ from haystack.utils.hf import deserialize_hf_model_kwargs, serialize_hf_model_kwargs
17
+ from PIL import Image
18
+
19
+ from haystack_integrations.components.embedders.sentence_transformers.embedding_backend.backend import (
20
+ _SentenceTransformersEmbeddingBackend,
21
+ _SentenceTransformersEmbeddingBackendFactory,
22
+ )
23
+
24
+
25
+ @component
26
+ class SentenceTransformersDocumentImageEmbedder:
27
+ """
28
+ A component for computing Document embeddings based on images using Sentence Transformers models.
29
+
30
+ The embedding of each Document is stored in the `embedding` field of the Document.
31
+
32
+ ### Usage example
33
+
34
+ ```python
35
+ from haystack import Document
36
+ from haystack_integrations.components.embedders.sentence_transformers import (
37
+ SentenceTransformersDocumentImageEmbedder,
38
+ )
39
+
40
+ embedder = SentenceTransformersDocumentImageEmbedder(model="sentence-transformers/clip-ViT-B-32")
41
+
42
+ documents = [
43
+ Document(content="A photo of a cat", meta={"file_path": "cat.jpg"}),
44
+ Document(content="A photo of a dog", meta={"file_path": "dog.jpg"}),
45
+ ]
46
+
47
+ result = embedder.run(documents=documents)
48
+ documents_with_embeddings = result["documents"]
49
+ print(documents_with_embeddings)
50
+
51
+ # [Document(id=...,
52
+ # content='A photo of a cat',
53
+ # meta={'file_path': 'cat.jpg',
54
+ # 'embedding_source': {'type': 'image', 'file_path_meta_field': 'file_path'}},
55
+ # embedding=vector of size 512),
56
+ # ...]
57
+ ```
58
+ """
59
+
60
+ def __init__(
61
+ self,
62
+ *,
63
+ file_path_meta_field: str = "file_path",
64
+ root_path: str | None = None,
65
+ model: str = "sentence-transformers/clip-ViT-B-32",
66
+ device: ComponentDevice | None = None,
67
+ token: Secret | None = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
68
+ batch_size: int = 32,
69
+ progress_bar: bool = True,
70
+ normalize_embeddings: bool = False,
71
+ trust_remote_code: bool = False,
72
+ local_files_only: bool = False,
73
+ model_kwargs: dict[str, Any] | None = None,
74
+ tokenizer_kwargs: dict[str, Any] | None = None,
75
+ config_kwargs: dict[str, Any] | None = None,
76
+ precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = "float32",
77
+ encode_kwargs: dict[str, Any] | None = None,
78
+ backend: Literal["torch", "onnx", "openvino"] = "torch",
79
+ ) -> None:
80
+ """
81
+ Creates a SentenceTransformersDocumentEmbedder component.
82
+
83
+ :param file_path_meta_field: The metadata field in the Document that contains the file path to the image or PDF.
84
+ :param root_path: The root directory path where document files are located. If provided, file paths in
85
+ document metadata will be resolved relative to this path. If None, file paths are treated as absolute paths.
86
+ :param model:
87
+ The Sentence Transformers model to use for calculating embeddings. Pass a local path or ID of the model on
88
+ Hugging Face. To be used with this component, the model must be able to embed images and text into the same
89
+ vector space. Compatible models include:
90
+ - "sentence-transformers/clip-ViT-B-32"
91
+ - "sentence-transformers/clip-ViT-L-14"
92
+ - "sentence-transformers/clip-ViT-B-16"
93
+ - "sentence-transformers/clip-ViT-B-32-multilingual-v1"
94
+ - "jinaai/jina-embeddings-v4"
95
+ - "jinaai/jina-clip-v1"
96
+ - "jinaai/jina-clip-v2".
97
+ :param device:
98
+ The device to use for loading the model.
99
+ Overrides the default device.
100
+ :param token:
101
+ The API token to download private models from Hugging Face.
102
+ :param batch_size:
103
+ Number of documents to embed at once.
104
+ :param progress_bar:
105
+ If `True`, shows a progress bar when embedding documents.
106
+ :param normalize_embeddings:
107
+ If `True`, the embeddings are normalized using L2 normalization, so that each embedding has a norm of 1.
108
+ :param trust_remote_code:
109
+ If `False`, allows only Hugging Face verified model architectures.
110
+ If `True`, allows custom models and scripts.
111
+ :param local_files_only:
112
+ If `True`, does not attempt to download the model from Hugging Face Hub and only looks at local files.
113
+ :param model_kwargs:
114
+ Additional keyword arguments for `AutoModelForSequenceClassification.from_pretrained`
115
+ when loading the model. Refer to specific model documentation for available kwargs.
116
+ :param tokenizer_kwargs:
117
+ Additional keyword arguments for `AutoTokenizer.from_pretrained` when loading the tokenizer.
118
+ Refer to specific model documentation for available kwargs.
119
+ :param config_kwargs:
120
+ Additional keyword arguments for `AutoConfig.from_pretrained` when loading the model configuration.
121
+ :param precision:
122
+ The precision to use for the embeddings.
123
+ All non-float32 precisions are quantized embeddings.
124
+ Quantized embeddings are smaller and faster to compute, but may have a lower accuracy.
125
+ They are useful for reducing the size of the embeddings of a corpus for semantic search, among other tasks.
126
+ :param encode_kwargs:
127
+ Additional keyword arguments for `SentenceTransformer.encode` when embedding documents.
128
+ This parameter is provided for fine customization. Be careful not to clash with already set parameters and
129
+ avoid passing parameters that change the output type.
130
+ :param backend:
131
+ The backend to use for the Sentence Transformers model. Choose from "torch", "onnx", or "openvino".
132
+ Refer to the [Sentence Transformers documentation](https://sbert.net/docs/sentence_transformer/usage/efficiency.html)
133
+ for more information on acceleration and quantization options.
134
+ """
135
+ self.file_path_meta_field = file_path_meta_field
136
+ self.root_path = root_path or ""
137
+ self.model = model
138
+ self.device = ComponentDevice.resolve_device(device)
139
+ self.token = token
140
+ self.batch_size = batch_size
141
+ self.progress_bar = progress_bar
142
+ self.normalize_embeddings = normalize_embeddings
143
+ self.trust_remote_code = trust_remote_code
144
+ self.local_files_only = local_files_only
145
+ self.model_kwargs = model_kwargs
146
+ self.tokenizer_kwargs = tokenizer_kwargs
147
+ self.config_kwargs = config_kwargs
148
+ self.encode_kwargs = encode_kwargs
149
+ self.precision = precision
150
+ self.backend = backend
151
+ self._embedding_backend: _SentenceTransformersEmbeddingBackend | None = None
152
+
153
+ def to_dict(self) -> dict[str, Any]:
154
+ """
155
+ Serializes the component to a dictionary.
156
+
157
+ :returns:
158
+ Dictionary with serialized data.
159
+ """
160
+ serialization_dict = default_to_dict(
161
+ self,
162
+ file_path_meta_field=self.file_path_meta_field,
163
+ root_path=self.root_path,
164
+ model=self.model,
165
+ device=self.device,
166
+ token=self.token,
167
+ batch_size=self.batch_size,
168
+ progress_bar=self.progress_bar,
169
+ normalize_embeddings=self.normalize_embeddings,
170
+ trust_remote_code=self.trust_remote_code,
171
+ local_files_only=self.local_files_only,
172
+ model_kwargs=self.model_kwargs,
173
+ tokenizer_kwargs=self.tokenizer_kwargs,
174
+ config_kwargs=self.config_kwargs,
175
+ precision=self.precision,
176
+ encode_kwargs=self.encode_kwargs,
177
+ backend=self.backend,
178
+ )
179
+ if serialization_dict["init_parameters"].get("model_kwargs") is not None:
180
+ serialize_hf_model_kwargs(serialization_dict["init_parameters"]["model_kwargs"])
181
+ return serialization_dict
182
+
183
+ @classmethod
184
+ def from_dict(cls, data: dict[str, Any]) -> "SentenceTransformersDocumentImageEmbedder":
185
+ """
186
+ Deserializes the component from a dictionary.
187
+
188
+ :param data:
189
+ Dictionary to deserialize from.
190
+ :returns:
191
+ Deserialized component.
192
+ """
193
+ init_params = data["init_parameters"]
194
+ if init_params.get("model_kwargs") is not None:
195
+ deserialize_hf_model_kwargs(init_params["model_kwargs"])
196
+ return default_from_dict(cls, data)
197
+
198
+ def warm_up(self) -> None:
199
+ """
200
+ Initializes the component.
201
+ """
202
+ if self._embedding_backend is None:
203
+ self._embedding_backend = _SentenceTransformersEmbeddingBackendFactory.get_embedding_backend(
204
+ model=self.model,
205
+ device=self.device.to_torch_str(),
206
+ auth_token=self.token,
207
+ trust_remote_code=self.trust_remote_code,
208
+ local_files_only=self.local_files_only,
209
+ model_kwargs=self.model_kwargs,
210
+ tokenizer_kwargs=self.tokenizer_kwargs,
211
+ config_kwargs=self.config_kwargs,
212
+ backend=self.backend,
213
+ )
214
+ if self.tokenizer_kwargs and self.tokenizer_kwargs.get("model_max_length"):
215
+ self._embedding_backend.model.max_seq_length = self.tokenizer_kwargs["model_max_length"]
216
+
217
+ @component.output_types(documents=list[Document])
218
+ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
219
+ """
220
+ Embed a list of documents.
221
+
222
+ :param documents:
223
+ Documents to embed.
224
+
225
+ :returns:
226
+ A dictionary with the following keys:
227
+ - `documents`: Documents with embeddings.
228
+ """
229
+ if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
230
+ msg = (
231
+ "SentenceTransformersDocumentImageEmbedder expects a list of Documents as input. "
232
+ "In case you want to embed a string, please use the SentenceTransformersTextEmbedder."
233
+ )
234
+ raise TypeError(msg)
235
+ if self._embedding_backend is None:
236
+ self.warm_up()
237
+
238
+ images_source_info = _extract_image_sources_info(
239
+ documents=documents, file_path_meta_field=self.file_path_meta_field, root_path=self.root_path
240
+ )
241
+
242
+ images_to_embed: list = [None] * len(documents)
243
+ pdf_page_infos: list[_PDFPageInfo] = []
244
+
245
+ for doc_idx, image_source_info in enumerate(images_source_info):
246
+ if image_source_info["mime_type"] == "application/pdf":
247
+ # Store PDF documents for later processing
248
+ page_number = image_source_info.get("page_number")
249
+ assert page_number is not None # noqa: S101 # checked in _extract_image_sources_info but mypy doesn't know that
250
+ pdf_page_info: _PDFPageInfo = {
251
+ "doc_idx": doc_idx,
252
+ "path": image_source_info["path"],
253
+ "page_number": page_number,
254
+ }
255
+ pdf_page_infos.append(pdf_page_info)
256
+ else:
257
+ # Process images directly
258
+ image = Image.open(image_source_info["path"])
259
+ images_to_embed[doc_idx] = image
260
+
261
+ pdf_images_by_doc_idx = _batch_convert_pdf_pages_to_images(pdf_page_infos=pdf_page_infos, return_base64=False)
262
+ for doc_idx, pil_image in pdf_images_by_doc_idx.items():
263
+ images_to_embed[doc_idx] = pil_image
264
+
265
+ none_images_doc_ids = [documents[doc_idx].id for doc_idx, image in enumerate(images_to_embed) if image is None]
266
+ if none_images_doc_ids:
267
+ msg = f"Conversion failed for some documents. Document IDs: {none_images_doc_ids}."
268
+ raise RuntimeError(msg)
269
+
270
+ # mypy doesn't know this is set in warm_up
271
+ embeddings = self._embedding_backend.embed( # type: ignore[union-attr]
272
+ data=images_to_embed,
273
+ batch_size=self.batch_size,
274
+ show_progress_bar=self.progress_bar,
275
+ normalize_embeddings=self.normalize_embeddings,
276
+ precision=self.precision,
277
+ **(self.encode_kwargs if self.encode_kwargs else {}),
278
+ )
279
+
280
+ docs_with_embeddings = []
281
+ for doc, emb in zip(documents, embeddings, strict=True):
282
+ # we store this information for later inspection
283
+ new_meta = {
284
+ **doc.meta,
285
+ "embedding_source": {"type": "image", "file_path_meta_field": self.file_path_meta_field},
286
+ }
287
+ new_doc = replace(doc, meta=new_meta, embedding=emb)
288
+ docs_with_embeddings.append(new_doc)
289
+
290
+ return {"documents": docs_with_embeddings}