sentence-transformers-haystack 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- haystack_integrations/components/embedders/py.typed +0 -0
- haystack_integrations/components/embedders/sentence_transformers/__init__.py +17 -0
- haystack_integrations/components/embedders/sentence_transformers/embedding_backend/__init__.py +3 -0
- haystack_integrations/components/embedders/sentence_transformers/embedding_backend/backend.py +109 -0
- haystack_integrations/components/embedders/sentence_transformers/embedding_backend/sparse_backend.py +118 -0
- haystack_integrations/components/embedders/sentence_transformers/sentence_transformers_doc_image_embedder.py +290 -0
- haystack_integrations/components/embedders/sentence_transformers/sentence_transformers_document_embedder.py +272 -0
- haystack_integrations/components/embedders/sentence_transformers/sentence_transformers_sparse_document_embedder.py +240 -0
- haystack_integrations/components/embedders/sentence_transformers/sentence_transformers_sparse_text_embedder.py +200 -0
- haystack_integrations/components/embedders/sentence_transformers/sentence_transformers_text_embedder.py +245 -0
- haystack_integrations/components/rankers/py.typed +0 -0
- haystack_integrations/components/rankers/sentence_transformers/__init__.py +11 -0
- haystack_integrations/components/rankers/sentence_transformers/sentence_transformers_diversity.py +432 -0
- haystack_integrations/components/rankers/sentence_transformers/sentence_transformers_similarity.py +295 -0
- sentence_transformers_haystack-0.1.0.dist-info/METADATA +41 -0
- sentence_transformers_haystack-0.1.0.dist-info/RECORD +18 -0
- sentence_transformers_haystack-0.1.0.dist-info/WHEEL +4 -0
- sentence_transformers_haystack-0.1.0.dist-info/licenses/LICENSE.txt +201 -0
|
File without changes
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
from .sentence_transformers_doc_image_embedder import SentenceTransformersDocumentImageEmbedder
|
|
6
|
+
from .sentence_transformers_document_embedder import SentenceTransformersDocumentEmbedder
|
|
7
|
+
from .sentence_transformers_sparse_document_embedder import SentenceTransformersSparseDocumentEmbedder
|
|
8
|
+
from .sentence_transformers_sparse_text_embedder import SentenceTransformersSparseTextEmbedder
|
|
9
|
+
from .sentence_transformers_text_embedder import SentenceTransformersTextEmbedder
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"SentenceTransformersDocumentEmbedder",
|
|
13
|
+
"SentenceTransformersDocumentImageEmbedder",
|
|
14
|
+
"SentenceTransformersSparseDocumentEmbedder",
|
|
15
|
+
"SentenceTransformersSparseTextEmbedder",
|
|
16
|
+
"SentenceTransformersTextEmbedder",
|
|
17
|
+
]
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Any, ClassVar, Literal
|
|
7
|
+
|
|
8
|
+
from haystack.utils.auth import Secret
|
|
9
|
+
from PIL.Image import Image
|
|
10
|
+
|
|
11
|
+
from sentence_transformers import SentenceTransformer
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class _SentenceTransformersEmbeddingBackendFactory:
|
|
15
|
+
"""
|
|
16
|
+
Factory class to create instances of Sentence Transformers embedding backends.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
_instances: ClassVar[dict[str, "_SentenceTransformersEmbeddingBackend"]] = {}
|
|
20
|
+
|
|
21
|
+
@staticmethod
|
|
22
|
+
def get_embedding_backend(
|
|
23
|
+
*,
|
|
24
|
+
model: str,
|
|
25
|
+
device: str | None = None,
|
|
26
|
+
auth_token: Secret | None = None,
|
|
27
|
+
trust_remote_code: bool = False,
|
|
28
|
+
revision: str | None = None,
|
|
29
|
+
local_files_only: bool = False,
|
|
30
|
+
truncate_dim: int | None = None,
|
|
31
|
+
model_kwargs: dict[str, Any] | None = None,
|
|
32
|
+
tokenizer_kwargs: dict[str, Any] | None = None,
|
|
33
|
+
config_kwargs: dict[str, Any] | None = None,
|
|
34
|
+
backend: Literal["torch", "onnx", "openvino"] = "torch",
|
|
35
|
+
) -> "_SentenceTransformersEmbeddingBackend":
|
|
36
|
+
cache_params = {
|
|
37
|
+
"model": model,
|
|
38
|
+
"device": device,
|
|
39
|
+
"auth_token": auth_token,
|
|
40
|
+
"trust_remote_code": trust_remote_code,
|
|
41
|
+
"revision": revision,
|
|
42
|
+
"local_files_only": local_files_only,
|
|
43
|
+
"truncate_dim": truncate_dim,
|
|
44
|
+
"model_kwargs": model_kwargs,
|
|
45
|
+
"tokenizer_kwargs": tokenizer_kwargs,
|
|
46
|
+
"config_kwargs": config_kwargs,
|
|
47
|
+
"backend": backend,
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
embedding_backend_id = json.dumps(cache_params, sort_keys=True, default=str)
|
|
51
|
+
|
|
52
|
+
if embedding_backend_id in _SentenceTransformersEmbeddingBackendFactory._instances:
|
|
53
|
+
return _SentenceTransformersEmbeddingBackendFactory._instances[embedding_backend_id]
|
|
54
|
+
|
|
55
|
+
embedding_backend = _SentenceTransformersEmbeddingBackend(
|
|
56
|
+
model=model,
|
|
57
|
+
device=device,
|
|
58
|
+
auth_token=auth_token,
|
|
59
|
+
trust_remote_code=trust_remote_code,
|
|
60
|
+
revision=revision,
|
|
61
|
+
local_files_only=local_files_only,
|
|
62
|
+
truncate_dim=truncate_dim,
|
|
63
|
+
model_kwargs=model_kwargs,
|
|
64
|
+
tokenizer_kwargs=tokenizer_kwargs,
|
|
65
|
+
config_kwargs=config_kwargs,
|
|
66
|
+
backend=backend,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
_SentenceTransformersEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend
|
|
70
|
+
return embedding_backend
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class _SentenceTransformersEmbeddingBackend:
|
|
74
|
+
"""
|
|
75
|
+
Class to manage Sentence Transformers embeddings.
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
def __init__(
|
|
79
|
+
self,
|
|
80
|
+
*,
|
|
81
|
+
model: str,
|
|
82
|
+
device: str | None = None,
|
|
83
|
+
auth_token: Secret | None = None,
|
|
84
|
+
trust_remote_code: bool = False,
|
|
85
|
+
revision: str | None = None,
|
|
86
|
+
local_files_only: bool = False,
|
|
87
|
+
truncate_dim: int | None = None,
|
|
88
|
+
model_kwargs: dict[str, Any] | None = None,
|
|
89
|
+
tokenizer_kwargs: dict[str, Any] | None = None,
|
|
90
|
+
config_kwargs: dict[str, Any] | None = None,
|
|
91
|
+
backend: Literal["torch", "onnx", "openvino"] = "torch",
|
|
92
|
+
) -> None:
|
|
93
|
+
self.model = SentenceTransformer(
|
|
94
|
+
model_name_or_path=model,
|
|
95
|
+
device=device,
|
|
96
|
+
token=auth_token.resolve_value() if auth_token else None,
|
|
97
|
+
trust_remote_code=trust_remote_code,
|
|
98
|
+
revision=revision,
|
|
99
|
+
local_files_only=local_files_only,
|
|
100
|
+
truncate_dim=truncate_dim,
|
|
101
|
+
model_kwargs=model_kwargs,
|
|
102
|
+
# `tokenizer_kwargs` was renamed to `processor_kwargs` in sentence-transformers 5.4.0
|
|
103
|
+
processor_kwargs=tokenizer_kwargs,
|
|
104
|
+
config_kwargs=config_kwargs,
|
|
105
|
+
backend=backend,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
def embed(self, data: list[str] | list[Image], **kwargs: Any) -> list[list[float]]:
|
|
109
|
+
return self.model.encode(data, **kwargs).tolist()
|
haystack_integrations/components/embedders/sentence_transformers/embedding_backend/sparse_backend.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Any, ClassVar, Literal
|
|
7
|
+
|
|
8
|
+
from haystack.dataclasses.sparse_embedding import SparseEmbedding
|
|
9
|
+
from haystack.utils.auth import Secret
|
|
10
|
+
|
|
11
|
+
from sentence_transformers import SparseEncoder
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class _SentenceTransformersSparseEmbeddingBackendFactory:
|
|
15
|
+
"""
|
|
16
|
+
Factory class to create instances of Sentence Transformers embedding backends.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
_instances: ClassVar[dict[str, "_SentenceTransformersSparseEncoderEmbeddingBackend"]] = {}
|
|
20
|
+
|
|
21
|
+
@staticmethod
|
|
22
|
+
def get_embedding_backend(
|
|
23
|
+
*,
|
|
24
|
+
model: str,
|
|
25
|
+
device: str | None = None,
|
|
26
|
+
auth_token: Secret | None = None,
|
|
27
|
+
trust_remote_code: bool = False,
|
|
28
|
+
revision: str | None = None,
|
|
29
|
+
local_files_only: bool = False,
|
|
30
|
+
model_kwargs: dict[str, Any] | None = None,
|
|
31
|
+
tokenizer_kwargs: dict[str, Any] | None = None,
|
|
32
|
+
config_kwargs: dict[str, Any] | None = None,
|
|
33
|
+
backend: Literal["torch", "onnx", "openvino"] = "torch",
|
|
34
|
+
) -> "_SentenceTransformersSparseEncoderEmbeddingBackend":
|
|
35
|
+
cache_params = {
|
|
36
|
+
"model": model,
|
|
37
|
+
"device": device,
|
|
38
|
+
"auth_token": auth_token,
|
|
39
|
+
"trust_remote_code": trust_remote_code,
|
|
40
|
+
"revision": revision,
|
|
41
|
+
"local_files_only": local_files_only,
|
|
42
|
+
"model_kwargs": model_kwargs,
|
|
43
|
+
"tokenizer_kwargs": tokenizer_kwargs,
|
|
44
|
+
"config_kwargs": config_kwargs,
|
|
45
|
+
"backend": backend,
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
embedding_backend_id = json.dumps(cache_params, sort_keys=True, default=str)
|
|
49
|
+
|
|
50
|
+
if embedding_backend_id in _SentenceTransformersSparseEmbeddingBackendFactory._instances:
|
|
51
|
+
return _SentenceTransformersSparseEmbeddingBackendFactory._instances[embedding_backend_id]
|
|
52
|
+
|
|
53
|
+
embedding_backend = _SentenceTransformersSparseEncoderEmbeddingBackend(
|
|
54
|
+
model=model,
|
|
55
|
+
device=device,
|
|
56
|
+
auth_token=auth_token,
|
|
57
|
+
trust_remote_code=trust_remote_code,
|
|
58
|
+
revision=revision,
|
|
59
|
+
local_files_only=local_files_only,
|
|
60
|
+
model_kwargs=model_kwargs,
|
|
61
|
+
tokenizer_kwargs=tokenizer_kwargs,
|
|
62
|
+
config_kwargs=config_kwargs,
|
|
63
|
+
backend=backend,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
_SentenceTransformersSparseEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend
|
|
67
|
+
return embedding_backend
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class _SentenceTransformersSparseEncoderEmbeddingBackend:
|
|
71
|
+
"""
|
|
72
|
+
Class to manage Sparse embeddings from Sentence Transformers.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
def __init__(
|
|
76
|
+
self,
|
|
77
|
+
*,
|
|
78
|
+
model: str,
|
|
79
|
+
device: str | None = None,
|
|
80
|
+
auth_token: Secret | None = None,
|
|
81
|
+
trust_remote_code: bool = False,
|
|
82
|
+
revision: str | None = None,
|
|
83
|
+
local_files_only: bool = False,
|
|
84
|
+
model_kwargs: dict[str, Any] | None = None,
|
|
85
|
+
tokenizer_kwargs: dict[str, Any] | None = None,
|
|
86
|
+
config_kwargs: dict[str, Any] | None = None,
|
|
87
|
+
backend: Literal["torch", "onnx", "openvino"] = "torch",
|
|
88
|
+
) -> None:
|
|
89
|
+
self.model = SparseEncoder(
|
|
90
|
+
model_name_or_path=model,
|
|
91
|
+
device=device,
|
|
92
|
+
token=auth_token.resolve_value() if auth_token else None,
|
|
93
|
+
trust_remote_code=trust_remote_code,
|
|
94
|
+
revision=revision,
|
|
95
|
+
local_files_only=local_files_only,
|
|
96
|
+
model_kwargs=model_kwargs,
|
|
97
|
+
# `tokenizer_kwargs` was renamed to `processor_kwargs` in sentence-transformers 5.4.0
|
|
98
|
+
processor_kwargs=tokenizer_kwargs,
|
|
99
|
+
config_kwargs=config_kwargs,
|
|
100
|
+
backend=backend,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
def embed(self, *, data: list[str], **kwargs: Any) -> list[SparseEmbedding]:
|
|
104
|
+
embeddings_list = self.model.encode(
|
|
105
|
+
data,
|
|
106
|
+
convert_to_tensor=False, # output is a list of individual tensors
|
|
107
|
+
convert_to_sparse_tensor=True,
|
|
108
|
+
**kwargs,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
sparse_embeddings: list[SparseEmbedding] = []
|
|
112
|
+
for embedding_tensor in embeddings_list:
|
|
113
|
+
coalesced_tensor = embedding_tensor.coalesce()
|
|
114
|
+
indices = coalesced_tensor.indices()[0].tolist() # Only column indices
|
|
115
|
+
values = coalesced_tensor.values().tolist()
|
|
116
|
+
sparse_embeddings.append(SparseEmbedding(indices=indices, values=values))
|
|
117
|
+
|
|
118
|
+
return sparse_embeddings
|
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
from dataclasses import replace
|
|
6
|
+
from typing import Any, Literal
|
|
7
|
+
|
|
8
|
+
from haystack import Document, component, default_from_dict, default_to_dict
|
|
9
|
+
from haystack.components.converters.image.image_utils import (
|
|
10
|
+
_batch_convert_pdf_pages_to_images,
|
|
11
|
+
_extract_image_sources_info,
|
|
12
|
+
_PDFPageInfo,
|
|
13
|
+
)
|
|
14
|
+
from haystack.utils.auth import Secret
|
|
15
|
+
from haystack.utils.device import ComponentDevice
|
|
16
|
+
from haystack.utils.hf import deserialize_hf_model_kwargs, serialize_hf_model_kwargs
|
|
17
|
+
from PIL import Image
|
|
18
|
+
|
|
19
|
+
from haystack_integrations.components.embedders.sentence_transformers.embedding_backend.backend import (
|
|
20
|
+
_SentenceTransformersEmbeddingBackend,
|
|
21
|
+
_SentenceTransformersEmbeddingBackendFactory,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@component
|
|
26
|
+
class SentenceTransformersDocumentImageEmbedder:
|
|
27
|
+
"""
|
|
28
|
+
A component for computing Document embeddings based on images using Sentence Transformers models.
|
|
29
|
+
|
|
30
|
+
The embedding of each Document is stored in the `embedding` field of the Document.
|
|
31
|
+
|
|
32
|
+
### Usage example
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
from haystack import Document
|
|
36
|
+
from haystack_integrations.components.embedders.sentence_transformers import (
|
|
37
|
+
SentenceTransformersDocumentImageEmbedder,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
embedder = SentenceTransformersDocumentImageEmbedder(model="sentence-transformers/clip-ViT-B-32")
|
|
41
|
+
|
|
42
|
+
documents = [
|
|
43
|
+
Document(content="A photo of a cat", meta={"file_path": "cat.jpg"}),
|
|
44
|
+
Document(content="A photo of a dog", meta={"file_path": "dog.jpg"}),
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
result = embedder.run(documents=documents)
|
|
48
|
+
documents_with_embeddings = result["documents"]
|
|
49
|
+
print(documents_with_embeddings)
|
|
50
|
+
|
|
51
|
+
# [Document(id=...,
|
|
52
|
+
# content='A photo of a cat',
|
|
53
|
+
# meta={'file_path': 'cat.jpg',
|
|
54
|
+
# 'embedding_source': {'type': 'image', 'file_path_meta_field': 'file_path'}},
|
|
55
|
+
# embedding=vector of size 512),
|
|
56
|
+
# ...]
|
|
57
|
+
```
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
def __init__(
|
|
61
|
+
self,
|
|
62
|
+
*,
|
|
63
|
+
file_path_meta_field: str = "file_path",
|
|
64
|
+
root_path: str | None = None,
|
|
65
|
+
model: str = "sentence-transformers/clip-ViT-B-32",
|
|
66
|
+
device: ComponentDevice | None = None,
|
|
67
|
+
token: Secret | None = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
|
|
68
|
+
batch_size: int = 32,
|
|
69
|
+
progress_bar: bool = True,
|
|
70
|
+
normalize_embeddings: bool = False,
|
|
71
|
+
trust_remote_code: bool = False,
|
|
72
|
+
local_files_only: bool = False,
|
|
73
|
+
model_kwargs: dict[str, Any] | None = None,
|
|
74
|
+
tokenizer_kwargs: dict[str, Any] | None = None,
|
|
75
|
+
config_kwargs: dict[str, Any] | None = None,
|
|
76
|
+
precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = "float32",
|
|
77
|
+
encode_kwargs: dict[str, Any] | None = None,
|
|
78
|
+
backend: Literal["torch", "onnx", "openvino"] = "torch",
|
|
79
|
+
) -> None:
|
|
80
|
+
"""
|
|
81
|
+
Creates a SentenceTransformersDocumentEmbedder component.
|
|
82
|
+
|
|
83
|
+
:param file_path_meta_field: The metadata field in the Document that contains the file path to the image or PDF.
|
|
84
|
+
:param root_path: The root directory path where document files are located. If provided, file paths in
|
|
85
|
+
document metadata will be resolved relative to this path. If None, file paths are treated as absolute paths.
|
|
86
|
+
:param model:
|
|
87
|
+
The Sentence Transformers model to use for calculating embeddings. Pass a local path or ID of the model on
|
|
88
|
+
Hugging Face. To be used with this component, the model must be able to embed images and text into the same
|
|
89
|
+
vector space. Compatible models include:
|
|
90
|
+
- "sentence-transformers/clip-ViT-B-32"
|
|
91
|
+
- "sentence-transformers/clip-ViT-L-14"
|
|
92
|
+
- "sentence-transformers/clip-ViT-B-16"
|
|
93
|
+
- "sentence-transformers/clip-ViT-B-32-multilingual-v1"
|
|
94
|
+
- "jinaai/jina-embeddings-v4"
|
|
95
|
+
- "jinaai/jina-clip-v1"
|
|
96
|
+
- "jinaai/jina-clip-v2".
|
|
97
|
+
:param device:
|
|
98
|
+
The device to use for loading the model.
|
|
99
|
+
Overrides the default device.
|
|
100
|
+
:param token:
|
|
101
|
+
The API token to download private models from Hugging Face.
|
|
102
|
+
:param batch_size:
|
|
103
|
+
Number of documents to embed at once.
|
|
104
|
+
:param progress_bar:
|
|
105
|
+
If `True`, shows a progress bar when embedding documents.
|
|
106
|
+
:param normalize_embeddings:
|
|
107
|
+
If `True`, the embeddings are normalized using L2 normalization, so that each embedding has a norm of 1.
|
|
108
|
+
:param trust_remote_code:
|
|
109
|
+
If `False`, allows only Hugging Face verified model architectures.
|
|
110
|
+
If `True`, allows custom models and scripts.
|
|
111
|
+
:param local_files_only:
|
|
112
|
+
If `True`, does not attempt to download the model from Hugging Face Hub and only looks at local files.
|
|
113
|
+
:param model_kwargs:
|
|
114
|
+
Additional keyword arguments for `AutoModelForSequenceClassification.from_pretrained`
|
|
115
|
+
when loading the model. Refer to specific model documentation for available kwargs.
|
|
116
|
+
:param tokenizer_kwargs:
|
|
117
|
+
Additional keyword arguments for `AutoTokenizer.from_pretrained` when loading the tokenizer.
|
|
118
|
+
Refer to specific model documentation for available kwargs.
|
|
119
|
+
:param config_kwargs:
|
|
120
|
+
Additional keyword arguments for `AutoConfig.from_pretrained` when loading the model configuration.
|
|
121
|
+
:param precision:
|
|
122
|
+
The precision to use for the embeddings.
|
|
123
|
+
All non-float32 precisions are quantized embeddings.
|
|
124
|
+
Quantized embeddings are smaller and faster to compute, but may have a lower accuracy.
|
|
125
|
+
They are useful for reducing the size of the embeddings of a corpus for semantic search, among other tasks.
|
|
126
|
+
:param encode_kwargs:
|
|
127
|
+
Additional keyword arguments for `SentenceTransformer.encode` when embedding documents.
|
|
128
|
+
This parameter is provided for fine customization. Be careful not to clash with already set parameters and
|
|
129
|
+
avoid passing parameters that change the output type.
|
|
130
|
+
:param backend:
|
|
131
|
+
The backend to use for the Sentence Transformers model. Choose from "torch", "onnx", or "openvino".
|
|
132
|
+
Refer to the [Sentence Transformers documentation](https://sbert.net/docs/sentence_transformer/usage/efficiency.html)
|
|
133
|
+
for more information on acceleration and quantization options.
|
|
134
|
+
"""
|
|
135
|
+
self.file_path_meta_field = file_path_meta_field
|
|
136
|
+
self.root_path = root_path or ""
|
|
137
|
+
self.model = model
|
|
138
|
+
self.device = ComponentDevice.resolve_device(device)
|
|
139
|
+
self.token = token
|
|
140
|
+
self.batch_size = batch_size
|
|
141
|
+
self.progress_bar = progress_bar
|
|
142
|
+
self.normalize_embeddings = normalize_embeddings
|
|
143
|
+
self.trust_remote_code = trust_remote_code
|
|
144
|
+
self.local_files_only = local_files_only
|
|
145
|
+
self.model_kwargs = model_kwargs
|
|
146
|
+
self.tokenizer_kwargs = tokenizer_kwargs
|
|
147
|
+
self.config_kwargs = config_kwargs
|
|
148
|
+
self.encode_kwargs = encode_kwargs
|
|
149
|
+
self.precision = precision
|
|
150
|
+
self.backend = backend
|
|
151
|
+
self._embedding_backend: _SentenceTransformersEmbeddingBackend | None = None
|
|
152
|
+
|
|
153
|
+
def to_dict(self) -> dict[str, Any]:
|
|
154
|
+
"""
|
|
155
|
+
Serializes the component to a dictionary.
|
|
156
|
+
|
|
157
|
+
:returns:
|
|
158
|
+
Dictionary with serialized data.
|
|
159
|
+
"""
|
|
160
|
+
serialization_dict = default_to_dict(
|
|
161
|
+
self,
|
|
162
|
+
file_path_meta_field=self.file_path_meta_field,
|
|
163
|
+
root_path=self.root_path,
|
|
164
|
+
model=self.model,
|
|
165
|
+
device=self.device,
|
|
166
|
+
token=self.token,
|
|
167
|
+
batch_size=self.batch_size,
|
|
168
|
+
progress_bar=self.progress_bar,
|
|
169
|
+
normalize_embeddings=self.normalize_embeddings,
|
|
170
|
+
trust_remote_code=self.trust_remote_code,
|
|
171
|
+
local_files_only=self.local_files_only,
|
|
172
|
+
model_kwargs=self.model_kwargs,
|
|
173
|
+
tokenizer_kwargs=self.tokenizer_kwargs,
|
|
174
|
+
config_kwargs=self.config_kwargs,
|
|
175
|
+
precision=self.precision,
|
|
176
|
+
encode_kwargs=self.encode_kwargs,
|
|
177
|
+
backend=self.backend,
|
|
178
|
+
)
|
|
179
|
+
if serialization_dict["init_parameters"].get("model_kwargs") is not None:
|
|
180
|
+
serialize_hf_model_kwargs(serialization_dict["init_parameters"]["model_kwargs"])
|
|
181
|
+
return serialization_dict
|
|
182
|
+
|
|
183
|
+
@classmethod
|
|
184
|
+
def from_dict(cls, data: dict[str, Any]) -> "SentenceTransformersDocumentImageEmbedder":
|
|
185
|
+
"""
|
|
186
|
+
Deserializes the component from a dictionary.
|
|
187
|
+
|
|
188
|
+
:param data:
|
|
189
|
+
Dictionary to deserialize from.
|
|
190
|
+
:returns:
|
|
191
|
+
Deserialized component.
|
|
192
|
+
"""
|
|
193
|
+
init_params = data["init_parameters"]
|
|
194
|
+
if init_params.get("model_kwargs") is not None:
|
|
195
|
+
deserialize_hf_model_kwargs(init_params["model_kwargs"])
|
|
196
|
+
return default_from_dict(cls, data)
|
|
197
|
+
|
|
198
|
+
def warm_up(self) -> None:
|
|
199
|
+
"""
|
|
200
|
+
Initializes the component.
|
|
201
|
+
"""
|
|
202
|
+
if self._embedding_backend is None:
|
|
203
|
+
self._embedding_backend = _SentenceTransformersEmbeddingBackendFactory.get_embedding_backend(
|
|
204
|
+
model=self.model,
|
|
205
|
+
device=self.device.to_torch_str(),
|
|
206
|
+
auth_token=self.token,
|
|
207
|
+
trust_remote_code=self.trust_remote_code,
|
|
208
|
+
local_files_only=self.local_files_only,
|
|
209
|
+
model_kwargs=self.model_kwargs,
|
|
210
|
+
tokenizer_kwargs=self.tokenizer_kwargs,
|
|
211
|
+
config_kwargs=self.config_kwargs,
|
|
212
|
+
backend=self.backend,
|
|
213
|
+
)
|
|
214
|
+
if self.tokenizer_kwargs and self.tokenizer_kwargs.get("model_max_length"):
|
|
215
|
+
self._embedding_backend.model.max_seq_length = self.tokenizer_kwargs["model_max_length"]
|
|
216
|
+
|
|
217
|
+
@component.output_types(documents=list[Document])
|
|
218
|
+
def run(self, documents: list[Document]) -> dict[str, list[Document]]:
|
|
219
|
+
"""
|
|
220
|
+
Embed a list of documents.
|
|
221
|
+
|
|
222
|
+
:param documents:
|
|
223
|
+
Documents to embed.
|
|
224
|
+
|
|
225
|
+
:returns:
|
|
226
|
+
A dictionary with the following keys:
|
|
227
|
+
- `documents`: Documents with embeddings.
|
|
228
|
+
"""
|
|
229
|
+
if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
|
|
230
|
+
msg = (
|
|
231
|
+
"SentenceTransformersDocumentImageEmbedder expects a list of Documents as input. "
|
|
232
|
+
"In case you want to embed a string, please use the SentenceTransformersTextEmbedder."
|
|
233
|
+
)
|
|
234
|
+
raise TypeError(msg)
|
|
235
|
+
if self._embedding_backend is None:
|
|
236
|
+
self.warm_up()
|
|
237
|
+
|
|
238
|
+
images_source_info = _extract_image_sources_info(
|
|
239
|
+
documents=documents, file_path_meta_field=self.file_path_meta_field, root_path=self.root_path
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
images_to_embed: list = [None] * len(documents)
|
|
243
|
+
pdf_page_infos: list[_PDFPageInfo] = []
|
|
244
|
+
|
|
245
|
+
for doc_idx, image_source_info in enumerate(images_source_info):
|
|
246
|
+
if image_source_info["mime_type"] == "application/pdf":
|
|
247
|
+
# Store PDF documents for later processing
|
|
248
|
+
page_number = image_source_info.get("page_number")
|
|
249
|
+
assert page_number is not None # noqa: S101 # checked in _extract_image_sources_info but mypy doesn't know that
|
|
250
|
+
pdf_page_info: _PDFPageInfo = {
|
|
251
|
+
"doc_idx": doc_idx,
|
|
252
|
+
"path": image_source_info["path"],
|
|
253
|
+
"page_number": page_number,
|
|
254
|
+
}
|
|
255
|
+
pdf_page_infos.append(pdf_page_info)
|
|
256
|
+
else:
|
|
257
|
+
# Process images directly
|
|
258
|
+
image = Image.open(image_source_info["path"])
|
|
259
|
+
images_to_embed[doc_idx] = image
|
|
260
|
+
|
|
261
|
+
pdf_images_by_doc_idx = _batch_convert_pdf_pages_to_images(pdf_page_infos=pdf_page_infos, return_base64=False)
|
|
262
|
+
for doc_idx, pil_image in pdf_images_by_doc_idx.items():
|
|
263
|
+
images_to_embed[doc_idx] = pil_image
|
|
264
|
+
|
|
265
|
+
none_images_doc_ids = [documents[doc_idx].id for doc_idx, image in enumerate(images_to_embed) if image is None]
|
|
266
|
+
if none_images_doc_ids:
|
|
267
|
+
msg = f"Conversion failed for some documents. Document IDs: {none_images_doc_ids}."
|
|
268
|
+
raise RuntimeError(msg)
|
|
269
|
+
|
|
270
|
+
# mypy doesn't know this is set in warm_up
|
|
271
|
+
embeddings = self._embedding_backend.embed( # type: ignore[union-attr]
|
|
272
|
+
data=images_to_embed,
|
|
273
|
+
batch_size=self.batch_size,
|
|
274
|
+
show_progress_bar=self.progress_bar,
|
|
275
|
+
normalize_embeddings=self.normalize_embeddings,
|
|
276
|
+
precision=self.precision,
|
|
277
|
+
**(self.encode_kwargs if self.encode_kwargs else {}),
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
docs_with_embeddings = []
|
|
281
|
+
for doc, emb in zip(documents, embeddings, strict=True):
|
|
282
|
+
# we store this information for later inspection
|
|
283
|
+
new_meta = {
|
|
284
|
+
**doc.meta,
|
|
285
|
+
"embedding_source": {"type": "image", "file_path_meta_field": self.file_path_meta_field},
|
|
286
|
+
}
|
|
287
|
+
new_doc = replace(doc, meta=new_meta, embedding=emb)
|
|
288
|
+
docs_with_embeddings.append(new_doc)
|
|
289
|
+
|
|
290
|
+
return {"documents": docs_with_embeddings}
|