amazon-bedrock-haystack 5.2.0__tar.gz → 5.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/CHANGELOG.md +24 -7
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/PKG-INFO +1 -1
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/pyproject.toml +3 -3
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/src/haystack_integrations/components/downloaders/s3/s3_downloader.py +10 -10
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/src/haystack_integrations/components/embedders/amazon_bedrock/document_embedder.py +27 -35
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/src/haystack_integrations/components/embedders/amazon_bedrock/document_image_embedder.py +31 -30
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/src/haystack_integrations/components/embedders/amazon_bedrock/text_embedder.py +23 -29
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/src/haystack_integrations/components/generators/amazon_bedrock/adapters.py +33 -33
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/src/haystack_integrations/components/generators/amazon_bedrock/chat/chat_generator.py +17 -17
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/src/haystack_integrations/components/generators/amazon_bedrock/chat/utils.py +22 -22
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/src/haystack_integrations/components/generators/amazon_bedrock/generator.py +10 -10
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/src/haystack_integrations/components/rankers/amazon_bedrock/ranker.py +7 -7
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/tests/test_chat_generator.py +2 -2
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/tests/test_document_embedder.py +35 -5
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/tests/test_document_image_embedder.py +15 -7
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/tests/test_generator.py +5 -5
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/tests/test_s3_downloader.py +3 -3
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/tests/test_text_embedder.py +27 -2
- amazon_bedrock_haystack-5.2.0/pydoc/config.yml +0 -42
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/.gitignore +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/LICENSE.txt +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/README.md +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/examples/bedrock_ranker_example.py +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/examples/chatgenerator_example.py +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/examples/embedders_generator_with_rag_example.py +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/examples/s3_downloader_example.py +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/pydoc/config_docusaurus.yml +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/src/haystack_integrations/common/amazon_bedrock/__init__.py +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/src/haystack_integrations/common/amazon_bedrock/errors.py +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/src/haystack_integrations/common/amazon_bedrock/utils.py +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/src/haystack_integrations/common/py.typed +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/src/haystack_integrations/common/s3/__init__.py +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/src/haystack_integrations/common/s3/errors.py +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/src/haystack_integrations/common/s3/utils.py +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/src/haystack_integrations/components/downloaders/py.typed +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/src/haystack_integrations/components/downloaders/s3/__init__.py +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/src/haystack_integrations/components/embedders/amazon_bedrock/__init__.py +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/src/haystack_integrations/components/embedders/py.typed +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/src/haystack_integrations/components/generators/amazon_bedrock/__init__.py +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/src/haystack_integrations/components/generators/amazon_bedrock/chat/__init__.py +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/src/haystack_integrations/components/generators/py.typed +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/src/haystack_integrations/components/rankers/amazon_bedrock/__init__.py +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/src/haystack_integrations/components/rankers/py.typed +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/tests/__init__.py +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/tests/conftest.py +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/tests/test_chat_generator_utils.py +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/tests/test_files/apple.jpg +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/tests/test_files/haystack-logo.png +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/tests/test_files/sample_pdf_1.pdf +0 -0
- {amazon_bedrock_haystack-5.2.0 → amazon_bedrock_haystack-5.3.1}/tests/test_ranker.py +0 -0
|
@@ -1,32 +1,49 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
-
## [integrations/amazon_bedrock-v5.
|
|
3
|
+
## [integrations/amazon_bedrock-v5.3.0] - 2025-12-17
|
|
4
4
|
|
|
5
5
|
### 🚀 Features
|
|
6
6
|
|
|
7
|
-
-
|
|
7
|
+
- `AmazonBedrockChatGenerator` update tools param to ToolsType (#2415)
|
|
8
|
+
- Cohere Embed v4 support in Bedrock (#2612)
|
|
8
9
|
|
|
9
|
-
|
|
10
|
+
### 📚 Documentation
|
|
11
|
+
|
|
12
|
+
- Add pydoc configurations for Docusaurus (#2411)
|
|
13
|
+
|
|
14
|
+
### ⚙️ CI
|
|
15
|
+
|
|
16
|
+
- Change pytest command (#2475)
|
|
10
17
|
|
|
11
18
|
### 🧹 Chores
|
|
12
19
|
|
|
13
|
-
-
|
|
20
|
+
- Remove Readme API CI workflow and configs (#2573)
|
|
21
|
+
|
|
22
|
+
### 🌀 Miscellaneous
|
|
14
23
|
|
|
15
|
-
|
|
24
|
+
- Adopt PEP 585 type hinting (part 2) (#2508)
|
|
25
|
+
|
|
26
|
+
## [integrations/amazon_bedrock-v5.1.0] - 2025-09-29
|
|
16
27
|
|
|
17
28
|
### 🚀 Features
|
|
18
29
|
|
|
19
|
-
-
|
|
30
|
+
- S3Downloader - add `s3_key_generation_function` param to customize S3 key generation (#2343)
|
|
31
|
+
|
|
20
32
|
|
|
21
|
-
## [integrations/amazon_bedrock-
|
|
33
|
+
## [integrations/amazon_bedrock-v5.0.0] - 2025-09-22
|
|
22
34
|
|
|
23
35
|
### 🚀 Features
|
|
24
36
|
|
|
25
37
|
- Support AWS Bedrock Guardrails in `AmazonBedrockChatGenerator` (#2284)
|
|
38
|
+
- Add a new `S3Downloader` component (#2192)
|
|
39
|
+
|
|
40
|
+
### 📚 Documentation
|
|
41
|
+
|
|
26
42
|
|
|
27
43
|
### 🧹 Chores
|
|
28
44
|
|
|
29
45
|
- Bedrock - remove unused `stop_words` init parameter (#2275)
|
|
46
|
+
- [**breaking**] Remove deprecated `BedrockRanker` (use `AmazonBedrockRanker` instead) (#2287)
|
|
30
47
|
|
|
31
48
|
### 🌀 Miscellaneous
|
|
32
49
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: amazon-bedrock-haystack
|
|
3
|
-
Version: 5.
|
|
3
|
+
Version: 5.3.1
|
|
4
4
|
Summary: An integration of AWS S3 and Bedrock as a Downloader and Generator components.
|
|
5
5
|
Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/amazon_bedrock#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
|
|
@@ -46,7 +46,7 @@ installer = "uv"
|
|
|
46
46
|
dependencies = ["haystack-pydoc-tools", "ruff"]
|
|
47
47
|
|
|
48
48
|
[tool.hatch.envs.default.scripts]
|
|
49
|
-
docs = ["pydoc-markdown pydoc/
|
|
49
|
+
docs = ["pydoc-markdown pydoc/config_docusaurus.yml"]
|
|
50
50
|
fmt = "ruff check --fix {args} && ruff format {args}"
|
|
51
51
|
fmt-check = "ruff check {args} && ruff format --check {args}"
|
|
52
52
|
|
|
@@ -66,7 +66,7 @@ dependencies = [
|
|
|
66
66
|
unit = 'pytest -m "not integration" {args:tests}'
|
|
67
67
|
integration = 'pytest -m "integration" {args:tests}'
|
|
68
68
|
all = 'pytest {args:tests}'
|
|
69
|
-
cov-retry = '
|
|
69
|
+
cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x {args:tests}'
|
|
70
70
|
|
|
71
71
|
types = """mypy -p haystack_integrations.common.amazon_bedrock \
|
|
72
72
|
-p haystack_integrations.components.embedders.amazon_bedrock \
|
|
@@ -92,7 +92,7 @@ module = [
|
|
|
92
92
|
ignore_missing_imports = true
|
|
93
93
|
|
|
94
94
|
[tool.ruff]
|
|
95
|
-
target-version = "
|
|
95
|
+
target-version = "py39"
|
|
96
96
|
line-length = 120
|
|
97
97
|
|
|
98
98
|
[tool.ruff.lint]
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
import os
|
|
6
6
|
from concurrent.futures import ThreadPoolExecutor
|
|
7
7
|
from pathlib import Path
|
|
8
|
-
from typing import Any, Callable,
|
|
8
|
+
from typing import Any, Callable, Optional
|
|
9
9
|
|
|
10
10
|
from botocore.config import Config
|
|
11
11
|
from haystack import component, default_from_dict, default_to_dict, logging
|
|
@@ -36,9 +36,9 @@ class S3Downloader:
|
|
|
36
36
|
aws_session_token: Optional[Secret] = Secret.from_env_var("AWS_SESSION_TOKEN", strict=False), # noqa: B008
|
|
37
37
|
aws_region_name: Optional[Secret] = Secret.from_env_var("AWS_DEFAULT_REGION", strict=False), # noqa: B008
|
|
38
38
|
aws_profile_name: Optional[Secret] = Secret.from_env_var("AWS_PROFILE", strict=False), # noqa: B008
|
|
39
|
-
boto3_config: Optional[
|
|
39
|
+
boto3_config: Optional[dict[str, Any]] = None,
|
|
40
40
|
file_root_path: Optional[str] = None,
|
|
41
|
-
file_extensions: Optional[
|
|
41
|
+
file_extensions: Optional[list[str]] = None,
|
|
42
42
|
file_name_meta_key: str = "file_name",
|
|
43
43
|
max_workers: int = 32,
|
|
44
44
|
max_cache_size: int = 100,
|
|
@@ -126,11 +126,11 @@ class S3Downloader:
|
|
|
126
126
|
self.file_root_path.mkdir(parents=True, exist_ok=True)
|
|
127
127
|
self._storage = S3Storage.from_env(session=self._session, config=self._config)
|
|
128
128
|
|
|
129
|
-
@component.output_types(documents=
|
|
129
|
+
@component.output_types(documents=list[Document])
|
|
130
130
|
def run(
|
|
131
131
|
self,
|
|
132
|
-
documents:
|
|
133
|
-
) ->
|
|
132
|
+
documents: list[Document],
|
|
133
|
+
) -> dict[str, list[Document]]:
|
|
134
134
|
"""Download files from AWS S3 Buckets to local filesystem.
|
|
135
135
|
|
|
136
136
|
Return enriched `Document`s with the path of the downloaded file.
|
|
@@ -160,7 +160,7 @@ class S3Downloader:
|
|
|
160
160
|
downloaded_documents = [d for d in iterable if d is not None]
|
|
161
161
|
return {"documents": downloaded_documents}
|
|
162
162
|
|
|
163
|
-
def _filter_documents_by_extensions(self, documents:
|
|
163
|
+
def _filter_documents_by_extensions(self, documents: list[Document]) -> list[Document]:
|
|
164
164
|
"""Filter documents by file extensions."""
|
|
165
165
|
if not self.file_extensions:
|
|
166
166
|
return documents
|
|
@@ -202,7 +202,7 @@ class S3Downloader:
|
|
|
202
202
|
document.meta["file_path"] = str(file_path)
|
|
203
203
|
return document
|
|
204
204
|
|
|
205
|
-
def _cleanup_cache(self, documents:
|
|
205
|
+
def _cleanup_cache(self, documents: list[Document]) -> None:
|
|
206
206
|
"""
|
|
207
207
|
Remove least-recently-accessed cache files when cache exceeds `max_cache_size`.
|
|
208
208
|
|
|
@@ -224,7 +224,7 @@ class S3Downloader:
|
|
|
224
224
|
except Exception as error:
|
|
225
225
|
logger.warning("Failed to remove cache file at {path} with error: {e}", path=p, e=error)
|
|
226
226
|
|
|
227
|
-
def to_dict(self) ->
|
|
227
|
+
def to_dict(self) -> dict[str, Any]:
|
|
228
228
|
"""Serialize the component to a dictionary."""
|
|
229
229
|
|
|
230
230
|
s3_key_generation_function_name = (
|
|
@@ -247,7 +247,7 @@ class S3Downloader:
|
|
|
247
247
|
)
|
|
248
248
|
|
|
249
249
|
@classmethod
|
|
250
|
-
def from_dict(cls, data:
|
|
250
|
+
def from_dict(cls, data: dict[str, Any]) -> "S3Downloader":
|
|
251
251
|
"""
|
|
252
252
|
Deserializes the component from a dictionary.
|
|
253
253
|
:param data:
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
-
from typing import Any,
|
|
2
|
+
from typing import Any, Optional
|
|
3
3
|
|
|
4
4
|
from botocore.config import Config
|
|
5
5
|
from botocore.exceptions import ClientError
|
|
@@ -16,14 +16,6 @@ from haystack_integrations.common.amazon_bedrock.utils import get_aws_session
|
|
|
16
16
|
|
|
17
17
|
logger = logging.getLogger(__name__)
|
|
18
18
|
|
|
19
|
-
SUPPORTED_EMBEDDING_MODELS = [
|
|
20
|
-
"amazon.titan-embed-text-v1",
|
|
21
|
-
"cohere.embed-english-v3",
|
|
22
|
-
"cohere.embed-multilingual-v3",
|
|
23
|
-
"amazon.titan-embed-text-v2:0",
|
|
24
|
-
"amazon.titan-embed-image-v1",
|
|
25
|
-
]
|
|
26
|
-
|
|
27
19
|
|
|
28
20
|
@component
|
|
29
21
|
class AmazonBedrockDocumentEmbedder:
|
|
@@ -57,13 +49,7 @@ class AmazonBedrockDocumentEmbedder:
|
|
|
57
49
|
|
|
58
50
|
def __init__(
|
|
59
51
|
self,
|
|
60
|
-
model:
|
|
61
|
-
"amazon.titan-embed-text-v1",
|
|
62
|
-
"cohere.embed-english-v3",
|
|
63
|
-
"cohere.embed-multilingual-v3",
|
|
64
|
-
"amazon.titan-embed-text-v2:0",
|
|
65
|
-
"amazon.titan-embed-image-v1",
|
|
66
|
-
],
|
|
52
|
+
model: str,
|
|
67
53
|
aws_access_key_id: Optional[Secret] = Secret.from_env_var("AWS_ACCESS_KEY_ID", strict=False), # noqa: B008
|
|
68
54
|
aws_secret_access_key: Optional[Secret] = Secret.from_env_var( # noqa: B008
|
|
69
55
|
"AWS_SECRET_ACCESS_KEY", strict=False
|
|
@@ -73,9 +59,9 @@ class AmazonBedrockDocumentEmbedder:
|
|
|
73
59
|
aws_profile_name: Optional[Secret] = Secret.from_env_var("AWS_PROFILE", strict=False), # noqa: B008
|
|
74
60
|
batch_size: int = 32,
|
|
75
61
|
progress_bar: bool = True,
|
|
76
|
-
meta_fields_to_embed: Optional[
|
|
62
|
+
meta_fields_to_embed: Optional[list[str]] = None,
|
|
77
63
|
embedding_separator: str = "\n",
|
|
78
|
-
boto3_config: Optional[
|
|
64
|
+
boto3_config: Optional[dict[str, Any]] = None,
|
|
79
65
|
**kwargs: Any,
|
|
80
66
|
) -> None:
|
|
81
67
|
"""
|
|
@@ -88,8 +74,13 @@ class AmazonBedrockDocumentEmbedder:
|
|
|
88
74
|
constructor. Aside from model, three required parameters are `aws_access_key_id`, `aws_secret_access_key`,
|
|
89
75
|
and `aws_region_name`.
|
|
90
76
|
|
|
91
|
-
:param model: The embedding model to use.
|
|
92
|
-
|
|
77
|
+
:param model: The embedding model to use.
|
|
78
|
+
Amazon Titan and Cohere embedding models are supported, for example:
|
|
79
|
+
"amazon.titan-embed-text-v1", "amazon.titan-embed-text-v2:0", "amazon.titan-embed-image-v1",
|
|
80
|
+
"cohere.embed-english-v3", "cohere.embed-multilingual-v3", "cohere.embed-v4:0".
|
|
81
|
+
To find all supported models, refer to the Amazon Bedrock
|
|
82
|
+
[documentation](https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html) and
|
|
83
|
+
filter for "embedding", then select models from the Amazon Titan and Cohere series.
|
|
93
84
|
:param aws_access_key_id: AWS access key ID.
|
|
94
85
|
:param aws_secret_access_key: AWS secret access key.
|
|
95
86
|
:param aws_session_token: AWS session token.
|
|
@@ -107,11 +98,8 @@ class AmazonBedrockDocumentEmbedder:
|
|
|
107
98
|
:raises ValueError: If the model is not supported.
|
|
108
99
|
:raises AmazonBedrockConfigurationError: If the AWS environment is not configured correctly.
|
|
109
100
|
"""
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
msg = "Please provide a valid model from the list of supported models: " + ", ".join(
|
|
113
|
-
SUPPORTED_EMBEDDING_MODELS
|
|
114
|
-
)
|
|
101
|
+
if "titan" not in model and "cohere" not in model:
|
|
102
|
+
msg = f"Model {model} is not supported. Only Amazon Titan and Cohere embedding models are supported."
|
|
115
103
|
raise ValueError(msg)
|
|
116
104
|
|
|
117
105
|
self.model = model
|
|
@@ -149,7 +137,7 @@ class AmazonBedrockDocumentEmbedder:
|
|
|
149
137
|
)
|
|
150
138
|
raise AmazonBedrockConfigurationError(msg) from exception
|
|
151
139
|
|
|
152
|
-
def _prepare_texts_to_embed(self, documents:
|
|
140
|
+
def _prepare_texts_to_embed(self, documents: list[Document]) -> list[str]:
|
|
153
141
|
"""
|
|
154
142
|
Prepare the texts to embed by concatenating the Document text with the metadata fields to embed.
|
|
155
143
|
"""
|
|
@@ -162,7 +150,7 @@ class AmazonBedrockDocumentEmbedder:
|
|
|
162
150
|
texts_to_embed.append(text_to_embed)
|
|
163
151
|
return texts_to_embed
|
|
164
152
|
|
|
165
|
-
def _embed_cohere(self, documents:
|
|
153
|
+
def _embed_cohere(self, documents: list[Document]) -> list[Document]:
|
|
166
154
|
"""
|
|
167
155
|
Internal method to embed Documents using Cohere models.
|
|
168
156
|
Batch inference is supported.
|
|
@@ -191,15 +179,19 @@ class AmazonBedrockDocumentEmbedder:
|
|
|
191
179
|
msg = f"Could not perform inference for Amazon Bedrock model {self.model} due to:\n{exception}"
|
|
192
180
|
raise AmazonBedrockInferenceError(msg) from exception
|
|
193
181
|
|
|
194
|
-
|
|
195
|
-
|
|
182
|
+
cohere_embeddings = json.loads(response.get("body").read())["embeddings"]
|
|
183
|
+
# depending on the model, Cohere returns a dict with the embedding types as keys or a list of lists
|
|
184
|
+
embeddings_list = (
|
|
185
|
+
next(iter(cohere_embeddings.values())) if isinstance(cohere_embeddings, dict) else cohere_embeddings
|
|
186
|
+
)
|
|
187
|
+
all_embeddings.extend(embeddings_list)
|
|
196
188
|
|
|
197
189
|
for doc, emb in zip(documents, all_embeddings):
|
|
198
190
|
doc.embedding = emb
|
|
199
191
|
|
|
200
192
|
return documents
|
|
201
193
|
|
|
202
|
-
def _embed_titan(self, documents:
|
|
194
|
+
def _embed_titan(self, documents: list[Document]) -> list[Document]:
|
|
203
195
|
"""
|
|
204
196
|
Internal method to embed Documents using Amazon Titan models.
|
|
205
197
|
NOTE: Batch inference is not supported, so embeddings are created one by one.
|
|
@@ -227,8 +219,8 @@ class AmazonBedrockDocumentEmbedder:
|
|
|
227
219
|
|
|
228
220
|
return documents
|
|
229
221
|
|
|
230
|
-
@component.output_types(documents=
|
|
231
|
-
def run(self, documents:
|
|
222
|
+
@component.output_types(documents=list[Document])
|
|
223
|
+
def run(self, documents: list[Document]) -> dict[str, list[Document]]:
|
|
232
224
|
"""Embed the provided `Document`s using the specified model.
|
|
233
225
|
|
|
234
226
|
:param documents: The `Document`s to embed.
|
|
@@ -248,12 +240,12 @@ class AmazonBedrockDocumentEmbedder:
|
|
|
248
240
|
elif "titan" in self.model:
|
|
249
241
|
documents_with_embeddings = self._embed_titan(documents=documents)
|
|
250
242
|
else:
|
|
251
|
-
msg = f"Model {self.model} is not supported.
|
|
243
|
+
msg = f"Model {self.model} is not supported. Only Amazon Titan and Cohere embedding models are supported."
|
|
252
244
|
raise ValueError(msg)
|
|
253
245
|
|
|
254
246
|
return {"documents": documents_with_embeddings}
|
|
255
247
|
|
|
256
|
-
def to_dict(self) ->
|
|
248
|
+
def to_dict(self) -> dict[str, Any]:
|
|
257
249
|
"""
|
|
258
250
|
Serializes the component to a dictionary.
|
|
259
251
|
|
|
@@ -277,7 +269,7 @@ class AmazonBedrockDocumentEmbedder:
|
|
|
277
269
|
)
|
|
278
270
|
|
|
279
271
|
@classmethod
|
|
280
|
-
def from_dict(cls, data:
|
|
272
|
+
def from_dict(cls, data: dict[str, Any]) -> "AmazonBedrockDocumentEmbedder":
|
|
281
273
|
"""
|
|
282
274
|
Deserializes the component from a dictionary.
|
|
283
275
|
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
import json
|
|
6
6
|
from dataclasses import replace
|
|
7
|
-
from typing import Any,
|
|
7
|
+
from typing import Any, Optional
|
|
8
8
|
|
|
9
9
|
from botocore.config import Config
|
|
10
10
|
from botocore.exceptions import ClientError
|
|
@@ -27,8 +27,6 @@ from haystack_integrations.common.amazon_bedrock.utils import get_aws_session
|
|
|
27
27
|
|
|
28
28
|
logger = logging.getLogger(__name__)
|
|
29
29
|
|
|
30
|
-
SUPPORTED_EMBEDDING_MODELS = ["amazon.titan-embed-image-v1", "cohere.embed-english-v3", "cohere.embed-multilingual-v3"]
|
|
31
|
-
|
|
32
30
|
|
|
33
31
|
@component
|
|
34
32
|
class AmazonBedrockDocumentImageEmbedder:
|
|
@@ -69,7 +67,7 @@ class AmazonBedrockDocumentImageEmbedder:
|
|
|
69
67
|
def __init__(
|
|
70
68
|
self,
|
|
71
69
|
*,
|
|
72
|
-
model:
|
|
70
|
+
model: str,
|
|
73
71
|
aws_access_key_id: Optional[Secret] = Secret.from_env_var("AWS_ACCESS_KEY_ID", strict=False), # noqa: B008
|
|
74
72
|
aws_secret_access_key: Optional[Secret] = Secret.from_env_var( # noqa: B008
|
|
75
73
|
"AWS_SECRET_ACCESS_KEY", strict=False
|
|
@@ -79,20 +77,21 @@ class AmazonBedrockDocumentImageEmbedder:
|
|
|
79
77
|
aws_profile_name: Optional[Secret] = Secret.from_env_var("AWS_PROFILE", strict=False), # noqa: B008
|
|
80
78
|
file_path_meta_field: str = "file_path",
|
|
81
79
|
root_path: Optional[str] = None,
|
|
82
|
-
image_size: Optional[
|
|
80
|
+
image_size: Optional[tuple[int, int]] = None,
|
|
83
81
|
progress_bar: bool = True,
|
|
84
|
-
boto3_config: Optional[
|
|
82
|
+
boto3_config: Optional[dict[str, Any]] = None,
|
|
85
83
|
**kwargs: Any,
|
|
86
84
|
) -> None:
|
|
87
85
|
"""
|
|
88
86
|
Creates a AmazonBedrockDocumentImageEmbedder component.
|
|
89
87
|
|
|
90
|
-
:param model:
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
-
|
|
88
|
+
:param model: The embedding model to use.
|
|
89
|
+
Amazon Titan and Cohere multimodal embedding models are supported, for example:
|
|
90
|
+
"amazon.titan-embed-image-v1", "cohere.embed-english-v3", "cohere.embed-multilingual-v3",
|
|
91
|
+
"cohere.embed-v4:0".
|
|
92
|
+
To find all supported models, refer to the Amazon Bedrock
|
|
93
|
+
[documentation](https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html) and
|
|
94
|
+
filter for "embedding", then select multimodal models from the Amazon Titan and Cohere series.
|
|
96
95
|
:param aws_access_key_id: AWS access key ID.
|
|
97
96
|
:param aws_secret_access_key: AWS secret access key.
|
|
98
97
|
:param aws_session_token: AWS session token.
|
|
@@ -114,9 +113,10 @@ class AmazonBedrockDocumentImageEmbedder:
|
|
|
114
113
|
:raises ValueError: If the model is not supported.
|
|
115
114
|
:raises AmazonBedrockConfigurationError: If the AWS environment is not configured correctly.
|
|
116
115
|
"""
|
|
117
|
-
if not model
|
|
118
|
-
msg =
|
|
119
|
-
|
|
116
|
+
if "titan" not in model and "cohere" not in model:
|
|
117
|
+
msg = (
|
|
118
|
+
f"Model {model} is not supported. "
|
|
119
|
+
"Only Amazon Titan and Cohere multimodal embedding models are supported."
|
|
120
120
|
)
|
|
121
121
|
raise ValueError(msg)
|
|
122
122
|
|
|
@@ -135,14 +135,14 @@ class AmazonBedrockDocumentImageEmbedder:
|
|
|
135
135
|
self.kwargs = kwargs
|
|
136
136
|
self.embedding_types = None
|
|
137
137
|
|
|
138
|
-
if
|
|
139
|
-
if len(
|
|
138
|
+
if embedding_types := self.kwargs.get("embedding_types"):
|
|
139
|
+
if len(embedding_types) > 1:
|
|
140
140
|
msg = (
|
|
141
141
|
"You have provided multiple embedding_types for Cohere model. "
|
|
142
142
|
"AmazonBedrockDocumentImageEmbedder only supports one embedding_type at a time."
|
|
143
143
|
)
|
|
144
144
|
raise ValueError(msg)
|
|
145
|
-
self.embedding_types =
|
|
145
|
+
self.embedding_types = embedding_types
|
|
146
146
|
|
|
147
147
|
def resolve_secret(secret: Optional[Secret]) -> Optional[str]:
|
|
148
148
|
return secret.resolve_value() if secret else None
|
|
@@ -280,7 +280,10 @@ class AmazonBedrockDocumentImageEmbedder:
|
|
|
280
280
|
elif "titan" in self.model:
|
|
281
281
|
embeddings = self._embed_titan(images=images_to_embed)
|
|
282
282
|
else:
|
|
283
|
-
msg =
|
|
283
|
+
msg = (
|
|
284
|
+
f"Model {self.model} is not supported. "
|
|
285
|
+
"Only Amazon Titan and Cohere multimodal embedding models are supported."
|
|
286
|
+
)
|
|
284
287
|
raise ValueError(msg)
|
|
285
288
|
|
|
286
289
|
docs_with_embeddings = []
|
|
@@ -296,7 +299,7 @@ class AmazonBedrockDocumentImageEmbedder:
|
|
|
296
299
|
|
|
297
300
|
return {"documents": docs_with_embeddings}
|
|
298
301
|
|
|
299
|
-
def _embed_titan(self, images:
|
|
302
|
+
def _embed_titan(self, images: list[str]) -> list[list[float]]:
|
|
300
303
|
"""
|
|
301
304
|
Internal method to embed base64 images using Amazon Titan models.
|
|
302
305
|
|
|
@@ -326,7 +329,7 @@ class AmazonBedrockDocumentImageEmbedder:
|
|
|
326
329
|
|
|
327
330
|
return all_embeddings
|
|
328
331
|
|
|
329
|
-
def _embed_cohere(self, image_uris:
|
|
332
|
+
def _embed_cohere(self, image_uris: list[str]) -> list[list[float]]:
|
|
330
333
|
"""
|
|
331
334
|
Internal method to embed base64 images using Cohere models.
|
|
332
335
|
|
|
@@ -351,15 +354,13 @@ class AmazonBedrockDocumentImageEmbedder:
|
|
|
351
354
|
raise AmazonBedrockInferenceError(msg) from exception
|
|
352
355
|
|
|
353
356
|
response_body = json.loads(response.get("body").read())
|
|
354
|
-
|
|
357
|
+
cohere_embeddings = response_body["embeddings"]
|
|
355
358
|
|
|
356
|
-
#
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
# a nested list of float embeddings
|
|
363
|
-
all_embeddings.append(embeddings[0])
|
|
359
|
+
# depending on the model and embedding_types, Cohere returns a dict with the embedding types as keys
|
|
360
|
+
# or a list of lists
|
|
361
|
+
embeddings_list = (
|
|
362
|
+
next(iter(cohere_embeddings.values())) if isinstance(cohere_embeddings, dict) else cohere_embeddings
|
|
363
|
+
)
|
|
364
|
+
all_embeddings.extend(embeddings_list)
|
|
364
365
|
|
|
365
366
|
return all_embeddings
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
-
from typing import Any,
|
|
2
|
+
from typing import Any, Optional
|
|
3
3
|
|
|
4
4
|
from botocore.config import Config
|
|
5
5
|
from botocore.exceptions import ClientError
|
|
@@ -14,14 +14,6 @@ from haystack_integrations.common.amazon_bedrock.utils import get_aws_session
|
|
|
14
14
|
|
|
15
15
|
logger = logging.getLogger(__name__)
|
|
16
16
|
|
|
17
|
-
SUPPORTED_EMBEDDING_MODELS = [
|
|
18
|
-
"amazon.titan-embed-text-v1",
|
|
19
|
-
"cohere.embed-english-v3",
|
|
20
|
-
"cohere.embed-multilingual-v3",
|
|
21
|
-
"amazon.titan-embed-text-v2:0",
|
|
22
|
-
"amazon.titan-embed-image-v1",
|
|
23
|
-
]
|
|
24
|
-
|
|
25
17
|
|
|
26
18
|
@component
|
|
27
19
|
class AmazonBedrockTextEmbedder:
|
|
@@ -50,13 +42,7 @@ class AmazonBedrockTextEmbedder:
|
|
|
50
42
|
|
|
51
43
|
def __init__(
|
|
52
44
|
self,
|
|
53
|
-
model:
|
|
54
|
-
"amazon.titan-embed-text-v1",
|
|
55
|
-
"cohere.embed-english-v3",
|
|
56
|
-
"cohere.embed-multilingual-v3",
|
|
57
|
-
"amazon.titan-embed-text-v2:0",
|
|
58
|
-
"amazon.titan-embed-image-v1",
|
|
59
|
-
],
|
|
45
|
+
model: str,
|
|
60
46
|
aws_access_key_id: Optional[Secret] = Secret.from_env_var("AWS_ACCESS_KEY_ID", strict=False), # noqa: B008
|
|
61
47
|
aws_secret_access_key: Optional[Secret] = Secret.from_env_var( # noqa: B008
|
|
62
48
|
"AWS_SECRET_ACCESS_KEY", strict=False
|
|
@@ -64,7 +50,7 @@ class AmazonBedrockTextEmbedder:
|
|
|
64
50
|
aws_session_token: Optional[Secret] = Secret.from_env_var("AWS_SESSION_TOKEN", strict=False), # noqa: B008
|
|
65
51
|
aws_region_name: Optional[Secret] = Secret.from_env_var("AWS_DEFAULT_REGION", strict=False), # noqa: B008
|
|
66
52
|
aws_profile_name: Optional[Secret] = Secret.from_env_var("AWS_PROFILE", strict=False), # noqa: B008
|
|
67
|
-
boto3_config: Optional[
|
|
53
|
+
boto3_config: Optional[dict[str, Any]] = None,
|
|
68
54
|
**kwargs: Any,
|
|
69
55
|
) -> None:
|
|
70
56
|
"""
|
|
@@ -77,8 +63,13 @@ class AmazonBedrockTextEmbedder:
|
|
|
77
63
|
constructor. Aside from model, three required parameters are `aws_access_key_id`, `aws_secret_access_key`,
|
|
78
64
|
and `aws_region_name`.
|
|
79
65
|
|
|
80
|
-
:param model: The embedding model to use.
|
|
81
|
-
|
|
66
|
+
:param model: The embedding model to use.
|
|
67
|
+
Amazon Titan and Cohere embedding models are supported, for example:
|
|
68
|
+
"amazon.titan-embed-text-v1", "amazon.titan-embed-text-v2:0", "amazon.titan-embed-image-v1",
|
|
69
|
+
"cohere.embed-english-v3", "cohere.embed-multilingual-v3", "cohere.embed-v4:0".
|
|
70
|
+
To find all supported models, refer to the Amazon Bedrock
|
|
71
|
+
[documentation](https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html) and
|
|
72
|
+
filter for "embedding", then select models from the Amazon Titan and Cohere series.
|
|
82
73
|
:param aws_access_key_id: AWS access key ID.
|
|
83
74
|
:param aws_secret_access_key: AWS secret access key.
|
|
84
75
|
:param aws_session_token: AWS session token.
|
|
@@ -90,10 +81,8 @@ class AmazonBedrockTextEmbedder:
|
|
|
90
81
|
:raises ValueError: If the model is not supported.
|
|
91
82
|
:raises AmazonBedrockConfigurationError: If the AWS environment is not configured correctly.
|
|
92
83
|
"""
|
|
93
|
-
if not model
|
|
94
|
-
msg = "
|
|
95
|
-
SUPPORTED_EMBEDDING_MODELS
|
|
96
|
-
)
|
|
84
|
+
if "titan" not in model and "cohere" not in model:
|
|
85
|
+
msg = f"Model {model} is not supported. Only Amazon Titan and Cohere embedding models are supported."
|
|
97
86
|
raise ValueError(msg)
|
|
98
87
|
|
|
99
88
|
self.model = model
|
|
@@ -127,8 +116,8 @@ class AmazonBedrockTextEmbedder:
|
|
|
127
116
|
)
|
|
128
117
|
raise AmazonBedrockConfigurationError(msg) from exception
|
|
129
118
|
|
|
130
|
-
@component.output_types(embedding=
|
|
131
|
-
def run(self, text: str) ->
|
|
119
|
+
@component.output_types(embedding=list[float])
|
|
120
|
+
def run(self, text: str) -> dict[str, list[float]]:
|
|
132
121
|
"""Embeds the input text using the Amazon Bedrock model.
|
|
133
122
|
|
|
134
123
|
:param text: The input text to embed.
|
|
@@ -168,16 +157,21 @@ class AmazonBedrockTextEmbedder:
|
|
|
168
157
|
response_body = json.loads(response.get("body").read())
|
|
169
158
|
|
|
170
159
|
if "cohere" in self.model:
|
|
171
|
-
|
|
160
|
+
cohere_embeddings = response_body["embeddings"]
|
|
161
|
+
# depending on the model, Cohere returns a dict with the embedding types as keys or a list of lists
|
|
162
|
+
embeddings_list = (
|
|
163
|
+
next(iter(cohere_embeddings.values())) if isinstance(cohere_embeddings, dict) else cohere_embeddings
|
|
164
|
+
)
|
|
165
|
+
embedding = embeddings_list[0]
|
|
172
166
|
elif "titan" in self.model:
|
|
173
167
|
embedding = response_body["embedding"]
|
|
174
168
|
else:
|
|
175
|
-
msg = f"
|
|
169
|
+
msg = f"Model {self.model} is not supported. Only Amazon Titan and Cohere embedding models are supported."
|
|
176
170
|
raise ValueError(msg)
|
|
177
171
|
|
|
178
172
|
return {"embedding": embedding}
|
|
179
173
|
|
|
180
|
-
def to_dict(self) ->
|
|
174
|
+
def to_dict(self) -> dict[str, Any]:
|
|
181
175
|
"""
|
|
182
176
|
Serializes the component to a dictionary.
|
|
183
177
|
|
|
@@ -197,7 +191,7 @@ class AmazonBedrockTextEmbedder:
|
|
|
197
191
|
)
|
|
198
192
|
|
|
199
193
|
@classmethod
|
|
200
|
-
def from_dict(cls, data:
|
|
194
|
+
def from_dict(cls, data: dict[str, Any]) -> "AmazonBedrockTextEmbedder":
|
|
201
195
|
"""
|
|
202
196
|
Deserializes the component from a dictionary.
|
|
203
197
|
|