amazon-bedrock-haystack 4.0.0__tar.gz → 4.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/CHANGELOG.md +32 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/PKG-INFO +2 -2
- amazon_bedrock_haystack-4.2.0/examples/s3_downloader_example.py +83 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/pydoc/config.yml +3 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/pyproject.toml +5 -2
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/common/amazon_bedrock/errors.py +5 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/common/amazon_bedrock/utils.py +4 -0
- amazon_bedrock_haystack-4.2.0/src/haystack_integrations/common/s3/errors.py +15 -0
- amazon_bedrock_haystack-4.2.0/src/haystack_integrations/common/s3/utils.py +120 -0
- amazon_bedrock_haystack-4.2.0/src/haystack_integrations/components/downloaders/s3/__init__.py +7 -0
- amazon_bedrock_haystack-4.2.0/src/haystack_integrations/components/downloaders/s3/s3_downloader.py +246 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/components/generators/amazon_bedrock/chat/chat_generator.py +29 -8
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/components/generators/amazon_bedrock/chat/utils.py +53 -15
- amazon_bedrock_haystack-4.2.0/src/haystack_integrations/components/rankers/py.typed +0 -0
- amazon_bedrock_haystack-4.2.0/tests/__init__.py +3 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/tests/test_chat_generator.py +46 -6
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/tests/test_chat_generator_utils.py +174 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/tests/test_document_image_embedder.py +1 -1
- amazon_bedrock_haystack-4.2.0/tests/test_s3_downloader.py +229 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/.gitignore +0 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/LICENSE.txt +0 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/README.md +0 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/examples/bedrock_ranker_example.py +0 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/examples/chatgenerator_example.py +0 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/examples/embedders_generator_with_rag_example.py +0 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/common/amazon_bedrock/__init__.py +0 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/common/py.typed +0 -0
- {amazon_bedrock_haystack-4.0.0/src/haystack_integrations/components/generators/amazon_bedrock/chat → amazon_bedrock_haystack-4.2.0/src/haystack_integrations/common/s3}/__init__.py +0 -0
- {amazon_bedrock_haystack-4.0.0/src/haystack_integrations/components/embedders → amazon_bedrock_haystack-4.2.0/src/haystack_integrations/components/downloaders}/py.typed +0 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/components/embedders/amazon_bedrock/__init__.py +0 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/components/embedders/amazon_bedrock/document_embedder.py +0 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/components/embedders/amazon_bedrock/document_image_embedder.py +0 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/components/embedders/amazon_bedrock/text_embedder.py +0 -0
- {amazon_bedrock_haystack-4.0.0/src/haystack_integrations/components/generators → amazon_bedrock_haystack-4.2.0/src/haystack_integrations/components/embedders}/py.typed +0 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/components/generators/amazon_bedrock/__init__.py +0 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/components/generators/amazon_bedrock/adapters.py +0 -0
- {amazon_bedrock_haystack-4.0.0/tests → amazon_bedrock_haystack-4.2.0/src/haystack_integrations/components/generators/amazon_bedrock/chat}/__init__.py +0 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/components/generators/amazon_bedrock/generator.py +0 -0
- {amazon_bedrock_haystack-4.0.0/src/haystack_integrations/components/rankers → amazon_bedrock_haystack-4.2.0/src/haystack_integrations/components/generators}/py.typed +0 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/components/rankers/amazon_bedrock/__init__.py +0 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/components/rankers/amazon_bedrock/ranker.py +0 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/tests/conftest.py +0 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/tests/test_document_embedder.py +0 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/tests/test_files/apple.jpg +0 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/tests/test_files/haystack-logo.png +0 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/tests/test_files/sample_pdf_1.pdf +0 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/tests/test_generator.py +0 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/tests/test_ranker.py +0 -0
- {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/tests/test_text_embedder.py +0 -0
|
@@ -1,11 +1,43 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [integrations/amazon_bedrock-v4.1.0] - 2025-09-19
|
|
4
|
+
|
|
5
|
+
### 🚀 Features
|
|
6
|
+
|
|
7
|
+
- Support AWS Bedrock Guardrails in `AmazonBedrockChatGenerator` (#2284)
|
|
8
|
+
|
|
9
|
+
### 🧹 Chores
|
|
10
|
+
|
|
11
|
+
- Bedrock - remove unused `stop_words` init parameter (#2275)
|
|
12
|
+
|
|
13
|
+
### 🌀 Miscellaneous
|
|
14
|
+
|
|
15
|
+
- Chore: Fix linting aws bedrock (#2249)
|
|
16
|
+
|
|
17
|
+
## [integrations/amazon_bedrock-v4.0.0] - 2025-08-29
|
|
18
|
+
|
|
19
|
+
### 🚀 Features
|
|
20
|
+
|
|
21
|
+
- [**breaking**] Update AmazonBedrockChatGenerator to use the new fields in `StreamingChunk` (#2216)
|
|
22
|
+
- [**breaking**] Use `ReasoningContent` to store reasoning content in `ChatMessage` instead of `ChatMessage.meta` (#2226)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
### 🧹 Chores
|
|
26
|
+
|
|
27
|
+
- Standardize readmes - part 2 (#2205)
|
|
28
|
+
|
|
3
29
|
## [integrations/amazon_bedrock-v3.11.0] - 2025-08-21
|
|
4
30
|
|
|
5
31
|
### 🚀 Features
|
|
6
32
|
|
|
7
33
|
- Add `AmazonBedrockDocumentImageEmbedder` component (#2185)
|
|
8
34
|
|
|
35
|
+
### 🧹 Chores
|
|
36
|
+
|
|
37
|
+
- Add framework name into UserAgent header for bedrock integration (#2168)
|
|
38
|
+
- Standardize readmes - part 1 (#2202)
|
|
39
|
+
|
|
40
|
+
|
|
9
41
|
## [integrations/amazon_bedrock-v3.10.0] - 2025-08-06
|
|
10
42
|
|
|
11
43
|
### 🚀 Features
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: amazon-bedrock-haystack
|
|
3
|
-
Version: 4.
|
|
4
|
-
Summary: An integration of
|
|
3
|
+
Version: 4.2.0
|
|
4
|
+
Summary: An integration of AWS S3 and Bedrock as a Downloader and Generator components.
|
|
5
5
|
Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/amazon_bedrock#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
|
|
7
7
|
Project-URL: Source, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/amazon_bedrock
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# To run this example, you will need to
|
|
2
|
+
# 1) set `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` and `AWS_DEFAULT_REGION` environment variables
|
|
3
|
+
# 2) enabled access to the selected S3 bucket
|
|
4
|
+
# 3) `S3_DOWNLOADER_BUCKET` environment variable should be set to the name of the S3 bucket.
|
|
5
|
+
|
|
6
|
+
# The example shows how to use the S3Downloader component in a query pipeline to download files from an S3 bucket.
|
|
7
|
+
# To run this example, set the file_name in docs to your files in the S3 bucket.
|
|
8
|
+
# The files are then downloaded, converted to images and used to answer a question.
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from uuid import uuid4
|
|
13
|
+
|
|
14
|
+
from haystack import Pipeline
|
|
15
|
+
from haystack.components.builders import ChatPromptBuilder
|
|
16
|
+
from haystack.components.converters.image import DocumentToImageContent
|
|
17
|
+
from haystack.components.routers import DocumentTypeRouter
|
|
18
|
+
from haystack.dataclasses import Document
|
|
19
|
+
|
|
20
|
+
from haystack_integrations.components.downloaders.s3.s3_downloader import S3Downloader
|
|
21
|
+
from haystack_integrations.components.generators.amazon_bedrock import AmazonBedrockChatGenerator
|
|
22
|
+
|
|
23
|
+
docs = [
|
|
24
|
+
Document(meta={"file_id": str(uuid4()), "file_name": "text-sample.txt"}),
|
|
25
|
+
Document(meta={"file_id": str(uuid4()), "file_name": "document-sample.pdf", "page_number": 1}),
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
chat_prompt_builder = ChatPromptBuilder(
|
|
29
|
+
required_variables=["question"],
|
|
30
|
+
template="""{% message role="system" %}
|
|
31
|
+
You are a friendly assistant that answers questions based on provided documents and images.
|
|
32
|
+
{% endmessage %}
|
|
33
|
+
|
|
34
|
+
{%- message role="user" -%}
|
|
35
|
+
Only provide an answer to the question using the images and text passages provided.
|
|
36
|
+
|
|
37
|
+
These are the text-only documents:
|
|
38
|
+
{%- if documents|length > 0 %}
|
|
39
|
+
{%- for doc in documents %}
|
|
40
|
+
Text Document [{{ loop.index }}] :
|
|
41
|
+
{{ doc.content }}
|
|
42
|
+
{% endfor -%}
|
|
43
|
+
{%- else %}
|
|
44
|
+
No relevant text documents were found.
|
|
45
|
+
{% endif %}
|
|
46
|
+
End of text documents.
|
|
47
|
+
|
|
48
|
+
Question: {{ question }}
|
|
49
|
+
Answer:
|
|
50
|
+
|
|
51
|
+
Images:
|
|
52
|
+
{%- if image_contents|length > 0 %}
|
|
53
|
+
{%- for img in image_contents -%}
|
|
54
|
+
{{ img | templatize_part }}
|
|
55
|
+
{%- endfor -%}
|
|
56
|
+
{% endif %}
|
|
57
|
+
{%- endmessage -%}
|
|
58
|
+
""",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
pipe = Pipeline()
|
|
62
|
+
pipe.add_component(
|
|
63
|
+
"s3_downloader", S3Downloader(file_root_path=str(Path.cwd() / "s3_downloads"), file_extensions=[".pdf"])
|
|
64
|
+
)
|
|
65
|
+
pipe.add_component(
|
|
66
|
+
"doc_type_router", DocumentTypeRouter(file_path_meta_field="file_path", mime_types=["application/pdf"])
|
|
67
|
+
)
|
|
68
|
+
pipe.add_component("doc_to_image", DocumentToImageContent(detail="auto"))
|
|
69
|
+
pipe.add_component("chat_prompt_builder", chat_prompt_builder)
|
|
70
|
+
pipe.add_component("llm", AmazonBedrockChatGenerator(model="anthropic.claude-3-haiku-20240307-v1:0"))
|
|
71
|
+
|
|
72
|
+
pipe.connect("s3_downloader.documents", "doc_type_router.documents")
|
|
73
|
+
pipe.connect("doc_type_router.application/pdf", "doc_to_image.documents")
|
|
74
|
+
pipe.connect("doc_to_image.image_contents", "chat_prompt_builder.image_contents")
|
|
75
|
+
pipe.connect("s3_downloader.documents", "chat_prompt_builder.documents")
|
|
76
|
+
pipe.connect("chat_prompt_builder.prompt", "llm.messages")
|
|
77
|
+
|
|
78
|
+
result = pipe.run(
|
|
79
|
+
data={
|
|
80
|
+
"s3_downloader": {"documents": docs},
|
|
81
|
+
"chat_prompt_builder": {"question": "What is the main topic of the document?"},
|
|
82
|
+
}
|
|
83
|
+
)
|
|
@@ -11,6 +11,9 @@ loaders:
|
|
|
11
11
|
"haystack_integrations.common.amazon_bedrock.errors",
|
|
12
12
|
"haystack_integrations.components.generators.amazon_bedrock.chat.chat_generator",
|
|
13
13
|
"haystack_integrations.components.rankers.amazon_bedrock.ranker",
|
|
14
|
+
"haystack_integrations.components.downloaders.s3.s3_downloader",
|
|
15
|
+
"haystack_integrations.common.s3.utils",
|
|
16
|
+
"haystack_integrations.common.s3.errors",
|
|
14
17
|
]
|
|
15
18
|
ignore_when_discovered: ["__init__"]
|
|
16
19
|
processors:
|
|
@@ -5,7 +5,7 @@ build-backend = "hatchling.build"
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "amazon-bedrock-haystack"
|
|
7
7
|
dynamic = ["version"]
|
|
8
|
-
description = 'An integration of
|
|
8
|
+
description = 'An integration of AWS S3 and Bedrock as a Downloader and Generator components.'
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
11
11
|
license = "Apache-2.0"
|
|
@@ -71,7 +71,10 @@ cov-retry = 'all --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x'
|
|
|
71
71
|
types = """mypy -p haystack_integrations.common.amazon_bedrock \
|
|
72
72
|
-p haystack_integrations.components.embedders.amazon_bedrock \
|
|
73
73
|
-p haystack_integrations.components.generators.amazon_bedrock \
|
|
74
|
-
-p haystack_integrations.components.rankers.amazon_bedrock
|
|
74
|
+
-p haystack_integrations.components.rankers.amazon_bedrock \
|
|
75
|
+
-p haystack_integrations.components.downloaders.s3 \
|
|
76
|
+
-p haystack_integrations.common.s3 {args}"""
|
|
77
|
+
|
|
75
78
|
|
|
76
79
|
[tool.mypy]
|
|
77
80
|
install_types = true
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class S3Error(Exception):
|
|
7
|
+
"""Exception for issues that occur in the S3 based components"""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class S3ConfigurationError(S3Error):
|
|
11
|
+
"""Exception raised when AmazonS3 node is not configured correctly"""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class S3StorageError(S3Error):
|
|
15
|
+
"""This exception is raised when an error occurs while interacting with a S3Storage object."""
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from http import HTTPStatus
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
from boto3.session import Session
|
|
12
|
+
from botocore.config import Config
|
|
13
|
+
from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError
|
|
14
|
+
|
|
15
|
+
from haystack_integrations.common.s3.errors import S3ConfigurationError, S3StorageError
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class S3Storage:
|
|
20
|
+
"""This class provides a storage class for downloading files from an AWS S3 bucket."""
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
s3_bucket: str,
|
|
25
|
+
session: Session,
|
|
26
|
+
s3_prefix: Optional[str] = None,
|
|
27
|
+
endpoint_url: Optional[str] = None,
|
|
28
|
+
config: Optional[Config] = None,
|
|
29
|
+
) -> None:
|
|
30
|
+
"""
|
|
31
|
+
Initializes the S3Storage object with the provided parameters.
|
|
32
|
+
|
|
33
|
+
:param s3_bucket: The name of the S3 bucket to download files from.
|
|
34
|
+
:param session: The session to use for the S3 client.
|
|
35
|
+
:param s3_prefix: The optional prefix of the files in the S3 bucket.
|
|
36
|
+
Can be used to specify folder or naming structure.
|
|
37
|
+
For example, if the file is in the folder "folder/subfolder/file.txt",
|
|
38
|
+
the s3_prefix should be "folder/subfolder/". If the file is in the root of the S3 bucket,
|
|
39
|
+
the s3_prefix should be None.
|
|
40
|
+
:param endpoint_url: The endpoint URL of the S3 bucket to download files from.
|
|
41
|
+
:param config: The configuration to use for the S3 client.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
self.s3_bucket = s3_bucket
|
|
45
|
+
self.s3_prefix = s3_prefix
|
|
46
|
+
self.endpoint_url = endpoint_url
|
|
47
|
+
self.session = session
|
|
48
|
+
self.config = config
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
self._client = self.session.client("s3", endpoint_url=self.endpoint_url, config=self.config)
|
|
52
|
+
except Exception as e:
|
|
53
|
+
msg = f"Failed to create S3 session client: {e}"
|
|
54
|
+
raise S3ConfigurationError(msg) from e
|
|
55
|
+
|
|
56
|
+
def download(self, key: str, local_file_path: Path) -> None:
|
|
57
|
+
"""Download a file from S3.
|
|
58
|
+
|
|
59
|
+
:param key: The key of the file to download.
|
|
60
|
+
:param local_file_path: The folder path to download the file to.
|
|
61
|
+
It will be created if it does not exist. The file will be downloaded to
|
|
62
|
+
the folder with the same name as the key.
|
|
63
|
+
:raises S3ConfigurationError: If the S3 session client cannot be created.
|
|
64
|
+
:raises S3StorageError: If the file does not exist in the S3 bucket
|
|
65
|
+
or the file cannot be downloaded.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
if self.s3_prefix:
|
|
69
|
+
s3_key = f"{self.s3_prefix}{key}"
|
|
70
|
+
else:
|
|
71
|
+
s3_key = key
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
self._client.download_file(self.s3_bucket, s3_key, str(local_file_path))
|
|
75
|
+
|
|
76
|
+
except (NoCredentialsError, PartialCredentialsError) as e:
|
|
77
|
+
msg = (
|
|
78
|
+
f"Missing AWS credentials. Please check your AWS credentials (access key, secret key, region)."
|
|
79
|
+
f"Error: {e}"
|
|
80
|
+
)
|
|
81
|
+
raise S3ConfigurationError(msg) from e
|
|
82
|
+
|
|
83
|
+
except ClientError as e:
|
|
84
|
+
error_code = int(e.response["Error"]["Code"])
|
|
85
|
+
|
|
86
|
+
if error_code == HTTPStatus.FORBIDDEN:
|
|
87
|
+
msg = (
|
|
88
|
+
f"Failed to access S3 bucket {self.s3_bucket!r}. "
|
|
89
|
+
f"Please check your AWS credentials (access key, secret key, region) and ensure "
|
|
90
|
+
f"they have the necessary S3 permissions. "
|
|
91
|
+
f"Error: {e}"
|
|
92
|
+
)
|
|
93
|
+
raise S3ConfigurationError(msg) from e
|
|
94
|
+
|
|
95
|
+
elif error_code == HTTPStatus.NOT_FOUND:
|
|
96
|
+
msg = f"The object {s3_key!r} does not exist in the S3 bucket {self.s3_bucket!r}. \n Error: {e}"
|
|
97
|
+
raise S3StorageError(msg) from e
|
|
98
|
+
else:
|
|
99
|
+
msg = f"Failed to download file {s3_key!r} from S3. Error: {e}"
|
|
100
|
+
raise S3StorageError(msg) from e
|
|
101
|
+
|
|
102
|
+
@classmethod
|
|
103
|
+
def from_env(cls, *, session: Session, config: Config) -> "S3Storage":
|
|
104
|
+
"""Create a S3Storage object from environment variables."""
|
|
105
|
+
s3_bucket = os.getenv("S3_DOWNLOADER_BUCKET")
|
|
106
|
+
if not s3_bucket:
|
|
107
|
+
msg = (
|
|
108
|
+
"Missing environment variable S3_DOWNLOADER_BUCKET."
|
|
109
|
+
"Please set it to the name of the S3 bucket to download files from."
|
|
110
|
+
)
|
|
111
|
+
raise ValueError(msg)
|
|
112
|
+
s3_prefix = os.getenv("S3_DOWNLOADER_PREFIX") or None
|
|
113
|
+
endpoint_url = os.getenv("AWS_ENDPOINT_URL") or None
|
|
114
|
+
return cls(
|
|
115
|
+
s3_bucket=s3_bucket,
|
|
116
|
+
s3_prefix=s3_prefix,
|
|
117
|
+
endpoint_url=endpoint_url,
|
|
118
|
+
session=session,
|
|
119
|
+
config=config,
|
|
120
|
+
)
|
amazon_bedrock_haystack-4.2.0/src/haystack_integrations/components/downloaders/s3/s3_downloader.py
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Dict, List, Optional
|
|
9
|
+
|
|
10
|
+
from botocore.config import Config
|
|
11
|
+
from haystack import component, default_from_dict, default_to_dict, logging
|
|
12
|
+
from haystack.dataclasses import Document
|
|
13
|
+
from haystack.utils.auth import Secret, deserialize_secrets_inplace
|
|
14
|
+
|
|
15
|
+
from haystack_integrations.common.amazon_bedrock.utils import get_aws_session
|
|
16
|
+
from haystack_integrations.common.s3.utils import S3Storage
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@component
|
|
22
|
+
class S3Downloader:
|
|
23
|
+
"""
|
|
24
|
+
A component for downloading files from AWS S3 Buckets to local filesystem.
|
|
25
|
+
Supports filtering by file extensions.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
*,
|
|
31
|
+
aws_access_key_id: Optional[Secret] = Secret.from_env_var("AWS_ACCESS_KEY_ID", strict=False), # noqa: B008
|
|
32
|
+
aws_secret_access_key: Optional[Secret] = Secret.from_env_var( # noqa: B008
|
|
33
|
+
"AWS_SECRET_ACCESS_KEY", strict=False
|
|
34
|
+
),
|
|
35
|
+
aws_session_token: Optional[Secret] = Secret.from_env_var("AWS_SESSION_TOKEN", strict=False), # noqa: B008
|
|
36
|
+
aws_region_name: Optional[Secret] = Secret.from_env_var("AWS_DEFAULT_REGION", strict=False), # noqa: B008
|
|
37
|
+
aws_profile_name: Optional[Secret] = Secret.from_env_var("AWS_PROFILE", strict=False), # noqa: B008
|
|
38
|
+
boto3_config: Optional[Dict[str, Any]] = None,
|
|
39
|
+
file_root_path: Optional[str] = None,
|
|
40
|
+
file_extensions: Optional[List[str]] = None,
|
|
41
|
+
file_name_meta_key: str = "file_name",
|
|
42
|
+
max_workers: int = 32,
|
|
43
|
+
max_cache_size: int = 100,
|
|
44
|
+
) -> None:
|
|
45
|
+
"""
|
|
46
|
+
Initializes the `S3Downloader` with the provided parameters.
|
|
47
|
+
|
|
48
|
+
Note that the AWS credentials are not required if the AWS environment is configured correctly. These are loaded
|
|
49
|
+
automatically from the environment or the AWS configuration file and do not need to be provided explicitly via
|
|
50
|
+
the constructor. If the AWS environment is not configured users need to provide the AWS credentials via the
|
|
51
|
+
constructor. Three required parameters are `aws_access_key_id`, `aws_secret_access_key`,
|
|
52
|
+
and `aws_region_name`.
|
|
53
|
+
|
|
54
|
+
:param aws_access_key_id: AWS access key ID.
|
|
55
|
+
:param aws_secret_access_key: AWS secret access key.
|
|
56
|
+
:param aws_session_token: AWS session token.
|
|
57
|
+
:param aws_region_name: AWS region name.
|
|
58
|
+
:param aws_profile_name: AWS profile name.
|
|
59
|
+
:param boto3_config: The configuration for the boto3 client.
|
|
60
|
+
:param file_root_path: The path where the file will be downloaded.
|
|
61
|
+
Can be set through this parameter or the `FILE_ROOT_PATH` environment variable.
|
|
62
|
+
If none of them is set, a `ValueError` is raised.
|
|
63
|
+
:param file_extensions: The file extensions that are permitted to be downloaded.
|
|
64
|
+
By default, all file extensions are allowed.
|
|
65
|
+
:param max_workers: The maximum number of workers to use for concurrent downloads.
|
|
66
|
+
:param max_cache_size: The maximum number of files to cache.
|
|
67
|
+
:param file_name_meta_key: The name of the meta key that contains the file name to download.
|
|
68
|
+
By default, the `Document.meta["file_name"]` is used. If you want to use a
|
|
69
|
+
different key in `Document.meta`, you can set it here.
|
|
70
|
+
:raises ValueError: If the `file_root_path` is not set through
|
|
71
|
+
the constructor or the `FILE_ROOT_PATH` environment variable.
|
|
72
|
+
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
# Set up download directory
|
|
76
|
+
file_root_path = file_root_path or os.getenv("FILE_ROOT_PATH")
|
|
77
|
+
|
|
78
|
+
if file_root_path is None:
|
|
79
|
+
msg = (
|
|
80
|
+
"The path where files will be downloaded is not set. Please set the "
|
|
81
|
+
"`file_root_path` init parameter or the `FILE_ROOT_PATH` environment variable."
|
|
82
|
+
)
|
|
83
|
+
raise ValueError(msg)
|
|
84
|
+
|
|
85
|
+
self.file_root_path = Path(file_root_path)
|
|
86
|
+
|
|
87
|
+
self.aws_access_key_id = aws_access_key_id
|
|
88
|
+
self.aws_secret_access_key = aws_secret_access_key
|
|
89
|
+
self.aws_region_name = aws_region_name
|
|
90
|
+
self.aws_session_token = aws_session_token
|
|
91
|
+
self.aws_profile_name = aws_profile_name
|
|
92
|
+
self.boto3_config = boto3_config
|
|
93
|
+
self.file_extensions = [e.lower() for e in file_extensions] if file_extensions else None
|
|
94
|
+
self.max_workers = max_workers
|
|
95
|
+
self.max_cache_size = max_cache_size
|
|
96
|
+
self.file_name_meta_key = file_name_meta_key
|
|
97
|
+
|
|
98
|
+
self._storage: Optional[S3Storage] = None
|
|
99
|
+
|
|
100
|
+
def resolve_secret(secret: Optional[Secret]) -> Optional[str]:
|
|
101
|
+
return secret.resolve_value() if secret else None
|
|
102
|
+
|
|
103
|
+
self._session = get_aws_session(
|
|
104
|
+
aws_access_key_id=resolve_secret(aws_access_key_id),
|
|
105
|
+
aws_secret_access_key=resolve_secret(aws_secret_access_key),
|
|
106
|
+
aws_session_token=resolve_secret(aws_session_token),
|
|
107
|
+
aws_region_name=resolve_secret(aws_region_name),
|
|
108
|
+
aws_profile_name=resolve_secret(aws_profile_name),
|
|
109
|
+
)
|
|
110
|
+
self._config = Config(
|
|
111
|
+
user_agent_extra="x-client-framework:haystack", **(self.boto3_config if self.boto3_config else {})
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
def warm_up(self) -> None:
|
|
115
|
+
"""Warm up the component by initializing the settings and storage."""
|
|
116
|
+
if self._storage is None:
|
|
117
|
+
self.file_root_path.mkdir(parents=True, exist_ok=True)
|
|
118
|
+
self._storage = S3Storage.from_env(session=self._session, config=self._config)
|
|
119
|
+
|
|
120
|
+
@component.output_types(documents=List[Document])
|
|
121
|
+
def run(
|
|
122
|
+
self,
|
|
123
|
+
documents: List[Document],
|
|
124
|
+
) -> Dict[str, List[Document]]:
|
|
125
|
+
"""Download files from AWS S3 Buckets to local filesystem.
|
|
126
|
+
|
|
127
|
+
Return enriched `Document`s with the path of the downloaded file.
|
|
128
|
+
:param documents: Document containing the name of the file to download in the meta field.
|
|
129
|
+
:returns: A dictionary with:
|
|
130
|
+
- `documents`: The downloaded `Document`s; each has `meta['file_path']`.
|
|
131
|
+
:raises S3Error: If a download attempt fails or the file does not exist in the S3 bucket.
|
|
132
|
+
:raises ValueError: If the path where files will be downloaded is not set.
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
if self._storage is None:
|
|
136
|
+
msg = f"The component {self.__class__.__name__} was not warmed up. Call 'warm_up()' before calling run()."
|
|
137
|
+
raise RuntimeError(msg)
|
|
138
|
+
|
|
139
|
+
filtered_documents = self._filter_documents_by_extensions(documents) if self.file_extensions else documents
|
|
140
|
+
|
|
141
|
+
if not filtered_documents:
|
|
142
|
+
return {"documents": []}
|
|
143
|
+
|
|
144
|
+
try:
|
|
145
|
+
max_workers = min(self.max_workers, len(filtered_documents) if filtered_documents else self.max_workers)
|
|
146
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
147
|
+
iterable = executor.map(self._download_file, filtered_documents)
|
|
148
|
+
finally:
|
|
149
|
+
self._cleanup_cache(filtered_documents)
|
|
150
|
+
|
|
151
|
+
downloaded_documents = [d for d in iterable if d is not None]
|
|
152
|
+
return {"documents": downloaded_documents}
|
|
153
|
+
|
|
154
|
+
def _filter_documents_by_extensions(self, documents: List[Document]) -> List[Document]:
|
|
155
|
+
"""Filter documents by file extensions."""
|
|
156
|
+
if not self.file_extensions:
|
|
157
|
+
return documents
|
|
158
|
+
return [
|
|
159
|
+
doc
|
|
160
|
+
for doc in documents
|
|
161
|
+
if Path(doc.meta.get(self.file_name_meta_key, "")).suffix.lower() in self.file_extensions
|
|
162
|
+
]
|
|
163
|
+
|
|
164
|
+
def _download_file(self, document: Document) -> Optional[Document]:
|
|
165
|
+
"""
|
|
166
|
+
Download a single file from AWS S3 Bucket to local filesystem.
|
|
167
|
+
|
|
168
|
+
:param document: `Document` with the name of the file to download in the meta field.
|
|
169
|
+
:returns:
|
|
170
|
+
The same `Document` with `meta` containing the `file_path` of the
|
|
171
|
+
downloaded file.
|
|
172
|
+
:raises S3Error: If the download or head request fails or the file does not exist in the S3 bucket.
|
|
173
|
+
"""
|
|
174
|
+
|
|
175
|
+
file_name = document.meta.get(self.file_name_meta_key)
|
|
176
|
+
if not file_name:
|
|
177
|
+
logger.warning(
|
|
178
|
+
f"Document missing required file name metadata key '{self.file_name_meta_key}'. Skipping download."
|
|
179
|
+
)
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
file_path = self.file_root_path / Path(file_name)
|
|
183
|
+
|
|
184
|
+
if file_path.is_file():
|
|
185
|
+
# set access and modification time to now without redownloading the file
|
|
186
|
+
file_path.touch()
|
|
187
|
+
|
|
188
|
+
else:
|
|
189
|
+
# we know that _storage is not None after warm_up() is called, but mypy does not know that
|
|
190
|
+
self._storage.download(key=file_name, local_file_path=file_path) # type: ignore[union-attr]
|
|
191
|
+
|
|
192
|
+
document.meta["file_path"] = str(file_path)
|
|
193
|
+
return document
|
|
194
|
+
|
|
195
|
+
def _cleanup_cache(self, documents: List[Document]) -> None:
|
|
196
|
+
"""
|
|
197
|
+
Remove least-recently-accessed cache files when cache exceeds `max_cache_size`.
|
|
198
|
+
|
|
199
|
+
:param documents: List of Document objects being used containing `cache_id` metadata.
|
|
200
|
+
"""
|
|
201
|
+
requested_ids = {
|
|
202
|
+
str(abs(hash(str(doc.meta.get("cache_id", ""))))) for doc in documents if doc.meta.get("cache_id")
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
all_files = [p for p in self.file_root_path.iterdir() if p.is_file()]
|
|
206
|
+
misses = [p for p in all_files if p.stem not in requested_ids]
|
|
207
|
+
|
|
208
|
+
overflow = len(misses) + len(requested_ids) - self.max_cache_size
|
|
209
|
+
if overflow > 0:
|
|
210
|
+
misses.sort(key=lambda p: p.stat().st_atime)
|
|
211
|
+
for p in misses[:overflow]:
|
|
212
|
+
try:
|
|
213
|
+
p.unlink()
|
|
214
|
+
except Exception as error:
|
|
215
|
+
logger.warning("Failed to remove cache file at {path} with error: {e}", path=p, e=error)
|
|
216
|
+
|
|
217
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
218
|
+
"""Serialize the component to a dictionary."""
|
|
219
|
+
return default_to_dict(
|
|
220
|
+
self,
|
|
221
|
+
aws_access_key_id=self.aws_access_key_id.to_dict() if self.aws_access_key_id else None,
|
|
222
|
+
aws_secret_access_key=self.aws_secret_access_key.to_dict() if self.aws_secret_access_key else None,
|
|
223
|
+
aws_session_token=self.aws_session_token.to_dict() if self.aws_session_token else None,
|
|
224
|
+
aws_region_name=self.aws_region_name.to_dict() if self.aws_region_name else None,
|
|
225
|
+
aws_profile_name=self.aws_profile_name.to_dict() if self.aws_profile_name else None,
|
|
226
|
+
file_root_path=str(self.file_root_path),
|
|
227
|
+
max_workers=self.max_workers,
|
|
228
|
+
max_cache_size=self.max_cache_size,
|
|
229
|
+
file_extensions=self.file_extensions,
|
|
230
|
+
file_name_meta_key=self.file_name_meta_key,
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
@classmethod
|
|
234
|
+
def from_dict(cls, data: Dict[str, Any]) -> "S3Downloader":
|
|
235
|
+
"""
|
|
236
|
+
Deserializes the component from a dictionary.
|
|
237
|
+
:param data:
|
|
238
|
+
Dictionary to deserialize from.
|
|
239
|
+
:returns:
|
|
240
|
+
Deserialized component.
|
|
241
|
+
"""
|
|
242
|
+
deserialize_secrets_inplace(
|
|
243
|
+
data["init_parameters"],
|
|
244
|
+
["aws_access_key_id", "aws_secret_access_key", "aws_session_token", "aws_region_name", "aws_profile_name"],
|
|
245
|
+
)
|
|
246
|
+
return default_from_dict(cls, data)
|