amazon-bedrock-haystack 4.0.0__tar.gz → 4.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/CHANGELOG.md +32 -0
  2. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/PKG-INFO +2 -2
  3. amazon_bedrock_haystack-4.2.0/examples/s3_downloader_example.py +83 -0
  4. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/pydoc/config.yml +3 -0
  5. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/pyproject.toml +5 -2
  6. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/common/amazon_bedrock/errors.py +5 -0
  7. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/common/amazon_bedrock/utils.py +4 -0
  8. amazon_bedrock_haystack-4.2.0/src/haystack_integrations/common/s3/errors.py +15 -0
  9. amazon_bedrock_haystack-4.2.0/src/haystack_integrations/common/s3/utils.py +120 -0
  10. amazon_bedrock_haystack-4.2.0/src/haystack_integrations/components/downloaders/s3/__init__.py +7 -0
  11. amazon_bedrock_haystack-4.2.0/src/haystack_integrations/components/downloaders/s3/s3_downloader.py +246 -0
  12. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/components/generators/amazon_bedrock/chat/chat_generator.py +29 -8
  13. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/components/generators/amazon_bedrock/chat/utils.py +53 -15
  14. amazon_bedrock_haystack-4.2.0/src/haystack_integrations/components/rankers/py.typed +0 -0
  15. amazon_bedrock_haystack-4.2.0/tests/__init__.py +3 -0
  16. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/tests/test_chat_generator.py +46 -6
  17. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/tests/test_chat_generator_utils.py +174 -0
  18. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/tests/test_document_image_embedder.py +1 -1
  19. amazon_bedrock_haystack-4.2.0/tests/test_s3_downloader.py +229 -0
  20. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/.gitignore +0 -0
  21. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/LICENSE.txt +0 -0
  22. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/README.md +0 -0
  23. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/examples/bedrock_ranker_example.py +0 -0
  24. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/examples/chatgenerator_example.py +0 -0
  25. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/examples/embedders_generator_with_rag_example.py +0 -0
  26. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/common/amazon_bedrock/__init__.py +0 -0
  27. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/common/py.typed +0 -0
  28. {amazon_bedrock_haystack-4.0.0/src/haystack_integrations/components/generators/amazon_bedrock/chat → amazon_bedrock_haystack-4.2.0/src/haystack_integrations/common/s3}/__init__.py +0 -0
  29. {amazon_bedrock_haystack-4.0.0/src/haystack_integrations/components/embedders → amazon_bedrock_haystack-4.2.0/src/haystack_integrations/components/downloaders}/py.typed +0 -0
  30. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/components/embedders/amazon_bedrock/__init__.py +0 -0
  31. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/components/embedders/amazon_bedrock/document_embedder.py +0 -0
  32. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/components/embedders/amazon_bedrock/document_image_embedder.py +0 -0
  33. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/components/embedders/amazon_bedrock/text_embedder.py +0 -0
  34. {amazon_bedrock_haystack-4.0.0/src/haystack_integrations/components/generators → amazon_bedrock_haystack-4.2.0/src/haystack_integrations/components/embedders}/py.typed +0 -0
  35. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/components/generators/amazon_bedrock/__init__.py +0 -0
  36. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/components/generators/amazon_bedrock/adapters.py +0 -0
  37. {amazon_bedrock_haystack-4.0.0/tests → amazon_bedrock_haystack-4.2.0/src/haystack_integrations/components/generators/amazon_bedrock/chat}/__init__.py +0 -0
  38. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/components/generators/amazon_bedrock/generator.py +0 -0
  39. {amazon_bedrock_haystack-4.0.0/src/haystack_integrations/components/rankers → amazon_bedrock_haystack-4.2.0/src/haystack_integrations/components/generators}/py.typed +0 -0
  40. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/components/rankers/amazon_bedrock/__init__.py +0 -0
  41. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/src/haystack_integrations/components/rankers/amazon_bedrock/ranker.py +0 -0
  42. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/tests/conftest.py +0 -0
  43. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/tests/test_document_embedder.py +0 -0
  44. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/tests/test_files/apple.jpg +0 -0
  45. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/tests/test_files/haystack-logo.png +0 -0
  46. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/tests/test_files/sample_pdf_1.pdf +0 -0
  47. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/tests/test_generator.py +0 -0
  48. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/tests/test_ranker.py +0 -0
  49. {amazon_bedrock_haystack-4.0.0 → amazon_bedrock_haystack-4.2.0}/tests/test_text_embedder.py +0 -0
@@ -1,11 +1,43 @@
1
1
  # Changelog
2
2
 
3
+ ## [integrations/amazon_bedrock-v4.1.0] - 2025-09-19
4
+
5
+ ### 🚀 Features
6
+
7
+ - Support AWS Bedrock Guardrails in `AmazonBedrockChatGenerator` (#2284)
8
+
9
+ ### 🧹 Chores
10
+
11
+ - Bedrock - remove unused `stop_words` init parameter (#2275)
12
+
13
+ ### 🌀 Miscellaneous
14
+
15
+ - Chore: Fix linting aws bedrock (#2249)
16
+
17
+ ## [integrations/amazon_bedrock-v4.0.0] - 2025-08-29
18
+
19
+ ### 🚀 Features
20
+
21
+ - [**breaking**] Update AmazonBedrockChatGenerator to use the new fields in `StreamingChunk` (#2216)
22
+ - [**breaking**] Use `ReasoningContent` to store reasoning content in `ChatMessage` instead of `ChatMessage.meta` (#2226)
23
+
24
+
25
+ ### 🧹 Chores
26
+
27
+ - Standardize readmes - part 2 (#2205)
28
+
3
29
  ## [integrations/amazon_bedrock-v3.11.0] - 2025-08-21
4
30
 
5
31
  ### 🚀 Features
6
32
 
7
33
  - Add `AmazonBedrockDocumentImageEmbedder` component (#2185)
8
34
 
35
+ ### 🧹 Chores
36
+
37
+ - Add framework name into UserAgent header for bedrock integration (#2168)
38
+ - Standardize readmes - part 1 (#2202)
39
+
40
+
9
41
  ## [integrations/amazon_bedrock-v3.10.0] - 2025-08-06
10
42
 
11
43
  ### 🚀 Features
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: amazon-bedrock-haystack
3
- Version: 4.0.0
4
- Summary: An integration of Amazon Bedrock as an AmazonBedrockGenerator component.
3
+ Version: 4.2.0
4
+ Summary: An integration of AWS S3 and Bedrock as a Downloader and Generator components.
5
5
  Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/amazon_bedrock#readme
6
6
  Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
7
7
  Project-URL: Source, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/amazon_bedrock
@@ -0,0 +1,83 @@
1
+ # To run this example, you will need to
2
+ # 1) set `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` and `AWS_DEFAULT_REGION` environment variables
3
+ # 2) enabled access to the selected S3 bucket
4
+ # 3) `S3_DOWNLOADER_BUCKET` environment variable should be set to the name of the S3 bucket.
5
+
6
+ # The example shows how to use the S3Downloader component in a query pipeline to download files from an S3 bucket.
7
+ # To run this example, set the file_name in docs to your files in the S3 bucket.
8
+ # The files are then downloaded, converted to images and used to answer a question.
9
+
10
+
11
+ from pathlib import Path
12
+ from uuid import uuid4
13
+
14
+ from haystack import Pipeline
15
+ from haystack.components.builders import ChatPromptBuilder
16
+ from haystack.components.converters.image import DocumentToImageContent
17
+ from haystack.components.routers import DocumentTypeRouter
18
+ from haystack.dataclasses import Document
19
+
20
+ from haystack_integrations.components.downloaders.s3.s3_downloader import S3Downloader
21
+ from haystack_integrations.components.generators.amazon_bedrock import AmazonBedrockChatGenerator
22
+
23
+ docs = [
24
+ Document(meta={"file_id": str(uuid4()), "file_name": "text-sample.txt"}),
25
+ Document(meta={"file_id": str(uuid4()), "file_name": "document-sample.pdf", "page_number": 1}),
26
+ ]
27
+
28
+ chat_prompt_builder = ChatPromptBuilder(
29
+ required_variables=["question"],
30
+ template="""{% message role="system" %}
31
+ You are a friendly assistant that answers questions based on provided documents and images.
32
+ {% endmessage %}
33
+
34
+ {%- message role="user" -%}
35
+ Only provide an answer to the question using the images and text passages provided.
36
+
37
+ These are the text-only documents:
38
+ {%- if documents|length > 0 %}
39
+ {%- for doc in documents %}
40
+ Text Document [{{ loop.index }}] :
41
+ {{ doc.content }}
42
+ {% endfor -%}
43
+ {%- else %}
44
+ No relevant text documents were found.
45
+ {% endif %}
46
+ End of text documents.
47
+
48
+ Question: {{ question }}
49
+ Answer:
50
+
51
+ Images:
52
+ {%- if image_contents|length > 0 %}
53
+ {%- for img in image_contents -%}
54
+ {{ img | templatize_part }}
55
+ {%- endfor -%}
56
+ {% endif %}
57
+ {%- endmessage -%}
58
+ """,
59
+ )
60
+
61
+ pipe = Pipeline()
62
+ pipe.add_component(
63
+ "s3_downloader", S3Downloader(file_root_path=str(Path.cwd() / "s3_downloads"), file_extensions=[".pdf"])
64
+ )
65
+ pipe.add_component(
66
+ "doc_type_router", DocumentTypeRouter(file_path_meta_field="file_path", mime_types=["application/pdf"])
67
+ )
68
+ pipe.add_component("doc_to_image", DocumentToImageContent(detail="auto"))
69
+ pipe.add_component("chat_prompt_builder", chat_prompt_builder)
70
+ pipe.add_component("llm", AmazonBedrockChatGenerator(model="anthropic.claude-3-haiku-20240307-v1:0"))
71
+
72
+ pipe.connect("s3_downloader.documents", "doc_type_router.documents")
73
+ pipe.connect("doc_type_router.application/pdf", "doc_to_image.documents")
74
+ pipe.connect("doc_to_image.image_contents", "chat_prompt_builder.image_contents")
75
+ pipe.connect("s3_downloader.documents", "chat_prompt_builder.documents")
76
+ pipe.connect("chat_prompt_builder.prompt", "llm.messages")
77
+
78
+ result = pipe.run(
79
+ data={
80
+ "s3_downloader": {"documents": docs},
81
+ "chat_prompt_builder": {"question": "What is the main topic of the document?"},
82
+ }
83
+ )
@@ -11,6 +11,9 @@ loaders:
11
11
  "haystack_integrations.common.amazon_bedrock.errors",
12
12
  "haystack_integrations.components.generators.amazon_bedrock.chat.chat_generator",
13
13
  "haystack_integrations.components.rankers.amazon_bedrock.ranker",
14
+ "haystack_integrations.components.downloaders.s3.s3_downloader",
15
+ "haystack_integrations.common.s3.utils",
16
+ "haystack_integrations.common.s3.errors",
14
17
  ]
15
18
  ignore_when_discovered: ["__init__"]
16
19
  processors:
@@ -5,7 +5,7 @@ build-backend = "hatchling.build"
5
5
  [project]
6
6
  name = "amazon-bedrock-haystack"
7
7
  dynamic = ["version"]
8
- description = 'An integration of Amazon Bedrock as an AmazonBedrockGenerator component.'
8
+ description = 'An integration of AWS S3 and Bedrock as a Downloader and Generator components.'
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
11
11
  license = "Apache-2.0"
@@ -71,7 +71,10 @@ cov-retry = 'all --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x'
71
71
  types = """mypy -p haystack_integrations.common.amazon_bedrock \
72
72
  -p haystack_integrations.components.embedders.amazon_bedrock \
73
73
  -p haystack_integrations.components.generators.amazon_bedrock \
74
- -p haystack_integrations.components.rankers.amazon_bedrock {args}"""
74
+ -p haystack_integrations.components.rankers.amazon_bedrock \
75
+ -p haystack_integrations.components.downloaders.s3 \
76
+ -p haystack_integrations.common.s3 {args}"""
77
+
75
78
 
76
79
  [tool.mypy]
77
80
  install_types = true
@@ -1,3 +1,8 @@
1
+ # SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
1
6
  class AmazonBedrockError(Exception):
2
7
  """
3
8
  Any error generated by the Amazon Bedrock integration.
@@ -1,3 +1,7 @@
1
+ # SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
1
5
  from typing import Any, Optional, Union
2
6
 
3
7
  import aioboto3
@@ -0,0 +1,15 @@
1
+ # SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ class S3Error(Exception):
7
+ """Exception for issues that occur in the S3 based components"""
8
+
9
+
10
+ class S3ConfigurationError(S3Error):
11
+ """Exception raised when AmazonS3 node is not configured correctly"""
12
+
13
+
14
+ class S3StorageError(S3Error):
15
+ """This exception is raised when an error occurs while interacting with a S3Storage object."""
@@ -0,0 +1,120 @@
1
+ # SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import os
6
+ from dataclasses import dataclass
7
+ from http import HTTPStatus
8
+ from pathlib import Path
9
+ from typing import Optional
10
+
11
+ from boto3.session import Session
12
+ from botocore.config import Config
13
+ from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError
14
+
15
+ from haystack_integrations.common.s3.errors import S3ConfigurationError, S3StorageError
16
+
17
+
18
+ @dataclass
19
+ class S3Storage:
20
+ """This class provides a storage class for downloading files from an AWS S3 bucket."""
21
+
22
+ def __init__(
23
+ self,
24
+ s3_bucket: str,
25
+ session: Session,
26
+ s3_prefix: Optional[str] = None,
27
+ endpoint_url: Optional[str] = None,
28
+ config: Optional[Config] = None,
29
+ ) -> None:
30
+ """
31
+ Initializes the S3Storage object with the provided parameters.
32
+
33
+ :param s3_bucket: The name of the S3 bucket to download files from.
34
+ :param session: The session to use for the S3 client.
35
+ :param s3_prefix: The optional prefix of the files in the S3 bucket.
36
+ Can be used to specify folder or naming structure.
37
+ For example, if the file is in the folder "folder/subfolder/file.txt",
38
+ the s3_prefix should be "folder/subfolder/". If the file is in the root of the S3 bucket,
39
+ the s3_prefix should be None.
40
+ :param endpoint_url: The endpoint URL of the S3 bucket to download files from.
41
+ :param config: The configuration to use for the S3 client.
42
+ """
43
+
44
+ self.s3_bucket = s3_bucket
45
+ self.s3_prefix = s3_prefix
46
+ self.endpoint_url = endpoint_url
47
+ self.session = session
48
+ self.config = config
49
+
50
+ try:
51
+ self._client = self.session.client("s3", endpoint_url=self.endpoint_url, config=self.config)
52
+ except Exception as e:
53
+ msg = f"Failed to create S3 session client: {e}"
54
+ raise S3ConfigurationError(msg) from e
55
+
56
+ def download(self, key: str, local_file_path: Path) -> None:
57
+ """Download a file from S3.
58
+
59
+ :param key: The key of the file to download.
60
+ :param local_file_path: The folder path to download the file to.
61
+ It will be created if it does not exist. The file will be downloaded to
62
+ the folder with the same name as the key.
63
+ :raises S3ConfigurationError: If the S3 session client cannot be created.
64
+ :raises S3StorageError: If the file does not exist in the S3 bucket
65
+ or the file cannot be downloaded.
66
+ """
67
+
68
+ if self.s3_prefix:
69
+ s3_key = f"{self.s3_prefix}{key}"
70
+ else:
71
+ s3_key = key
72
+
73
+ try:
74
+ self._client.download_file(self.s3_bucket, s3_key, str(local_file_path))
75
+
76
+ except (NoCredentialsError, PartialCredentialsError) as e:
77
+ msg = (
78
+ f"Missing AWS credentials. Please check your AWS credentials (access key, secret key, region)."
79
+ f"Error: {e}"
80
+ )
81
+ raise S3ConfigurationError(msg) from e
82
+
83
+ except ClientError as e:
84
+ error_code = int(e.response["Error"]["Code"])
85
+
86
+ if error_code == HTTPStatus.FORBIDDEN:
87
+ msg = (
88
+ f"Failed to access S3 bucket {self.s3_bucket!r}. "
89
+ f"Please check your AWS credentials (access key, secret key, region) and ensure "
90
+ f"they have the necessary S3 permissions. "
91
+ f"Error: {e}"
92
+ )
93
+ raise S3ConfigurationError(msg) from e
94
+
95
+ elif error_code == HTTPStatus.NOT_FOUND:
96
+ msg = f"The object {s3_key!r} does not exist in the S3 bucket {self.s3_bucket!r}. \n Error: {e}"
97
+ raise S3StorageError(msg) from e
98
+ else:
99
+ msg = f"Failed to download file {s3_key!r} from S3. Error: {e}"
100
+ raise S3StorageError(msg) from e
101
+
102
+ @classmethod
103
+ def from_env(cls, *, session: Session, config: Config) -> "S3Storage":
104
+ """Create a S3Storage object from environment variables."""
105
+ s3_bucket = os.getenv("S3_DOWNLOADER_BUCKET")
106
+ if not s3_bucket:
107
+ msg = (
108
+ "Missing environment variable S3_DOWNLOADER_BUCKET."
109
+ "Please set it to the name of the S3 bucket to download files from."
110
+ )
111
+ raise ValueError(msg)
112
+ s3_prefix = os.getenv("S3_DOWNLOADER_PREFIX") or None
113
+ endpoint_url = os.getenv("AWS_ENDPOINT_URL") or None
114
+ return cls(
115
+ s3_bucket=s3_bucket,
116
+ s3_prefix=s3_prefix,
117
+ endpoint_url=endpoint_url,
118
+ session=session,
119
+ config=config,
120
+ )
@@ -0,0 +1,7 @@
1
+ # SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from .s3_downloader import S3Downloader
6
+
7
+ __all__ = ["S3Downloader"]
@@ -0,0 +1,246 @@
1
+ # SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import os
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Optional
9
+
10
+ from botocore.config import Config
11
+ from haystack import component, default_from_dict, default_to_dict, logging
12
+ from haystack.dataclasses import Document
13
+ from haystack.utils.auth import Secret, deserialize_secrets_inplace
14
+
15
+ from haystack_integrations.common.amazon_bedrock.utils import get_aws_session
16
+ from haystack_integrations.common.s3.utils import S3Storage
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ @component
22
+ class S3Downloader:
23
+ """
24
+ A component for downloading files from AWS S3 Buckets to local filesystem.
25
+ Supports filtering by file extensions.
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ *,
31
+ aws_access_key_id: Optional[Secret] = Secret.from_env_var("AWS_ACCESS_KEY_ID", strict=False), # noqa: B008
32
+ aws_secret_access_key: Optional[Secret] = Secret.from_env_var( # noqa: B008
33
+ "AWS_SECRET_ACCESS_KEY", strict=False
34
+ ),
35
+ aws_session_token: Optional[Secret] = Secret.from_env_var("AWS_SESSION_TOKEN", strict=False), # noqa: B008
36
+ aws_region_name: Optional[Secret] = Secret.from_env_var("AWS_DEFAULT_REGION", strict=False), # noqa: B008
37
+ aws_profile_name: Optional[Secret] = Secret.from_env_var("AWS_PROFILE", strict=False), # noqa: B008
38
+ boto3_config: Optional[Dict[str, Any]] = None,
39
+ file_root_path: Optional[str] = None,
40
+ file_extensions: Optional[List[str]] = None,
41
+ file_name_meta_key: str = "file_name",
42
+ max_workers: int = 32,
43
+ max_cache_size: int = 100,
44
+ ) -> None:
45
+ """
46
+ Initializes the `S3Downloader` with the provided parameters.
47
+
48
+ Note that the AWS credentials are not required if the AWS environment is configured correctly. These are loaded
49
+ automatically from the environment or the AWS configuration file and do not need to be provided explicitly via
50
+ the constructor. If the AWS environment is not configured users need to provide the AWS credentials via the
51
+ constructor. Three required parameters are `aws_access_key_id`, `aws_secret_access_key`,
52
+ and `aws_region_name`.
53
+
54
+ :param aws_access_key_id: AWS access key ID.
55
+ :param aws_secret_access_key: AWS secret access key.
56
+ :param aws_session_token: AWS session token.
57
+ :param aws_region_name: AWS region name.
58
+ :param aws_profile_name: AWS profile name.
59
+ :param boto3_config: The configuration for the boto3 client.
60
+ :param file_root_path: The path where the file will be downloaded.
61
+ Can be set through this parameter or the `FILE_ROOT_PATH` environment variable.
62
+ If none of them is set, a `ValueError` is raised.
63
+ :param file_extensions: The file extensions that are permitted to be downloaded.
64
+ By default, all file extensions are allowed.
65
+ :param max_workers: The maximum number of workers to use for concurrent downloads.
66
+ :param max_cache_size: The maximum number of files to cache.
67
+ :param file_name_meta_key: The name of the meta key that contains the file name to download.
68
+ By default, the `Document.meta["file_name"]` is used. If you want to use a
69
+ different key in `Document.meta`, you can set it here.
70
+ :raises ValueError: If the `file_root_path` is not set through
71
+ the constructor or the `FILE_ROOT_PATH` environment variable.
72
+
73
+ """
74
+
75
+ # Set up download directory
76
+ file_root_path = file_root_path or os.getenv("FILE_ROOT_PATH")
77
+
78
+ if file_root_path is None:
79
+ msg = (
80
+ "The path where files will be downloaded is not set. Please set the "
81
+ "`file_root_path` init parameter or the `FILE_ROOT_PATH` environment variable."
82
+ )
83
+ raise ValueError(msg)
84
+
85
+ self.file_root_path = Path(file_root_path)
86
+
87
+ self.aws_access_key_id = aws_access_key_id
88
+ self.aws_secret_access_key = aws_secret_access_key
89
+ self.aws_region_name = aws_region_name
90
+ self.aws_session_token = aws_session_token
91
+ self.aws_profile_name = aws_profile_name
92
+ self.boto3_config = boto3_config
93
+ self.file_extensions = [e.lower() for e in file_extensions] if file_extensions else None
94
+ self.max_workers = max_workers
95
+ self.max_cache_size = max_cache_size
96
+ self.file_name_meta_key = file_name_meta_key
97
+
98
+ self._storage: Optional[S3Storage] = None
99
+
100
+ def resolve_secret(secret: Optional[Secret]) -> Optional[str]:
101
+ return secret.resolve_value() if secret else None
102
+
103
+ self._session = get_aws_session(
104
+ aws_access_key_id=resolve_secret(aws_access_key_id),
105
+ aws_secret_access_key=resolve_secret(aws_secret_access_key),
106
+ aws_session_token=resolve_secret(aws_session_token),
107
+ aws_region_name=resolve_secret(aws_region_name),
108
+ aws_profile_name=resolve_secret(aws_profile_name),
109
+ )
110
+ self._config = Config(
111
+ user_agent_extra="x-client-framework:haystack", **(self.boto3_config if self.boto3_config else {})
112
+ )
113
+
114
+ def warm_up(self) -> None:
115
+ """Warm up the component by initializing the settings and storage."""
116
+ if self._storage is None:
117
+ self.file_root_path.mkdir(parents=True, exist_ok=True)
118
+ self._storage = S3Storage.from_env(session=self._session, config=self._config)
119
+
120
+ @component.output_types(documents=List[Document])
121
+ def run(
122
+ self,
123
+ documents: List[Document],
124
+ ) -> Dict[str, List[Document]]:
125
+ """Download files from AWS S3 Buckets to local filesystem.
126
+
127
+ Return enriched `Document`s with the path of the downloaded file.
128
+ :param documents: Document containing the name of the file to download in the meta field.
129
+ :returns: A dictionary with:
130
+ - `documents`: The downloaded `Document`s; each has `meta['file_path']`.
131
+ :raises S3Error: If a download attempt fails or the file does not exist in the S3 bucket.
132
+ :raises ValueError: If the path where files will be downloaded is not set.
133
+ """
134
+
135
+ if self._storage is None:
136
+ msg = f"The component {self.__class__.__name__} was not warmed up. Call 'warm_up()' before calling run()."
137
+ raise RuntimeError(msg)
138
+
139
+ filtered_documents = self._filter_documents_by_extensions(documents) if self.file_extensions else documents
140
+
141
+ if not filtered_documents:
142
+ return {"documents": []}
143
+
144
+ try:
145
+ max_workers = min(self.max_workers, len(filtered_documents) if filtered_documents else self.max_workers)
146
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
147
+ iterable = executor.map(self._download_file, filtered_documents)
148
+ finally:
149
+ self._cleanup_cache(filtered_documents)
150
+
151
+ downloaded_documents = [d for d in iterable if d is not None]
152
+ return {"documents": downloaded_documents}
153
+
154
+ def _filter_documents_by_extensions(self, documents: List[Document]) -> List[Document]:
155
+ """Filter documents by file extensions."""
156
+ if not self.file_extensions:
157
+ return documents
158
+ return [
159
+ doc
160
+ for doc in documents
161
+ if Path(doc.meta.get(self.file_name_meta_key, "")).suffix.lower() in self.file_extensions
162
+ ]
163
+
164
+ def _download_file(self, document: Document) -> Optional[Document]:
165
+ """
166
+ Download a single file from AWS S3 Bucket to local filesystem.
167
+
168
+ :param document: `Document` with the name of the file to download in the meta field.
169
+ :returns:
170
+ The same `Document` with `meta` containing the `file_path` of the
171
+ downloaded file.
172
+ :raises S3Error: If the download or head request fails or the file does not exist in the S3 bucket.
173
+ """
174
+
175
+ file_name = document.meta.get(self.file_name_meta_key)
176
+ if not file_name:
177
+ logger.warning(
178
+ f"Document missing required file name metadata key '{self.file_name_meta_key}'. Skipping download."
179
+ )
180
+ return None
181
+
182
+ file_path = self.file_root_path / Path(file_name)
183
+
184
+ if file_path.is_file():
185
+ # set access and modification time to now without redownloading the file
186
+ file_path.touch()
187
+
188
+ else:
189
+ # we know that _storage is not None after warm_up() is called, but mypy does not know that
190
+ self._storage.download(key=file_name, local_file_path=file_path) # type: ignore[union-attr]
191
+
192
+ document.meta["file_path"] = str(file_path)
193
+ return document
194
+
195
+ def _cleanup_cache(self, documents: List[Document]) -> None:
196
+ """
197
+ Remove least-recently-accessed cache files when cache exceeds `max_cache_size`.
198
+
199
+ :param documents: List of Document objects being used containing `cache_id` metadata.
200
+ """
201
+ requested_ids = {
202
+ str(abs(hash(str(doc.meta.get("cache_id", ""))))) for doc in documents if doc.meta.get("cache_id")
203
+ }
204
+
205
+ all_files = [p for p in self.file_root_path.iterdir() if p.is_file()]
206
+ misses = [p for p in all_files if p.stem not in requested_ids]
207
+
208
+ overflow = len(misses) + len(requested_ids) - self.max_cache_size
209
+ if overflow > 0:
210
+ misses.sort(key=lambda p: p.stat().st_atime)
211
+ for p in misses[:overflow]:
212
+ try:
213
+ p.unlink()
214
+ except Exception as error:
215
+ logger.warning("Failed to remove cache file at {path} with error: {e}", path=p, e=error)
216
+
217
+ def to_dict(self) -> Dict[str, Any]:
218
+ """Serialize the component to a dictionary."""
219
+ return default_to_dict(
220
+ self,
221
+ aws_access_key_id=self.aws_access_key_id.to_dict() if self.aws_access_key_id else None,
222
+ aws_secret_access_key=self.aws_secret_access_key.to_dict() if self.aws_secret_access_key else None,
223
+ aws_session_token=self.aws_session_token.to_dict() if self.aws_session_token else None,
224
+ aws_region_name=self.aws_region_name.to_dict() if self.aws_region_name else None,
225
+ aws_profile_name=self.aws_profile_name.to_dict() if self.aws_profile_name else None,
226
+ file_root_path=str(self.file_root_path),
227
+ max_workers=self.max_workers,
228
+ max_cache_size=self.max_cache_size,
229
+ file_extensions=self.file_extensions,
230
+ file_name_meta_key=self.file_name_meta_key,
231
+ )
232
+
233
+ @classmethod
234
+ def from_dict(cls, data: Dict[str, Any]) -> "S3Downloader":
235
+ """
236
+ Deserializes the component from a dictionary.
237
+ :param data:
238
+ Dictionary to deserialize from.
239
+ :returns:
240
+ Deserialized component.
241
+ """
242
+ deserialize_secrets_inplace(
243
+ data["init_parameters"],
244
+ ["aws_access_key_id", "aws_secret_access_key", "aws_session_token", "aws_region_name", "aws_profile_name"],
245
+ )
246
+ return default_from_dict(cls, data)