amazon-bedrock-haystack 4.0.0__py3-none-any.whl → 4.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {amazon_bedrock_haystack-4.0.0.dist-info → amazon_bedrock_haystack-4.2.0.dist-info}/METADATA +2 -2
- {amazon_bedrock_haystack-4.0.0.dist-info → amazon_bedrock_haystack-4.2.0.dist-info}/RECORD +14 -8
- haystack_integrations/common/amazon_bedrock/errors.py +5 -0
- haystack_integrations/common/amazon_bedrock/utils.py +4 -0
- haystack_integrations/common/s3/__init__.py +3 -0
- haystack_integrations/common/s3/errors.py +15 -0
- haystack_integrations/common/s3/utils.py +120 -0
- haystack_integrations/components/downloaders/py.typed +0 -0
- haystack_integrations/components/downloaders/s3/__init__.py +7 -0
- haystack_integrations/components/downloaders/s3/s3_downloader.py +246 -0
- haystack_integrations/components/generators/amazon_bedrock/chat/chat_generator.py +29 -8
- haystack_integrations/components/generators/amazon_bedrock/chat/utils.py +53 -15
- {amazon_bedrock_haystack-4.0.0.dist-info → amazon_bedrock_haystack-4.2.0.dist-info}/WHEEL +0 -0
- {amazon_bedrock_haystack-4.0.0.dist-info → amazon_bedrock_haystack-4.2.0.dist-info}/licenses/LICENSE.txt +0 -0
{amazon_bedrock_haystack-4.0.0.dist-info → amazon_bedrock_haystack-4.2.0.dist-info}/METADATA
RENAMED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: amazon-bedrock-haystack
|
|
3
|
-
Version: 4.
|
|
4
|
-
Summary: An integration of
|
|
3
|
+
Version: 4.2.0
|
|
4
|
+
Summary: An integration of AWS S3 and Bedrock as a Downloader and Generator components.
|
|
5
5
|
Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/amazon_bedrock#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
|
|
7
7
|
Project-URL: Source, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/amazon_bedrock
|
|
@@ -1,7 +1,13 @@
|
|
|
1
1
|
haystack_integrations/common/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
haystack_integrations/common/amazon_bedrock/__init__.py,sha256=6GZ8Y3Lw0rLOsOAqi6Tu5mZC977UzQvgDxKpOWr8IQw,110
|
|
3
|
-
haystack_integrations/common/amazon_bedrock/errors.py,sha256=
|
|
4
|
-
haystack_integrations/common/amazon_bedrock/utils.py,sha256=
|
|
3
|
+
haystack_integrations/common/amazon_bedrock/errors.py,sha256=47w_rg3JTGJ5QfsUELrjYBeuLajxUmQ7-zOFvdz4mT8,856
|
|
4
|
+
haystack_integrations/common/amazon_bedrock/utils.py,sha256=UIJVl1e_hlP9mQr_YvVTFklbc_kqOgI7DnSxcAuSLx0,2846
|
|
5
|
+
haystack_integrations/common/s3/__init__.py,sha256=6GZ8Y3Lw0rLOsOAqi6Tu5mZC977UzQvgDxKpOWr8IQw,110
|
|
6
|
+
haystack_integrations/common/s3/errors.py,sha256=BrTDLdhQvAuQutyg35cFyP5h8PNkDEieLwehi58UqAU,452
|
|
7
|
+
haystack_integrations/common/s3/utils.py,sha256=OJupFj54aQmg6S8VuVq6Lc2qpFZyyJajRVIpwe3_6iA,4744
|
|
8
|
+
haystack_integrations/components/downloaders/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
haystack_integrations/components/downloaders/s3/__init__.py,sha256=2BOd3_N0kGqRJGH-ENrTJqOqzqHryRYaSuNqpLYKMFo,179
|
|
10
|
+
haystack_integrations/components/downloaders/s3/s3_downloader.py,sha256=kptTCSry_uEYtNAca_pU7zQJs_LJwwJKYjuYDVJrZRE,11220
|
|
5
11
|
haystack_integrations/components/embedders/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
12
|
haystack_integrations/components/embedders/amazon_bedrock/__init__.py,sha256=7GlhHJ4jFHCxq5QN5losGuGtrGNjvEx2dSQvEYD2yG0,408
|
|
7
13
|
haystack_integrations/components/embedders/amazon_bedrock/document_embedder.py,sha256=DD34-HAGwGwTU7KWGqKXXlFdwIs21JavBRDHrBqC-m4,13060
|
|
@@ -12,12 +18,12 @@ haystack_integrations/components/generators/amazon_bedrock/__init__.py,sha256=lv
|
|
|
12
18
|
haystack_integrations/components/generators/amazon_bedrock/adapters.py,sha256=yBC-3YwV6qAwSXMtdZiLSYh2lUpPQIDy7Efl7w-Cu-k,19640
|
|
13
19
|
haystack_integrations/components/generators/amazon_bedrock/generator.py,sha256=Brzw0XvtPJhz2kR2I3liAqWHRmDR6p5HzJerEAPhoJU,14743
|
|
14
20
|
haystack_integrations/components/generators/amazon_bedrock/chat/__init__.py,sha256=6GZ8Y3Lw0rLOsOAqi6Tu5mZC977UzQvgDxKpOWr8IQw,110
|
|
15
|
-
haystack_integrations/components/generators/amazon_bedrock/chat/chat_generator.py,sha256=
|
|
16
|
-
haystack_integrations/components/generators/amazon_bedrock/chat/utils.py,sha256=
|
|
21
|
+
haystack_integrations/components/generators/amazon_bedrock/chat/chat_generator.py,sha256=qArwfXcforWnPzLXrAW-1hkPFpMy3NSdDyJ5GOta25w,26068
|
|
22
|
+
haystack_integrations/components/generators/amazon_bedrock/chat/utils.py,sha256=1M_k8CG2WH23Yz-sB7a1kiIqVh2QB8Pqi0zbWXyMUL8,27255
|
|
17
23
|
haystack_integrations/components/rankers/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
24
|
haystack_integrations/components/rankers/amazon_bedrock/__init__.py,sha256=Zrc3BSVkEaXYpliEi6hKG9bqW4J7DNk93p50SuoyT1Q,107
|
|
19
25
|
haystack_integrations/components/rankers/amazon_bedrock/ranker.py,sha256=enAjf2QyDwfpidKkFCdLz954cx-Tjh9emrOS3vINJDg,12344
|
|
20
|
-
amazon_bedrock_haystack-4.
|
|
21
|
-
amazon_bedrock_haystack-4.
|
|
22
|
-
amazon_bedrock_haystack-4.
|
|
23
|
-
amazon_bedrock_haystack-4.
|
|
26
|
+
amazon_bedrock_haystack-4.2.0.dist-info/METADATA,sha256=HM32juznvZMTZgwNyS34vsLTZeBsLxaru7sUmGz4xQA,2228
|
|
27
|
+
amazon_bedrock_haystack-4.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
28
|
+
amazon_bedrock_haystack-4.2.0.dist-info/licenses/LICENSE.txt,sha256=B05uMshqTA74s-0ltyHKI6yoPfJ3zYgQbvcXfDVGFf8,10280
|
|
29
|
+
amazon_bedrock_haystack-4.2.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class S3Error(Exception):
|
|
7
|
+
"""Exception for issues that occur in the S3 based components"""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class S3ConfigurationError(S3Error):
|
|
11
|
+
"""Exception raised when AmazonS3 node is not configured correctly"""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class S3StorageError(S3Error):
|
|
15
|
+
"""This exception is raised when an error occurs while interacting with a S3Storage object."""
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from http import HTTPStatus
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
from boto3.session import Session
|
|
12
|
+
from botocore.config import Config
|
|
13
|
+
from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError
|
|
14
|
+
|
|
15
|
+
from haystack_integrations.common.s3.errors import S3ConfigurationError, S3StorageError
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class S3Storage:
|
|
20
|
+
"""This class provides a storage class for downloading files from an AWS S3 bucket."""
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
s3_bucket: str,
|
|
25
|
+
session: Session,
|
|
26
|
+
s3_prefix: Optional[str] = None,
|
|
27
|
+
endpoint_url: Optional[str] = None,
|
|
28
|
+
config: Optional[Config] = None,
|
|
29
|
+
) -> None:
|
|
30
|
+
"""
|
|
31
|
+
Initializes the S3Storage object with the provided parameters.
|
|
32
|
+
|
|
33
|
+
:param s3_bucket: The name of the S3 bucket to download files from.
|
|
34
|
+
:param session: The session to use for the S3 client.
|
|
35
|
+
:param s3_prefix: The optional prefix of the files in the S3 bucket.
|
|
36
|
+
Can be used to specify folder or naming structure.
|
|
37
|
+
For example, if the file is in the folder "folder/subfolder/file.txt",
|
|
38
|
+
the s3_prefix should be "folder/subfolder/". If the file is in the root of the S3 bucket,
|
|
39
|
+
the s3_prefix should be None.
|
|
40
|
+
:param endpoint_url: The endpoint URL of the S3 bucket to download files from.
|
|
41
|
+
:param config: The configuration to use for the S3 client.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
self.s3_bucket = s3_bucket
|
|
45
|
+
self.s3_prefix = s3_prefix
|
|
46
|
+
self.endpoint_url = endpoint_url
|
|
47
|
+
self.session = session
|
|
48
|
+
self.config = config
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
self._client = self.session.client("s3", endpoint_url=self.endpoint_url, config=self.config)
|
|
52
|
+
except Exception as e:
|
|
53
|
+
msg = f"Failed to create S3 session client: {e}"
|
|
54
|
+
raise S3ConfigurationError(msg) from e
|
|
55
|
+
|
|
56
|
+
def download(self, key: str, local_file_path: Path) -> None:
|
|
57
|
+
"""Download a file from S3.
|
|
58
|
+
|
|
59
|
+
:param key: The key of the file to download.
|
|
60
|
+
:param local_file_path: The folder path to download the file to.
|
|
61
|
+
It will be created if it does not exist. The file will be downloaded to
|
|
62
|
+
the folder with the same name as the key.
|
|
63
|
+
:raises S3ConfigurationError: If the S3 session client cannot be created.
|
|
64
|
+
:raises S3StorageError: If the file does not exist in the S3 bucket
|
|
65
|
+
or the file cannot be downloaded.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
if self.s3_prefix:
|
|
69
|
+
s3_key = f"{self.s3_prefix}{key}"
|
|
70
|
+
else:
|
|
71
|
+
s3_key = key
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
self._client.download_file(self.s3_bucket, s3_key, str(local_file_path))
|
|
75
|
+
|
|
76
|
+
except (NoCredentialsError, PartialCredentialsError) as e:
|
|
77
|
+
msg = (
|
|
78
|
+
f"Missing AWS credentials. Please check your AWS credentials (access key, secret key, region)."
|
|
79
|
+
f"Error: {e}"
|
|
80
|
+
)
|
|
81
|
+
raise S3ConfigurationError(msg) from e
|
|
82
|
+
|
|
83
|
+
except ClientError as e:
|
|
84
|
+
error_code = int(e.response["Error"]["Code"])
|
|
85
|
+
|
|
86
|
+
if error_code == HTTPStatus.FORBIDDEN:
|
|
87
|
+
msg = (
|
|
88
|
+
f"Failed to access S3 bucket {self.s3_bucket!r}. "
|
|
89
|
+
f"Please check your AWS credentials (access key, secret key, region) and ensure "
|
|
90
|
+
f"they have the necessary S3 permissions. "
|
|
91
|
+
f"Error: {e}"
|
|
92
|
+
)
|
|
93
|
+
raise S3ConfigurationError(msg) from e
|
|
94
|
+
|
|
95
|
+
elif error_code == HTTPStatus.NOT_FOUND:
|
|
96
|
+
msg = f"The object {s3_key!r} does not exist in the S3 bucket {self.s3_bucket!r}. \n Error: {e}"
|
|
97
|
+
raise S3StorageError(msg) from e
|
|
98
|
+
else:
|
|
99
|
+
msg = f"Failed to download file {s3_key!r} from S3. Error: {e}"
|
|
100
|
+
raise S3StorageError(msg) from e
|
|
101
|
+
|
|
102
|
+
@classmethod
|
|
103
|
+
def from_env(cls, *, session: Session, config: Config) -> "S3Storage":
|
|
104
|
+
"""Create a S3Storage object from environment variables."""
|
|
105
|
+
s3_bucket = os.getenv("S3_DOWNLOADER_BUCKET")
|
|
106
|
+
if not s3_bucket:
|
|
107
|
+
msg = (
|
|
108
|
+
"Missing environment variable S3_DOWNLOADER_BUCKET."
|
|
109
|
+
"Please set it to the name of the S3 bucket to download files from."
|
|
110
|
+
)
|
|
111
|
+
raise ValueError(msg)
|
|
112
|
+
s3_prefix = os.getenv("S3_DOWNLOADER_PREFIX") or None
|
|
113
|
+
endpoint_url = os.getenv("AWS_ENDPOINT_URL") or None
|
|
114
|
+
return cls(
|
|
115
|
+
s3_bucket=s3_bucket,
|
|
116
|
+
s3_prefix=s3_prefix,
|
|
117
|
+
endpoint_url=endpoint_url,
|
|
118
|
+
session=session,
|
|
119
|
+
config=config,
|
|
120
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Dict, List, Optional
|
|
9
|
+
|
|
10
|
+
from botocore.config import Config
|
|
11
|
+
from haystack import component, default_from_dict, default_to_dict, logging
|
|
12
|
+
from haystack.dataclasses import Document
|
|
13
|
+
from haystack.utils.auth import Secret, deserialize_secrets_inplace
|
|
14
|
+
|
|
15
|
+
from haystack_integrations.common.amazon_bedrock.utils import get_aws_session
|
|
16
|
+
from haystack_integrations.common.s3.utils import S3Storage
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@component
|
|
22
|
+
class S3Downloader:
|
|
23
|
+
"""
|
|
24
|
+
A component for downloading files from AWS S3 Buckets to local filesystem.
|
|
25
|
+
Supports filtering by file extensions.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
*,
|
|
31
|
+
aws_access_key_id: Optional[Secret] = Secret.from_env_var("AWS_ACCESS_KEY_ID", strict=False), # noqa: B008
|
|
32
|
+
aws_secret_access_key: Optional[Secret] = Secret.from_env_var( # noqa: B008
|
|
33
|
+
"AWS_SECRET_ACCESS_KEY", strict=False
|
|
34
|
+
),
|
|
35
|
+
aws_session_token: Optional[Secret] = Secret.from_env_var("AWS_SESSION_TOKEN", strict=False), # noqa: B008
|
|
36
|
+
aws_region_name: Optional[Secret] = Secret.from_env_var("AWS_DEFAULT_REGION", strict=False), # noqa: B008
|
|
37
|
+
aws_profile_name: Optional[Secret] = Secret.from_env_var("AWS_PROFILE", strict=False), # noqa: B008
|
|
38
|
+
boto3_config: Optional[Dict[str, Any]] = None,
|
|
39
|
+
file_root_path: Optional[str] = None,
|
|
40
|
+
file_extensions: Optional[List[str]] = None,
|
|
41
|
+
file_name_meta_key: str = "file_name",
|
|
42
|
+
max_workers: int = 32,
|
|
43
|
+
max_cache_size: int = 100,
|
|
44
|
+
) -> None:
|
|
45
|
+
"""
|
|
46
|
+
Initializes the `S3Downloader` with the provided parameters.
|
|
47
|
+
|
|
48
|
+
Note that the AWS credentials are not required if the AWS environment is configured correctly. These are loaded
|
|
49
|
+
automatically from the environment or the AWS configuration file and do not need to be provided explicitly via
|
|
50
|
+
the constructor. If the AWS environment is not configured users need to provide the AWS credentials via the
|
|
51
|
+
constructor. Three required parameters are `aws_access_key_id`, `aws_secret_access_key`,
|
|
52
|
+
and `aws_region_name`.
|
|
53
|
+
|
|
54
|
+
:param aws_access_key_id: AWS access key ID.
|
|
55
|
+
:param aws_secret_access_key: AWS secret access key.
|
|
56
|
+
:param aws_session_token: AWS session token.
|
|
57
|
+
:param aws_region_name: AWS region name.
|
|
58
|
+
:param aws_profile_name: AWS profile name.
|
|
59
|
+
:param boto3_config: The configuration for the boto3 client.
|
|
60
|
+
:param file_root_path: The path where the file will be downloaded.
|
|
61
|
+
Can be set through this parameter or the `FILE_ROOT_PATH` environment variable.
|
|
62
|
+
If none of them is set, a `ValueError` is raised.
|
|
63
|
+
:param file_extensions: The file extensions that are permitted to be downloaded.
|
|
64
|
+
By default, all file extensions are allowed.
|
|
65
|
+
:param max_workers: The maximum number of workers to use for concurrent downloads.
|
|
66
|
+
:param max_cache_size: The maximum number of files to cache.
|
|
67
|
+
:param file_name_meta_key: The name of the meta key that contains the file name to download.
|
|
68
|
+
By default, the `Document.meta["file_name"]` is used. If you want to use a
|
|
69
|
+
different key in `Document.meta`, you can set it here.
|
|
70
|
+
:raises ValueError: If the `file_root_path` is not set through
|
|
71
|
+
the constructor or the `FILE_ROOT_PATH` environment variable.
|
|
72
|
+
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
# Set up download directory
|
|
76
|
+
file_root_path = file_root_path or os.getenv("FILE_ROOT_PATH")
|
|
77
|
+
|
|
78
|
+
if file_root_path is None:
|
|
79
|
+
msg = (
|
|
80
|
+
"The path where files will be downloaded is not set. Please set the "
|
|
81
|
+
"`file_root_path` init parameter or the `FILE_ROOT_PATH` environment variable."
|
|
82
|
+
)
|
|
83
|
+
raise ValueError(msg)
|
|
84
|
+
|
|
85
|
+
self.file_root_path = Path(file_root_path)
|
|
86
|
+
|
|
87
|
+
self.aws_access_key_id = aws_access_key_id
|
|
88
|
+
self.aws_secret_access_key = aws_secret_access_key
|
|
89
|
+
self.aws_region_name = aws_region_name
|
|
90
|
+
self.aws_session_token = aws_session_token
|
|
91
|
+
self.aws_profile_name = aws_profile_name
|
|
92
|
+
self.boto3_config = boto3_config
|
|
93
|
+
self.file_extensions = [e.lower() for e in file_extensions] if file_extensions else None
|
|
94
|
+
self.max_workers = max_workers
|
|
95
|
+
self.max_cache_size = max_cache_size
|
|
96
|
+
self.file_name_meta_key = file_name_meta_key
|
|
97
|
+
|
|
98
|
+
self._storage: Optional[S3Storage] = None
|
|
99
|
+
|
|
100
|
+
def resolve_secret(secret: Optional[Secret]) -> Optional[str]:
|
|
101
|
+
return secret.resolve_value() if secret else None
|
|
102
|
+
|
|
103
|
+
self._session = get_aws_session(
|
|
104
|
+
aws_access_key_id=resolve_secret(aws_access_key_id),
|
|
105
|
+
aws_secret_access_key=resolve_secret(aws_secret_access_key),
|
|
106
|
+
aws_session_token=resolve_secret(aws_session_token),
|
|
107
|
+
aws_region_name=resolve_secret(aws_region_name),
|
|
108
|
+
aws_profile_name=resolve_secret(aws_profile_name),
|
|
109
|
+
)
|
|
110
|
+
self._config = Config(
|
|
111
|
+
user_agent_extra="x-client-framework:haystack", **(self.boto3_config if self.boto3_config else {})
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
def warm_up(self) -> None:
|
|
115
|
+
"""Warm up the component by initializing the settings and storage."""
|
|
116
|
+
if self._storage is None:
|
|
117
|
+
self.file_root_path.mkdir(parents=True, exist_ok=True)
|
|
118
|
+
self._storage = S3Storage.from_env(session=self._session, config=self._config)
|
|
119
|
+
|
|
120
|
+
@component.output_types(documents=List[Document])
|
|
121
|
+
def run(
|
|
122
|
+
self,
|
|
123
|
+
documents: List[Document],
|
|
124
|
+
) -> Dict[str, List[Document]]:
|
|
125
|
+
"""Download files from AWS S3 Buckets to local filesystem.
|
|
126
|
+
|
|
127
|
+
Return enriched `Document`s with the path of the downloaded file.
|
|
128
|
+
:param documents: Document containing the name of the file to download in the meta field.
|
|
129
|
+
:returns: A dictionary with:
|
|
130
|
+
- `documents`: The downloaded `Document`s; each has `meta['file_path']`.
|
|
131
|
+
:raises S3Error: If a download attempt fails or the file does not exist in the S3 bucket.
|
|
132
|
+
:raises ValueError: If the path where files will be downloaded is not set.
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
if self._storage is None:
|
|
136
|
+
msg = f"The component {self.__class__.__name__} was not warmed up. Call 'warm_up()' before calling run()."
|
|
137
|
+
raise RuntimeError(msg)
|
|
138
|
+
|
|
139
|
+
filtered_documents = self._filter_documents_by_extensions(documents) if self.file_extensions else documents
|
|
140
|
+
|
|
141
|
+
if not filtered_documents:
|
|
142
|
+
return {"documents": []}
|
|
143
|
+
|
|
144
|
+
try:
|
|
145
|
+
max_workers = min(self.max_workers, len(filtered_documents) if filtered_documents else self.max_workers)
|
|
146
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
147
|
+
iterable = executor.map(self._download_file, filtered_documents)
|
|
148
|
+
finally:
|
|
149
|
+
self._cleanup_cache(filtered_documents)
|
|
150
|
+
|
|
151
|
+
downloaded_documents = [d for d in iterable if d is not None]
|
|
152
|
+
return {"documents": downloaded_documents}
|
|
153
|
+
|
|
154
|
+
def _filter_documents_by_extensions(self, documents: List[Document]) -> List[Document]:
|
|
155
|
+
"""Filter documents by file extensions."""
|
|
156
|
+
if not self.file_extensions:
|
|
157
|
+
return documents
|
|
158
|
+
return [
|
|
159
|
+
doc
|
|
160
|
+
for doc in documents
|
|
161
|
+
if Path(doc.meta.get(self.file_name_meta_key, "")).suffix.lower() in self.file_extensions
|
|
162
|
+
]
|
|
163
|
+
|
|
164
|
+
def _download_file(self, document: Document) -> Optional[Document]:
|
|
165
|
+
"""
|
|
166
|
+
Download a single file from AWS S3 Bucket to local filesystem.
|
|
167
|
+
|
|
168
|
+
:param document: `Document` with the name of the file to download in the meta field.
|
|
169
|
+
:returns:
|
|
170
|
+
The same `Document` with `meta` containing the `file_path` of the
|
|
171
|
+
downloaded file.
|
|
172
|
+
:raises S3Error: If the download or head request fails or the file does not exist in the S3 bucket.
|
|
173
|
+
"""
|
|
174
|
+
|
|
175
|
+
file_name = document.meta.get(self.file_name_meta_key)
|
|
176
|
+
if not file_name:
|
|
177
|
+
logger.warning(
|
|
178
|
+
f"Document missing required file name metadata key '{self.file_name_meta_key}'. Skipping download."
|
|
179
|
+
)
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
file_path = self.file_root_path / Path(file_name)
|
|
183
|
+
|
|
184
|
+
if file_path.is_file():
|
|
185
|
+
# set access and modification time to now without redownloading the file
|
|
186
|
+
file_path.touch()
|
|
187
|
+
|
|
188
|
+
else:
|
|
189
|
+
# we know that _storage is not None after warm_up() is called, but mypy does not know that
|
|
190
|
+
self._storage.download(key=file_name, local_file_path=file_path) # type: ignore[union-attr]
|
|
191
|
+
|
|
192
|
+
document.meta["file_path"] = str(file_path)
|
|
193
|
+
return document
|
|
194
|
+
|
|
195
|
+
def _cleanup_cache(self, documents: List[Document]) -> None:
|
|
196
|
+
"""
|
|
197
|
+
Remove least-recently-accessed cache files when cache exceeds `max_cache_size`.
|
|
198
|
+
|
|
199
|
+
:param documents: List of Document objects being used containing `cache_id` metadata.
|
|
200
|
+
"""
|
|
201
|
+
requested_ids = {
|
|
202
|
+
str(abs(hash(str(doc.meta.get("cache_id", ""))))) for doc in documents if doc.meta.get("cache_id")
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
all_files = [p for p in self.file_root_path.iterdir() if p.is_file()]
|
|
206
|
+
misses = [p for p in all_files if p.stem not in requested_ids]
|
|
207
|
+
|
|
208
|
+
overflow = len(misses) + len(requested_ids) - self.max_cache_size
|
|
209
|
+
if overflow > 0:
|
|
210
|
+
misses.sort(key=lambda p: p.stat().st_atime)
|
|
211
|
+
for p in misses[:overflow]:
|
|
212
|
+
try:
|
|
213
|
+
p.unlink()
|
|
214
|
+
except Exception as error:
|
|
215
|
+
logger.warning("Failed to remove cache file at {path} with error: {e}", path=p, e=error)
|
|
216
|
+
|
|
217
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
218
|
+
"""Serialize the component to a dictionary."""
|
|
219
|
+
return default_to_dict(
|
|
220
|
+
self,
|
|
221
|
+
aws_access_key_id=self.aws_access_key_id.to_dict() if self.aws_access_key_id else None,
|
|
222
|
+
aws_secret_access_key=self.aws_secret_access_key.to_dict() if self.aws_secret_access_key else None,
|
|
223
|
+
aws_session_token=self.aws_session_token.to_dict() if self.aws_session_token else None,
|
|
224
|
+
aws_region_name=self.aws_region_name.to_dict() if self.aws_region_name else None,
|
|
225
|
+
aws_profile_name=self.aws_profile_name.to_dict() if self.aws_profile_name else None,
|
|
226
|
+
file_root_path=str(self.file_root_path),
|
|
227
|
+
max_workers=self.max_workers,
|
|
228
|
+
max_cache_size=self.max_cache_size,
|
|
229
|
+
file_extensions=self.file_extensions,
|
|
230
|
+
file_name_meta_key=self.file_name_meta_key,
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
@classmethod
|
|
234
|
+
def from_dict(cls, data: Dict[str, Any]) -> "S3Downloader":
|
|
235
|
+
"""
|
|
236
|
+
Deserializes the component from a dictionary.
|
|
237
|
+
:param data:
|
|
238
|
+
Dictionary to deserialize from.
|
|
239
|
+
:returns:
|
|
240
|
+
Deserialized component.
|
|
241
|
+
"""
|
|
242
|
+
deserialize_secrets_inplace(
|
|
243
|
+
data["init_parameters"],
|
|
244
|
+
["aws_access_key_id", "aws_secret_access_key", "aws_session_token", "aws_region_name", "aws_profile_name"],
|
|
245
|
+
)
|
|
246
|
+
return default_from_dict(cls, data)
|
|
@@ -27,6 +27,7 @@ from haystack_integrations.components.generators.amazon_bedrock.chat.utils impor
|
|
|
27
27
|
_parse_completion_response,
|
|
28
28
|
_parse_streaming_response,
|
|
29
29
|
_parse_streaming_response_async,
|
|
30
|
+
_validate_guardrail_config,
|
|
30
31
|
)
|
|
31
32
|
|
|
32
33
|
logger = logging.getLogger(__name__)
|
|
@@ -154,10 +155,11 @@ class AmazonBedrockChatGenerator:
|
|
|
154
155
|
aws_region_name: Optional[Secret] = Secret.from_env_var(["AWS_DEFAULT_REGION"], strict=False), # noqa: B008
|
|
155
156
|
aws_profile_name: Optional[Secret] = Secret.from_env_var(["AWS_PROFILE"], strict=False), # noqa: B008
|
|
156
157
|
generation_kwargs: Optional[Dict[str, Any]] = None,
|
|
157
|
-
stop_words: Optional[List[str]] = None,
|
|
158
158
|
streaming_callback: Optional[StreamingCallbackT] = None,
|
|
159
159
|
boto3_config: Optional[Dict[str, Any]] = None,
|
|
160
160
|
tools: Optional[Union[List[Tool], Toolset]] = None,
|
|
161
|
+
*,
|
|
162
|
+
guardrail_config: Optional[Dict[str, str]] = None,
|
|
161
163
|
) -> None:
|
|
162
164
|
"""
|
|
163
165
|
Initializes the `AmazonBedrockChatGenerator` with the provided parameters. The parameters are passed to the
|
|
@@ -179,10 +181,6 @@ class AmazonBedrockChatGenerator:
|
|
|
179
181
|
:param generation_kwargs: Keyword arguments sent to the model. These parameters are specific to a model.
|
|
180
182
|
You can find the model specific arguments in the AWS Bedrock API
|
|
181
183
|
[documentation](https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters.html).
|
|
182
|
-
:param stop_words: A list of stop words that stop the model from generating more text
|
|
183
|
-
when encountered. You can provide them using this parameter or using the model's `generation_kwargs`
|
|
184
|
-
under a model's specific key for stop words.
|
|
185
|
-
For example, you can provide stop words for Anthropic Claude in the `stop_sequences` key.
|
|
186
184
|
:param streaming_callback: A callback function called when a new token is received from the stream.
|
|
187
185
|
By default, the model is not set up for streaming. To enable streaming, set this parameter to a callback
|
|
188
186
|
function that handles the streaming chunks. The callback function receives a
|
|
@@ -190,6 +188,19 @@ class AmazonBedrockChatGenerator:
|
|
|
190
188
|
the streaming mode on.
|
|
191
189
|
:param boto3_config: The configuration for the boto3 client.
|
|
192
190
|
:param tools: A list of Tool objects or a Toolset that the model can use. Each tool should have a unique name.
|
|
191
|
+
:param guardrail_config: Optional configuration for a guardrail that has been created in Amazon Bedrock.
|
|
192
|
+
This must be provided as a dictionary matching either
|
|
193
|
+
[GuardrailConfiguration](https://docs.aws.amazon.com/bedrock/latest/APIReference/API_GuardrailConfiguration.html).
|
|
194
|
+
or, in streaming mode (when `streaming_callback` is set),
|
|
195
|
+
[GuardrailStreamConfiguration](https://docs.aws.amazon.com/bedrock/latest/APIReference/API_runtime_GuardrailStreamConfiguration.html).
|
|
196
|
+
If `trace` is set to `enabled`, the guardrail trace will be included under the `trace` key in the `meta`
|
|
197
|
+
attribute of the resulting `ChatMessage`.
|
|
198
|
+
Note: Enabling guardrails in streaming mode may introduce additional latency.
|
|
199
|
+
To manage this, you can adjust the `streamProcessingMode` parameter.
|
|
200
|
+
See the
|
|
201
|
+
[Guardrails Streaming documentation](https://docs.aws.amazon.com/bedrock/latest/userguide/guardrails-streaming.html)
|
|
202
|
+
for more information.
|
|
203
|
+
|
|
193
204
|
|
|
194
205
|
:raises ValueError: If the model name is empty or None.
|
|
195
206
|
:raises AmazonBedrockConfigurationError: If the AWS environment is not configured correctly or the model is
|
|
@@ -204,12 +215,15 @@ class AmazonBedrockChatGenerator:
|
|
|
204
215
|
self.aws_session_token = aws_session_token
|
|
205
216
|
self.aws_region_name = aws_region_name
|
|
206
217
|
self.aws_profile_name = aws_profile_name
|
|
207
|
-
self.stop_words = stop_words or []
|
|
208
218
|
self.streaming_callback = streaming_callback
|
|
209
219
|
self.boto3_config = boto3_config
|
|
220
|
+
|
|
210
221
|
_check_duplicate_tool_names(list(tools or [])) # handles Toolset as well
|
|
211
222
|
self.tools = tools
|
|
212
223
|
|
|
224
|
+
_validate_guardrail_config(guardrail_config=guardrail_config, streaming=streaming_callback is not None)
|
|
225
|
+
self.guardrail_config = guardrail_config
|
|
226
|
+
|
|
213
227
|
def resolve_secret(secret: Optional[Secret]) -> Optional[str]:
|
|
214
228
|
return secret.resolve_value() if secret else None
|
|
215
229
|
|
|
@@ -237,7 +251,6 @@ class AmazonBedrockChatGenerator:
|
|
|
237
251
|
raise AmazonBedrockConfigurationError(msg) from exception
|
|
238
252
|
|
|
239
253
|
self.generation_kwargs = generation_kwargs or {}
|
|
240
|
-
self.stop_words = stop_words or []
|
|
241
254
|
self.async_session: Optional[aioboto3.Session] = None
|
|
242
255
|
|
|
243
256
|
def _get_async_session(self) -> aioboto3.Session:
|
|
@@ -291,11 +304,11 @@ class AmazonBedrockChatGenerator:
|
|
|
291
304
|
aws_region_name=self.aws_region_name.to_dict() if self.aws_region_name else None,
|
|
292
305
|
aws_profile_name=self.aws_profile_name.to_dict() if self.aws_profile_name else None,
|
|
293
306
|
model=self.model,
|
|
294
|
-
stop_words=self.stop_words,
|
|
295
307
|
generation_kwargs=self.generation_kwargs,
|
|
296
308
|
streaming_callback=callback_name,
|
|
297
309
|
boto3_config=self.boto3_config,
|
|
298
310
|
tools=serialize_tools_or_toolset(self.tools),
|
|
311
|
+
guardrail_config=self.guardrail_config,
|
|
299
312
|
)
|
|
300
313
|
|
|
301
314
|
@classmethod
|
|
@@ -308,6 +321,12 @@ class AmazonBedrockChatGenerator:
|
|
|
308
321
|
Instance of `AmazonBedrockChatGenerator`.
|
|
309
322
|
"""
|
|
310
323
|
init_params = data.get("init_parameters", {})
|
|
324
|
+
|
|
325
|
+
stop_words = init_params.pop("stop_words", None)
|
|
326
|
+
msg = "stop_words parameter will be ignored. Use the `stopSequences` key in `generation_kwargs` instead."
|
|
327
|
+
if stop_words:
|
|
328
|
+
logger.warning(msg)
|
|
329
|
+
|
|
311
330
|
serialized_callback_handler = init_params.get("streaming_callback")
|
|
312
331
|
if serialized_callback_handler:
|
|
313
332
|
data["init_parameters"]["streaming_callback"] = deserialize_callable(serialized_callback_handler)
|
|
@@ -387,6 +406,8 @@ class AmazonBedrockChatGenerator:
|
|
|
387
406
|
params["toolConfig"] = tool_config
|
|
388
407
|
if additional_fields:
|
|
389
408
|
params["additionalModelRequestFields"] = additional_fields
|
|
409
|
+
if self.guardrail_config:
|
|
410
|
+
params["guardrailConfig"] = self.guardrail_config
|
|
390
411
|
|
|
391
412
|
# overloads that exhaust finite Literals(bool) not treated as exhaustive
|
|
392
413
|
# see https://github.com/python/mypy/issues/14764
|
|
@@ -273,6 +273,7 @@ def _parse_completion_response(response_body: Dict[str, Any], model: str) -> Lis
|
|
|
273
273
|
:param model: The model ID used for generation, included in message metadata.
|
|
274
274
|
:returns: List of ChatMessage objects containing the assistant's response(s) with appropriate metadata.
|
|
275
275
|
"""
|
|
276
|
+
|
|
276
277
|
replies = []
|
|
277
278
|
if "output" in response_body and "message" in response_body["output"]:
|
|
278
279
|
message = response_body["output"]["message"]
|
|
@@ -280,7 +281,7 @@ def _parse_completion_response(response_body: Dict[str, Any], model: str) -> Lis
|
|
|
280
281
|
content_blocks = message["content"]
|
|
281
282
|
|
|
282
283
|
# Common meta information
|
|
283
|
-
|
|
284
|
+
meta = {
|
|
284
285
|
"model": model,
|
|
285
286
|
"index": 0,
|
|
286
287
|
"finish_reason": FINISH_REASON_MAPPING.get(response_body.get("stopReason", "")),
|
|
@@ -291,6 +292,9 @@ def _parse_completion_response(response_body: Dict[str, Any], model: str) -> Lis
|
|
|
291
292
|
"total_tokens": response_body.get("usage", {}).get("totalTokens", 0),
|
|
292
293
|
},
|
|
293
294
|
}
|
|
295
|
+
# guardrail trace
|
|
296
|
+
if "trace" in response_body:
|
|
297
|
+
meta["trace"] = response_body["trace"]
|
|
294
298
|
|
|
295
299
|
# Process all content blocks and combine them into a single message
|
|
296
300
|
text_content = []
|
|
@@ -329,7 +333,7 @@ def _parse_completion_response(response_body: Dict[str, Any], model: str) -> Lis
|
|
|
329
333
|
ChatMessage.from_assistant(
|
|
330
334
|
" ".join(text_content),
|
|
331
335
|
tool_calls=tool_calls,
|
|
332
|
-
meta=
|
|
336
|
+
meta=meta,
|
|
333
337
|
reasoning=ReasoningContent(
|
|
334
338
|
reasoning_text=reasoning_text, extra={"reasoning_contents": reasoning_contents}
|
|
335
339
|
)
|
|
@@ -355,6 +359,7 @@ def _convert_event_to_streaming_chunk(
|
|
|
355
359
|
:param component_info: ComponentInfo object
|
|
356
360
|
:returns: StreamingChunk object containing the content and metadata extracted from the event.
|
|
357
361
|
"""
|
|
362
|
+
|
|
358
363
|
# Initialize an empty StreamingChunk to return if no relevant event is found
|
|
359
364
|
# (e.g. for messageStart and contentBlockStop)
|
|
360
365
|
base_meta = {"model": model, "received_at": datetime.now(timezone.utc).isoformat()}
|
|
@@ -426,19 +431,23 @@ def _convert_event_to_streaming_chunk(
|
|
|
426
431
|
meta=base_meta,
|
|
427
432
|
)
|
|
428
433
|
|
|
429
|
-
elif "metadata" in event
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
434
|
+
elif "metadata" in event:
|
|
435
|
+
event_meta = event["metadata"]
|
|
436
|
+
chunk_meta: Dict[str, Any] = {**base_meta}
|
|
437
|
+
|
|
438
|
+
if "usage" in event_meta:
|
|
439
|
+
usage = event_meta["usage"]
|
|
440
|
+
chunk_meta["usage"] = {
|
|
441
|
+
"prompt_tokens": usage.get("inputTokens", 0),
|
|
442
|
+
"completion_tokens": usage.get("outputTokens", 0),
|
|
443
|
+
"total_tokens": usage.get("totalTokens", 0),
|
|
444
|
+
}
|
|
445
|
+
if "trace" in event_meta:
|
|
446
|
+
chunk_meta["trace"] = event_meta["trace"]
|
|
447
|
+
|
|
448
|
+
# Only create chunk if we added usage or trace data
|
|
449
|
+
if len(chunk_meta) > len(base_meta):
|
|
450
|
+
streaming_chunk = StreamingChunk(content="", meta=chunk_meta)
|
|
442
451
|
|
|
443
452
|
streaming_chunk.component_info = component_info
|
|
444
453
|
|
|
@@ -547,8 +556,15 @@ def _parse_streaming_response(
|
|
|
547
556
|
content_block_idxs.add(content_block_idx)
|
|
548
557
|
streaming_callback(streaming_chunk)
|
|
549
558
|
chunks.append(streaming_chunk)
|
|
559
|
+
|
|
550
560
|
reply = _convert_streaming_chunks_to_chat_message(chunks=chunks)
|
|
561
|
+
|
|
562
|
+
# both the reasoning content and the trace are ignored in _convert_streaming_chunks_to_chat_message
|
|
563
|
+
# so we need to process them separately
|
|
551
564
|
reasoning_content = _process_reasoning_contents(chunks=chunks)
|
|
565
|
+
if chunks[-1].meta and "trace" in chunks[-1].meta:
|
|
566
|
+
reply.meta["trace"] = chunks[-1].meta["trace"]
|
|
567
|
+
|
|
552
568
|
reply = ChatMessage.from_assistant(
|
|
553
569
|
text=reply.text,
|
|
554
570
|
meta=reply.meta,
|
|
@@ -556,6 +572,7 @@ def _parse_streaming_response(
|
|
|
556
572
|
tool_calls=reply.tool_calls,
|
|
557
573
|
reasoning=reasoning_content,
|
|
558
574
|
)
|
|
575
|
+
|
|
559
576
|
return [reply]
|
|
560
577
|
|
|
561
578
|
|
|
@@ -594,3 +611,24 @@ async def _parse_streaming_response_async(
|
|
|
594
611
|
reasoning=reasoning_content,
|
|
595
612
|
)
|
|
596
613
|
return [reply]
|
|
614
|
+
|
|
615
|
+
|
|
616
|
+
def _validate_guardrail_config(guardrail_config: Optional[Dict[str, str]] = None, streaming: bool = False) -> None:
|
|
617
|
+
"""
|
|
618
|
+
Validate the guardrail configuration.
|
|
619
|
+
|
|
620
|
+
:param guardrail_config: The guardrail configuration.
|
|
621
|
+
:param streaming: Whether the streaming is enabled.
|
|
622
|
+
|
|
623
|
+
:raises ValueError: If the guardrail configuration is invalid.
|
|
624
|
+
"""
|
|
625
|
+
if guardrail_config is None:
|
|
626
|
+
return
|
|
627
|
+
|
|
628
|
+
required_fields = {"guardrailIdentifier", "guardrailVersion"}
|
|
629
|
+
if not required_fields.issubset(guardrail_config):
|
|
630
|
+
msg = "`guardrailIdentifier` and `guardrailVersion` fields are required in guardrail configuration."
|
|
631
|
+
raise ValueError(msg)
|
|
632
|
+
if not streaming and "streamProcessingMode" in guardrail_config:
|
|
633
|
+
msg = "`streamProcessingMode` field is only supported for streaming (when `streaming_callback` is not None)."
|
|
634
|
+
raise ValueError(msg)
|
|
File without changes
|
|
File without changes
|