apache-airflow-providers-amazon 9.14.0__py3-none-any.whl → 9.18.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airflow/providers/amazon/__init__.py +3 -3
- airflow/providers/amazon/aws/auth_manager/aws_auth_manager.py +106 -5
- airflow/providers/amazon/aws/auth_manager/routes/login.py +7 -1
- airflow/providers/amazon/aws/executors/aws_lambda/docker/app.py +5 -1
- airflow/providers/amazon/aws/executors/aws_lambda/lambda_executor.py +1 -1
- airflow/providers/amazon/aws/hooks/athena.py +6 -2
- airflow/providers/amazon/aws/hooks/athena_sql.py +2 -2
- airflow/providers/amazon/aws/hooks/base_aws.py +2 -2
- airflow/providers/amazon/aws/hooks/batch_client.py +4 -6
- airflow/providers/amazon/aws/hooks/batch_waiters.py +0 -1
- airflow/providers/amazon/aws/hooks/chime.py +1 -1
- airflow/providers/amazon/aws/hooks/datasync.py +3 -3
- airflow/providers/amazon/aws/hooks/firehose.py +56 -0
- airflow/providers/amazon/aws/hooks/glue.py +7 -1
- airflow/providers/amazon/aws/hooks/kinesis.py +31 -13
- airflow/providers/amazon/aws/hooks/mwaa.py +38 -7
- airflow/providers/amazon/aws/hooks/redshift_sql.py +20 -6
- airflow/providers/amazon/aws/hooks/s3.py +41 -11
- airflow/providers/amazon/aws/hooks/sagemaker_unified_studio.py +1 -1
- airflow/providers/amazon/aws/hooks/ses.py +76 -10
- airflow/providers/amazon/aws/hooks/sns.py +74 -18
- airflow/providers/amazon/aws/hooks/sqs.py +64 -11
- airflow/providers/amazon/aws/hooks/ssm.py +34 -6
- airflow/providers/amazon/aws/hooks/step_function.py +1 -1
- airflow/providers/amazon/aws/links/base_aws.py +1 -1
- airflow/providers/amazon/aws/notifications/ses.py +139 -0
- airflow/providers/amazon/aws/notifications/sns.py +16 -1
- airflow/providers/amazon/aws/notifications/sqs.py +17 -1
- airflow/providers/amazon/aws/operators/base_aws.py +2 -2
- airflow/providers/amazon/aws/operators/bedrock.py +2 -0
- airflow/providers/amazon/aws/operators/cloud_formation.py +2 -2
- airflow/providers/amazon/aws/operators/datasync.py +2 -1
- airflow/providers/amazon/aws/operators/emr.py +44 -33
- airflow/providers/amazon/aws/operators/mwaa.py +12 -3
- airflow/providers/amazon/aws/operators/sagemaker_unified_studio.py +1 -1
- airflow/providers/amazon/aws/operators/ssm.py +122 -17
- airflow/providers/amazon/aws/secrets/secrets_manager.py +3 -4
- airflow/providers/amazon/aws/sensors/base_aws.py +2 -2
- airflow/providers/amazon/aws/sensors/mwaa.py +14 -1
- airflow/providers/amazon/aws/sensors/s3.py +27 -13
- airflow/providers/amazon/aws/sensors/sagemaker_unified_studio.py +1 -1
- airflow/providers/amazon/aws/sensors/ssm.py +33 -17
- airflow/providers/amazon/aws/transfers/azure_blob_to_s3.py +3 -3
- airflow/providers/amazon/aws/transfers/base.py +5 -5
- airflow/providers/amazon/aws/transfers/dynamodb_to_s3.py +4 -4
- airflow/providers/amazon/aws/transfers/exasol_to_s3.py +1 -1
- airflow/providers/amazon/aws/transfers/ftp_to_s3.py +1 -1
- airflow/providers/amazon/aws/transfers/gcs_to_s3.py +48 -5
- airflow/providers/amazon/aws/transfers/glacier_to_gcs.py +1 -1
- airflow/providers/amazon/aws/transfers/google_api_to_s3.py +2 -5
- airflow/providers/amazon/aws/transfers/hive_to_dynamodb.py +1 -1
- airflow/providers/amazon/aws/transfers/http_to_s3.py +1 -1
- airflow/providers/amazon/aws/transfers/imap_attachment_to_s3.py +1 -1
- airflow/providers/amazon/aws/transfers/local_to_s3.py +1 -1
- airflow/providers/amazon/aws/transfers/mongo_to_s3.py +1 -1
- airflow/providers/amazon/aws/transfers/redshift_to_s3.py +6 -6
- airflow/providers/amazon/aws/transfers/s3_to_dynamodb.py +1 -1
- airflow/providers/amazon/aws/transfers/s3_to_ftp.py +1 -1
- airflow/providers/amazon/aws/transfers/s3_to_redshift.py +6 -6
- airflow/providers/amazon/aws/transfers/s3_to_sftp.py +1 -1
- airflow/providers/amazon/aws/transfers/s3_to_sql.py +1 -1
- airflow/providers/amazon/aws/transfers/salesforce_to_s3.py +1 -1
- airflow/providers/amazon/aws/transfers/sftp_to_s3.py +1 -1
- airflow/providers/amazon/aws/transfers/sql_to_s3.py +4 -5
- airflow/providers/amazon/aws/triggers/bedrock.py +1 -1
- airflow/providers/amazon/aws/triggers/s3.py +29 -2
- airflow/providers/amazon/aws/triggers/ssm.py +17 -1
- airflow/providers/amazon/aws/utils/connection_wrapper.py +2 -5
- airflow/providers/amazon/aws/utils/mixins.py +1 -1
- airflow/providers/amazon/aws/utils/waiter.py +2 -2
- airflow/providers/amazon/aws/waiters/emr.json +6 -6
- airflow/providers/amazon/get_provider_info.py +19 -1
- airflow/providers/amazon/version_compat.py +19 -16
- {apache_airflow_providers_amazon-9.14.0.dist-info → apache_airflow_providers_amazon-9.18.0rc2.dist-info}/METADATA +25 -19
- {apache_airflow_providers_amazon-9.14.0.dist-info → apache_airflow_providers_amazon-9.18.0rc2.dist-info}/RECORD +79 -76
- apache_airflow_providers_amazon-9.18.0rc2.dist-info/licenses/NOTICE +5 -0
- {apache_airflow_providers_amazon-9.14.0.dist-info → apache_airflow_providers_amazon-9.18.0rc2.dist-info}/WHEEL +0 -0
- {apache_airflow_providers_amazon-9.14.0.dist-info → apache_airflow_providers_amazon-9.18.0rc2.dist-info}/entry_points.txt +0 -0
- {airflow/providers/amazon → apache_airflow_providers_amazon-9.18.0rc2.dist-info/licenses}/LICENSE +0 -0
|
@@ -36,7 +36,7 @@ from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
|
|
36
36
|
from airflow.providers.amazon.aws.sensors.base_aws import AwsBaseSensor
|
|
37
37
|
from airflow.providers.amazon.aws.triggers.s3 import S3KeysUnchangedTrigger, S3KeyTrigger
|
|
38
38
|
from airflow.providers.amazon.aws.utils.mixins import aws_template_fields
|
|
39
|
-
from airflow.
|
|
39
|
+
from airflow.providers.common.compat.sdk import poke_mode_only
|
|
40
40
|
|
|
41
41
|
|
|
42
42
|
class S3KeySensor(AwsBaseSensor[S3Hook]):
|
|
@@ -122,8 +122,19 @@ class S3KeySensor(AwsBaseSensor[S3Hook]):
|
|
|
122
122
|
"""
|
|
123
123
|
if self.wildcard_match:
|
|
124
124
|
prefix = re.split(r"[\[*?]", key, 1)[0]
|
|
125
|
-
|
|
126
|
-
key_matches
|
|
125
|
+
|
|
126
|
+
key_matches: list[str] = []
|
|
127
|
+
|
|
128
|
+
# Is check_fn is None, then we can return True without having to iterate through each value in
|
|
129
|
+
# yielded by iter_file_metadata. Otherwise, we'll check for a match, and add all matches to the
|
|
130
|
+
# key_matches list
|
|
131
|
+
for k in self.hook.iter_file_metadata(prefix, bucket_name):
|
|
132
|
+
if fnmatch.fnmatch(k["Key"], key):
|
|
133
|
+
if self.check_fn is None:
|
|
134
|
+
# This will only wait for a single match, and will immediately return
|
|
135
|
+
return True
|
|
136
|
+
key_matches.append(k)
|
|
137
|
+
|
|
127
138
|
if not key_matches:
|
|
128
139
|
return False
|
|
129
140
|
|
|
@@ -132,21 +143,23 @@ class S3KeySensor(AwsBaseSensor[S3Hook]):
|
|
|
132
143
|
for f in key_matches:
|
|
133
144
|
metadata = {}
|
|
134
145
|
if "*" in self.metadata_keys:
|
|
135
|
-
metadata = self.hook.head_object(f["Key"], bucket_name)
|
|
146
|
+
metadata = self.hook.head_object(f["Key"], bucket_name) # type: ignore[index]
|
|
136
147
|
else:
|
|
137
|
-
for
|
|
148
|
+
for mk in self.metadata_keys:
|
|
138
149
|
try:
|
|
139
|
-
metadata[
|
|
150
|
+
metadata[mk] = f[mk] # type: ignore[index]
|
|
140
151
|
except KeyError:
|
|
141
152
|
# supplied key might be from head_object response
|
|
142
|
-
self.log.info("Key %s not found in response, performing head_object",
|
|
143
|
-
metadata[
|
|
153
|
+
self.log.info("Key %s not found in response, performing head_object", mk)
|
|
154
|
+
metadata[mk] = self.hook.head_object(f["Key"], bucket_name).get(mk, None) # type: ignore[index]
|
|
144
155
|
files.append(metadata)
|
|
156
|
+
|
|
145
157
|
elif self.use_regex:
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
158
|
+
for k in self.hook.iter_file_metadata("", bucket_name):
|
|
159
|
+
if re.match(pattern=key, string=k["Key"]):
|
|
160
|
+
return True
|
|
161
|
+
return False
|
|
162
|
+
|
|
150
163
|
else:
|
|
151
164
|
obj = self.hook.head_object(key, bucket_name)
|
|
152
165
|
if obj is None:
|
|
@@ -202,6 +215,7 @@ class S3KeySensor(AwsBaseSensor[S3Hook]):
|
|
|
202
215
|
poke_interval=self.poke_interval,
|
|
203
216
|
should_check_fn=bool(self.check_fn),
|
|
204
217
|
use_regex=self.use_regex,
|
|
218
|
+
metadata_keys=self.metadata_keys,
|
|
205
219
|
),
|
|
206
220
|
method_name="execute_complete",
|
|
207
221
|
)
|
|
@@ -213,7 +227,7 @@ class S3KeySensor(AwsBaseSensor[S3Hook]):
|
|
|
213
227
|
Relies on trigger to throw an exception, otherwise it assumes execution was successful.
|
|
214
228
|
"""
|
|
215
229
|
if event["status"] == "running":
|
|
216
|
-
found_keys = self.check_fn(event["files"]) # type: ignore[misc]
|
|
230
|
+
found_keys = self.check_fn(event["files"], **context) # type: ignore[misc]
|
|
217
231
|
if not found_keys:
|
|
218
232
|
self._defer()
|
|
219
233
|
elif event["status"] == "error":
|
|
@@ -25,7 +25,7 @@ from airflow.exceptions import AirflowException
|
|
|
25
25
|
from airflow.providers.amazon.aws.hooks.sagemaker_unified_studio import (
|
|
26
26
|
SageMakerNotebookHook,
|
|
27
27
|
)
|
|
28
|
-
from airflow.providers.
|
|
28
|
+
from airflow.providers.common.compat.sdk import BaseSensorOperator
|
|
29
29
|
|
|
30
30
|
if TYPE_CHECKING:
|
|
31
31
|
from airflow.utils.context import Context
|
|
@@ -21,7 +21,6 @@ from collections.abc import Sequence
|
|
|
21
21
|
from typing import TYPE_CHECKING, Any
|
|
22
22
|
|
|
23
23
|
from airflow.configuration import conf
|
|
24
|
-
from airflow.exceptions import AirflowException
|
|
25
24
|
from airflow.providers.amazon.aws.hooks.ssm import SsmHook
|
|
26
25
|
from airflow.providers.amazon.aws.sensors.base_aws import AwsBaseSensor
|
|
27
26
|
from airflow.providers.amazon.aws.triggers.ssm import SsmRunCommandTrigger
|
|
@@ -34,32 +33,45 @@ if TYPE_CHECKING:
|
|
|
34
33
|
|
|
35
34
|
class SsmRunCommandCompletedSensor(AwsBaseSensor[SsmHook]):
|
|
36
35
|
"""
|
|
37
|
-
Poll the state of an AWS SSM Run Command until
|
|
36
|
+
Poll the state of an AWS SSM Run Command until completion.
|
|
37
|
+
|
|
38
|
+
Waits until all instance jobs reach a terminal state. Fails if any
|
|
39
|
+
instance job ends in a failed state.
|
|
38
40
|
|
|
39
41
|
.. seealso::
|
|
40
|
-
For more information on how to use this sensor, take a look at the
|
|
42
|
+
For more information on how to use this sensor, take a look at the
|
|
43
|
+
guide:
|
|
41
44
|
:ref:`howto/sensor:SsmRunCommandCompletedSensor`
|
|
42
45
|
|
|
43
46
|
:param command_id: The ID of the AWS SSM Run Command.
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
:param poke_interval: Polling period in seconds to check for the status
|
|
49
|
-
|
|
47
|
+
:param deferrable: If True, the sensor will operate in deferrable mode.
|
|
48
|
+
This mode requires aiobotocore module to be installed.
|
|
49
|
+
(default: False, but can be overridden in config file by setting
|
|
50
|
+
default_deferrable to True)
|
|
51
|
+
:param poke_interval: Polling period in seconds to check for the status
|
|
52
|
+
of the job. (default: 120)
|
|
53
|
+
:param max_retries: Number of times before returning the current state.
|
|
54
|
+
(default: 75)
|
|
50
55
|
:param aws_conn_id: The Airflow connection used for AWS credentials.
|
|
51
|
-
If this is ``None`` or empty then the default boto3 behaviour is used.
|
|
52
|
-
running Airflow in a distributed manner and aws_conn_id is None or
|
|
56
|
+
If this is ``None`` or empty then the default boto3 behaviour is used.
|
|
57
|
+
If running Airflow in a distributed manner and aws_conn_id is None or
|
|
53
58
|
empty, then default boto3 configuration would be used (and must be
|
|
54
59
|
maintained on each worker node).
|
|
55
|
-
:param region_name: AWS region_name. If not specified then the default
|
|
60
|
+
:param region_name: AWS region_name. If not specified then the default
|
|
61
|
+
boto3 behaviour is used.
|
|
56
62
|
:param verify: Whether or not to verify SSL certificates. See:
|
|
57
63
|
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html
|
|
58
|
-
:param botocore_config: Configuration dictionary (key-values) for botocore
|
|
64
|
+
:param botocore_config: Configuration dictionary (key-values) for botocore
|
|
65
|
+
client. See:
|
|
59
66
|
https://botocore.amazonaws.com/v1/documentation/api/latest/reference/config.html
|
|
60
67
|
"""
|
|
61
68
|
|
|
62
|
-
INTERMEDIATE_STATES: tuple[str, ...] = (
|
|
69
|
+
INTERMEDIATE_STATES: tuple[str, ...] = (
|
|
70
|
+
"Pending",
|
|
71
|
+
"Delayed",
|
|
72
|
+
"InProgress",
|
|
73
|
+
"Cancelling",
|
|
74
|
+
)
|
|
63
75
|
FAILURE_STATES: tuple[str, ...] = ("Cancelled", "TimedOut", "Failed")
|
|
64
76
|
SUCCESS_STATES: tuple[str, ...] = ("Success",)
|
|
65
77
|
FAILURE_MESSAGE = "SSM run command sensor failed."
|
|
@@ -89,14 +101,18 @@ class SsmRunCommandCompletedSensor(AwsBaseSensor[SsmHook]):
|
|
|
89
101
|
command_invocations = response.get("CommandInvocations", [])
|
|
90
102
|
|
|
91
103
|
if not command_invocations:
|
|
92
|
-
self.log.info(
|
|
104
|
+
self.log.info(
|
|
105
|
+
"No command invocations found",
|
|
106
|
+
"command_id=%s yet, waiting...",
|
|
107
|
+
self.command_id,
|
|
108
|
+
)
|
|
93
109
|
return False
|
|
94
110
|
|
|
95
111
|
for invocation in command_invocations:
|
|
96
112
|
state = invocation["Status"]
|
|
97
113
|
|
|
98
114
|
if state in self.FAILURE_STATES:
|
|
99
|
-
raise
|
|
115
|
+
raise RuntimeError(self.FAILURE_MESSAGE)
|
|
100
116
|
|
|
101
117
|
if state in self.INTERMEDIATE_STATES:
|
|
102
118
|
return False
|
|
@@ -122,6 +138,6 @@ class SsmRunCommandCompletedSensor(AwsBaseSensor[SsmHook]):
|
|
|
122
138
|
event = validate_execute_complete_event(event)
|
|
123
139
|
|
|
124
140
|
if event["status"] != "success":
|
|
125
|
-
raise
|
|
141
|
+
raise RuntimeError(f"Error while running run command: {event}")
|
|
126
142
|
|
|
127
143
|
self.log.info("SSM run command `%s` completed.", event["command_id"])
|
|
@@ -23,7 +23,7 @@ from collections.abc import Sequence
|
|
|
23
23
|
from typing import TYPE_CHECKING
|
|
24
24
|
|
|
25
25
|
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
|
26
|
-
from airflow.providers.
|
|
26
|
+
from airflow.providers.common.compat.sdk import BaseOperator
|
|
27
27
|
|
|
28
28
|
try:
|
|
29
29
|
from airflow.providers.microsoft.azure.hooks.wasb import WasbHook
|
|
@@ -49,12 +49,12 @@ class AzureBlobStorageToS3Operator(BaseOperator):
|
|
|
49
49
|
:param prefix: Prefix string which filters objects whose name begin with
|
|
50
50
|
this prefix. (templated)
|
|
51
51
|
:param delimiter: The delimiter by which you want to filter the objects. (templated)
|
|
52
|
-
For e.g to lists the CSV files from in a directory in GCS you would use
|
|
52
|
+
For e.g. to lists the CSV files from in a directory in GCS you would use
|
|
53
53
|
delimiter='.csv'.
|
|
54
54
|
:param aws_conn_id: Connection id of the S3 connection to use
|
|
55
55
|
:param dest_s3_key: The base S3 key to be used to store the files. (templated)
|
|
56
56
|
:param dest_verify: Whether or not to verify SSL certificates for S3 connection.
|
|
57
|
-
By default SSL certificates are verified.
|
|
57
|
+
By default, SSL certificates are verified.
|
|
58
58
|
You can provide the following values:
|
|
59
59
|
|
|
60
60
|
- ``False``: do not validate SSL certificates. SSL will still be used
|
|
@@ -22,8 +22,8 @@ from __future__ import annotations
|
|
|
22
22
|
from collections.abc import Sequence
|
|
23
23
|
|
|
24
24
|
from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
|
|
25
|
-
from airflow.providers.amazon.version_compat import
|
|
26
|
-
from airflow.
|
|
25
|
+
from airflow.providers.amazon.version_compat import NOTSET, ArgNotSet, is_arg_set
|
|
26
|
+
from airflow.providers.common.compat.sdk import BaseOperator
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
class AwsToAwsBaseOperator(BaseOperator):
|
|
@@ -55,7 +55,7 @@ class AwsToAwsBaseOperator(BaseOperator):
|
|
|
55
55
|
self.source_aws_conn_id = source_aws_conn_id
|
|
56
56
|
self.dest_aws_conn_id = dest_aws_conn_id
|
|
57
57
|
self.source_aws_conn_id = source_aws_conn_id
|
|
58
|
-
if
|
|
59
|
-
self.dest_aws_conn_id = self.source_aws_conn_id
|
|
60
|
-
else:
|
|
58
|
+
if is_arg_set(dest_aws_conn_id):
|
|
61
59
|
self.dest_aws_conn_id = dest_aws_conn_id
|
|
60
|
+
else:
|
|
61
|
+
self.dest_aws_conn_id = self.source_aws_conn_id
|
|
@@ -36,8 +36,8 @@ from airflow.providers.amazon.aws.transfers.base import AwsToAwsBaseOperator
|
|
|
36
36
|
from airflow.utils.helpers import prune_dict
|
|
37
37
|
|
|
38
38
|
if TYPE_CHECKING:
|
|
39
|
-
from airflow.
|
|
40
|
-
from airflow.
|
|
39
|
+
from airflow.providers.amazon.version_compat import ArgNotSet
|
|
40
|
+
from airflow.sdk import Context
|
|
41
41
|
|
|
42
42
|
|
|
43
43
|
class JSONEncoder(json.JSONEncoder):
|
|
@@ -216,9 +216,9 @@ class DynamoDBToS3Operator(AwsToAwsBaseOperator):
|
|
|
216
216
|
scan_kwargs = copy(self.dynamodb_scan_kwargs) if self.dynamodb_scan_kwargs else {}
|
|
217
217
|
err = None
|
|
218
218
|
f: IO[Any]
|
|
219
|
-
with NamedTemporaryFile() as
|
|
219
|
+
with NamedTemporaryFile() as f_tmp:
|
|
220
220
|
try:
|
|
221
|
-
f = self._scan_dynamodb_and_upload_to_s3(
|
|
221
|
+
f = self._scan_dynamodb_and_upload_to_s3(f_tmp, scan_kwargs, table)
|
|
222
222
|
except Exception as e:
|
|
223
223
|
err = e
|
|
224
224
|
raise e
|
|
@@ -24,7 +24,7 @@ from tempfile import NamedTemporaryFile
|
|
|
24
24
|
from typing import TYPE_CHECKING
|
|
25
25
|
|
|
26
26
|
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
|
27
|
-
from airflow.providers.
|
|
27
|
+
from airflow.providers.common.compat.sdk import BaseOperator
|
|
28
28
|
from airflow.providers.exasol.hooks.exasol import ExasolHook
|
|
29
29
|
|
|
30
30
|
if TYPE_CHECKING:
|
|
@@ -22,7 +22,7 @@ from tempfile import NamedTemporaryFile
|
|
|
22
22
|
from typing import TYPE_CHECKING
|
|
23
23
|
|
|
24
24
|
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
|
25
|
-
from airflow.providers.
|
|
25
|
+
from airflow.providers.common.compat.sdk import BaseOperator
|
|
26
26
|
from airflow.providers.ftp.hooks.ftp import FTPHook
|
|
27
27
|
|
|
28
28
|
if TYPE_CHECKING:
|
|
@@ -27,7 +27,7 @@ from packaging.version import Version
|
|
|
27
27
|
|
|
28
28
|
from airflow.exceptions import AirflowException
|
|
29
29
|
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
|
30
|
-
from airflow.providers.
|
|
30
|
+
from airflow.providers.common.compat.sdk import BaseOperator
|
|
31
31
|
from airflow.providers.google.cloud.hooks.gcs import GCSHook
|
|
32
32
|
|
|
33
33
|
if TYPE_CHECKING:
|
|
@@ -39,6 +39,11 @@ class GCSToS3Operator(BaseOperator):
|
|
|
39
39
|
"""
|
|
40
40
|
Synchronizes a Google Cloud Storage bucket with an S3 bucket.
|
|
41
41
|
|
|
42
|
+
.. note::
|
|
43
|
+
When flatten_structure=True, it takes precedence over keep_directory_structure.
|
|
44
|
+
For example, with flatten_structure=True, "folder/subfolder/file.txt" becomes "file.txt"
|
|
45
|
+
regardless of the keep_directory_structure setting.
|
|
46
|
+
|
|
42
47
|
.. seealso::
|
|
43
48
|
For more information on how to use this operator, take a look at the guide:
|
|
44
49
|
:ref:`howto/operator:GCSToS3Operator`
|
|
@@ -79,6 +84,9 @@ class GCSToS3Operator(BaseOperator):
|
|
|
79
84
|
object to be uploaded in S3
|
|
80
85
|
:param keep_directory_structure: (Optional) When set to False the path of the file
|
|
81
86
|
on the bucket is recreated within path passed in dest_s3_key.
|
|
87
|
+
:param flatten_structure: (Optional) When set to True, places all files directly
|
|
88
|
+
in the dest_s3_key directory without preserving subdirectory structure.
|
|
89
|
+
Takes precedence over keep_directory_structure when enabled.
|
|
82
90
|
:param match_glob: (Optional) filters objects based on the glob pattern given by the string
|
|
83
91
|
(e.g, ``'**/*/.json'``)
|
|
84
92
|
:param gcp_user_project: (Optional) The identifier of the Google Cloud project to bill for this request.
|
|
@@ -108,6 +116,7 @@ class GCSToS3Operator(BaseOperator):
|
|
|
108
116
|
dest_s3_extra_args: dict | None = None,
|
|
109
117
|
s3_acl_policy: str | None = None,
|
|
110
118
|
keep_directory_structure: bool = True,
|
|
119
|
+
flatten_structure: bool = False,
|
|
111
120
|
match_glob: str | None = None,
|
|
112
121
|
gcp_user_project: str | None = None,
|
|
113
122
|
**kwargs,
|
|
@@ -124,6 +133,10 @@ class GCSToS3Operator(BaseOperator):
|
|
|
124
133
|
self.dest_s3_extra_args = dest_s3_extra_args or {}
|
|
125
134
|
self.s3_acl_policy = s3_acl_policy
|
|
126
135
|
self.keep_directory_structure = keep_directory_structure
|
|
136
|
+
self.flatten_structure = flatten_structure
|
|
137
|
+
|
|
138
|
+
if self.flatten_structure and self.keep_directory_structure:
|
|
139
|
+
self.log.warning("flatten_structure=True takes precedence over keep_directory_structure=True")
|
|
127
140
|
try:
|
|
128
141
|
from airflow.providers.google import __version__ as _GOOGLE_PROVIDER_VERSION
|
|
129
142
|
|
|
@@ -140,6 +153,17 @@ class GCSToS3Operator(BaseOperator):
|
|
|
140
153
|
self.match_glob = match_glob
|
|
141
154
|
self.gcp_user_project = gcp_user_project
|
|
142
155
|
|
|
156
|
+
def _transform_file_path(self, file_path: str) -> str:
|
|
157
|
+
"""
|
|
158
|
+
Transform the GCS file path according to the specified options.
|
|
159
|
+
|
|
160
|
+
:param file_path: The original GCS file path
|
|
161
|
+
:return: The transformed file path for S3 destination
|
|
162
|
+
"""
|
|
163
|
+
if self.flatten_structure:
|
|
164
|
+
return os.path.basename(file_path)
|
|
165
|
+
return file_path
|
|
166
|
+
|
|
143
167
|
def execute(self, context: Context) -> list[str]:
|
|
144
168
|
# list all files in an Google Cloud Storage bucket
|
|
145
169
|
gcs_hook = GCSHook(
|
|
@@ -167,7 +191,7 @@ class GCSToS3Operator(BaseOperator):
|
|
|
167
191
|
aws_conn_id=self.dest_aws_conn_id, verify=self.dest_verify, extra_args=self.dest_s3_extra_args
|
|
168
192
|
)
|
|
169
193
|
|
|
170
|
-
if not self.keep_directory_structure and self.prefix:
|
|
194
|
+
if not self.keep_directory_structure and self.prefix and not self.flatten_structure:
|
|
171
195
|
self.dest_s3_key = os.path.join(self.dest_s3_key, self.prefix)
|
|
172
196
|
|
|
173
197
|
if not self.replace:
|
|
@@ -187,15 +211,34 @@ class GCSToS3Operator(BaseOperator):
|
|
|
187
211
|
existing_files = existing_files or []
|
|
188
212
|
# remove the prefix for the existing files to allow the match
|
|
189
213
|
existing_files = [file.replace(prefix, "", 1) for file in existing_files]
|
|
190
|
-
|
|
214
|
+
|
|
215
|
+
# Transform GCS files for comparison and filter out existing ones
|
|
216
|
+
existing_files_set = set(existing_files)
|
|
217
|
+
filtered_files = []
|
|
218
|
+
seen_transformed = set()
|
|
219
|
+
|
|
220
|
+
for file in gcs_files:
|
|
221
|
+
transformed = self._transform_file_path(file)
|
|
222
|
+
if transformed not in existing_files_set and transformed not in seen_transformed:
|
|
223
|
+
filtered_files.append(file)
|
|
224
|
+
seen_transformed.add(transformed)
|
|
225
|
+
elif transformed in seen_transformed:
|
|
226
|
+
self.log.warning(
|
|
227
|
+
"Skipping duplicate file %s (transforms to %s)",
|
|
228
|
+
file,
|
|
229
|
+
transformed,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
gcs_files = filtered_files
|
|
191
233
|
|
|
192
234
|
if gcs_files:
|
|
193
235
|
for file in gcs_files:
|
|
194
236
|
with gcs_hook.provide_file(
|
|
195
237
|
object_name=file, bucket_name=str(self.gcs_bucket), user_project=self.gcp_user_project
|
|
196
238
|
) as local_tmp_file:
|
|
197
|
-
|
|
198
|
-
|
|
239
|
+
transformed_path = self._transform_file_path(file)
|
|
240
|
+
dest_key = os.path.join(self.dest_s3_key, transformed_path)
|
|
241
|
+
self.log.info("Saving file from %s to %s", file, dest_key)
|
|
199
242
|
s3_hook.load_file(
|
|
200
243
|
filename=local_tmp_file.name,
|
|
201
244
|
key=dest_key,
|
|
@@ -22,7 +22,7 @@ from collections.abc import Sequence
|
|
|
22
22
|
from typing import TYPE_CHECKING
|
|
23
23
|
|
|
24
24
|
from airflow.providers.amazon.aws.hooks.glacier import GlacierHook
|
|
25
|
-
from airflow.providers.
|
|
25
|
+
from airflow.providers.common.compat.sdk import BaseOperator
|
|
26
26
|
from airflow.providers.google.cloud.hooks.gcs import GCSHook
|
|
27
27
|
|
|
28
28
|
if TYPE_CHECKING:
|
|
@@ -26,14 +26,11 @@ from typing import TYPE_CHECKING
|
|
|
26
26
|
|
|
27
27
|
from airflow.models.xcom import XCOM_RETURN_KEY
|
|
28
28
|
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
|
29
|
-
from airflow.providers.
|
|
29
|
+
from airflow.providers.common.compat.sdk import BaseOperator
|
|
30
30
|
from airflow.providers.google.common.hooks.discovery_api import GoogleDiscoveryApiHook
|
|
31
31
|
|
|
32
32
|
if TYPE_CHECKING:
|
|
33
|
-
|
|
34
|
-
from airflow.sdk.types import RuntimeTaskInstanceProtocol
|
|
35
|
-
except ImportError:
|
|
36
|
-
from airflow.models import TaskInstance as RuntimeTaskInstanceProtocol # type: ignore[assignment]
|
|
33
|
+
from airflow.providers.common.compat.sdk import RuntimeTaskInstanceProtocol
|
|
37
34
|
from airflow.utils.context import Context
|
|
38
35
|
|
|
39
36
|
# MAX XCOM Size is 48KB
|
|
@@ -24,8 +24,8 @@ from collections.abc import Callable, Sequence
|
|
|
24
24
|
from typing import TYPE_CHECKING, Literal
|
|
25
25
|
|
|
26
26
|
from airflow.providers.amazon.aws.hooks.dynamodb import DynamoDBHook
|
|
27
|
-
from airflow.providers.amazon.version_compat import BaseOperator
|
|
28
27
|
from airflow.providers.apache.hive.hooks.hive import HiveServer2Hook
|
|
28
|
+
from airflow.providers.common.compat.sdk import BaseOperator
|
|
29
29
|
|
|
30
30
|
if TYPE_CHECKING:
|
|
31
31
|
from airflow.utils.context import Context
|
|
@@ -23,7 +23,7 @@ from functools import cached_property
|
|
|
23
23
|
from typing import TYPE_CHECKING, Any
|
|
24
24
|
|
|
25
25
|
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
|
26
|
-
from airflow.providers.
|
|
26
|
+
from airflow.providers.common.compat.sdk import BaseOperator
|
|
27
27
|
from airflow.providers.http.hooks.http import HttpHook
|
|
28
28
|
|
|
29
29
|
if TYPE_CHECKING:
|
|
@@ -23,7 +23,7 @@ from collections.abc import Sequence
|
|
|
23
23
|
from typing import TYPE_CHECKING
|
|
24
24
|
|
|
25
25
|
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
|
26
|
-
from airflow.providers.
|
|
26
|
+
from airflow.providers.common.compat.sdk import BaseOperator
|
|
27
27
|
from airflow.providers.imap.hooks.imap import ImapHook
|
|
28
28
|
|
|
29
29
|
if TYPE_CHECKING:
|
|
@@ -21,7 +21,7 @@ from collections.abc import Sequence
|
|
|
21
21
|
from typing import TYPE_CHECKING
|
|
22
22
|
|
|
23
23
|
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
|
24
|
-
from airflow.providers.
|
|
24
|
+
from airflow.providers.common.compat.sdk import BaseOperator
|
|
25
25
|
|
|
26
26
|
if TYPE_CHECKING:
|
|
27
27
|
from airflow.utils.context import Context
|
|
@@ -24,7 +24,7 @@ from typing import TYPE_CHECKING, Any, cast
|
|
|
24
24
|
from bson import json_util
|
|
25
25
|
|
|
26
26
|
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
|
27
|
-
from airflow.providers.
|
|
27
|
+
from airflow.providers.common.compat.sdk import BaseOperator
|
|
28
28
|
from airflow.providers.mongo.hooks.mongo import MongoHook
|
|
29
29
|
|
|
30
30
|
if TYPE_CHECKING:
|
|
@@ -28,8 +28,8 @@ from airflow.providers.amazon.aws.hooks.redshift_data import RedshiftDataHook
|
|
|
28
28
|
from airflow.providers.amazon.aws.hooks.redshift_sql import RedshiftSQLHook
|
|
29
29
|
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
|
30
30
|
from airflow.providers.amazon.aws.utils.redshift import build_credentials_block
|
|
31
|
-
from airflow.providers.amazon.version_compat import
|
|
32
|
-
from airflow.
|
|
31
|
+
from airflow.providers.amazon.version_compat import NOTSET, ArgNotSet, is_arg_set
|
|
32
|
+
from airflow.providers.common.compat.sdk import BaseOperator
|
|
33
33
|
|
|
34
34
|
if TYPE_CHECKING:
|
|
35
35
|
from airflow.utils.context import Context
|
|
@@ -131,12 +131,12 @@ class RedshiftToS3Operator(BaseOperator):
|
|
|
131
131
|
# actually provide a connection note that, because we don't want to let the exception bubble up in
|
|
132
132
|
# that case (since we're silently injecting a connection on their behalf).
|
|
133
133
|
self._aws_conn_id: str | None
|
|
134
|
-
if
|
|
135
|
-
self.conn_set = False
|
|
136
|
-
self._aws_conn_id = "aws_default"
|
|
137
|
-
else:
|
|
134
|
+
if is_arg_set(aws_conn_id):
|
|
138
135
|
self.conn_set = True
|
|
139
136
|
self._aws_conn_id = aws_conn_id
|
|
137
|
+
else:
|
|
138
|
+
self.conn_set = False
|
|
139
|
+
self._aws_conn_id = "aws_default"
|
|
140
140
|
|
|
141
141
|
def _build_unload_query(
|
|
142
142
|
self, credentials_block: str, select_query: str, s3_key: str, unload_options: str
|
|
@@ -24,7 +24,7 @@ from botocore.exceptions import ClientError, WaiterError
|
|
|
24
24
|
|
|
25
25
|
from airflow.exceptions import AirflowException
|
|
26
26
|
from airflow.providers.amazon.aws.hooks.dynamodb import DynamoDBHook
|
|
27
|
-
from airflow.providers.
|
|
27
|
+
from airflow.providers.common.compat.sdk import BaseOperator
|
|
28
28
|
|
|
29
29
|
if TYPE_CHECKING:
|
|
30
30
|
from airflow.utils.context import Context
|
|
@@ -22,7 +22,7 @@ from tempfile import NamedTemporaryFile
|
|
|
22
22
|
from typing import TYPE_CHECKING
|
|
23
23
|
|
|
24
24
|
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
|
25
|
-
from airflow.providers.
|
|
25
|
+
from airflow.providers.common.compat.sdk import BaseOperator
|
|
26
26
|
from airflow.providers.ftp.hooks.ftp import FTPHook
|
|
27
27
|
|
|
28
28
|
if TYPE_CHECKING:
|
|
@@ -24,8 +24,8 @@ from airflow.providers.amazon.aws.hooks.redshift_data import RedshiftDataHook
|
|
|
24
24
|
from airflow.providers.amazon.aws.hooks.redshift_sql import RedshiftSQLHook
|
|
25
25
|
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
|
26
26
|
from airflow.providers.amazon.aws.utils.redshift import build_credentials_block
|
|
27
|
-
from airflow.providers.amazon.version_compat import
|
|
28
|
-
from airflow.
|
|
27
|
+
from airflow.providers.amazon.version_compat import NOTSET, ArgNotSet, is_arg_set
|
|
28
|
+
from airflow.providers.common.compat.sdk import BaseOperator
|
|
29
29
|
|
|
30
30
|
if TYPE_CHECKING:
|
|
31
31
|
from airflow.utils.context import Context
|
|
@@ -122,12 +122,12 @@ class S3ToRedshiftOperator(BaseOperator):
|
|
|
122
122
|
# actually provide a connection note that, because we don't want to let the exception bubble up in
|
|
123
123
|
# that case (since we're silently injecting a connection on their behalf).
|
|
124
124
|
self._aws_conn_id: str | None
|
|
125
|
-
if
|
|
126
|
-
self.conn_set = False
|
|
127
|
-
self._aws_conn_id = "aws_default"
|
|
128
|
-
else:
|
|
125
|
+
if is_arg_set(aws_conn_id):
|
|
129
126
|
self.conn_set = True
|
|
130
127
|
self._aws_conn_id = aws_conn_id
|
|
128
|
+
else:
|
|
129
|
+
self.conn_set = False
|
|
130
|
+
self._aws_conn_id = "aws_default"
|
|
131
131
|
|
|
132
132
|
if self.redshift_data_api_kwargs:
|
|
133
133
|
for arg in ["sql", "parameters"]:
|
|
@@ -23,7 +23,7 @@ from typing import TYPE_CHECKING
|
|
|
23
23
|
from urllib.parse import urlsplit
|
|
24
24
|
|
|
25
25
|
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
|
26
|
-
from airflow.providers.
|
|
26
|
+
from airflow.providers.common.compat.sdk import BaseOperator
|
|
27
27
|
from airflow.providers.ssh.hooks.ssh import SSHHook
|
|
28
28
|
|
|
29
29
|
if TYPE_CHECKING:
|
|
@@ -23,7 +23,7 @@ from typing import TYPE_CHECKING
|
|
|
23
23
|
|
|
24
24
|
from airflow.exceptions import AirflowException
|
|
25
25
|
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
|
26
|
-
from airflow.providers.
|
|
26
|
+
from airflow.providers.common.compat.sdk import BaseHook, BaseOperator
|
|
27
27
|
|
|
28
28
|
if TYPE_CHECKING:
|
|
29
29
|
from airflow.utils.context import Context
|
|
@@ -22,7 +22,7 @@ from collections.abc import Sequence
|
|
|
22
22
|
from typing import TYPE_CHECKING
|
|
23
23
|
|
|
24
24
|
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
|
25
|
-
from airflow.providers.
|
|
25
|
+
from airflow.providers.common.compat.sdk import BaseOperator
|
|
26
26
|
from airflow.providers.salesforce.hooks.salesforce import SalesforceHook
|
|
27
27
|
|
|
28
28
|
if TYPE_CHECKING:
|
|
@@ -23,7 +23,7 @@ from typing import TYPE_CHECKING
|
|
|
23
23
|
from urllib.parse import urlsplit
|
|
24
24
|
|
|
25
25
|
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
|
26
|
-
from airflow.providers.
|
|
26
|
+
from airflow.providers.common.compat.sdk import BaseOperator
|
|
27
27
|
from airflow.providers.ssh.hooks.ssh import SSHHook
|
|
28
28
|
|
|
29
29
|
if TYPE_CHECKING:
|
|
@@ -27,7 +27,7 @@ from typing import TYPE_CHECKING, Any, Literal, cast
|
|
|
27
27
|
|
|
28
28
|
from airflow.exceptions import AirflowException, AirflowProviderDeprecationWarning
|
|
29
29
|
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
|
30
|
-
from airflow.providers.
|
|
30
|
+
from airflow.providers.common.compat.sdk import BaseHook, BaseOperator
|
|
31
31
|
|
|
32
32
|
if TYPE_CHECKING:
|
|
33
33
|
import pandas as pd
|
|
@@ -304,12 +304,11 @@ class SqlToS3Operator(BaseOperator):
|
|
|
304
304
|
group_df.reset_index(drop=True),
|
|
305
305
|
)
|
|
306
306
|
elif isinstance(df, pl.DataFrame):
|
|
307
|
-
for group_label,
|
|
308
|
-
if random_column_name
|
|
309
|
-
group_df = group_df.drop(random_column_name)
|
|
307
|
+
for group_label, group_df_in in df.group_by(**self.groupby_kwargs): # type: ignore[assignment]
|
|
308
|
+
group_df2 = group_df_in.drop(random_column_name) if random_column_name else group_df_in
|
|
310
309
|
yield (
|
|
311
310
|
cast("str", group_label[0] if isinstance(group_label, tuple) else group_label),
|
|
312
|
-
|
|
311
|
+
group_df2,
|
|
313
312
|
)
|
|
314
313
|
|
|
315
314
|
def _get_hook(self) -> DbApiHook:
|
|
@@ -20,7 +20,7 @@ from typing import TYPE_CHECKING
|
|
|
20
20
|
|
|
21
21
|
from airflow.providers.amazon.aws.hooks.bedrock import BedrockAgentHook, BedrockHook
|
|
22
22
|
from airflow.providers.amazon.aws.triggers.base import AwsBaseWaiterTrigger
|
|
23
|
-
from airflow.
|
|
23
|
+
from airflow.providers.amazon.version_compat import NOTSET, ArgNotSet
|
|
24
24
|
|
|
25
25
|
if TYPE_CHECKING:
|
|
26
26
|
from airflow.providers.amazon.aws.hooks.base_aws import AwsGenericHook
|