apache-airflow-providers-amazon 9.14.0__py3-none-any.whl → 9.18.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. airflow/providers/amazon/__init__.py +3 -3
  2. airflow/providers/amazon/aws/auth_manager/aws_auth_manager.py +106 -5
  3. airflow/providers/amazon/aws/auth_manager/routes/login.py +7 -1
  4. airflow/providers/amazon/aws/executors/aws_lambda/docker/app.py +5 -1
  5. airflow/providers/amazon/aws/executors/aws_lambda/lambda_executor.py +1 -1
  6. airflow/providers/amazon/aws/hooks/athena.py +6 -2
  7. airflow/providers/amazon/aws/hooks/athena_sql.py +2 -2
  8. airflow/providers/amazon/aws/hooks/base_aws.py +2 -2
  9. airflow/providers/amazon/aws/hooks/batch_client.py +4 -6
  10. airflow/providers/amazon/aws/hooks/batch_waiters.py +0 -1
  11. airflow/providers/amazon/aws/hooks/chime.py +1 -1
  12. airflow/providers/amazon/aws/hooks/datasync.py +3 -3
  13. airflow/providers/amazon/aws/hooks/firehose.py +56 -0
  14. airflow/providers/amazon/aws/hooks/glue.py +7 -1
  15. airflow/providers/amazon/aws/hooks/kinesis.py +31 -13
  16. airflow/providers/amazon/aws/hooks/mwaa.py +38 -7
  17. airflow/providers/amazon/aws/hooks/redshift_sql.py +20 -6
  18. airflow/providers/amazon/aws/hooks/s3.py +41 -11
  19. airflow/providers/amazon/aws/hooks/sagemaker_unified_studio.py +1 -1
  20. airflow/providers/amazon/aws/hooks/ses.py +76 -10
  21. airflow/providers/amazon/aws/hooks/sns.py +74 -18
  22. airflow/providers/amazon/aws/hooks/sqs.py +64 -11
  23. airflow/providers/amazon/aws/hooks/ssm.py +34 -6
  24. airflow/providers/amazon/aws/hooks/step_function.py +1 -1
  25. airflow/providers/amazon/aws/links/base_aws.py +1 -1
  26. airflow/providers/amazon/aws/notifications/ses.py +139 -0
  27. airflow/providers/amazon/aws/notifications/sns.py +16 -1
  28. airflow/providers/amazon/aws/notifications/sqs.py +17 -1
  29. airflow/providers/amazon/aws/operators/base_aws.py +2 -2
  30. airflow/providers/amazon/aws/operators/bedrock.py +2 -0
  31. airflow/providers/amazon/aws/operators/cloud_formation.py +2 -2
  32. airflow/providers/amazon/aws/operators/datasync.py +2 -1
  33. airflow/providers/amazon/aws/operators/emr.py +44 -33
  34. airflow/providers/amazon/aws/operators/mwaa.py +12 -3
  35. airflow/providers/amazon/aws/operators/sagemaker_unified_studio.py +1 -1
  36. airflow/providers/amazon/aws/operators/ssm.py +122 -17
  37. airflow/providers/amazon/aws/secrets/secrets_manager.py +3 -4
  38. airflow/providers/amazon/aws/sensors/base_aws.py +2 -2
  39. airflow/providers/amazon/aws/sensors/mwaa.py +14 -1
  40. airflow/providers/amazon/aws/sensors/s3.py +27 -13
  41. airflow/providers/amazon/aws/sensors/sagemaker_unified_studio.py +1 -1
  42. airflow/providers/amazon/aws/sensors/ssm.py +33 -17
  43. airflow/providers/amazon/aws/transfers/azure_blob_to_s3.py +3 -3
  44. airflow/providers/amazon/aws/transfers/base.py +5 -5
  45. airflow/providers/amazon/aws/transfers/dynamodb_to_s3.py +4 -4
  46. airflow/providers/amazon/aws/transfers/exasol_to_s3.py +1 -1
  47. airflow/providers/amazon/aws/transfers/ftp_to_s3.py +1 -1
  48. airflow/providers/amazon/aws/transfers/gcs_to_s3.py +48 -5
  49. airflow/providers/amazon/aws/transfers/glacier_to_gcs.py +1 -1
  50. airflow/providers/amazon/aws/transfers/google_api_to_s3.py +2 -5
  51. airflow/providers/amazon/aws/transfers/hive_to_dynamodb.py +1 -1
  52. airflow/providers/amazon/aws/transfers/http_to_s3.py +1 -1
  53. airflow/providers/amazon/aws/transfers/imap_attachment_to_s3.py +1 -1
  54. airflow/providers/amazon/aws/transfers/local_to_s3.py +1 -1
  55. airflow/providers/amazon/aws/transfers/mongo_to_s3.py +1 -1
  56. airflow/providers/amazon/aws/transfers/redshift_to_s3.py +6 -6
  57. airflow/providers/amazon/aws/transfers/s3_to_dynamodb.py +1 -1
  58. airflow/providers/amazon/aws/transfers/s3_to_ftp.py +1 -1
  59. airflow/providers/amazon/aws/transfers/s3_to_redshift.py +6 -6
  60. airflow/providers/amazon/aws/transfers/s3_to_sftp.py +1 -1
  61. airflow/providers/amazon/aws/transfers/s3_to_sql.py +1 -1
  62. airflow/providers/amazon/aws/transfers/salesforce_to_s3.py +1 -1
  63. airflow/providers/amazon/aws/transfers/sftp_to_s3.py +1 -1
  64. airflow/providers/amazon/aws/transfers/sql_to_s3.py +4 -5
  65. airflow/providers/amazon/aws/triggers/bedrock.py +1 -1
  66. airflow/providers/amazon/aws/triggers/s3.py +29 -2
  67. airflow/providers/amazon/aws/triggers/ssm.py +17 -1
  68. airflow/providers/amazon/aws/utils/connection_wrapper.py +2 -5
  69. airflow/providers/amazon/aws/utils/mixins.py +1 -1
  70. airflow/providers/amazon/aws/utils/waiter.py +2 -2
  71. airflow/providers/amazon/aws/waiters/emr.json +6 -6
  72. airflow/providers/amazon/get_provider_info.py +19 -1
  73. airflow/providers/amazon/version_compat.py +19 -16
  74. {apache_airflow_providers_amazon-9.14.0.dist-info → apache_airflow_providers_amazon-9.18.0rc2.dist-info}/METADATA +25 -19
  75. {apache_airflow_providers_amazon-9.14.0.dist-info → apache_airflow_providers_amazon-9.18.0rc2.dist-info}/RECORD +79 -76
  76. apache_airflow_providers_amazon-9.18.0rc2.dist-info/licenses/NOTICE +5 -0
  77. {apache_airflow_providers_amazon-9.14.0.dist-info → apache_airflow_providers_amazon-9.18.0rc2.dist-info}/WHEEL +0 -0
  78. {apache_airflow_providers_amazon-9.14.0.dist-info → apache_airflow_providers_amazon-9.18.0rc2.dist-info}/entry_points.txt +0 -0
  79. {airflow/providers/amazon → apache_airflow_providers_amazon-9.18.0rc2.dist-info/licenses}/LICENSE +0 -0
@@ -36,7 +36,7 @@ from airflow.providers.amazon.aws.hooks.s3 import S3Hook
36
36
  from airflow.providers.amazon.aws.sensors.base_aws import AwsBaseSensor
37
37
  from airflow.providers.amazon.aws.triggers.s3 import S3KeysUnchangedTrigger, S3KeyTrigger
38
38
  from airflow.providers.amazon.aws.utils.mixins import aws_template_fields
39
- from airflow.sensors.base import poke_mode_only
39
+ from airflow.providers.common.compat.sdk import poke_mode_only
40
40
 
41
41
 
42
42
  class S3KeySensor(AwsBaseSensor[S3Hook]):
@@ -122,8 +122,19 @@ class S3KeySensor(AwsBaseSensor[S3Hook]):
122
122
  """
123
123
  if self.wildcard_match:
124
124
  prefix = re.split(r"[\[*?]", key, 1)[0]
125
- keys = self.hook.get_file_metadata(prefix, bucket_name)
126
- key_matches = [k for k in keys if fnmatch.fnmatch(k["Key"], key)]
125
+
126
+ key_matches: list[str] = []
127
+
128
+ # Is check_fn is None, then we can return True without having to iterate through each value in
129
+ # yielded by iter_file_metadata. Otherwise, we'll check for a match, and add all matches to the
130
+ # key_matches list
131
+ for k in self.hook.iter_file_metadata(prefix, bucket_name):
132
+ if fnmatch.fnmatch(k["Key"], key):
133
+ if self.check_fn is None:
134
+ # This will only wait for a single match, and will immediately return
135
+ return True
136
+ key_matches.append(k)
137
+
127
138
  if not key_matches:
128
139
  return False
129
140
 
@@ -132,21 +143,23 @@ class S3KeySensor(AwsBaseSensor[S3Hook]):
132
143
  for f in key_matches:
133
144
  metadata = {}
134
145
  if "*" in self.metadata_keys:
135
- metadata = self.hook.head_object(f["Key"], bucket_name)
146
+ metadata = self.hook.head_object(f["Key"], bucket_name) # type: ignore[index]
136
147
  else:
137
- for key in self.metadata_keys:
148
+ for mk in self.metadata_keys:
138
149
  try:
139
- metadata[key] = f[key]
150
+ metadata[mk] = f[mk] # type: ignore[index]
140
151
  except KeyError:
141
152
  # supplied key might be from head_object response
142
- self.log.info("Key %s not found in response, performing head_object", key)
143
- metadata[key] = self.hook.head_object(f["Key"], bucket_name).get(key, None)
153
+ self.log.info("Key %s not found in response, performing head_object", mk)
154
+ metadata[mk] = self.hook.head_object(f["Key"], bucket_name).get(mk, None) # type: ignore[index]
144
155
  files.append(metadata)
156
+
145
157
  elif self.use_regex:
146
- keys = self.hook.get_file_metadata("", bucket_name)
147
- key_matches = [k for k in keys if re.match(pattern=key, string=k["Key"])]
148
- if not key_matches:
149
- return False
158
+ for k in self.hook.iter_file_metadata("", bucket_name):
159
+ if re.match(pattern=key, string=k["Key"]):
160
+ return True
161
+ return False
162
+
150
163
  else:
151
164
  obj = self.hook.head_object(key, bucket_name)
152
165
  if obj is None:
@@ -202,6 +215,7 @@ class S3KeySensor(AwsBaseSensor[S3Hook]):
202
215
  poke_interval=self.poke_interval,
203
216
  should_check_fn=bool(self.check_fn),
204
217
  use_regex=self.use_regex,
218
+ metadata_keys=self.metadata_keys,
205
219
  ),
206
220
  method_name="execute_complete",
207
221
  )
@@ -213,7 +227,7 @@ class S3KeySensor(AwsBaseSensor[S3Hook]):
213
227
  Relies on trigger to throw an exception, otherwise it assumes execution was successful.
214
228
  """
215
229
  if event["status"] == "running":
216
- found_keys = self.check_fn(event["files"]) # type: ignore[misc]
230
+ found_keys = self.check_fn(event["files"], **context) # type: ignore[misc]
217
231
  if not found_keys:
218
232
  self._defer()
219
233
  elif event["status"] == "error":
@@ -25,7 +25,7 @@ from airflow.exceptions import AirflowException
25
25
  from airflow.providers.amazon.aws.hooks.sagemaker_unified_studio import (
26
26
  SageMakerNotebookHook,
27
27
  )
28
- from airflow.providers.amazon.version_compat import BaseSensorOperator
28
+ from airflow.providers.common.compat.sdk import BaseSensorOperator
29
29
 
30
30
  if TYPE_CHECKING:
31
31
  from airflow.utils.context import Context
@@ -21,7 +21,6 @@ from collections.abc import Sequence
21
21
  from typing import TYPE_CHECKING, Any
22
22
 
23
23
  from airflow.configuration import conf
24
- from airflow.exceptions import AirflowException
25
24
  from airflow.providers.amazon.aws.hooks.ssm import SsmHook
26
25
  from airflow.providers.amazon.aws.sensors.base_aws import AwsBaseSensor
27
26
  from airflow.providers.amazon.aws.triggers.ssm import SsmRunCommandTrigger
@@ -34,32 +33,45 @@ if TYPE_CHECKING:
34
33
 
35
34
  class SsmRunCommandCompletedSensor(AwsBaseSensor[SsmHook]):
36
35
  """
37
- Poll the state of an AWS SSM Run Command until all instance jobs reach a terminal state. Fails if any instance job ends in a failed state.
36
+ Poll the state of an AWS SSM Run Command until completion.
37
+
38
+ Waits until all instance jobs reach a terminal state. Fails if any
39
+ instance job ends in a failed state.
38
40
 
39
41
  .. seealso::
40
- For more information on how to use this sensor, take a look at the guide:
42
+ For more information on how to use this sensor, take a look at the
43
+ guide:
41
44
  :ref:`howto/sensor:SsmRunCommandCompletedSensor`
42
45
 
43
46
  :param command_id: The ID of the AWS SSM Run Command.
44
-
45
- :param deferrable: If True, the sensor will operate in deferrable mode. This mode requires aiobotocore
46
- module to be installed.
47
- (default: False, but can be overridden in config file by setting default_deferrable to True)
48
- :param poke_interval: Polling period in seconds to check for the status of the job. (default: 120)
49
- :param max_retries: Number of times before returning the current state. (default: 75)
47
+ :param deferrable: If True, the sensor will operate in deferrable mode.
48
+ This mode requires aiobotocore module to be installed.
49
+ (default: False, but can be overridden in config file by setting
50
+ default_deferrable to True)
51
+ :param poke_interval: Polling period in seconds to check for the status
52
+ of the job. (default: 120)
53
+ :param max_retries: Number of times before returning the current state.
54
+ (default: 75)
50
55
  :param aws_conn_id: The Airflow connection used for AWS credentials.
51
- If this is ``None`` or empty then the default boto3 behaviour is used. If
52
- running Airflow in a distributed manner and aws_conn_id is None or
56
+ If this is ``None`` or empty then the default boto3 behaviour is used.
57
+ If running Airflow in a distributed manner and aws_conn_id is None or
53
58
  empty, then default boto3 configuration would be used (and must be
54
59
  maintained on each worker node).
55
- :param region_name: AWS region_name. If not specified then the default boto3 behaviour is used.
60
+ :param region_name: AWS region_name. If not specified then the default
61
+ boto3 behaviour is used.
56
62
  :param verify: Whether or not to verify SSL certificates. See:
57
63
  https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html
58
- :param botocore_config: Configuration dictionary (key-values) for botocore client. See:
64
+ :param botocore_config: Configuration dictionary (key-values) for botocore
65
+ client. See:
59
66
  https://botocore.amazonaws.com/v1/documentation/api/latest/reference/config.html
60
67
  """
61
68
 
62
- INTERMEDIATE_STATES: tuple[str, ...] = ("Pending", "Delayed", "InProgress", "Cancelling")
69
+ INTERMEDIATE_STATES: tuple[str, ...] = (
70
+ "Pending",
71
+ "Delayed",
72
+ "InProgress",
73
+ "Cancelling",
74
+ )
63
75
  FAILURE_STATES: tuple[str, ...] = ("Cancelled", "TimedOut", "Failed")
64
76
  SUCCESS_STATES: tuple[str, ...] = ("Success",)
65
77
  FAILURE_MESSAGE = "SSM run command sensor failed."
@@ -89,14 +101,18 @@ class SsmRunCommandCompletedSensor(AwsBaseSensor[SsmHook]):
89
101
  command_invocations = response.get("CommandInvocations", [])
90
102
 
91
103
  if not command_invocations:
92
- self.log.info("No command invocations found for command_id=%s yet, waiting...", self.command_id)
104
+ self.log.info(
105
+ "No command invocations found",
106
+ "command_id=%s yet, waiting...",
107
+ self.command_id,
108
+ )
93
109
  return False
94
110
 
95
111
  for invocation in command_invocations:
96
112
  state = invocation["Status"]
97
113
 
98
114
  if state in self.FAILURE_STATES:
99
- raise AirflowException(self.FAILURE_MESSAGE)
115
+ raise RuntimeError(self.FAILURE_MESSAGE)
100
116
 
101
117
  if state in self.INTERMEDIATE_STATES:
102
118
  return False
@@ -122,6 +138,6 @@ class SsmRunCommandCompletedSensor(AwsBaseSensor[SsmHook]):
122
138
  event = validate_execute_complete_event(event)
123
139
 
124
140
  if event["status"] != "success":
125
- raise AirflowException(f"Error while running run command: {event}")
141
+ raise RuntimeError(f"Error while running run command: {event}")
126
142
 
127
143
  self.log.info("SSM run command `%s` completed.", event["command_id"])
@@ -23,7 +23,7 @@ from collections.abc import Sequence
23
23
  from typing import TYPE_CHECKING
24
24
 
25
25
  from airflow.providers.amazon.aws.hooks.s3 import S3Hook
26
- from airflow.providers.amazon.version_compat import BaseOperator
26
+ from airflow.providers.common.compat.sdk import BaseOperator
27
27
 
28
28
  try:
29
29
  from airflow.providers.microsoft.azure.hooks.wasb import WasbHook
@@ -49,12 +49,12 @@ class AzureBlobStorageToS3Operator(BaseOperator):
49
49
  :param prefix: Prefix string which filters objects whose name begin with
50
50
  this prefix. (templated)
51
51
  :param delimiter: The delimiter by which you want to filter the objects. (templated)
52
- For e.g to lists the CSV files from in a directory in GCS you would use
52
+ For e.g. to lists the CSV files from in a directory in GCS you would use
53
53
  delimiter='.csv'.
54
54
  :param aws_conn_id: Connection id of the S3 connection to use
55
55
  :param dest_s3_key: The base S3 key to be used to store the files. (templated)
56
56
  :param dest_verify: Whether or not to verify SSL certificates for S3 connection.
57
- By default SSL certificates are verified.
57
+ By default, SSL certificates are verified.
58
58
  You can provide the following values:
59
59
 
60
60
  - ``False``: do not validate SSL certificates. SSL will still be used
@@ -22,8 +22,8 @@ from __future__ import annotations
22
22
  from collections.abc import Sequence
23
23
 
24
24
  from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
25
- from airflow.providers.amazon.version_compat import BaseOperator
26
- from airflow.utils.types import NOTSET, ArgNotSet
25
+ from airflow.providers.amazon.version_compat import NOTSET, ArgNotSet, is_arg_set
26
+ from airflow.providers.common.compat.sdk import BaseOperator
27
27
 
28
28
 
29
29
  class AwsToAwsBaseOperator(BaseOperator):
@@ -55,7 +55,7 @@ class AwsToAwsBaseOperator(BaseOperator):
55
55
  self.source_aws_conn_id = source_aws_conn_id
56
56
  self.dest_aws_conn_id = dest_aws_conn_id
57
57
  self.source_aws_conn_id = source_aws_conn_id
58
- if isinstance(dest_aws_conn_id, ArgNotSet):
59
- self.dest_aws_conn_id = self.source_aws_conn_id
60
- else:
58
+ if is_arg_set(dest_aws_conn_id):
61
59
  self.dest_aws_conn_id = dest_aws_conn_id
60
+ else:
61
+ self.dest_aws_conn_id = self.source_aws_conn_id
@@ -36,8 +36,8 @@ from airflow.providers.amazon.aws.transfers.base import AwsToAwsBaseOperator
36
36
  from airflow.utils.helpers import prune_dict
37
37
 
38
38
  if TYPE_CHECKING:
39
- from airflow.utils.context import Context
40
- from airflow.utils.types import ArgNotSet
39
+ from airflow.providers.amazon.version_compat import ArgNotSet
40
+ from airflow.sdk import Context
41
41
 
42
42
 
43
43
  class JSONEncoder(json.JSONEncoder):
@@ -216,9 +216,9 @@ class DynamoDBToS3Operator(AwsToAwsBaseOperator):
216
216
  scan_kwargs = copy(self.dynamodb_scan_kwargs) if self.dynamodb_scan_kwargs else {}
217
217
  err = None
218
218
  f: IO[Any]
219
- with NamedTemporaryFile() as f:
219
+ with NamedTemporaryFile() as f_tmp:
220
220
  try:
221
- f = self._scan_dynamodb_and_upload_to_s3(f, scan_kwargs, table)
221
+ f = self._scan_dynamodb_and_upload_to_s3(f_tmp, scan_kwargs, table)
222
222
  except Exception as e:
223
223
  err = e
224
224
  raise e
@@ -24,7 +24,7 @@ from tempfile import NamedTemporaryFile
24
24
  from typing import TYPE_CHECKING
25
25
 
26
26
  from airflow.providers.amazon.aws.hooks.s3 import S3Hook
27
- from airflow.providers.amazon.version_compat import BaseOperator
27
+ from airflow.providers.common.compat.sdk import BaseOperator
28
28
  from airflow.providers.exasol.hooks.exasol import ExasolHook
29
29
 
30
30
  if TYPE_CHECKING:
@@ -22,7 +22,7 @@ from tempfile import NamedTemporaryFile
22
22
  from typing import TYPE_CHECKING
23
23
 
24
24
  from airflow.providers.amazon.aws.hooks.s3 import S3Hook
25
- from airflow.providers.amazon.version_compat import BaseOperator
25
+ from airflow.providers.common.compat.sdk import BaseOperator
26
26
  from airflow.providers.ftp.hooks.ftp import FTPHook
27
27
 
28
28
  if TYPE_CHECKING:
@@ -27,7 +27,7 @@ from packaging.version import Version
27
27
 
28
28
  from airflow.exceptions import AirflowException
29
29
  from airflow.providers.amazon.aws.hooks.s3 import S3Hook
30
- from airflow.providers.amazon.version_compat import BaseOperator
30
+ from airflow.providers.common.compat.sdk import BaseOperator
31
31
  from airflow.providers.google.cloud.hooks.gcs import GCSHook
32
32
 
33
33
  if TYPE_CHECKING:
@@ -39,6 +39,11 @@ class GCSToS3Operator(BaseOperator):
39
39
  """
40
40
  Synchronizes a Google Cloud Storage bucket with an S3 bucket.
41
41
 
42
+ .. note::
43
+ When flatten_structure=True, it takes precedence over keep_directory_structure.
44
+ For example, with flatten_structure=True, "folder/subfolder/file.txt" becomes "file.txt"
45
+ regardless of the keep_directory_structure setting.
46
+
42
47
  .. seealso::
43
48
  For more information on how to use this operator, take a look at the guide:
44
49
  :ref:`howto/operator:GCSToS3Operator`
@@ -79,6 +84,9 @@ class GCSToS3Operator(BaseOperator):
79
84
  object to be uploaded in S3
80
85
  :param keep_directory_structure: (Optional) When set to False the path of the file
81
86
  on the bucket is recreated within path passed in dest_s3_key.
87
+ :param flatten_structure: (Optional) When set to True, places all files directly
88
+ in the dest_s3_key directory without preserving subdirectory structure.
89
+ Takes precedence over keep_directory_structure when enabled.
82
90
  :param match_glob: (Optional) filters objects based on the glob pattern given by the string
83
91
  (e.g, ``'**/*/.json'``)
84
92
  :param gcp_user_project: (Optional) The identifier of the Google Cloud project to bill for this request.
@@ -108,6 +116,7 @@ class GCSToS3Operator(BaseOperator):
108
116
  dest_s3_extra_args: dict | None = None,
109
117
  s3_acl_policy: str | None = None,
110
118
  keep_directory_structure: bool = True,
119
+ flatten_structure: bool = False,
111
120
  match_glob: str | None = None,
112
121
  gcp_user_project: str | None = None,
113
122
  **kwargs,
@@ -124,6 +133,10 @@ class GCSToS3Operator(BaseOperator):
124
133
  self.dest_s3_extra_args = dest_s3_extra_args or {}
125
134
  self.s3_acl_policy = s3_acl_policy
126
135
  self.keep_directory_structure = keep_directory_structure
136
+ self.flatten_structure = flatten_structure
137
+
138
+ if self.flatten_structure and self.keep_directory_structure:
139
+ self.log.warning("flatten_structure=True takes precedence over keep_directory_structure=True")
127
140
  try:
128
141
  from airflow.providers.google import __version__ as _GOOGLE_PROVIDER_VERSION
129
142
 
@@ -140,6 +153,17 @@ class GCSToS3Operator(BaseOperator):
140
153
  self.match_glob = match_glob
141
154
  self.gcp_user_project = gcp_user_project
142
155
 
156
+ def _transform_file_path(self, file_path: str) -> str:
157
+ """
158
+ Transform the GCS file path according to the specified options.
159
+
160
+ :param file_path: The original GCS file path
161
+ :return: The transformed file path for S3 destination
162
+ """
163
+ if self.flatten_structure:
164
+ return os.path.basename(file_path)
165
+ return file_path
166
+
143
167
  def execute(self, context: Context) -> list[str]:
144
168
  # list all files in an Google Cloud Storage bucket
145
169
  gcs_hook = GCSHook(
@@ -167,7 +191,7 @@ class GCSToS3Operator(BaseOperator):
167
191
  aws_conn_id=self.dest_aws_conn_id, verify=self.dest_verify, extra_args=self.dest_s3_extra_args
168
192
  )
169
193
 
170
- if not self.keep_directory_structure and self.prefix:
194
+ if not self.keep_directory_structure and self.prefix and not self.flatten_structure:
171
195
  self.dest_s3_key = os.path.join(self.dest_s3_key, self.prefix)
172
196
 
173
197
  if not self.replace:
@@ -187,15 +211,34 @@ class GCSToS3Operator(BaseOperator):
187
211
  existing_files = existing_files or []
188
212
  # remove the prefix for the existing files to allow the match
189
213
  existing_files = [file.replace(prefix, "", 1) for file in existing_files]
190
- gcs_files = list(set(gcs_files) - set(existing_files))
214
+
215
+ # Transform GCS files for comparison and filter out existing ones
216
+ existing_files_set = set(existing_files)
217
+ filtered_files = []
218
+ seen_transformed = set()
219
+
220
+ for file in gcs_files:
221
+ transformed = self._transform_file_path(file)
222
+ if transformed not in existing_files_set and transformed not in seen_transformed:
223
+ filtered_files.append(file)
224
+ seen_transformed.add(transformed)
225
+ elif transformed in seen_transformed:
226
+ self.log.warning(
227
+ "Skipping duplicate file %s (transforms to %s)",
228
+ file,
229
+ transformed,
230
+ )
231
+
232
+ gcs_files = filtered_files
191
233
 
192
234
  if gcs_files:
193
235
  for file in gcs_files:
194
236
  with gcs_hook.provide_file(
195
237
  object_name=file, bucket_name=str(self.gcs_bucket), user_project=self.gcp_user_project
196
238
  ) as local_tmp_file:
197
- dest_key = os.path.join(self.dest_s3_key, file)
198
- self.log.info("Saving file to %s", dest_key)
239
+ transformed_path = self._transform_file_path(file)
240
+ dest_key = os.path.join(self.dest_s3_key, transformed_path)
241
+ self.log.info("Saving file from %s to %s", file, dest_key)
199
242
  s3_hook.load_file(
200
243
  filename=local_tmp_file.name,
201
244
  key=dest_key,
@@ -22,7 +22,7 @@ from collections.abc import Sequence
22
22
  from typing import TYPE_CHECKING
23
23
 
24
24
  from airflow.providers.amazon.aws.hooks.glacier import GlacierHook
25
- from airflow.providers.amazon.version_compat import BaseOperator
25
+ from airflow.providers.common.compat.sdk import BaseOperator
26
26
  from airflow.providers.google.cloud.hooks.gcs import GCSHook
27
27
 
28
28
  if TYPE_CHECKING:
@@ -26,14 +26,11 @@ from typing import TYPE_CHECKING
26
26
 
27
27
  from airflow.models.xcom import XCOM_RETURN_KEY
28
28
  from airflow.providers.amazon.aws.hooks.s3 import S3Hook
29
- from airflow.providers.amazon.version_compat import BaseOperator
29
+ from airflow.providers.common.compat.sdk import BaseOperator
30
30
  from airflow.providers.google.common.hooks.discovery_api import GoogleDiscoveryApiHook
31
31
 
32
32
  if TYPE_CHECKING:
33
- try:
34
- from airflow.sdk.types import RuntimeTaskInstanceProtocol
35
- except ImportError:
36
- from airflow.models import TaskInstance as RuntimeTaskInstanceProtocol # type: ignore[assignment]
33
+ from airflow.providers.common.compat.sdk import RuntimeTaskInstanceProtocol
37
34
  from airflow.utils.context import Context
38
35
 
39
36
  # MAX XCOM Size is 48KB
@@ -24,8 +24,8 @@ from collections.abc import Callable, Sequence
24
24
  from typing import TYPE_CHECKING, Literal
25
25
 
26
26
  from airflow.providers.amazon.aws.hooks.dynamodb import DynamoDBHook
27
- from airflow.providers.amazon.version_compat import BaseOperator
28
27
  from airflow.providers.apache.hive.hooks.hive import HiveServer2Hook
28
+ from airflow.providers.common.compat.sdk import BaseOperator
29
29
 
30
30
  if TYPE_CHECKING:
31
31
  from airflow.utils.context import Context
@@ -23,7 +23,7 @@ from functools import cached_property
23
23
  from typing import TYPE_CHECKING, Any
24
24
 
25
25
  from airflow.providers.amazon.aws.hooks.s3 import S3Hook
26
- from airflow.providers.amazon.version_compat import BaseOperator
26
+ from airflow.providers.common.compat.sdk import BaseOperator
27
27
  from airflow.providers.http.hooks.http import HttpHook
28
28
 
29
29
  if TYPE_CHECKING:
@@ -23,7 +23,7 @@ from collections.abc import Sequence
23
23
  from typing import TYPE_CHECKING
24
24
 
25
25
  from airflow.providers.amazon.aws.hooks.s3 import S3Hook
26
- from airflow.providers.amazon.version_compat import BaseOperator
26
+ from airflow.providers.common.compat.sdk import BaseOperator
27
27
  from airflow.providers.imap.hooks.imap import ImapHook
28
28
 
29
29
  if TYPE_CHECKING:
@@ -21,7 +21,7 @@ from collections.abc import Sequence
21
21
  from typing import TYPE_CHECKING
22
22
 
23
23
  from airflow.providers.amazon.aws.hooks.s3 import S3Hook
24
- from airflow.providers.amazon.version_compat import BaseOperator
24
+ from airflow.providers.common.compat.sdk import BaseOperator
25
25
 
26
26
  if TYPE_CHECKING:
27
27
  from airflow.utils.context import Context
@@ -24,7 +24,7 @@ from typing import TYPE_CHECKING, Any, cast
24
24
  from bson import json_util
25
25
 
26
26
  from airflow.providers.amazon.aws.hooks.s3 import S3Hook
27
- from airflow.providers.amazon.version_compat import BaseOperator
27
+ from airflow.providers.common.compat.sdk import BaseOperator
28
28
  from airflow.providers.mongo.hooks.mongo import MongoHook
29
29
 
30
30
  if TYPE_CHECKING:
@@ -28,8 +28,8 @@ from airflow.providers.amazon.aws.hooks.redshift_data import RedshiftDataHook
28
28
  from airflow.providers.amazon.aws.hooks.redshift_sql import RedshiftSQLHook
29
29
  from airflow.providers.amazon.aws.hooks.s3 import S3Hook
30
30
  from airflow.providers.amazon.aws.utils.redshift import build_credentials_block
31
- from airflow.providers.amazon.version_compat import BaseOperator
32
- from airflow.utils.types import NOTSET, ArgNotSet
31
+ from airflow.providers.amazon.version_compat import NOTSET, ArgNotSet, is_arg_set
32
+ from airflow.providers.common.compat.sdk import BaseOperator
33
33
 
34
34
  if TYPE_CHECKING:
35
35
  from airflow.utils.context import Context
@@ -131,12 +131,12 @@ class RedshiftToS3Operator(BaseOperator):
131
131
  # actually provide a connection note that, because we don't want to let the exception bubble up in
132
132
  # that case (since we're silently injecting a connection on their behalf).
133
133
  self._aws_conn_id: str | None
134
- if isinstance(aws_conn_id, ArgNotSet):
135
- self.conn_set = False
136
- self._aws_conn_id = "aws_default"
137
- else:
134
+ if is_arg_set(aws_conn_id):
138
135
  self.conn_set = True
139
136
  self._aws_conn_id = aws_conn_id
137
+ else:
138
+ self.conn_set = False
139
+ self._aws_conn_id = "aws_default"
140
140
 
141
141
  def _build_unload_query(
142
142
  self, credentials_block: str, select_query: str, s3_key: str, unload_options: str
@@ -24,7 +24,7 @@ from botocore.exceptions import ClientError, WaiterError
24
24
 
25
25
  from airflow.exceptions import AirflowException
26
26
  from airflow.providers.amazon.aws.hooks.dynamodb import DynamoDBHook
27
- from airflow.providers.amazon.version_compat import BaseOperator
27
+ from airflow.providers.common.compat.sdk import BaseOperator
28
28
 
29
29
  if TYPE_CHECKING:
30
30
  from airflow.utils.context import Context
@@ -22,7 +22,7 @@ from tempfile import NamedTemporaryFile
22
22
  from typing import TYPE_CHECKING
23
23
 
24
24
  from airflow.providers.amazon.aws.hooks.s3 import S3Hook
25
- from airflow.providers.amazon.version_compat import BaseOperator
25
+ from airflow.providers.common.compat.sdk import BaseOperator
26
26
  from airflow.providers.ftp.hooks.ftp import FTPHook
27
27
 
28
28
  if TYPE_CHECKING:
@@ -24,8 +24,8 @@ from airflow.providers.amazon.aws.hooks.redshift_data import RedshiftDataHook
24
24
  from airflow.providers.amazon.aws.hooks.redshift_sql import RedshiftSQLHook
25
25
  from airflow.providers.amazon.aws.hooks.s3 import S3Hook
26
26
  from airflow.providers.amazon.aws.utils.redshift import build_credentials_block
27
- from airflow.providers.amazon.version_compat import BaseOperator
28
- from airflow.utils.types import NOTSET, ArgNotSet
27
+ from airflow.providers.amazon.version_compat import NOTSET, ArgNotSet, is_arg_set
28
+ from airflow.providers.common.compat.sdk import BaseOperator
29
29
 
30
30
  if TYPE_CHECKING:
31
31
  from airflow.utils.context import Context
@@ -122,12 +122,12 @@ class S3ToRedshiftOperator(BaseOperator):
122
122
  # actually provide a connection note that, because we don't want to let the exception bubble up in
123
123
  # that case (since we're silently injecting a connection on their behalf).
124
124
  self._aws_conn_id: str | None
125
- if isinstance(aws_conn_id, ArgNotSet):
126
- self.conn_set = False
127
- self._aws_conn_id = "aws_default"
128
- else:
125
+ if is_arg_set(aws_conn_id):
129
126
  self.conn_set = True
130
127
  self._aws_conn_id = aws_conn_id
128
+ else:
129
+ self.conn_set = False
130
+ self._aws_conn_id = "aws_default"
131
131
 
132
132
  if self.redshift_data_api_kwargs:
133
133
  for arg in ["sql", "parameters"]:
@@ -23,7 +23,7 @@ from typing import TYPE_CHECKING
23
23
  from urllib.parse import urlsplit
24
24
 
25
25
  from airflow.providers.amazon.aws.hooks.s3 import S3Hook
26
- from airflow.providers.amazon.version_compat import BaseOperator
26
+ from airflow.providers.common.compat.sdk import BaseOperator
27
27
  from airflow.providers.ssh.hooks.ssh import SSHHook
28
28
 
29
29
  if TYPE_CHECKING:
@@ -23,7 +23,7 @@ from typing import TYPE_CHECKING
23
23
 
24
24
  from airflow.exceptions import AirflowException
25
25
  from airflow.providers.amazon.aws.hooks.s3 import S3Hook
26
- from airflow.providers.amazon.version_compat import BaseHook, BaseOperator
26
+ from airflow.providers.common.compat.sdk import BaseHook, BaseOperator
27
27
 
28
28
  if TYPE_CHECKING:
29
29
  from airflow.utils.context import Context
@@ -22,7 +22,7 @@ from collections.abc import Sequence
22
22
  from typing import TYPE_CHECKING
23
23
 
24
24
  from airflow.providers.amazon.aws.hooks.s3 import S3Hook
25
- from airflow.providers.amazon.version_compat import BaseOperator
25
+ from airflow.providers.common.compat.sdk import BaseOperator
26
26
  from airflow.providers.salesforce.hooks.salesforce import SalesforceHook
27
27
 
28
28
  if TYPE_CHECKING:
@@ -23,7 +23,7 @@ from typing import TYPE_CHECKING
23
23
  from urllib.parse import urlsplit
24
24
 
25
25
  from airflow.providers.amazon.aws.hooks.s3 import S3Hook
26
- from airflow.providers.amazon.version_compat import BaseOperator
26
+ from airflow.providers.common.compat.sdk import BaseOperator
27
27
  from airflow.providers.ssh.hooks.ssh import SSHHook
28
28
 
29
29
  if TYPE_CHECKING:
@@ -27,7 +27,7 @@ from typing import TYPE_CHECKING, Any, Literal, cast
27
27
 
28
28
  from airflow.exceptions import AirflowException, AirflowProviderDeprecationWarning
29
29
  from airflow.providers.amazon.aws.hooks.s3 import S3Hook
30
- from airflow.providers.amazon.version_compat import BaseHook, BaseOperator
30
+ from airflow.providers.common.compat.sdk import BaseHook, BaseOperator
31
31
 
32
32
  if TYPE_CHECKING:
33
33
  import pandas as pd
@@ -304,12 +304,11 @@ class SqlToS3Operator(BaseOperator):
304
304
  group_df.reset_index(drop=True),
305
305
  )
306
306
  elif isinstance(df, pl.DataFrame):
307
- for group_label, group_df in df.group_by(**self.groupby_kwargs): # type: ignore[assignment]
308
- if random_column_name:
309
- group_df = group_df.drop(random_column_name)
307
+ for group_label, group_df_in in df.group_by(**self.groupby_kwargs): # type: ignore[assignment]
308
+ group_df2 = group_df_in.drop(random_column_name) if random_column_name else group_df_in
310
309
  yield (
311
310
  cast("str", group_label[0] if isinstance(group_label, tuple) else group_label),
312
- group_df,
311
+ group_df2,
313
312
  )
314
313
 
315
314
  def _get_hook(self) -> DbApiHook:
@@ -20,7 +20,7 @@ from typing import TYPE_CHECKING
20
20
 
21
21
  from airflow.providers.amazon.aws.hooks.bedrock import BedrockAgentHook, BedrockHook
22
22
  from airflow.providers.amazon.aws.triggers.base import AwsBaseWaiterTrigger
23
- from airflow.utils.types import NOTSET, ArgNotSet
23
+ from airflow.providers.amazon.version_compat import NOTSET, ArgNotSet
24
24
 
25
25
  if TYPE_CHECKING:
26
26
  from airflow.providers.amazon.aws.hooks.base_aws import AwsGenericHook