apache-airflow-providers-amazon 9.9.0rc1__py3-none-any.whl → 9.10.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airflow/providers/amazon/__init__.py +1 -1
- airflow/providers/amazon/aws/auth_manager/avp/facade.py +8 -1
- airflow/providers/amazon/aws/auth_manager/aws_auth_manager.py +0 -55
- airflow/providers/amazon/aws/bundles/__init__.py +16 -0
- airflow/providers/amazon/aws/bundles/s3.py +152 -0
- airflow/providers/amazon/aws/executors/batch/batch_executor.py +51 -0
- airflow/providers/amazon/aws/executors/ecs/utils.py +2 -2
- airflow/providers/amazon/aws/executors/utils/exponential_backoff_retry.py +1 -1
- airflow/providers/amazon/aws/fs/s3.py +2 -1
- airflow/providers/amazon/aws/hooks/athena_sql.py +12 -2
- airflow/providers/amazon/aws/hooks/base_aws.py +29 -17
- airflow/providers/amazon/aws/hooks/batch_client.py +2 -1
- airflow/providers/amazon/aws/hooks/batch_waiters.py +2 -1
- airflow/providers/amazon/aws/hooks/chime.py +5 -1
- airflow/providers/amazon/aws/hooks/ec2.py +2 -1
- airflow/providers/amazon/aws/hooks/eks.py +1 -2
- airflow/providers/amazon/aws/hooks/glue.py +82 -7
- airflow/providers/amazon/aws/hooks/rds.py +2 -1
- airflow/providers/amazon/aws/hooks/s3.py +86 -3
- airflow/providers/amazon/aws/hooks/sagemaker.py +2 -2
- airflow/providers/amazon/aws/hooks/sagemaker_unified_studio.py +1 -1
- airflow/providers/amazon/aws/links/base_aws.py +2 -10
- airflow/providers/amazon/aws/operators/base_aws.py +1 -1
- airflow/providers/amazon/aws/operators/batch.py +6 -22
- airflow/providers/amazon/aws/operators/ecs.py +1 -1
- airflow/providers/amazon/aws/operators/glue.py +23 -8
- airflow/providers/amazon/aws/operators/redshift_data.py +1 -1
- airflow/providers/amazon/aws/operators/sagemaker.py +2 -2
- airflow/providers/amazon/aws/operators/sagemaker_unified_studio.py +1 -1
- airflow/providers/amazon/aws/sensors/base_aws.py +1 -1
- airflow/providers/amazon/aws/sensors/glue.py +57 -12
- airflow/providers/amazon/aws/sensors/s3.py +2 -2
- airflow/providers/amazon/aws/sensors/sagemaker_unified_studio.py +1 -1
- airflow/providers/amazon/aws/transfers/azure_blob_to_s3.py +1 -1
- airflow/providers/amazon/aws/transfers/base.py +1 -1
- airflow/providers/amazon/aws/transfers/dynamodb_to_s3.py +2 -2
- airflow/providers/amazon/aws/transfers/exasol_to_s3.py +1 -1
- airflow/providers/amazon/aws/transfers/ftp_to_s3.py +1 -1
- airflow/providers/amazon/aws/transfers/gcs_to_s3.py +1 -1
- airflow/providers/amazon/aws/transfers/glacier_to_gcs.py +1 -1
- airflow/providers/amazon/aws/transfers/google_api_to_s3.py +6 -2
- airflow/providers/amazon/aws/transfers/hive_to_dynamodb.py +3 -3
- airflow/providers/amazon/aws/transfers/http_to_s3.py +1 -1
- airflow/providers/amazon/aws/transfers/imap_attachment_to_s3.py +1 -1
- airflow/providers/amazon/aws/transfers/local_to_s3.py +1 -1
- airflow/providers/amazon/aws/transfers/mongo_to_s3.py +1 -1
- airflow/providers/amazon/aws/transfers/redshift_to_s3.py +1 -1
- airflow/providers/amazon/aws/transfers/s3_to_dynamodb.py +1 -1
- airflow/providers/amazon/aws/transfers/s3_to_ftp.py +1 -1
- airflow/providers/amazon/aws/transfers/s3_to_redshift.py +1 -1
- airflow/providers/amazon/aws/transfers/s3_to_sftp.py +1 -1
- airflow/providers/amazon/aws/transfers/s3_to_sql.py +3 -4
- airflow/providers/amazon/aws/transfers/salesforce_to_s3.py +1 -1
- airflow/providers/amazon/aws/transfers/sftp_to_s3.py +1 -1
- airflow/providers/amazon/aws/transfers/sql_to_s3.py +2 -5
- airflow/providers/amazon/aws/triggers/base.py +0 -1
- airflow/providers/amazon/aws/triggers/glue.py +37 -24
- airflow/providers/amazon/aws/utils/connection_wrapper.py +10 -1
- airflow/providers/amazon/aws/utils/suppress.py +2 -1
- airflow/providers/amazon/aws/utils/waiter.py +1 -1
- airflow/providers/amazon/aws/waiters/glue.json +55 -0
- airflow/providers/amazon/version_compat.py +24 -0
- {apache_airflow_providers_amazon-9.9.0rc1.dist-info → apache_airflow_providers_amazon-9.10.0rc1.dist-info}/METADATA +8 -9
- {apache_airflow_providers_amazon-9.9.0rc1.dist-info → apache_airflow_providers_amazon-9.10.0rc1.dist-info}/RECORD +66 -64
- {apache_airflow_providers_amazon-9.9.0rc1.dist-info → apache_airflow_providers_amazon-9.10.0rc1.dist-info}/WHEEL +0 -0
- {apache_airflow_providers_amazon-9.9.0rc1.dist-info → apache_airflow_providers_amazon-9.10.0rc1.dist-info}/entry_points.txt +0 -0
@@ -24,6 +24,14 @@ from functools import cached_property
|
|
24
24
|
from typing import Any
|
25
25
|
|
26
26
|
from botocore.exceptions import ClientError
|
27
|
+
from tenacity import (
|
28
|
+
AsyncRetrying,
|
29
|
+
Retrying,
|
30
|
+
before_sleep_log,
|
31
|
+
retry_if_exception,
|
32
|
+
stop_after_attempt,
|
33
|
+
wait_exponential,
|
34
|
+
)
|
27
35
|
|
28
36
|
from airflow.exceptions import AirflowException, AirflowProviderDeprecationWarning
|
29
37
|
from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
|
@@ -46,11 +54,11 @@ class GlueJobHook(AwsBaseHook):
|
|
46
54
|
:param script_location: path to etl script on s3
|
47
55
|
:param retry_limit: Maximum number of times to retry this job if it fails
|
48
56
|
:param num_of_dpus: Number of AWS Glue DPUs to allocate to this Job
|
49
|
-
:param region_name: aws region name (example: us-east-1)
|
50
57
|
:param iam_role_name: AWS IAM Role for Glue Job Execution. If set `iam_role_arn` must equal None.
|
51
58
|
:param iam_role_arn: AWS IAM Role ARN for Glue Job Execution, If set `iam_role_name` must equal None.
|
52
59
|
:param create_job_kwargs: Extra arguments for Glue Job Creation
|
53
60
|
:param update_config: Update job configuration on Glue (default: False)
|
61
|
+
:param api_retry_args: An optional dictionary with arguments passed to ``tenacity.Retrying`` & ``tenacity.AsyncRetrying`` classes.
|
54
62
|
|
55
63
|
Additional arguments (such as ``aws_conn_id``) may be specified and
|
56
64
|
are passed down to the underlying AwsBaseHook.
|
@@ -80,6 +88,7 @@ class GlueJobHook(AwsBaseHook):
|
|
80
88
|
create_job_kwargs: dict | None = None,
|
81
89
|
update_config: bool = False,
|
82
90
|
job_poll_interval: int | float = 6,
|
91
|
+
api_retry_args: dict[Any, Any] | None = None,
|
83
92
|
*args,
|
84
93
|
**kwargs,
|
85
94
|
):
|
@@ -96,6 +105,17 @@ class GlueJobHook(AwsBaseHook):
|
|
96
105
|
self.update_config = update_config
|
97
106
|
self.job_poll_interval = job_poll_interval
|
98
107
|
|
108
|
+
self.retry_config: dict[str, Any] = {
|
109
|
+
"retry": retry_if_exception(self._should_retry_on_error),
|
110
|
+
"wait": wait_exponential(multiplier=1, min=1, max=60),
|
111
|
+
"stop": stop_after_attempt(5),
|
112
|
+
"before_sleep": before_sleep_log(self.log, log_level=20),
|
113
|
+
"reraise": True,
|
114
|
+
}
|
115
|
+
|
116
|
+
if api_retry_args:
|
117
|
+
self.retry_config.update(api_retry_args)
|
118
|
+
|
99
119
|
worker_type_exists = "WorkerType" in self.create_job_kwargs
|
100
120
|
num_workers_exists = "NumberOfWorkers" in self.create_job_kwargs
|
101
121
|
|
@@ -116,6 +136,29 @@ class GlueJobHook(AwsBaseHook):
|
|
116
136
|
kwargs["client_type"] = "glue"
|
117
137
|
super().__init__(*args, **kwargs)
|
118
138
|
|
139
|
+
def _should_retry_on_error(self, exception: BaseException) -> bool:
|
140
|
+
"""
|
141
|
+
Determine if an exception should trigger a retry.
|
142
|
+
|
143
|
+
:param exception: The exception that occurred
|
144
|
+
:return: True if the exception should trigger a retry, False otherwise
|
145
|
+
"""
|
146
|
+
if isinstance(exception, ClientError):
|
147
|
+
error_code = exception.response.get("Error", {}).get("Code", "")
|
148
|
+
retryable_errors = {
|
149
|
+
"ThrottlingException",
|
150
|
+
"RequestLimitExceeded",
|
151
|
+
"ServiceUnavailable",
|
152
|
+
"InternalFailure",
|
153
|
+
"InternalServerError",
|
154
|
+
"TooManyRequestsException",
|
155
|
+
"RequestTimeout",
|
156
|
+
"RequestTimeoutException",
|
157
|
+
"HttpTimeoutException",
|
158
|
+
}
|
159
|
+
return error_code in retryable_errors
|
160
|
+
return False
|
161
|
+
|
119
162
|
def create_glue_job_config(self) -> dict:
|
120
163
|
default_command = {
|
121
164
|
"Name": "glueetl",
|
@@ -217,8 +260,21 @@ class GlueJobHook(AwsBaseHook):
|
|
217
260
|
:param run_id: The job-run ID of the predecessor job run
|
218
261
|
:return: State of the Glue job
|
219
262
|
"""
|
220
|
-
|
221
|
-
|
263
|
+
for attempt in Retrying(**self.retry_config):
|
264
|
+
with attempt:
|
265
|
+
try:
|
266
|
+
job_run = self.conn.get_job_run(JobName=job_name, RunId=run_id, PredecessorsIncluded=True)
|
267
|
+
return job_run["JobRun"]["JobRunState"]
|
268
|
+
except ClientError as e:
|
269
|
+
self.log.error("Failed to get job state for job %s run %s: %s", job_name, run_id, e)
|
270
|
+
raise
|
271
|
+
except Exception as e:
|
272
|
+
self.log.error(
|
273
|
+
"Unexpected error getting job state for job %s run %s: %s", job_name, run_id, e
|
274
|
+
)
|
275
|
+
raise
|
276
|
+
# This should never be reached due to reraise=True, but mypy needs it
|
277
|
+
raise RuntimeError("Unexpected end of retry loop")
|
222
278
|
|
223
279
|
async def async_get_job_state(self, job_name: str, run_id: str) -> str:
|
224
280
|
"""
|
@@ -226,9 +282,22 @@ class GlueJobHook(AwsBaseHook):
|
|
226
282
|
|
227
283
|
The async version of get_job_state.
|
228
284
|
"""
|
229
|
-
async
|
230
|
-
|
231
|
-
|
285
|
+
async for attempt in AsyncRetrying(**self.retry_config):
|
286
|
+
with attempt:
|
287
|
+
try:
|
288
|
+
async with await self.get_async_conn() as client:
|
289
|
+
job_run = await client.get_job_run(JobName=job_name, RunId=run_id)
|
290
|
+
return job_run["JobRun"]["JobRunState"]
|
291
|
+
except ClientError as e:
|
292
|
+
self.log.error("Failed to get job state for job %s run %s: %s", job_name, run_id, e)
|
293
|
+
raise
|
294
|
+
except Exception as e:
|
295
|
+
self.log.error(
|
296
|
+
"Unexpected error getting job state for job %s run %s: %s", job_name, run_id, e
|
297
|
+
)
|
298
|
+
raise
|
299
|
+
# This should never be reached due to reraise=True, but mypy needs it
|
300
|
+
raise RuntimeError("Unexpected end of retry loop")
|
232
301
|
|
233
302
|
@cached_property
|
234
303
|
def logs_hook(self):
|
@@ -372,7 +441,7 @@ class GlueJobHook(AwsBaseHook):
|
|
372
441
|
)
|
373
442
|
return None
|
374
443
|
|
375
|
-
def has_job(self, job_name) -> bool:
|
444
|
+
def has_job(self, job_name: str) -> bool:
|
376
445
|
"""
|
377
446
|
Check if the job already exists.
|
378
447
|
|
@@ -422,6 +491,9 @@ class GlueJobHook(AwsBaseHook):
|
|
422
491
|
|
423
492
|
:return:Name of the Job
|
424
493
|
"""
|
494
|
+
if self.job_name is None:
|
495
|
+
raise ValueError("job_name must be set to get or create a Glue job")
|
496
|
+
|
425
497
|
if self.has_job(self.job_name):
|
426
498
|
return self.job_name
|
427
499
|
|
@@ -441,6 +513,9 @@ class GlueJobHook(AwsBaseHook):
|
|
441
513
|
|
442
514
|
:return:Name of the Job
|
443
515
|
"""
|
516
|
+
if self.job_name is None:
|
517
|
+
raise ValueError("job_name must be set to create or update a Glue job")
|
518
|
+
|
444
519
|
config = self.create_glue_job_config()
|
445
520
|
|
446
521
|
if self.has_job(self.job_name):
|
@@ -20,7 +20,8 @@
|
|
20
20
|
from __future__ import annotations
|
21
21
|
|
22
22
|
import time
|
23
|
-
from
|
23
|
+
from collections.abc import Callable
|
24
|
+
from typing import TYPE_CHECKING
|
24
25
|
|
25
26
|
from airflow.exceptions import AirflowException, AirflowNotFoundException
|
26
27
|
from airflow.providers.amazon.aws.hooks.base_aws import AwsGenericHook
|
@@ -28,7 +28,7 @@ import os
|
|
28
28
|
import re
|
29
29
|
import shutil
|
30
30
|
import time
|
31
|
-
from collections.abc import AsyncIterator
|
31
|
+
from collections.abc import AsyncIterator, Callable
|
32
32
|
from contextlib import suppress
|
33
33
|
from copy import deepcopy
|
34
34
|
from datetime import datetime
|
@@ -37,7 +37,7 @@ from inspect import signature
|
|
37
37
|
from io import BytesIO
|
38
38
|
from pathlib import Path
|
39
39
|
from tempfile import NamedTemporaryFile, gettempdir
|
40
|
-
from typing import TYPE_CHECKING, Any
|
40
|
+
from typing import TYPE_CHECKING, Any
|
41
41
|
from urllib.parse import urlsplit
|
42
42
|
from uuid import uuid4
|
43
43
|
|
@@ -635,6 +635,10 @@ class S3Hook(AwsBaseHook):
|
|
635
635
|
delimiter: str | None = "/",
|
636
636
|
) -> list[Any]:
|
637
637
|
"""Get a list of files in the bucket."""
|
638
|
+
# Validate that bucket_keys is in fact a list, otherwise, the characters will be split
|
639
|
+
if isinstance(bucket_keys, str):
|
640
|
+
bucket_keys = [bucket_keys]
|
641
|
+
|
638
642
|
keys: list[Any] = []
|
639
643
|
for key in bucket_keys:
|
640
644
|
prefix = key
|
@@ -652,7 +656,9 @@ class S3Hook(AwsBaseHook):
|
|
652
656
|
response = paginator.paginate(**params)
|
653
657
|
async for page in response:
|
654
658
|
if "Contents" in page:
|
655
|
-
keys.extend(
|
659
|
+
keys.extend(
|
660
|
+
k.get("Key") for k in page["Contents"] if isinstance(k.get("Size"), (int, float))
|
661
|
+
)
|
656
662
|
return keys
|
657
663
|
|
658
664
|
async def _list_keys_async(
|
@@ -1683,3 +1689,80 @@ class S3Hook(AwsBaseHook):
|
|
1683
1689
|
"""
|
1684
1690
|
s3_client = self.get_conn()
|
1685
1691
|
s3_client.delete_bucket_tagging(Bucket=bucket_name)
|
1692
|
+
|
1693
|
+
def _sync_to_local_dir_delete_stale_local_files(self, current_s3_objects: list[Path], local_dir: Path):
|
1694
|
+
current_s3_keys = {key for key in current_s3_objects}
|
1695
|
+
|
1696
|
+
for item in local_dir.iterdir():
|
1697
|
+
item: Path # type: ignore[no-redef]
|
1698
|
+
absolute_item_path = item.resolve()
|
1699
|
+
|
1700
|
+
if absolute_item_path not in current_s3_keys:
|
1701
|
+
try:
|
1702
|
+
if item.is_file():
|
1703
|
+
item.unlink(missing_ok=True)
|
1704
|
+
self.log.debug("Deleted stale local file: %s", item)
|
1705
|
+
elif item.is_dir():
|
1706
|
+
# delete only when the folder is empty
|
1707
|
+
if not os.listdir(item):
|
1708
|
+
item.rmdir()
|
1709
|
+
self.log.debug("Deleted stale empty directory: %s", item)
|
1710
|
+
else:
|
1711
|
+
self.log.debug("Skipping stale item of unknown type: %s", item)
|
1712
|
+
except OSError as e:
|
1713
|
+
self.log.error("Error deleting stale item %s: %s", item, e)
|
1714
|
+
raise e
|
1715
|
+
|
1716
|
+
def _sync_to_local_dir_if_changed(self, s3_bucket, s3_object, local_target_path: Path):
|
1717
|
+
should_download = False
|
1718
|
+
download_msg = ""
|
1719
|
+
if not local_target_path.exists():
|
1720
|
+
should_download = True
|
1721
|
+
download_msg = f"Local file {local_target_path} does not exist."
|
1722
|
+
else:
|
1723
|
+
local_stats = local_target_path.stat()
|
1724
|
+
|
1725
|
+
if s3_object.size != local_stats.st_size:
|
1726
|
+
should_download = True
|
1727
|
+
download_msg = (
|
1728
|
+
f"S3 object size ({s3_object.size}) and local file size ({local_stats.st_size}) differ."
|
1729
|
+
)
|
1730
|
+
|
1731
|
+
s3_last_modified = s3_object.last_modified
|
1732
|
+
if local_stats.st_mtime < s3_last_modified.microsecond:
|
1733
|
+
should_download = True
|
1734
|
+
download_msg = f"S3 object last modified ({s3_last_modified.microsecond}) and local file last modified ({local_stats.st_mtime}) differ."
|
1735
|
+
|
1736
|
+
if should_download:
|
1737
|
+
s3_bucket.download_file(s3_object.key, local_target_path)
|
1738
|
+
self.log.debug(
|
1739
|
+
"%s Downloaded %s to %s", download_msg, s3_object.key, local_target_path.as_posix()
|
1740
|
+
)
|
1741
|
+
else:
|
1742
|
+
self.log.debug(
|
1743
|
+
"Local file %s is up-to-date with S3 object %s. Skipping download.",
|
1744
|
+
local_target_path.as_posix(),
|
1745
|
+
s3_object.key,
|
1746
|
+
)
|
1747
|
+
|
1748
|
+
def sync_to_local_dir(self, bucket_name: str, local_dir: Path, s3_prefix="", delete_stale: bool = True):
|
1749
|
+
"""Download S3 files from the S3 bucket to the local directory."""
|
1750
|
+
self.log.debug("Downloading data from s3://%s/%s to %s", bucket_name, s3_prefix, local_dir)
|
1751
|
+
|
1752
|
+
local_s3_objects = []
|
1753
|
+
s3_bucket = self.get_bucket(bucket_name)
|
1754
|
+
for obj in s3_bucket.objects.filter(Prefix=s3_prefix):
|
1755
|
+
obj_path = Path(obj.key)
|
1756
|
+
local_target_path = local_dir.joinpath(obj_path.relative_to(s3_prefix))
|
1757
|
+
if not local_target_path.parent.exists():
|
1758
|
+
local_target_path.parent.mkdir(parents=True, exist_ok=True)
|
1759
|
+
self.log.debug("Created local directory: %s", local_target_path.parent)
|
1760
|
+
self._sync_to_local_dir_if_changed(
|
1761
|
+
s3_bucket=s3_bucket, s3_object=obj, local_target_path=local_target_path
|
1762
|
+
)
|
1763
|
+
local_s3_objects.append(local_target_path)
|
1764
|
+
|
1765
|
+
if delete_stale:
|
1766
|
+
self._sync_to_local_dir_delete_stale_local_files(
|
1767
|
+
current_s3_objects=local_s3_objects, local_dir=local_dir
|
1768
|
+
)
|
@@ -23,10 +23,10 @@ import tarfile
|
|
23
23
|
import tempfile
|
24
24
|
import time
|
25
25
|
from collections import Counter, namedtuple
|
26
|
-
from collections.abc import AsyncGenerator, Generator
|
26
|
+
from collections.abc import AsyncGenerator, Callable, Generator
|
27
27
|
from datetime import datetime
|
28
28
|
from functools import partial
|
29
|
-
from typing import Any,
|
29
|
+
from typing import Any, cast
|
30
30
|
|
31
31
|
from asgiref.sync import sync_to_async
|
32
32
|
from botocore.exceptions import ClientError
|
@@ -25,8 +25,8 @@ from sagemaker_studio import ClientConfig
|
|
25
25
|
from sagemaker_studio.sagemaker_studio_api import SageMakerStudioAPI
|
26
26
|
|
27
27
|
from airflow.exceptions import AirflowException
|
28
|
-
from airflow.hooks.base import BaseHook
|
29
28
|
from airflow.providers.amazon.aws.utils.sagemaker_unified_studio import is_local_runner
|
29
|
+
from airflow.providers.amazon.version_compat import BaseHook
|
30
30
|
|
31
31
|
|
32
32
|
class SageMakerNotebookHook(BaseHook):
|
@@ -20,20 +20,13 @@ from __future__ import annotations
|
|
20
20
|
from typing import TYPE_CHECKING, ClassVar
|
21
21
|
|
22
22
|
from airflow.providers.amazon.aws.utils.suppress import return_on_error
|
23
|
-
from airflow.providers.amazon.version_compat import
|
23
|
+
from airflow.providers.amazon.version_compat import BaseOperatorLink, XCom
|
24
24
|
|
25
25
|
if TYPE_CHECKING:
|
26
26
|
from airflow.models import BaseOperator
|
27
27
|
from airflow.models.taskinstancekey import TaskInstanceKey
|
28
28
|
from airflow.utils.context import Context
|
29
29
|
|
30
|
-
if AIRFLOW_V_3_0_PLUS:
|
31
|
-
from airflow.sdk import BaseOperatorLink
|
32
|
-
from airflow.sdk.execution_time.xcom import XCom
|
33
|
-
else:
|
34
|
-
from airflow.models import XCom # type: ignore[no-redef]
|
35
|
-
from airflow.models.baseoperatorlink import BaseOperatorLink # type: ignore[no-redef]
|
36
|
-
|
37
30
|
|
38
31
|
BASE_AWS_CONSOLE_LINK = "https://console.{aws_domain}"
|
39
32
|
|
@@ -94,8 +87,7 @@ class BaseAwsLink(BaseOperatorLink):
|
|
94
87
|
if not operator.do_xcom_push:
|
95
88
|
return
|
96
89
|
|
97
|
-
|
98
|
-
context,
|
90
|
+
context["ti"].xcom_push(
|
99
91
|
key=cls.key,
|
100
92
|
value={
|
101
93
|
"region_name": region_name,
|
@@ -19,13 +19,13 @@ from __future__ import annotations
|
|
19
19
|
|
20
20
|
from collections.abc import Sequence
|
21
21
|
|
22
|
-
from airflow.models import BaseOperator
|
23
22
|
from airflow.providers.amazon.aws.utils.mixins import (
|
24
23
|
AwsBaseHookMixin,
|
25
24
|
AwsHookParams,
|
26
25
|
AwsHookType,
|
27
26
|
aws_template_fields,
|
28
27
|
)
|
28
|
+
from airflow.providers.amazon.version_compat import BaseOperator
|
29
29
|
from airflow.utils.types import NOTSET, ArgNotSet
|
30
30
|
|
31
31
|
|
@@ -140,28 +140,12 @@ class BatchOperator(AwsBaseOperator[BatchClientHook]):
|
|
140
140
|
"retry_strategy": "json",
|
141
141
|
}
|
142
142
|
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
"wait_for_completion"
|
150
|
-
) or self.expand_input.value.get("wait_for_completion")
|
151
|
-
array_properties = self.partial_kwargs.get("array_properties") or self.expand_input.value.get(
|
152
|
-
"array_properties"
|
153
|
-
)
|
154
|
-
else:
|
155
|
-
wait_for_completion = self.wait_for_completion
|
156
|
-
array_properties = self.array_properties
|
157
|
-
|
158
|
-
if wait_for_completion:
|
159
|
-
op_extra_links.extend([BatchJobDefinitionLink(), BatchJobQueueLink()])
|
160
|
-
if not array_properties:
|
161
|
-
# There is no CloudWatch Link to the parent Batch Job available.
|
162
|
-
op_extra_links.append(CloudWatchEventsLink())
|
163
|
-
|
164
|
-
return tuple(op_extra_links)
|
143
|
+
operator_extra_links = (
|
144
|
+
BatchJobDetailsLink(),
|
145
|
+
BatchJobDefinitionLink(),
|
146
|
+
BatchJobQueueLink(),
|
147
|
+
CloudWatchEventsLink(),
|
148
|
+
)
|
165
149
|
|
166
150
|
def __init__(
|
167
151
|
self,
|
@@ -526,7 +526,7 @@ class EcsRunTaskOperator(EcsBaseOperator):
|
|
526
526
|
self._start_task()
|
527
527
|
|
528
528
|
if self.do_xcom_push:
|
529
|
-
|
529
|
+
context["ti"].xcom_push(key="ecs_task_arn", value=self.arn)
|
530
530
|
|
531
531
|
if self.deferrable:
|
532
532
|
self.defer(
|
@@ -60,7 +60,6 @@ class GlueJobOperator(AwsBaseOperator[GlueJobHook]):
|
|
60
60
|
:param script_args: etl script arguments and AWS Glue arguments (templated)
|
61
61
|
:param retry_limit: The maximum number of times to retry this job if it fails
|
62
62
|
:param num_of_dpus: Number of AWS Glue DPUs to allocate to this Job.
|
63
|
-
:param region_name: aws region name (example: us-east-1)
|
64
63
|
:param s3_bucket: S3 bucket where logs and local etl script will be uploaded
|
65
64
|
:param iam_role_name: AWS IAM Role for Glue Job Execution. If set `iam_role_arn` must equal None.
|
66
65
|
:param iam_role_arn: AWS IAM ARN for Glue Job Execution. If set `iam_role_name` must equal None.
|
@@ -78,7 +77,20 @@ class GlueJobOperator(AwsBaseOperator[GlueJobHook]):
|
|
78
77
|
of limiting concurrency, Glue needs 5-10 seconds to clean up resources.
|
79
78
|
Thus if status is returned immediately it might end up in case of more than 1 concurrent run.
|
80
79
|
It is recommended to set this parameter to 10 when you are using concurrency=1.
|
81
|
-
For more information see:
|
80
|
+
For more information see:
|
81
|
+
https://repost.aws/questions/QUaKgpLBMPSGWO0iq2Fob_bw/glue-run-concurrent-jobs#ANFpCL2fRnQRqgDFuIU_rpvA
|
82
|
+
:param waiter_delay: Time in seconds to wait between status checks. (default: 60)
|
83
|
+
:param waiter_max_attempts: Maximum number of attempts to check for job completion. (default: 20)
|
84
|
+
:param aws_conn_id: The Airflow connection used for AWS credentials.
|
85
|
+
If this is ``None`` or empty then the default boto3 behaviour is used. If
|
86
|
+
running Airflow in a distributed manner and aws_conn_id is None or
|
87
|
+
empty, then default boto3 configuration would be used (and must be
|
88
|
+
maintained on each worker node).
|
89
|
+
:param region_name: AWS region_name. If not specified then the default boto3 behaviour is used.
|
90
|
+
:param verify: Whether or not to verify SSL certificates. See:
|
91
|
+
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html
|
92
|
+
:param botocore_config: Configuration dictionary (key-values) for botocore client. See:
|
93
|
+
https://botocore.amazonaws.com/v1/documentation/api/latest/reference/config.html
|
82
94
|
"""
|
83
95
|
|
84
96
|
aws_hook_class = GlueJobHook
|
@@ -122,9 +134,11 @@ class GlueJobOperator(AwsBaseOperator[GlueJobHook]):
|
|
122
134
|
verbose: bool = False,
|
123
135
|
replace_script_file: bool = False,
|
124
136
|
update_config: bool = False,
|
125
|
-
job_poll_interval: int | float = 6,
|
126
137
|
stop_job_run_on_kill: bool = False,
|
127
138
|
sleep_before_return: int = 0,
|
139
|
+
job_poll_interval: int | float = 6,
|
140
|
+
waiter_delay: int = 60,
|
141
|
+
waiter_max_attempts: int = 75,
|
128
142
|
**kwargs,
|
129
143
|
):
|
130
144
|
super().__init__(**kwargs)
|
@@ -152,6 +166,8 @@ class GlueJobOperator(AwsBaseOperator[GlueJobHook]):
|
|
152
166
|
self._job_run_id: str | None = None
|
153
167
|
self.sleep_before_return: int = sleep_before_return
|
154
168
|
self.s3_script_location: str | None = None
|
169
|
+
self.waiter_delay = waiter_delay
|
170
|
+
self.waiter_max_attempts = waiter_max_attempts
|
155
171
|
|
156
172
|
@property
|
157
173
|
def _hook_parameters(self):
|
@@ -231,7 +247,9 @@ class GlueJobOperator(AwsBaseOperator[GlueJobHook]):
|
|
231
247
|
run_id=self._job_run_id,
|
232
248
|
verbose=self.verbose,
|
233
249
|
aws_conn_id=self.aws_conn_id,
|
234
|
-
|
250
|
+
waiter_delay=self.waiter_delay,
|
251
|
+
waiter_max_attempts=self.waiter_max_attempts,
|
252
|
+
region_name=self.region_name,
|
235
253
|
),
|
236
254
|
method_name="execute_complete",
|
237
255
|
)
|
@@ -254,7 +272,7 @@ class GlueJobOperator(AwsBaseOperator[GlueJobHook]):
|
|
254
272
|
|
255
273
|
if validated_event["status"] != "success":
|
256
274
|
raise AirflowException(f"Error in glue job: {validated_event}")
|
257
|
-
return validated_event["
|
275
|
+
return validated_event["run_id"]
|
258
276
|
|
259
277
|
def on_kill(self):
|
260
278
|
"""Cancel the running AWS Glue Job."""
|
@@ -282,7 +300,6 @@ class GlueDataQualityOperator(AwsBaseOperator[GlueDataQualityHook]):
|
|
282
300
|
:param description: A description of the data quality ruleset.
|
283
301
|
:param update_rule_set: To update existing ruleset, Set this flag to True. (default: False)
|
284
302
|
:param data_quality_ruleset_kwargs: Extra arguments for RuleSet.
|
285
|
-
|
286
303
|
:param aws_conn_id: The Airflow connection used for AWS credentials.
|
287
304
|
If this is ``None`` or empty then the default boto3 behaviour is used. If
|
288
305
|
running Airflow in a distributed manner and aws_conn_id is None or
|
@@ -378,7 +395,6 @@ class GlueDataQualityRuleSetEvaluationRunOperator(AwsBaseOperator[GlueDataQualit
|
|
378
395
|
:param deferrable: If True, the operator will wait asynchronously for the job to stop.
|
379
396
|
This implies waiting for completion. This mode requires aiobotocore module to be installed.
|
380
397
|
(default: False)
|
381
|
-
|
382
398
|
:param aws_conn_id: The Airflow connection used for AWS credentials.
|
383
399
|
If this is ``None`` or empty then the default boto3 behaviour is used. If
|
384
400
|
running Airflow in a distributed manner and aws_conn_id is None or
|
@@ -543,7 +559,6 @@ class GlueDataQualityRuleRecommendationRunOperator(AwsBaseOperator[GlueDataQuali
|
|
543
559
|
:param deferrable: If True, the operator will wait asynchronously for the job to stop.
|
544
560
|
This implies waiting for completion. This mode requires aiobotocore module to be installed.
|
545
561
|
(default: False)
|
546
|
-
|
547
562
|
:param aws_conn_id: The Airflow connection used for AWS credentials.
|
548
563
|
If this is ``None`` or empty then the default boto3 behaviour is used. If
|
549
564
|
running Airflow in a distributed manner and aws_conn_id is None or
|
@@ -159,7 +159,7 @@ class RedshiftDataOperator(AwsBaseOperator[RedshiftDataHook]):
|
|
159
159
|
self.statement_id: str = query_execution_output.statement_id
|
160
160
|
|
161
161
|
if query_execution_output.session_id:
|
162
|
-
|
162
|
+
context["ti"].xcom_push(key="session_id", value=query_execution_output.session_id)
|
163
163
|
|
164
164
|
if self.deferrable and self.wait_for_completion:
|
165
165
|
is_finished: bool = self.hook.check_query_is_finished(self.statement_id)
|
@@ -20,8 +20,8 @@ import datetime
|
|
20
20
|
import json
|
21
21
|
import time
|
22
22
|
import urllib
|
23
|
-
from collections.abc import Sequence
|
24
|
-
from typing import TYPE_CHECKING, Any,
|
23
|
+
from collections.abc import Callable, Sequence
|
24
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
25
25
|
|
26
26
|
from botocore.exceptions import ClientError
|
27
27
|
|
@@ -24,7 +24,6 @@ from typing import TYPE_CHECKING
|
|
24
24
|
|
25
25
|
from airflow.configuration import conf
|
26
26
|
from airflow.exceptions import AirflowException
|
27
|
-
from airflow.models import BaseOperator
|
28
27
|
from airflow.providers.amazon.aws.hooks.sagemaker_unified_studio import (
|
29
28
|
SageMakerNotebookHook,
|
30
29
|
)
|
@@ -34,6 +33,7 @@ from airflow.providers.amazon.aws.links.sagemaker_unified_studio import (
|
|
34
33
|
from airflow.providers.amazon.aws.triggers.sagemaker_unified_studio import (
|
35
34
|
SageMakerNotebookJobTrigger,
|
36
35
|
)
|
36
|
+
from airflow.providers.amazon.version_compat import BaseOperator
|
37
37
|
|
38
38
|
if TYPE_CHECKING:
|
39
39
|
from airflow.utils.context import Context
|
@@ -25,7 +25,7 @@ from airflow.providers.amazon.aws.utils.mixins import (
|
|
25
25
|
AwsHookType,
|
26
26
|
aws_template_fields,
|
27
27
|
)
|
28
|
-
from airflow.
|
28
|
+
from airflow.providers.amazon.version_compat import BaseSensorOperator
|
29
29
|
from airflow.utils.types import NOTSET, ArgNotSet
|
30
30
|
|
31
31
|
|