apache-airflow-providers-amazon 8.25.0rc1__py3-none-any.whl → 8.26.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airflow/providers/amazon/__init__.py +1 -1
- airflow/providers/amazon/aws/hooks/athena.py +18 -9
- airflow/providers/amazon/aws/hooks/athena_sql.py +2 -1
- airflow/providers/amazon/aws/hooks/base_aws.py +34 -10
- airflow/providers/amazon/aws/hooks/chime.py +2 -1
- airflow/providers/amazon/aws/hooks/datasync.py +6 -3
- airflow/providers/amazon/aws/hooks/ecr.py +2 -1
- airflow/providers/amazon/aws/hooks/ecs.py +12 -6
- airflow/providers/amazon/aws/hooks/glacier.py +8 -4
- airflow/providers/amazon/aws/hooks/kinesis.py +2 -1
- airflow/providers/amazon/aws/hooks/logs.py +4 -2
- airflow/providers/amazon/aws/hooks/redshift_cluster.py +24 -12
- airflow/providers/amazon/aws/hooks/redshift_data.py +4 -2
- airflow/providers/amazon/aws/hooks/redshift_sql.py +6 -3
- airflow/providers/amazon/aws/hooks/s3.py +70 -53
- airflow/providers/amazon/aws/hooks/sagemaker.py +82 -41
- airflow/providers/amazon/aws/hooks/secrets_manager.py +6 -3
- airflow/providers/amazon/aws/hooks/sts.py +2 -1
- airflow/providers/amazon/aws/operators/athena.py +21 -8
- airflow/providers/amazon/aws/operators/batch.py +12 -6
- airflow/providers/amazon/aws/operators/datasync.py +2 -1
- airflow/providers/amazon/aws/operators/ecs.py +1 -0
- airflow/providers/amazon/aws/operators/emr.py +6 -86
- airflow/providers/amazon/aws/operators/glue.py +4 -2
- airflow/providers/amazon/aws/operators/glue_crawler.py +22 -19
- airflow/providers/amazon/aws/operators/neptune.py +2 -1
- airflow/providers/amazon/aws/operators/redshift_cluster.py +2 -1
- airflow/providers/amazon/aws/operators/sagemaker.py +2 -1
- airflow/providers/amazon/aws/sensors/base_aws.py +2 -1
- airflow/providers/amazon/aws/sensors/glue_catalog_partition.py +25 -17
- airflow/providers/amazon/aws/sensors/glue_crawler.py +16 -12
- airflow/providers/amazon/aws/transfers/mongo_to_s3.py +6 -3
- airflow/providers/amazon/aws/transfers/s3_to_dynamodb.py +2 -1
- airflow/providers/amazon/aws/transfers/s3_to_sql.py +2 -1
- airflow/providers/amazon/aws/triggers/ecs.py +3 -1
- airflow/providers/amazon/aws/triggers/glue.py +15 -3
- airflow/providers/amazon/aws/triggers/glue_crawler.py +8 -1
- airflow/providers/amazon/aws/utils/connection_wrapper.py +10 -5
- airflow/providers/amazon/aws/utils/mixins.py +2 -1
- airflow/providers/amazon/aws/utils/redshift.py +2 -1
- airflow/providers/amazon/get_provider_info.py +2 -1
- {apache_airflow_providers_amazon-8.25.0rc1.dist-info → apache_airflow_providers_amazon-8.26.0rc1.dist-info}/METADATA +6 -6
- {apache_airflow_providers_amazon-8.25.0rc1.dist-info → apache_airflow_providers_amazon-8.26.0rc1.dist-info}/RECORD +45 -45
- {apache_airflow_providers_amazon-8.25.0rc1.dist-info → apache_airflow_providers_amazon-8.26.0rc1.dist-info}/WHEEL +0 -0
- {apache_airflow_providers_amazon-8.25.0rc1.dist-info → apache_airflow_providers_amazon-8.26.0rc1.dist-info}/entry_points.txt +0 -0
@@ -175,9 +175,6 @@ class AthenaOperator(AwsBaseOperator[AthenaHook]):
|
|
175
175
|
f"query_execution_id is {self.query_execution_id}."
|
176
176
|
)
|
177
177
|
|
178
|
-
# Save output location from API response for later use in OpenLineage.
|
179
|
-
self.output_location = self.hook.get_output_location(self.query_execution_id)
|
180
|
-
|
181
178
|
return self.query_execution_id
|
182
179
|
|
183
180
|
def execute_complete(self, context: Context, event: dict[str, Any] | None = None) -> str:
|
@@ -185,6 +182,9 @@ class AthenaOperator(AwsBaseOperator[AthenaHook]):
|
|
185
182
|
|
186
183
|
if event["status"] != "success":
|
187
184
|
raise AirflowException(f"Error while waiting for operation on cluster to complete: {event}")
|
185
|
+
|
186
|
+
# Save query_execution_id to be later used by listeners
|
187
|
+
self.query_execution_id = event["value"]
|
188
188
|
return event["value"]
|
189
189
|
|
190
190
|
def on_kill(self) -> None:
|
@@ -208,13 +208,21 @@ class AthenaOperator(AwsBaseOperator[AthenaHook]):
|
|
208
208
|
)
|
209
209
|
self.hook.poll_query_status(self.query_execution_id, sleep_time=self.sleep_time)
|
210
210
|
|
211
|
-
def
|
212
|
-
"""
|
211
|
+
def get_openlineage_facets_on_complete(self, _) -> OperatorLineage:
|
212
|
+
"""
|
213
|
+
Retrieve OpenLineage data by parsing SQL queries and enriching them with Athena API.
|
213
214
|
|
214
215
|
In addition to CTAS query, query and calculation results are stored in S3 location.
|
215
|
-
For that reason additional output is attached with this location.
|
216
|
+
For that reason additional output is attached with this location. Instead of using the complete
|
217
|
+
path where the results are saved (user's prefix + some UUID), we are creating a dataset with the
|
218
|
+
user-provided path only. This should make it easier to match this dataset across different processes.
|
216
219
|
"""
|
217
|
-
from openlineage.client.facet import
|
220
|
+
from openlineage.client.facet import (
|
221
|
+
ExternalQueryRunFacet,
|
222
|
+
ExtractionError,
|
223
|
+
ExtractionErrorRunFacet,
|
224
|
+
SqlJobFacet,
|
225
|
+
)
|
218
226
|
from openlineage.client.run import Dataset
|
219
227
|
|
220
228
|
from airflow.providers.openlineage.extractors.base import OperatorLineage
|
@@ -264,6 +272,11 @@ class AthenaOperator(AwsBaseOperator[AthenaHook]):
|
|
264
272
|
)
|
265
273
|
)
|
266
274
|
|
275
|
+
if self.query_execution_id:
|
276
|
+
run_facets["externalQuery"] = ExternalQueryRunFacet(
|
277
|
+
externalQueryId=self.query_execution_id, source="awsathena"
|
278
|
+
)
|
279
|
+
|
267
280
|
if self.output_location:
|
268
281
|
parsed = urlparse(self.output_location)
|
269
282
|
outputs.append(Dataset(namespace=f"{parsed.scheme}://{parsed.netloc}", name=parsed.path or "/"))
|
@@ -300,7 +313,7 @@ class AthenaOperator(AwsBaseOperator[AthenaHook]):
|
|
300
313
|
)
|
301
314
|
}
|
302
315
|
fields = [
|
303
|
-
SchemaField(name=column["Name"], type=column["Type"], description=column
|
316
|
+
SchemaField(name=column["Name"], type=column["Type"], description=column.get("Comment"))
|
304
317
|
for column in table_metadata["TableMetadata"]["Columns"]
|
305
318
|
]
|
306
319
|
if fields:
|
@@ -14,7 +14,8 @@
|
|
14
14
|
# KIND, either express or implied. See the License for the
|
15
15
|
# specific language governing permissions and limitations
|
16
16
|
# under the License.
|
17
|
-
"""
|
17
|
+
"""
|
18
|
+
AWS Batch services.
|
18
19
|
|
19
20
|
.. seealso::
|
20
21
|
|
@@ -54,7 +55,8 @@ if TYPE_CHECKING:
|
|
54
55
|
|
55
56
|
|
56
57
|
class BatchOperator(BaseOperator):
|
57
|
-
"""
|
58
|
+
"""
|
59
|
+
Execute a job on AWS Batch.
|
58
60
|
|
59
61
|
.. seealso::
|
60
62
|
For more information on how to use this operator, take a look at the guide:
|
@@ -236,7 +238,8 @@ class BatchOperator(BaseOperator):
|
|
236
238
|
)
|
237
239
|
|
238
240
|
def execute(self, context: Context) -> str | None:
|
239
|
-
"""
|
241
|
+
"""
|
242
|
+
Submit and monitor an AWS Batch job.
|
240
243
|
|
241
244
|
:raises: AirflowException
|
242
245
|
"""
|
@@ -287,7 +290,8 @@ class BatchOperator(BaseOperator):
|
|
287
290
|
self.log.info("AWS Batch job (%s) terminated: %s", self.job_id, response)
|
288
291
|
|
289
292
|
def submit_job(self, context: Context):
|
290
|
-
"""
|
293
|
+
"""
|
294
|
+
Submit an AWS Batch job.
|
291
295
|
|
292
296
|
:raises: AirflowException
|
293
297
|
"""
|
@@ -342,7 +346,8 @@ class BatchOperator(BaseOperator):
|
|
342
346
|
)
|
343
347
|
|
344
348
|
def monitor_job(self, context: Context):
|
345
|
-
"""
|
349
|
+
"""
|
350
|
+
Monitor an AWS Batch job.
|
346
351
|
|
347
352
|
This can raise an exception or an AirflowTaskTimeout if the task was
|
348
353
|
created with ``execution_timeout``.
|
@@ -434,7 +439,8 @@ class BatchOperator(BaseOperator):
|
|
434
439
|
|
435
440
|
|
436
441
|
class BatchCreateComputeEnvironmentOperator(BaseOperator):
|
437
|
-
"""
|
442
|
+
"""
|
443
|
+
Create an AWS Batch compute environment.
|
438
444
|
|
439
445
|
.. seealso::
|
440
446
|
For more information on how to use this operator, take a look at the guide:
|
@@ -34,7 +34,8 @@ if TYPE_CHECKING:
|
|
34
34
|
|
35
35
|
|
36
36
|
class DataSyncOperator(AwsBaseOperator[DataSyncHook]):
|
37
|
-
"""
|
37
|
+
"""
|
38
|
+
Find, Create, Update, Execute and Delete AWS DataSync Tasks.
|
38
39
|
|
39
40
|
If ``do_xcom_push`` is True, then the DataSync TaskArn and TaskExecutionArn
|
40
41
|
which were executed will be pushed to an XCom.
|
@@ -586,6 +586,7 @@ class EcsRunTaskOperator(EcsBaseOperator):
|
|
586
586
|
if event["status"] != "success":
|
587
587
|
raise AirflowException(f"Error in task execution: {event}")
|
588
588
|
self.arn = event["task_arn"] # restore arn to its updated value, needed for next steps
|
589
|
+
self.cluster = event["cluster"]
|
589
590
|
self._after_execution()
|
590
591
|
if self._aws_logs_enabled():
|
591
592
|
# same behavior as non-deferrable mode, return last line of logs of the task.
|
@@ -27,7 +27,6 @@ from uuid import uuid4
|
|
27
27
|
from airflow.configuration import conf
|
28
28
|
from airflow.exceptions import AirflowException, AirflowProviderDeprecationWarning
|
29
29
|
from airflow.models import BaseOperator
|
30
|
-
from airflow.models.mappedoperator import MappedOperator
|
31
30
|
from airflow.providers.amazon.aws.hooks.emr import EmrContainerHook, EmrHook, EmrServerlessHook
|
32
31
|
from airflow.providers.amazon.aws.links.emr import (
|
33
32
|
EmrClusterLink,
|
@@ -1259,91 +1258,12 @@ class EmrServerlessStartJobOperator(BaseOperator):
|
|
1259
1258
|
"configuration_overrides": "json",
|
1260
1259
|
}
|
1261
1260
|
|
1262
|
-
|
1263
|
-
|
1264
|
-
|
1265
|
-
|
1266
|
-
|
1267
|
-
|
1268
|
-
Only add dashboard links if they're explicitly enabled. These are one-time links that any user
|
1269
|
-
can access, but expire on first click or one hour, whichever comes first.
|
1270
|
-
"""
|
1271
|
-
op_extra_links = []
|
1272
|
-
|
1273
|
-
if isinstance(self, MappedOperator):
|
1274
|
-
operator_class = self.operator_class
|
1275
|
-
enable_application_ui_links = self.partial_kwargs.get(
|
1276
|
-
"enable_application_ui_links"
|
1277
|
-
) or self.expand_input.value.get("enable_application_ui_links")
|
1278
|
-
job_driver = self.partial_kwargs.get("job_driver", {}) or self.expand_input.value.get(
|
1279
|
-
"job_driver", {}
|
1280
|
-
)
|
1281
|
-
configuration_overrides = self.partial_kwargs.get(
|
1282
|
-
"configuration_overrides"
|
1283
|
-
) or self.expand_input.value.get("configuration_overrides")
|
1284
|
-
|
1285
|
-
# Configuration overrides can either be a list or a dictionary, depending on whether it's passed in as partial or expand.
|
1286
|
-
if isinstance(configuration_overrides, list):
|
1287
|
-
if any(
|
1288
|
-
[
|
1289
|
-
operator_class.is_monitoring_in_job_override(
|
1290
|
-
self=operator_class,
|
1291
|
-
config_key="s3MonitoringConfiguration",
|
1292
|
-
job_override=job_override,
|
1293
|
-
)
|
1294
|
-
for job_override in configuration_overrides
|
1295
|
-
]
|
1296
|
-
):
|
1297
|
-
op_extra_links.extend([EmrServerlessS3LogsLink()])
|
1298
|
-
if any(
|
1299
|
-
[
|
1300
|
-
operator_class.is_monitoring_in_job_override(
|
1301
|
-
self=operator_class,
|
1302
|
-
config_key="cloudWatchLoggingConfiguration",
|
1303
|
-
job_override=job_override,
|
1304
|
-
)
|
1305
|
-
for job_override in configuration_overrides
|
1306
|
-
]
|
1307
|
-
):
|
1308
|
-
op_extra_links.extend([EmrServerlessCloudWatchLogsLink()])
|
1309
|
-
else:
|
1310
|
-
if operator_class.is_monitoring_in_job_override(
|
1311
|
-
self=operator_class,
|
1312
|
-
config_key="s3MonitoringConfiguration",
|
1313
|
-
job_override=configuration_overrides,
|
1314
|
-
):
|
1315
|
-
op_extra_links.extend([EmrServerlessS3LogsLink()])
|
1316
|
-
if operator_class.is_monitoring_in_job_override(
|
1317
|
-
self=operator_class,
|
1318
|
-
config_key="cloudWatchLoggingConfiguration",
|
1319
|
-
job_override=configuration_overrides,
|
1320
|
-
):
|
1321
|
-
op_extra_links.extend([EmrServerlessCloudWatchLogsLink()])
|
1322
|
-
|
1323
|
-
else:
|
1324
|
-
operator_class = self
|
1325
|
-
enable_application_ui_links = self.enable_application_ui_links
|
1326
|
-
configuration_overrides = self.configuration_overrides
|
1327
|
-
job_driver = self.job_driver
|
1328
|
-
|
1329
|
-
if operator_class.is_monitoring_in_job_override(
|
1330
|
-
"s3MonitoringConfiguration", configuration_overrides
|
1331
|
-
):
|
1332
|
-
op_extra_links.extend([EmrServerlessS3LogsLink()])
|
1333
|
-
if operator_class.is_monitoring_in_job_override(
|
1334
|
-
"cloudWatchLoggingConfiguration", configuration_overrides
|
1335
|
-
):
|
1336
|
-
op_extra_links.extend([EmrServerlessCloudWatchLogsLink()])
|
1337
|
-
|
1338
|
-
if enable_application_ui_links:
|
1339
|
-
op_extra_links.extend([EmrServerlessDashboardLink()])
|
1340
|
-
if isinstance(job_driver, list):
|
1341
|
-
if any("sparkSubmit" in ind_job_driver for ind_job_driver in job_driver):
|
1342
|
-
op_extra_links.extend([EmrServerlessLogsLink()])
|
1343
|
-
elif "sparkSubmit" in job_driver:
|
1344
|
-
op_extra_links.extend([EmrServerlessLogsLink()])
|
1345
|
-
|
1346
|
-
return tuple(op_extra_links)
|
1261
|
+
operator_extra_links = (
|
1262
|
+
EmrServerlessS3LogsLink(),
|
1263
|
+
EmrServerlessCloudWatchLogsLink(),
|
1264
|
+
EmrServerlessDashboardLink(),
|
1265
|
+
EmrServerlessLogsLink(),
|
1266
|
+
)
|
1347
1267
|
|
1348
1268
|
def __init__(
|
1349
1269
|
self,
|
@@ -43,7 +43,8 @@ if TYPE_CHECKING:
|
|
43
43
|
|
44
44
|
|
45
45
|
class GlueJobOperator(BaseOperator):
|
46
|
-
"""
|
46
|
+
"""
|
47
|
+
Create an AWS Glue Job.
|
47
48
|
|
48
49
|
AWS Glue is a serverless Spark ETL service for running Spark Jobs on the AWS
|
49
50
|
cloud. Language support: Python and Scala.
|
@@ -179,7 +180,8 @@ class GlueJobOperator(BaseOperator):
|
|
179
180
|
)
|
180
181
|
|
181
182
|
def execute(self, context: Context):
|
182
|
-
"""
|
183
|
+
"""
|
184
|
+
Execute AWS Glue Job from Airflow.
|
183
185
|
|
184
186
|
:return: the current Glue job ID.
|
185
187
|
"""
|
@@ -17,22 +17,22 @@
|
|
17
17
|
# under the License.
|
18
18
|
from __future__ import annotations
|
19
19
|
|
20
|
-
from functools import cached_property
|
21
20
|
from typing import TYPE_CHECKING, Any, Sequence
|
22
21
|
|
23
22
|
from airflow.configuration import conf
|
24
23
|
from airflow.exceptions import AirflowException
|
24
|
+
from airflow.providers.amazon.aws.operators.base_aws import AwsBaseOperator
|
25
25
|
from airflow.providers.amazon.aws.triggers.glue_crawler import GlueCrawlerCompleteTrigger
|
26
26
|
from airflow.providers.amazon.aws.utils import validate_execute_complete_event
|
27
|
+
from airflow.providers.amazon.aws.utils.mixins import aws_template_fields
|
27
28
|
|
28
29
|
if TYPE_CHECKING:
|
29
30
|
from airflow.utils.context import Context
|
30
31
|
|
31
|
-
from airflow.models import BaseOperator
|
32
32
|
from airflow.providers.amazon.aws.hooks.glue_crawler import GlueCrawlerHook
|
33
33
|
|
34
34
|
|
35
|
-
class GlueCrawlerOperator(
|
35
|
+
class GlueCrawlerOperator(AwsBaseOperator[GlueCrawlerHook]):
|
36
36
|
"""
|
37
37
|
Creates, updates and triggers an AWS Glue Crawler.
|
38
38
|
|
@@ -45,45 +45,45 @@ class GlueCrawlerOperator(BaseOperator):
|
|
45
45
|
:ref:`howto/operator:GlueCrawlerOperator`
|
46
46
|
|
47
47
|
:param config: Configurations for the AWS Glue crawler
|
48
|
-
:param aws_conn_id: The Airflow connection used for AWS credentials.
|
49
|
-
If this is None or empty then the default boto3 behaviour is used. If
|
50
|
-
running Airflow in a distributed manner and aws_conn_id is None or
|
51
|
-
empty, then default boto3 configuration would be used (and must be
|
52
|
-
maintained on each worker node).
|
53
48
|
:param poll_interval: Time (in seconds) to wait between two consecutive calls to check crawler status
|
54
49
|
:param wait_for_completion: Whether to wait for crawl execution completion. (default: True)
|
55
50
|
:param deferrable: If True, the operator will wait asynchronously for the crawl to complete.
|
56
51
|
This implies waiting for completion. This mode requires aiobotocore module to be installed.
|
57
52
|
(default: False)
|
53
|
+
:param aws_conn_id: The Airflow connection used for AWS credentials.
|
54
|
+
If this is ``None`` or empty then the default boto3 behaviour is used. If
|
55
|
+
running Airflow in a distributed manner and aws_conn_id is None or
|
56
|
+
empty, then default boto3 configuration would be used (and must be
|
57
|
+
maintained on each worker node).
|
58
|
+
:param region_name: AWS region_name. If not specified then the default boto3 behaviour is used.
|
59
|
+
:param verify: Whether or not to verify SSL certificates. See:
|
60
|
+
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html
|
61
|
+
:param botocore_config: Configuration dictionary (key-values) for botocore client. See:
|
62
|
+
https://botocore.amazonaws.com/v1/documentation/api/latest/reference/config.html
|
58
63
|
"""
|
59
64
|
|
60
|
-
|
65
|
+
aws_hook_class = GlueCrawlerHook
|
66
|
+
|
67
|
+
template_fields: Sequence[str] = aws_template_fields(
|
68
|
+
"config",
|
69
|
+
)
|
61
70
|
ui_color = "#ededed"
|
62
71
|
|
63
72
|
def __init__(
|
64
73
|
self,
|
65
74
|
config,
|
66
|
-
aws_conn_id="aws_default",
|
67
|
-
region_name: str | None = None,
|
68
75
|
poll_interval: int = 5,
|
69
76
|
wait_for_completion: bool = True,
|
70
77
|
deferrable: bool = conf.getboolean("operators", "default_deferrable", fallback=False),
|
71
78
|
**kwargs,
|
72
79
|
):
|
73
80
|
super().__init__(**kwargs)
|
74
|
-
self.aws_conn_id = aws_conn_id
|
75
81
|
self.poll_interval = poll_interval
|
76
82
|
self.wait_for_completion = wait_for_completion
|
77
83
|
self.deferrable = deferrable
|
78
|
-
self.region_name = region_name
|
79
84
|
self.config = config
|
80
85
|
|
81
|
-
|
82
|
-
def hook(self) -> GlueCrawlerHook:
|
83
|
-
"""Create and return a GlueCrawlerHook."""
|
84
|
-
return GlueCrawlerHook(self.aws_conn_id, region_name=self.region_name)
|
85
|
-
|
86
|
-
def execute(self, context: Context):
|
86
|
+
def execute(self, context: Context) -> str:
|
87
87
|
"""
|
88
88
|
Execute AWS Glue Crawler from Airflow.
|
89
89
|
|
@@ -103,6 +103,9 @@ class GlueCrawlerOperator(BaseOperator):
|
|
103
103
|
crawler_name=crawler_name,
|
104
104
|
waiter_delay=self.poll_interval,
|
105
105
|
aws_conn_id=self.aws_conn_id,
|
106
|
+
region_name=self.region_name,
|
107
|
+
verify=self.verify,
|
108
|
+
botocore_config=self.botocore_config,
|
106
109
|
),
|
107
110
|
method_name="execute_complete",
|
108
111
|
)
|
@@ -81,7 +81,8 @@ def handle_waitable_exception(
|
|
81
81
|
|
82
82
|
|
83
83
|
class NeptuneStartDbClusterOperator(AwsBaseOperator[NeptuneHook]):
|
84
|
-
"""
|
84
|
+
"""
|
85
|
+
Starts an Amazon Neptune DB cluster.
|
85
86
|
|
86
87
|
Amazon Neptune Database is a serverless graph database designed for superior scalability
|
87
88
|
and availability. Neptune Database provides built-in security, continuous backups, and
|
@@ -38,7 +38,8 @@ if TYPE_CHECKING:
|
|
38
38
|
|
39
39
|
|
40
40
|
class RedshiftCreateClusterOperator(BaseOperator):
|
41
|
-
"""
|
41
|
+
"""
|
42
|
+
Creates a new cluster with the specified parameters.
|
42
43
|
|
43
44
|
.. seealso::
|
44
45
|
For more information on how to use this operator, take a look at the guide:
|
@@ -60,7 +60,8 @@ def serialize(result: dict) -> dict:
|
|
60
60
|
|
61
61
|
|
62
62
|
class SageMakerBaseOperator(BaseOperator):
|
63
|
-
"""
|
63
|
+
"""
|
64
|
+
This is the base operator for all SageMaker operators.
|
64
65
|
|
65
66
|
:param config: The configuration necessary to start a training job (templated)
|
66
67
|
"""
|
@@ -30,7 +30,8 @@ from airflow.utils.types import NOTSET, ArgNotSet
|
|
30
30
|
|
31
31
|
|
32
32
|
class AwsBaseSensor(BaseSensorOperator, AwsBaseHookMixin[AwsHookType]):
|
33
|
-
"""
|
33
|
+
"""
|
34
|
+
Base AWS (Amazon) Sensor Class for build sensors in top of AWS Hooks.
|
34
35
|
|
35
36
|
.. warning::
|
36
37
|
Only for internal usage, this class might be changed, renamed or removed in the future
|
@@ -18,7 +18,6 @@
|
|
18
18
|
from __future__ import annotations
|
19
19
|
|
20
20
|
from datetime import timedelta
|
21
|
-
from functools import cached_property
|
22
21
|
from typing import TYPE_CHECKING, Any, Sequence
|
23
22
|
|
24
23
|
from deprecated import deprecated
|
@@ -26,18 +25,23 @@ from deprecated import deprecated
|
|
26
25
|
from airflow.configuration import conf
|
27
26
|
from airflow.exceptions import AirflowException, AirflowProviderDeprecationWarning, AirflowSkipException
|
28
27
|
from airflow.providers.amazon.aws.hooks.glue_catalog import GlueCatalogHook
|
28
|
+
from airflow.providers.amazon.aws.sensors.base_aws import AwsBaseSensor
|
29
29
|
from airflow.providers.amazon.aws.triggers.glue import GlueCatalogPartitionTrigger
|
30
30
|
from airflow.providers.amazon.aws.utils import validate_execute_complete_event
|
31
|
-
from airflow.
|
31
|
+
from airflow.providers.amazon.aws.utils.mixins import aws_template_fields
|
32
32
|
|
33
33
|
if TYPE_CHECKING:
|
34
34
|
from airflow.utils.context import Context
|
35
35
|
|
36
36
|
|
37
|
-
class GlueCatalogPartitionSensor(
|
37
|
+
class GlueCatalogPartitionSensor(AwsBaseSensor[GlueCatalogHook]):
|
38
38
|
"""
|
39
39
|
Waits for a partition to show up in AWS Glue Catalog.
|
40
40
|
|
41
|
+
.. seealso::
|
42
|
+
For more information on how to use this sensor, take a look at the guide:
|
43
|
+
:ref:`howto/sensor:GlueCatalogPartitionSensor`
|
44
|
+
|
41
45
|
:param table_name: The name of the table to wait for, supports the dot
|
42
46
|
notation (my_database.my_table)
|
43
47
|
:param expression: The partition clause to wait for. This is passed as
|
@@ -46,19 +50,27 @@ class GlueCatalogPartitionSensor(BaseSensorOperator):
|
|
46
50
|
AND type='value'`` and comparison operators as in ``"ds>=2015-01-01"``.
|
47
51
|
See https://docs.aws.amazon.com/glue/latest/dg/aws-glue-api-catalog-partitions.html
|
48
52
|
#aws-glue-api-catalog-partitions-GetPartitions
|
49
|
-
:param aws_conn_id: ID of the Airflow connection where
|
50
|
-
credentials and extra configuration are stored
|
51
|
-
:param region_name: Optional aws region name (example: us-east-1). Uses region from connection
|
52
|
-
if not specified.
|
53
53
|
:param database_name: The name of the catalog database where the partitions reside.
|
54
54
|
:param poke_interval: Time in seconds that the job should wait in
|
55
55
|
between each tries
|
56
56
|
:param deferrable: If true, then the sensor will wait asynchronously for the partition to
|
57
57
|
show up in the AWS Glue Catalog.
|
58
58
|
(default: False, but can be overridden in config file by setting default_deferrable to True)
|
59
|
+
:param aws_conn_id: The Airflow connection used for AWS credentials.
|
60
|
+
If this is ``None`` or empty then the default boto3 behaviour is used. If
|
61
|
+
running Airflow in a distributed manner and aws_conn_id is None or
|
62
|
+
empty, then default boto3 configuration would be used (and must be
|
63
|
+
maintained on each worker node).
|
64
|
+
:param region_name: AWS region_name. If not specified then the default boto3 behaviour is used.
|
65
|
+
:param verify: Whether or not to verify SSL certificates. See:
|
66
|
+
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html
|
67
|
+
:param botocore_config: Configuration dictionary (key-values) for botocore client. See:
|
68
|
+
https://botocore.amazonaws.com/v1/documentation/api/latest/reference/config.html
|
59
69
|
"""
|
60
70
|
|
61
|
-
|
71
|
+
aws_hook_class = GlueCatalogHook
|
72
|
+
|
73
|
+
template_fields: Sequence[str] = aws_template_fields(
|
62
74
|
"database_name",
|
63
75
|
"table_name",
|
64
76
|
"expression",
|
@@ -70,19 +82,16 @@ class GlueCatalogPartitionSensor(BaseSensorOperator):
|
|
70
82
|
*,
|
71
83
|
table_name: str,
|
72
84
|
expression: str = "ds='{{ ds }}'",
|
73
|
-
aws_conn_id: str | None = "aws_default",
|
74
|
-
region_name: str | None = None,
|
75
85
|
database_name: str = "default",
|
76
86
|
poke_interval: int = 60 * 3,
|
77
87
|
deferrable: bool = conf.getboolean("operators", "default_deferrable", fallback=False),
|
78
88
|
**kwargs,
|
79
89
|
):
|
80
|
-
super().__init__(
|
81
|
-
self.aws_conn_id = aws_conn_id
|
82
|
-
self.region_name = region_name
|
90
|
+
super().__init__(**kwargs)
|
83
91
|
self.table_name = table_name
|
84
92
|
self.expression = expression
|
85
93
|
self.database_name = database_name
|
94
|
+
self.poke_interval = poke_interval
|
86
95
|
self.deferrable = deferrable
|
87
96
|
|
88
97
|
def execute(self, context: Context) -> Any:
|
@@ -93,7 +102,10 @@ class GlueCatalogPartitionSensor(BaseSensorOperator):
|
|
93
102
|
table_name=self.table_name,
|
94
103
|
expression=self.expression,
|
95
104
|
aws_conn_id=self.aws_conn_id,
|
105
|
+
region_name=self.region_name,
|
96
106
|
waiter_delay=int(self.poke_interval),
|
107
|
+
verify=self.verify,
|
108
|
+
botocore_config=self.botocore_config,
|
97
109
|
),
|
98
110
|
method_name="execute_complete",
|
99
111
|
timeout=timedelta(seconds=self.timeout),
|
@@ -126,7 +138,3 @@ class GlueCatalogPartitionSensor(BaseSensorOperator):
|
|
126
138
|
def get_hook(self) -> GlueCatalogHook:
|
127
139
|
"""Get the GlueCatalogHook."""
|
128
140
|
return self.hook
|
129
|
-
|
130
|
-
@cached_property
|
131
|
-
def hook(self) -> GlueCatalogHook:
|
132
|
-
return GlueCatalogHook(aws_conn_id=self.aws_conn_id, region_name=self.region_name)
|
@@ -17,20 +17,20 @@
|
|
17
17
|
# under the License.
|
18
18
|
from __future__ import annotations
|
19
19
|
|
20
|
-
from functools import cached_property
|
21
20
|
from typing import TYPE_CHECKING, Sequence
|
22
21
|
|
23
22
|
from deprecated import deprecated
|
24
23
|
|
25
24
|
from airflow.exceptions import AirflowException, AirflowProviderDeprecationWarning, AirflowSkipException
|
26
25
|
from airflow.providers.amazon.aws.hooks.glue_crawler import GlueCrawlerHook
|
27
|
-
from airflow.sensors.
|
26
|
+
from airflow.providers.amazon.aws.sensors.base_aws import AwsBaseSensor
|
27
|
+
from airflow.providers.amazon.aws.utils.mixins import aws_template_fields
|
28
28
|
|
29
29
|
if TYPE_CHECKING:
|
30
30
|
from airflow.utils.context import Context
|
31
31
|
|
32
32
|
|
33
|
-
class GlueCrawlerSensor(
|
33
|
+
class GlueCrawlerSensor(AwsBaseSensor[GlueCrawlerHook]):
|
34
34
|
"""
|
35
35
|
Waits for an AWS Glue crawler to reach any of the statuses below.
|
36
36
|
|
@@ -41,19 +41,27 @@ class GlueCrawlerSensor(BaseSensorOperator):
|
|
41
41
|
:ref:`howto/sensor:GlueCrawlerSensor`
|
42
42
|
|
43
43
|
:param crawler_name: The AWS Glue crawler unique name
|
44
|
-
:param aws_conn_id:
|
45
|
-
If this is None or empty then the default boto3 behaviour is used. If
|
44
|
+
:param aws_conn_id: The Airflow connection used for AWS credentials.
|
45
|
+
If this is ``None`` or empty then the default boto3 behaviour is used. If
|
46
46
|
running Airflow in a distributed manner and aws_conn_id is None or
|
47
47
|
empty, then default boto3 configuration would be used (and must be
|
48
48
|
maintained on each worker node).
|
49
|
+
:param region_name: AWS region_name. If not specified then the default boto3 behaviour is used.
|
50
|
+
:param verify: Whether or not to verify SSL certificates. See:
|
51
|
+
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html
|
52
|
+
:param botocore_config: Configuration dictionary (key-values) for botocore client. See:
|
53
|
+
https://botocore.amazonaws.com/v1/documentation/api/latest/reference/config.html
|
49
54
|
"""
|
50
55
|
|
51
|
-
|
56
|
+
aws_hook_class = GlueCrawlerHook
|
52
57
|
|
53
|
-
|
58
|
+
template_fields: Sequence[str] = aws_template_fields(
|
59
|
+
"crawler_name",
|
60
|
+
)
|
61
|
+
|
62
|
+
def __init__(self, *, crawler_name: str, **kwargs) -> None:
|
54
63
|
super().__init__(**kwargs)
|
55
64
|
self.crawler_name = crawler_name
|
56
|
-
self.aws_conn_id = aws_conn_id
|
57
65
|
self.success_statuses = "SUCCEEDED"
|
58
66
|
self.errored_statuses = ("FAILED", "CANCELLED")
|
59
67
|
|
@@ -79,7 +87,3 @@ class GlueCrawlerSensor(BaseSensorOperator):
|
|
79
87
|
def get_hook(self) -> GlueCrawlerHook:
|
80
88
|
"""Return a new or pre-existing GlueCrawlerHook."""
|
81
89
|
return self.hook
|
82
|
-
|
83
|
-
@cached_property
|
84
|
-
def hook(self) -> GlueCrawlerHook:
|
85
|
-
return GlueCrawlerHook(aws_conn_id=self.aws_conn_id)
|
@@ -34,7 +34,8 @@ if TYPE_CHECKING:
|
|
34
34
|
|
35
35
|
|
36
36
|
class MongoToS3Operator(BaseOperator):
|
37
|
-
"""
|
37
|
+
"""
|
38
|
+
Move data from MongoDB to S3.
|
38
39
|
|
39
40
|
.. seealso::
|
40
41
|
For more information on how to use this operator, take a look at the guide:
|
@@ -128,7 +129,8 @@ class MongoToS3Operator(BaseOperator):
|
|
128
129
|
|
129
130
|
@staticmethod
|
130
131
|
def _stringify(iterable: Iterable, joinable: str = "\n") -> str:
|
131
|
-
"""
|
132
|
+
"""
|
133
|
+
Stringify an iterable of dicts.
|
132
134
|
|
133
135
|
This dumps each dict with JSON, and joins them with ``joinable``.
|
134
136
|
"""
|
@@ -136,7 +138,8 @@ class MongoToS3Operator(BaseOperator):
|
|
136
138
|
|
137
139
|
@staticmethod
|
138
140
|
def transform(docs: Any) -> Any:
|
139
|
-
"""
|
141
|
+
"""
|
142
|
+
Transform the data for transfer.
|
140
143
|
|
141
144
|
This method is meant to be extended by child classes to perform
|
142
145
|
transformations unique to those operators needs. Processes pyMongo
|
@@ -44,7 +44,8 @@ class KeySchema(TypedDict):
|
|
44
44
|
|
45
45
|
|
46
46
|
class S3ToDynamoDBOperator(BaseOperator):
|
47
|
-
"""
|
47
|
+
"""
|
48
|
+
Load Data from S3 into a DynamoDB.
|
48
49
|
|
49
50
|
Data stored in S3 can be uploaded to a new or existing DynamoDB. Supported file formats CSV, DynamoDB JSON and
|
50
51
|
Amazon ION.
|
@@ -30,7 +30,8 @@ if TYPE_CHECKING:
|
|
30
30
|
|
31
31
|
|
32
32
|
class S3ToSqlOperator(BaseOperator):
|
33
|
-
"""
|
33
|
+
"""
|
34
|
+
Load Data from S3 into a SQL Database.
|
34
35
|
|
35
36
|
You need to provide a parser function that takes a filename as an input
|
36
37
|
and returns an iterable of rows
|
@@ -179,7 +179,9 @@ class TaskDoneTrigger(BaseTrigger):
|
|
179
179
|
cluster=self.cluster, tasks=[self.task_arn], WaiterConfig={"MaxAttempts": 1}
|
180
180
|
)
|
181
181
|
# we reach this point only if the waiter met a success criteria
|
182
|
-
yield TriggerEvent(
|
182
|
+
yield TriggerEvent(
|
183
|
+
{"status": "success", "task_arn": self.task_arn, "cluster": self.cluster}
|
184
|
+
)
|
183
185
|
return
|
184
186
|
except WaiterError as error:
|
185
187
|
if "terminal failure" in str(error):
|