apache-airflow-providers-amazon 8.24.0rc1__py3-none-any.whl → 8.24.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airflow/providers/amazon/LICENSE +4 -4
- airflow/providers/amazon/aws/hooks/base_aws.py +8 -3
- airflow/providers/amazon/aws/hooks/glue.py +123 -0
- airflow/providers/amazon/aws/operators/bedrock.py +6 -20
- airflow/providers/amazon/aws/operators/emr.py +38 -30
- airflow/providers/amazon/aws/operators/glue.py +408 -2
- airflow/providers/amazon/aws/operators/sagemaker.py +85 -12
- airflow/providers/amazon/aws/sensors/glue.py +260 -2
- airflow/providers/amazon/aws/sensors/s3.py +35 -5
- airflow/providers/amazon/aws/transfers/dynamodb_to_s3.py +0 -1
- airflow/providers/amazon/aws/triggers/glue.py +76 -2
- airflow/providers/amazon/aws/waiters/glue.json +98 -0
- airflow/providers/amazon/get_provider_info.py +18 -12
- {apache_airflow_providers_amazon-8.24.0rc1.dist-info → apache_airflow_providers_amazon-8.24.0rc2.dist-info}/METADATA +15 -14
- {apache_airflow_providers_amazon-8.24.0rc1.dist-info → apache_airflow_providers_amazon-8.24.0rc2.dist-info}/RECORD +17 -17
- {apache_airflow_providers_amazon-8.24.0rc1.dist-info → apache_airflow_providers_amazon-8.24.0rc2.dist-info}/WHEEL +0 -0
- {apache_airflow_providers_amazon-8.24.0rc1.dist-info → apache_airflow_providers_amazon-8.24.0rc2.dist-info}/entry_points.txt +0 -0
airflow/providers/amazon/LICENSE
CHANGED
@@ -215,7 +215,7 @@ Third party Apache 2.0 licenses
|
|
215
215
|
|
216
216
|
The following components are provided under the Apache 2.0 License.
|
217
217
|
See project link for details. The text of each license is also included
|
218
|
-
at licenses/LICENSE-[project].txt.
|
218
|
+
at 3rd-party-licenses/LICENSE-[project].txt.
|
219
219
|
|
220
220
|
(ALv2 License) hue v4.3.0 (https://github.com/cloudera/hue/)
|
221
221
|
(ALv2 License) jqclock v2.3.0 (https://github.com/JohnRDOrazio/jQuery-Clock-Plugin)
|
@@ -227,7 +227,7 @@ MIT licenses
|
|
227
227
|
========================================================================
|
228
228
|
|
229
229
|
The following components are provided under the MIT License. See project link for details.
|
230
|
-
The text of each license is also included at licenses/LICENSE-[project].txt.
|
230
|
+
The text of each license is also included at 3rd-party-licenses/LICENSE-[project].txt.
|
231
231
|
|
232
232
|
(MIT License) jquery v3.5.1 (https://jquery.org/license/)
|
233
233
|
(MIT License) dagre-d3 v0.6.4 (https://github.com/cpettitt/dagre-d3)
|
@@ -243,11 +243,11 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
|
|
243
243
|
BSD 3-Clause licenses
|
244
244
|
========================================================================
|
245
245
|
The following components are provided under the BSD 3-Clause license. See project links for details.
|
246
|
-
The text of each license is also included at licenses/LICENSE-[project].txt.
|
246
|
+
The text of each license is also included at 3rd-party-licenses/LICENSE-[project].txt.
|
247
247
|
|
248
248
|
(BSD 3 License) d3 v5.16.0 (https://d3js.org)
|
249
249
|
(BSD 3 License) d3-shape v2.1.0 (https://github.com/d3/d3-shape)
|
250
250
|
(BSD 3 License) cgroupspy 0.2.1 (https://github.com/cloudsigma/cgroupspy)
|
251
251
|
|
252
252
|
========================================================================
|
253
|
-
See licenses/LICENSES-ui.txt for packages used in `/airflow/www`
|
253
|
+
See 3rd-party-licenses/LICENSES-ui.txt for packages used in `/airflow/www`
|
@@ -156,7 +156,9 @@ class BaseSessionFactory(LoggingMixin):
|
|
156
156
|
|
157
157
|
return async_get_session()
|
158
158
|
|
159
|
-
def create_session(
|
159
|
+
def create_session(
|
160
|
+
self, deferrable: bool = False
|
161
|
+
) -> boto3.session.Session | aiobotocore.session.AioSession:
|
160
162
|
"""Create boto3 or aiobotocore Session from connection config."""
|
161
163
|
if not self.conn:
|
162
164
|
self.log.info(
|
@@ -198,7 +200,7 @@ class BaseSessionFactory(LoggingMixin):
|
|
198
200
|
|
199
201
|
def _create_session_with_assume_role(
|
200
202
|
self, session_kwargs: dict[str, Any], deferrable: bool = False
|
201
|
-
) -> boto3.session.Session:
|
203
|
+
) -> boto3.session.Session | aiobotocore.session.AioSession:
|
202
204
|
if self.conn.assume_role_method == "assume_role_with_web_identity":
|
203
205
|
# Deferred credentials have no initial credentials
|
204
206
|
credential_fetcher = self._get_web_identity_credential_fetcher()
|
@@ -239,7 +241,10 @@ class BaseSessionFactory(LoggingMixin):
|
|
239
241
|
session._credentials = credentials
|
240
242
|
session.set_config_variable("region", self.basic_session.region_name)
|
241
243
|
|
242
|
-
|
244
|
+
if not deferrable:
|
245
|
+
return boto3.session.Session(botocore_session=session, **session_kwargs)
|
246
|
+
|
247
|
+
return session
|
243
248
|
|
244
249
|
def _refresh_credentials(self) -> dict[str, Any]:
|
245
250
|
self.log.debug("Refreshing credentials")
|
@@ -20,6 +20,7 @@ from __future__ import annotations
|
|
20
20
|
import asyncio
|
21
21
|
import time
|
22
22
|
from functools import cached_property
|
23
|
+
from typing import Any
|
23
24
|
|
24
25
|
from botocore.exceptions import ClientError
|
25
26
|
|
@@ -430,3 +431,125 @@ class GlueJobHook(AwsBaseHook):
|
|
430
431
|
self.conn.create_job(**config)
|
431
432
|
|
432
433
|
return self.job_name
|
434
|
+
|
435
|
+
|
436
|
+
class GlueDataQualityHook(AwsBaseHook):
|
437
|
+
"""
|
438
|
+
Interact with AWS Glue Data Quality.
|
439
|
+
|
440
|
+
Provide thick wrapper around :external+boto3:py:class:`boto3.client("glue") <Glue.Client>`.
|
441
|
+
|
442
|
+
Additional arguments (such as ``aws_conn_id``) may be specified and
|
443
|
+
are passed down to the underlying AwsBaseHook.
|
444
|
+
|
445
|
+
.. seealso::
|
446
|
+
- :class:`airflow.providers.amazon.aws.hooks.base_aws.AwsBaseHook`
|
447
|
+
"""
|
448
|
+
|
449
|
+
def __init__(
|
450
|
+
self,
|
451
|
+
*args,
|
452
|
+
**kwargs,
|
453
|
+
):
|
454
|
+
kwargs["client_type"] = "glue"
|
455
|
+
super().__init__(*args, **kwargs)
|
456
|
+
|
457
|
+
def has_data_quality_ruleset(self, name: str) -> bool:
|
458
|
+
try:
|
459
|
+
self.conn.get_data_quality_ruleset(Name=name)
|
460
|
+
return True
|
461
|
+
except self.conn.exceptions.EntityNotFoundException:
|
462
|
+
return False
|
463
|
+
|
464
|
+
def _log_results(self, result: dict[str, Any]) -> None:
|
465
|
+
"""
|
466
|
+
Print the outcome of evaluation run, An evaluation run can involve multiple rulesets evaluated against a data source (Glue table).
|
467
|
+
|
468
|
+
Name Description Result EvaluatedMetrics EvaluationMessage
|
469
|
+
Rule_1 RowCount between 150000 and 600000 PASS {'Dataset.*.RowCount': 300000.0} NaN
|
470
|
+
Rule_2 IsComplete "marketplace" PASS {'Column.marketplace.Completeness': 1.0} NaN
|
471
|
+
Rule_3 ColumnLength "marketplace" between 1 and 2 FAIL {'Column.marketplace.MaximumLength': 9.0, 'Column.marketplace.MinimumLength': 3.0} Value: 9.0 does not meet the constraint requirement!
|
472
|
+
|
473
|
+
"""
|
474
|
+
import pandas as pd
|
475
|
+
|
476
|
+
pd.set_option("display.max_rows", None)
|
477
|
+
pd.set_option("display.max_columns", None)
|
478
|
+
pd.set_option("display.width", None)
|
479
|
+
pd.set_option("display.max_colwidth", None)
|
480
|
+
|
481
|
+
self.log.info(
|
482
|
+
"AWS Glue data quality ruleset evaluation result for RulesetName: %s RulesetEvaluationRunId: %s Score: %s",
|
483
|
+
result.get("RulesetName"),
|
484
|
+
result.get("RulesetEvaluationRunId"),
|
485
|
+
result.get("Score"),
|
486
|
+
)
|
487
|
+
|
488
|
+
rule_results = result["RuleResults"]
|
489
|
+
rule_results_df = pd.DataFrame(rule_results)
|
490
|
+
self.log.info(rule_results_df)
|
491
|
+
|
492
|
+
def get_evaluation_run_results(self, run_id: str) -> dict[str, Any]:
|
493
|
+
response = self.conn.get_data_quality_ruleset_evaluation_run(RunId=run_id)
|
494
|
+
|
495
|
+
return self.conn.batch_get_data_quality_result(ResultIds=response["ResultIds"])
|
496
|
+
|
497
|
+
def validate_evaluation_run_results(
|
498
|
+
self, evaluation_run_id: str, show_results: bool = True, verify_result_status: bool = True
|
499
|
+
) -> None:
|
500
|
+
results = self.get_evaluation_run_results(evaluation_run_id)
|
501
|
+
total_failed_rules = 0
|
502
|
+
|
503
|
+
if results.get("ResultsNotFound"):
|
504
|
+
self.log.info(
|
505
|
+
"AWS Glue data quality ruleset evaluation run, results not found for %s",
|
506
|
+
results["ResultsNotFound"],
|
507
|
+
)
|
508
|
+
|
509
|
+
for result in results["Results"]:
|
510
|
+
rule_results = result["RuleResults"]
|
511
|
+
|
512
|
+
total_failed_rules += len(
|
513
|
+
list(
|
514
|
+
filter(
|
515
|
+
lambda result: result.get("Result") == "FAIL" or result.get("Result") == "ERROR",
|
516
|
+
rule_results,
|
517
|
+
)
|
518
|
+
)
|
519
|
+
)
|
520
|
+
|
521
|
+
if show_results:
|
522
|
+
self._log_results(result)
|
523
|
+
|
524
|
+
self.log.info(
|
525
|
+
"AWS Glue data quality ruleset evaluation run, total number of rules failed: %s",
|
526
|
+
total_failed_rules,
|
527
|
+
)
|
528
|
+
|
529
|
+
if verify_result_status and total_failed_rules > 0:
|
530
|
+
raise AirflowException(
|
531
|
+
"AWS Glue data quality ruleset evaluation run failed for one or more rules"
|
532
|
+
)
|
533
|
+
|
534
|
+
def log_recommendation_results(self, run_id: str) -> None:
|
535
|
+
"""
|
536
|
+
Print the outcome of recommendation run, recommendation run generates multiple rules against a data source (Glue table) in Data Quality Definition Language (DQDL) format.
|
537
|
+
|
538
|
+
Rules = [
|
539
|
+
IsComplete "NAME",
|
540
|
+
ColumnLength "EMP_ID" between 1 and 12,
|
541
|
+
IsUnique "EMP_ID",
|
542
|
+
ColumnValues "INCOME" > 50000
|
543
|
+
]
|
544
|
+
"""
|
545
|
+
result = self.conn.get_data_quality_rule_recommendation_run(RunId=run_id)
|
546
|
+
|
547
|
+
if result.get("RecommendedRuleset"):
|
548
|
+
self.log.info(
|
549
|
+
"AWS Glue data quality recommended rules for DatabaseName: %s TableName: %s",
|
550
|
+
result["DataSource"]["GlueTable"]["DatabaseName"],
|
551
|
+
result["DataSource"]["GlueTable"]["TableName"],
|
552
|
+
)
|
553
|
+
self.log.info(result["RecommendedRuleset"])
|
554
|
+
else:
|
555
|
+
self.log.info("AWS Glue data quality, no recommended rules available for RunId: %s", run_id)
|
@@ -20,7 +20,6 @@ import json
|
|
20
20
|
from time import sleep
|
21
21
|
from typing import TYPE_CHECKING, Any, Sequence
|
22
22
|
|
23
|
-
import botocore
|
24
23
|
from botocore.exceptions import ClientError
|
25
24
|
|
26
25
|
from airflow.configuration import conf
|
@@ -38,7 +37,7 @@ from airflow.providers.amazon.aws.triggers.bedrock import (
|
|
38
37
|
BedrockKnowledgeBaseActiveTrigger,
|
39
38
|
BedrockProvisionModelThroughputCompletedTrigger,
|
40
39
|
)
|
41
|
-
from airflow.providers.amazon.aws.utils import
|
40
|
+
from airflow.providers.amazon.aws.utils import validate_execute_complete_event
|
42
41
|
from airflow.providers.amazon.aws.utils.mixins import aws_template_fields
|
43
42
|
from airflow.utils.helpers import prune_dict
|
44
43
|
from airflow.utils.timezone import utcnow
|
@@ -799,24 +798,11 @@ class BedrockRaGOperator(AwsBaseOperator[BedrockAgentRuntimeHook]):
|
|
799
798
|
def execute(self, context: Context) -> Any:
|
800
799
|
self.validate_inputs()
|
801
800
|
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
)
|
808
|
-
except botocore.exceptions.ParamValidationError as error:
|
809
|
-
if (
|
810
|
-
'Unknown parameter in retrieveAndGenerateConfiguration: "externalSourcesConfiguration"'
|
811
|
-
in str(error)
|
812
|
-
) and (self.source_type == "EXTERNAL_SOURCES"):
|
813
|
-
self.log.error(
|
814
|
-
"You are attempting to use External Sources and the BOTO API returned an "
|
815
|
-
"error message which may indicate the need to update botocore to do this. \n"
|
816
|
-
"Support for External Sources was added in botocore 1.34.90 and you are using botocore %s",
|
817
|
-
".".join(map(str, get_botocore_version())),
|
818
|
-
)
|
819
|
-
raise
|
801
|
+
result = self.hook.conn.retrieve_and_generate(
|
802
|
+
input={"text": self.input},
|
803
|
+
retrieveAndGenerateConfiguration=self.build_rag_config(),
|
804
|
+
**self.rag_kwargs,
|
805
|
+
)
|
820
806
|
|
821
807
|
self.log.info(
|
822
808
|
"\nPrompt: %s\nResponse: %s\nCitations: %s",
|
@@ -263,30 +263,34 @@ class EmrStartNotebookExecutionOperator(BaseOperator):
|
|
263
263
|
wait_for_completion: bool = False,
|
264
264
|
aws_conn_id: str | None = "aws_default",
|
265
265
|
# TODO: waiter_max_attempts and waiter_delay should default to None when the other two are deprecated.
|
266
|
-
waiter_max_attempts: int | None
|
267
|
-
waiter_delay: int | None
|
268
|
-
waiter_countdown: int
|
269
|
-
waiter_check_interval_seconds: int =
|
266
|
+
waiter_max_attempts: int | None = None,
|
267
|
+
waiter_delay: int | None = None,
|
268
|
+
waiter_countdown: int | None = None,
|
269
|
+
waiter_check_interval_seconds: int | None = None,
|
270
270
|
**kwargs: Any,
|
271
271
|
):
|
272
|
-
if
|
272
|
+
if waiter_check_interval_seconds:
|
273
273
|
warnings.warn(
|
274
|
-
"The parameter
|
275
|
-
"naming conventions. Please use
|
274
|
+
"The parameter `waiter_check_interval_seconds` has been deprecated to "
|
275
|
+
"standardize naming conventions. Please `use waiter_delay instead`. In the "
|
276
276
|
"future this will default to None and defer to the waiter's default value.",
|
277
277
|
AirflowProviderDeprecationWarning,
|
278
278
|
stacklevel=2,
|
279
279
|
)
|
280
|
-
|
281
|
-
|
280
|
+
else:
|
281
|
+
waiter_check_interval_seconds = 60
|
282
|
+
if waiter_countdown:
|
282
283
|
warnings.warn(
|
283
|
-
"The parameter
|
284
|
-
"
|
284
|
+
"The parameter waiter_countdown has been deprecated to standardize "
|
285
|
+
"naming conventions. Please use waiter_max_attempts instead. In the "
|
285
286
|
"future this will default to None and defer to the waiter's default value.",
|
286
287
|
AirflowProviderDeprecationWarning,
|
287
288
|
stacklevel=2,
|
288
289
|
)
|
289
|
-
|
290
|
+
# waiter_countdown defaults to never timing out, which is not supported
|
291
|
+
# by boto waiters, so we will set it here to "a very long time" for now.
|
292
|
+
waiter_max_attempts = (waiter_countdown or 999) // waiter_check_interval_seconds
|
293
|
+
|
290
294
|
super().__init__(**kwargs)
|
291
295
|
self.editor_id = editor_id
|
292
296
|
self.relative_path = relative_path
|
@@ -298,8 +302,8 @@ class EmrStartNotebookExecutionOperator(BaseOperator):
|
|
298
302
|
self.wait_for_completion = wait_for_completion
|
299
303
|
self.cluster_id = cluster_id
|
300
304
|
self.aws_conn_id = aws_conn_id
|
301
|
-
self.waiter_max_attempts = waiter_max_attempts
|
302
|
-
self.waiter_delay = waiter_delay
|
305
|
+
self.waiter_max_attempts = waiter_max_attempts or 25
|
306
|
+
self.waiter_delay = waiter_delay or waiter_check_interval_seconds or 60
|
303
307
|
self.master_instance_security_group_id = master_instance_security_group_id
|
304
308
|
|
305
309
|
def execute(self, context: Context):
|
@@ -387,36 +391,40 @@ class EmrStopNotebookExecutionOperator(BaseOperator):
|
|
387
391
|
wait_for_completion: bool = False,
|
388
392
|
aws_conn_id: str | None = "aws_default",
|
389
393
|
# TODO: waiter_max_attempts and waiter_delay should default to None when the other two are deprecated.
|
390
|
-
waiter_max_attempts: int | None
|
391
|
-
waiter_delay: int | None
|
392
|
-
waiter_countdown: int
|
393
|
-
waiter_check_interval_seconds: int =
|
394
|
+
waiter_max_attempts: int | None = None,
|
395
|
+
waiter_delay: int | None = None,
|
396
|
+
waiter_countdown: int | None = None,
|
397
|
+
waiter_check_interval_seconds: int | None = None,
|
394
398
|
**kwargs: Any,
|
395
399
|
):
|
396
|
-
if
|
400
|
+
if waiter_check_interval_seconds:
|
397
401
|
warnings.warn(
|
398
|
-
"The parameter
|
399
|
-
"naming conventions. Please use
|
402
|
+
"The parameter `waiter_check_interval_seconds` has been deprecated to "
|
403
|
+
"standardize naming conventions. Please `use waiter_delay instead`. In the "
|
400
404
|
"future this will default to None and defer to the waiter's default value.",
|
401
405
|
AirflowProviderDeprecationWarning,
|
402
406
|
stacklevel=2,
|
403
407
|
)
|
404
|
-
|
405
|
-
|
408
|
+
else:
|
409
|
+
waiter_check_interval_seconds = 60
|
410
|
+
if waiter_countdown:
|
406
411
|
warnings.warn(
|
407
|
-
"The parameter
|
408
|
-
"
|
412
|
+
"The parameter waiter_countdown has been deprecated to standardize "
|
413
|
+
"naming conventions. Please use waiter_max_attempts instead. In the "
|
409
414
|
"future this will default to None and defer to the waiter's default value.",
|
410
415
|
AirflowProviderDeprecationWarning,
|
411
416
|
stacklevel=2,
|
412
417
|
)
|
413
|
-
|
418
|
+
# waiter_countdown defaults to never timing out, which is not supported
|
419
|
+
# by boto waiters, so we will set it here to "a very long time" for now.
|
420
|
+
waiter_max_attempts = (waiter_countdown or 999) // waiter_check_interval_seconds
|
421
|
+
|
414
422
|
super().__init__(**kwargs)
|
415
423
|
self.notebook_execution_id = notebook_execution_id
|
416
424
|
self.wait_for_completion = wait_for_completion
|
417
425
|
self.aws_conn_id = aws_conn_id
|
418
|
-
self.waiter_max_attempts = waiter_max_attempts
|
419
|
-
self.waiter_delay = waiter_delay
|
426
|
+
self.waiter_max_attempts = waiter_max_attempts or 25
|
427
|
+
self.waiter_delay = waiter_delay or waiter_check_interval_seconds or 60
|
420
428
|
|
421
429
|
def execute(self, context: Context) -> None:
|
422
430
|
emr_hook = EmrHook(aws_conn_id=self.aws_conn_id)
|
@@ -822,8 +830,8 @@ class EmrCreateJobFlowOperator(BaseOperator):
|
|
822
830
|
trigger=EmrCreateJobFlowTrigger(
|
823
831
|
job_flow_id=self._job_flow_id,
|
824
832
|
aws_conn_id=self.aws_conn_id,
|
825
|
-
|
826
|
-
|
833
|
+
waiter_delay=self.waiter_delay,
|
834
|
+
waiter_max_attempts=self.waiter_max_attempts,
|
827
835
|
),
|
828
836
|
method_name="execute_complete",
|
829
837
|
# timeout is set to ensure that if a trigger dies, the timeout does not restart
|