apache-airflow-providers-google 10.12.0rc1__py3-none-any.whl → 10.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airflow/providers/google/__init__.py +3 -3
- airflow/providers/google/cloud/fs/gcs.py +16 -13
- airflow/providers/google/cloud/hooks/bigquery_dts.py +2 -1
- airflow/providers/google/cloud/hooks/cloud_build.py +2 -1
- airflow/providers/google/cloud/hooks/cloud_composer.py +4 -3
- airflow/providers/google/cloud/hooks/compute_ssh.py +18 -6
- airflow/providers/google/cloud/hooks/dataflow.py +61 -3
- airflow/providers/google/cloud/hooks/dataplex.py +2 -1
- airflow/providers/google/cloud/hooks/dataproc.py +19 -18
- airflow/providers/google/cloud/hooks/gcs.py +10 -6
- airflow/providers/google/cloud/hooks/pubsub.py +3 -2
- airflow/providers/google/cloud/log/gcs_task_handler.py +2 -39
- airflow/providers/google/cloud/log/stackdriver_task_handler.py +2 -11
- airflow/providers/google/cloud/operators/bigquery.py +47 -47
- airflow/providers/google/cloud/operators/cloud_composer.py +1 -1
- airflow/providers/google/cloud/operators/cloud_run.py +3 -3
- airflow/providers/google/cloud/operators/dataflow.py +6 -0
- airflow/providers/google/cloud/operators/dataplex.py +530 -1
- airflow/providers/google/cloud/operators/dataproc.py +11 -11
- airflow/providers/google/cloud/operators/gcs.py +90 -15
- airflow/providers/google/cloud/operators/kubernetes_engine.py +2 -3
- airflow/providers/google/cloud/operators/pubsub.py +47 -55
- airflow/providers/google/cloud/secrets/secret_manager.py +22 -1
- airflow/providers/google/cloud/sensors/cloud_composer.py +14 -1
- airflow/providers/google/cloud/sensors/dataplex.py +118 -0
- airflow/providers/google/cloud/sensors/gcs.py +10 -1
- airflow/providers/google/cloud/transfers/adls_to_gcs.py +5 -5
- airflow/providers/google/cloud/transfers/gcs_to_gcs.py +42 -42
- airflow/providers/google/cloud/transfers/mssql_to_gcs.py +9 -9
- airflow/providers/google/cloud/triggers/cloud_run.py +7 -7
- airflow/providers/google/cloud/triggers/dataplex.py +82 -0
- airflow/providers/google/cloud/triggers/dataproc.py +2 -5
- airflow/providers/google/cloud/triggers/gcs.py +13 -3
- airflow/providers/google/cloud/triggers/kubernetes_engine.py +3 -1
- airflow/providers/google/common/hooks/base_google.py +6 -4
- airflow/providers/google/get_provider_info.py +14 -13
- {apache_airflow_providers_google-10.12.0rc1.dist-info → apache_airflow_providers_google-10.13.0.dist-info}/METADATA +31 -31
- {apache_airflow_providers_google-10.12.0rc1.dist-info → apache_airflow_providers_google-10.13.0.dist-info}/RECORD +40 -40
- {apache_airflow_providers_google-10.12.0rc1.dist-info → apache_airflow_providers_google-10.13.0.dist-info}/WHEEL +0 -0
- {apache_airflow_providers_google-10.12.0rc1.dist-info → apache_airflow_providers_google-10.13.0.dist-info}/entry_points.txt +0 -0
@@ -188,11 +188,11 @@ class GCSListObjectsOperator(GoogleCloudBaseOperator):
|
|
188
188
|
folder in ``data`` bucket. ::
|
189
189
|
|
190
190
|
GCS_Files = GCSListOperator(
|
191
|
-
task_id=
|
192
|
-
bucket=
|
193
|
-
prefix=
|
194
|
-
match_glob=
|
195
|
-
gcp_conn_id=google_cloud_conn_id
|
191
|
+
task_id="GCS_Files",
|
192
|
+
bucket="data",
|
193
|
+
prefix="sales/sales-2017/",
|
194
|
+
match_glob="**/*/.avro",
|
195
|
+
gcp_conn_id=google_cloud_conn_id,
|
196
196
|
)
|
197
197
|
"""
|
198
198
|
|
@@ -313,6 +313,7 @@ class GCSDeleteObjectsOperator(GoogleCloudBaseOperator):
|
|
313
313
|
)
|
314
314
|
raise ValueError(err_message)
|
315
315
|
|
316
|
+
self._objects: list[str] = []
|
316
317
|
super().__init__(**kwargs)
|
317
318
|
|
318
319
|
def execute(self, context: Context) -> None:
|
@@ -322,13 +323,47 @@ class GCSDeleteObjectsOperator(GoogleCloudBaseOperator):
|
|
322
323
|
)
|
323
324
|
|
324
325
|
if self.objects is not None:
|
325
|
-
|
326
|
+
self._objects = self.objects
|
326
327
|
else:
|
327
|
-
|
328
|
-
self.log.info("Deleting %s objects from %s", len(
|
329
|
-
for object_name in
|
328
|
+
self._objects = hook.list(bucket_name=self.bucket_name, prefix=self.prefix)
|
329
|
+
self.log.info("Deleting %s objects from %s", len(self._objects), self.bucket_name)
|
330
|
+
for object_name in self._objects:
|
330
331
|
hook.delete(bucket_name=self.bucket_name, object_name=object_name)
|
331
332
|
|
333
|
+
def get_openlineage_facets_on_complete(self, task_instance):
|
334
|
+
"""Implementing on_complete as execute() resolves object names."""
|
335
|
+
from openlineage.client.facet import (
|
336
|
+
LifecycleStateChange,
|
337
|
+
LifecycleStateChangeDatasetFacet,
|
338
|
+
LifecycleStateChangeDatasetFacetPreviousIdentifier,
|
339
|
+
)
|
340
|
+
from openlineage.client.run import Dataset
|
341
|
+
|
342
|
+
from airflow.providers.openlineage.extractors import OperatorLineage
|
343
|
+
|
344
|
+
if not self._objects:
|
345
|
+
return OperatorLineage()
|
346
|
+
|
347
|
+
bucket_url = f"gs://{self.bucket_name}"
|
348
|
+
input_datasets = [
|
349
|
+
Dataset(
|
350
|
+
namespace=bucket_url,
|
351
|
+
name=object_name,
|
352
|
+
facets={
|
353
|
+
"lifecycleStateChange": LifecycleStateChangeDatasetFacet(
|
354
|
+
lifecycleStateChange=LifecycleStateChange.DROP.value,
|
355
|
+
previousIdentifier=LifecycleStateChangeDatasetFacetPreviousIdentifier(
|
356
|
+
namespace=bucket_url,
|
357
|
+
name=object_name,
|
358
|
+
),
|
359
|
+
)
|
360
|
+
},
|
361
|
+
)
|
362
|
+
for object_name in self._objects
|
363
|
+
]
|
364
|
+
|
365
|
+
return OperatorLineage(inputs=input_datasets)
|
366
|
+
|
332
367
|
|
333
368
|
class GCSBucketCreateAclEntryOperator(GoogleCloudBaseOperator):
|
334
369
|
"""
|
@@ -596,6 +631,22 @@ class GCSFileTransformOperator(GoogleCloudBaseOperator):
|
|
596
631
|
filename=destination_file.name,
|
597
632
|
)
|
598
633
|
|
634
|
+
def get_openlineage_facets_on_start(self):
|
635
|
+
from openlineage.client.run import Dataset
|
636
|
+
|
637
|
+
from airflow.providers.openlineage.extractors import OperatorLineage
|
638
|
+
|
639
|
+
input_dataset = Dataset(
|
640
|
+
namespace=f"gs://{self.source_bucket}",
|
641
|
+
name=self.source_object,
|
642
|
+
)
|
643
|
+
output_dataset = Dataset(
|
644
|
+
namespace=f"gs://{self.destination_bucket}",
|
645
|
+
name=self.destination_object,
|
646
|
+
)
|
647
|
+
|
648
|
+
return OperatorLineage(inputs=[input_dataset], outputs=[output_dataset])
|
649
|
+
|
599
650
|
|
600
651
|
class GCSTimeSpanFileTransformOperator(GoogleCloudBaseOperator):
|
601
652
|
"""
|
@@ -722,6 +773,9 @@ class GCSTimeSpanFileTransformOperator(GoogleCloudBaseOperator):
|
|
722
773
|
self.upload_continue_on_fail = upload_continue_on_fail
|
723
774
|
self.upload_num_attempts = upload_num_attempts
|
724
775
|
|
776
|
+
self._source_object_names: list[str] = []
|
777
|
+
self._destination_object_names: list[str] = []
|
778
|
+
|
725
779
|
def execute(self, context: Context) -> list[str]:
|
726
780
|
# Define intervals and prefixes.
|
727
781
|
try:
|
@@ -773,7 +827,7 @@ class GCSTimeSpanFileTransformOperator(GoogleCloudBaseOperator):
|
|
773
827
|
)
|
774
828
|
|
775
829
|
# Fetch list of files.
|
776
|
-
|
830
|
+
self._source_object_names = source_hook.list_by_timespan(
|
777
831
|
bucket_name=self.source_bucket,
|
778
832
|
prefix=source_prefix_interp,
|
779
833
|
timespan_start=timespan_start,
|
@@ -785,7 +839,7 @@ class GCSTimeSpanFileTransformOperator(GoogleCloudBaseOperator):
|
|
785
839
|
temp_output_dir_path = Path(temp_output_dir)
|
786
840
|
|
787
841
|
# TODO: download in parallel.
|
788
|
-
for blob_to_transform in
|
842
|
+
for blob_to_transform in self._source_object_names:
|
789
843
|
destination_file = temp_input_dir_path / blob_to_transform
|
790
844
|
destination_file.parent.mkdir(parents=True, exist_ok=True)
|
791
845
|
try:
|
@@ -822,8 +876,6 @@ class GCSTimeSpanFileTransformOperator(GoogleCloudBaseOperator):
|
|
822
876
|
|
823
877
|
self.log.info("Transformation succeeded. Output temporarily located at %s", temp_output_dir_path)
|
824
878
|
|
825
|
-
files_uploaded = []
|
826
|
-
|
827
879
|
# TODO: upload in parallel.
|
828
880
|
for upload_file in temp_output_dir_path.glob("**/*"):
|
829
881
|
if upload_file.is_dir():
|
@@ -844,12 +896,35 @@ class GCSTimeSpanFileTransformOperator(GoogleCloudBaseOperator):
|
|
844
896
|
chunk_size=self.chunk_size,
|
845
897
|
num_max_attempts=self.upload_num_attempts,
|
846
898
|
)
|
847
|
-
|
899
|
+
self._destination_object_names.append(str(upload_file_name))
|
848
900
|
except GoogleCloudError:
|
849
901
|
if not self.upload_continue_on_fail:
|
850
902
|
raise
|
851
903
|
|
852
|
-
return
|
904
|
+
return self._destination_object_names
|
905
|
+
|
906
|
+
def get_openlineage_facets_on_complete(self, task_instance):
|
907
|
+
"""Implementing on_complete as execute() resolves object names."""
|
908
|
+
from openlineage.client.run import Dataset
|
909
|
+
|
910
|
+
from airflow.providers.openlineage.extractors import OperatorLineage
|
911
|
+
|
912
|
+
input_datasets = [
|
913
|
+
Dataset(
|
914
|
+
namespace=f"gs://{self.source_bucket}",
|
915
|
+
name=object_name,
|
916
|
+
)
|
917
|
+
for object_name in self._source_object_names
|
918
|
+
]
|
919
|
+
output_datasets = [
|
920
|
+
Dataset(
|
921
|
+
namespace=f"gs://{self.destination_bucket}",
|
922
|
+
name=object_name,
|
923
|
+
)
|
924
|
+
for object_name in self._destination_object_names
|
925
|
+
]
|
926
|
+
|
927
|
+
return OperatorLineage(inputs=input_datasets, outputs=output_datasets)
|
853
928
|
|
854
929
|
|
855
930
|
class GCSDeleteBucketOperator(GoogleCloudBaseOperator):
|
@@ -192,15 +192,14 @@ class GKECreateClusterOperator(GoogleCloudBaseOperator):
|
|
192
192
|
The **minimum** required to define a cluster to create is:
|
193
193
|
|
194
194
|
``dict()`` ::
|
195
|
-
cluster_def = {
|
196
|
-
'initial_node_count': 1}
|
195
|
+
cluster_def = {"name": "my-cluster-name", "initial_node_count": 1}
|
197
196
|
|
198
197
|
or
|
199
198
|
|
200
199
|
``Cluster`` proto ::
|
201
200
|
from google.cloud.container_v1.types import Cluster
|
202
201
|
|
203
|
-
cluster_def = Cluster(name=
|
202
|
+
cluster_def = Cluster(name="my-cluster-name", initial_node_count=1)
|
204
203
|
|
205
204
|
**Operator Creation**: ::
|
206
205
|
|
@@ -58,24 +58,22 @@ class PubSubCreateTopicOperator(GoogleCloudBaseOperator):
|
|
58
58
|
By default, if the topic already exists, this operator will
|
59
59
|
not cause the DAG to fail. ::
|
60
60
|
|
61
|
-
with DAG(
|
62
|
-
(
|
63
|
-
|
64
|
-
|
65
|
-
|
61
|
+
with DAG("successful DAG") as dag:
|
62
|
+
create_topic = PubSubCreateTopicOperator(project_id="my-project", topic="my_new_topic")
|
63
|
+
create_topic_again = PubSubCreateTopicOperator(project_id="my-project", topic="my_new_topic")
|
64
|
+
|
65
|
+
create_topic >> create_topic_again
|
66
66
|
|
67
67
|
The operator can be configured to fail if the topic already exists. ::
|
68
68
|
|
69
|
-
with DAG(
|
70
|
-
(
|
71
|
-
|
72
|
-
|
73
|
-
project_id='my-project',
|
74
|
-
topic='my_new_topic',
|
75
|
-
fail_if_exists=True,
|
76
|
-
)
|
69
|
+
with DAG("failing DAG") as dag:
|
70
|
+
create_topic = PubSubCreateTopicOperator(project_id="my-project", topic="my_new_topic")
|
71
|
+
create_topic_again = PubSubCreateTopicOperator(
|
72
|
+
project_id="my-project", topic="my_new_topic", fail_if_exists=True
|
77
73
|
)
|
78
74
|
|
75
|
+
create_topic >> create_topic_again
|
76
|
+
|
79
77
|
Both ``project_id`` and ``topic`` are templated so you can use Jinja templating in their values.
|
80
78
|
|
81
79
|
:param project_id: Optional, the Google Cloud project ID where the topic will be created.
|
@@ -197,43 +195,35 @@ class PubSubCreateSubscriptionOperator(GoogleCloudBaseOperator):
|
|
197
195
|
By default, if the subscription already exists, this operator will
|
198
196
|
not cause the DAG to fail. However, the topic must exist in the project. ::
|
199
197
|
|
200
|
-
with DAG(
|
201
|
-
(
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
)
|
207
|
-
>> PubSubCreateSubscriptionOperator(
|
208
|
-
project_id='my-project',
|
209
|
-
topic='my-topic',
|
210
|
-
subscription='my-subscription',
|
211
|
-
)
|
198
|
+
with DAG("successful DAG") as dag:
|
199
|
+
create_subscription = PubSubCreateSubscriptionOperator(
|
200
|
+
project_id="my-project", topic="my-topic", subscription="my-subscription"
|
201
|
+
)
|
202
|
+
create_subscription_again = PubSubCreateSubscriptionOperator(
|
203
|
+
project_id="my-project", topic="my-topic", subscription="my-subscription"
|
212
204
|
)
|
213
205
|
|
206
|
+
create_subscription >> create_subscription_again
|
207
|
+
|
208
|
+
|
214
209
|
The operator can be configured to fail if the subscription already exists.
|
215
210
|
::
|
216
211
|
|
217
|
-
with DAG(
|
218
|
-
(
|
219
|
-
|
220
|
-
project_id='my-project',
|
221
|
-
topic='my-topic',
|
222
|
-
subscription='my-subscription',
|
223
|
-
)
|
224
|
-
>> PubSubCreateSubscriptionOperator(
|
225
|
-
project_id='my-project',
|
226
|
-
topic='my-topic',
|
227
|
-
subscription='my-subscription',
|
228
|
-
fail_if_exists=True,
|
229
|
-
)
|
212
|
+
with DAG("failing DAG") as dag:
|
213
|
+
create_subscription = PubSubCreateSubscriptionOperator(
|
214
|
+
project_id="my-project", topic="my-topic", subscription="my-subscription"
|
230
215
|
)
|
216
|
+
create_subscription_again = PubSubCreateSubscriptionOperator(
|
217
|
+
project_id="my-project", topic="my-topic", subscription="my-subscription", fail_if_exists=True
|
218
|
+
)
|
219
|
+
|
220
|
+
create_subscription >> create_subscription_again
|
231
221
|
|
232
222
|
Finally, subscription is not required. If not passed, the operator will
|
233
223
|
generated a universally unique identifier for the subscription's name. ::
|
234
224
|
|
235
|
-
with DAG(
|
236
|
-
PubSubCreateSubscriptionOperator(project_id=
|
225
|
+
with DAG("DAG") as dag:
|
226
|
+
PubSubCreateSubscriptionOperator(project_id="my-project", topic="my-topic")
|
237
227
|
|
238
228
|
``project_id``, ``topic``, ``subscription``, ``subscription_project_id`` and
|
239
229
|
``impersonation_chain`` are templated so you can use Jinja templating in their values.
|
@@ -410,14 +400,16 @@ class PubSubDeleteTopicOperator(GoogleCloudBaseOperator):
|
|
410
400
|
By default, if the topic does not exist, this operator will
|
411
401
|
not cause the DAG to fail. ::
|
412
402
|
|
413
|
-
with DAG(
|
414
|
-
PubSubDeleteTopicOperator(project_id=
|
403
|
+
with DAG("successful DAG") as dag:
|
404
|
+
PubSubDeleteTopicOperator(project_id="my-project", topic="non_existing_topic")
|
415
405
|
|
416
406
|
The operator can be configured to fail if the topic does not exist. ::
|
417
407
|
|
418
|
-
with DAG(
|
408
|
+
with DAG("failing DAG") as dag:
|
419
409
|
PubSubDeleteTopicOperator(
|
420
|
-
project_id=
|
410
|
+
project_id="my-project",
|
411
|
+
topic="non_existing_topic",
|
412
|
+
fail_if_not_exists=True,
|
421
413
|
)
|
422
414
|
|
423
415
|
Both ``project_id`` and ``topic`` are templated so you can use Jinja templating in their values.
|
@@ -506,16 +498,18 @@ class PubSubDeleteSubscriptionOperator(GoogleCloudBaseOperator):
|
|
506
498
|
By default, if the subscription does not exist, this operator will
|
507
499
|
not cause the DAG to fail. ::
|
508
500
|
|
509
|
-
with DAG(
|
510
|
-
PubSubDeleteSubscriptionOperator(project_id=
|
501
|
+
with DAG("successful DAG") as dag:
|
502
|
+
PubSubDeleteSubscriptionOperator(project_id="my-project", subscription="non-existing")
|
511
503
|
|
512
504
|
The operator can be configured to fail if the subscription already exists.
|
513
505
|
|
514
506
|
::
|
515
507
|
|
516
|
-
with DAG(
|
508
|
+
with DAG("failing DAG") as dag:
|
517
509
|
PubSubDeleteSubscriptionOperator(
|
518
|
-
project_id=
|
510
|
+
project_id="my-project",
|
511
|
+
subscription="non-existing",
|
512
|
+
fail_if_not_exists=True,
|
519
513
|
)
|
520
514
|
|
521
515
|
``project_id``, and ``subscription`` are templated so you can use Jinja templating in their values.
|
@@ -605,15 +599,13 @@ class PubSubPublishMessageOperator(GoogleCloudBaseOperator):
|
|
605
599
|
in a single Google Cloud project. If the topic does not exist, this
|
606
600
|
task will fail. ::
|
607
601
|
|
608
|
-
m1 = {
|
609
|
-
|
610
|
-
|
611
|
-
m2 = {'data': b'Knock, knock'}
|
612
|
-
m3 = {'attributes': {'foo': ''}}
|
602
|
+
m1 = {"data": b"Hello, World!", "attributes": {"type": "greeting"}}
|
603
|
+
m2 = {"data": b"Knock, knock"}
|
604
|
+
m3 = {"attributes": {"foo": ""}}
|
613
605
|
|
614
606
|
t1 = PubSubPublishMessageOperator(
|
615
|
-
project_id=
|
616
|
-
topic=
|
607
|
+
project_id="my-project",
|
608
|
+
topic="my_topic",
|
617
609
|
messages=[m1, m2, m3],
|
618
610
|
create_topic=True,
|
619
611
|
dag=dag,
|
@@ -20,12 +20,16 @@ from __future__ import annotations
|
|
20
20
|
import logging
|
21
21
|
import re
|
22
22
|
import warnings
|
23
|
+
from typing import Sequence
|
23
24
|
|
24
25
|
from google.auth.exceptions import DefaultCredentialsError
|
25
26
|
|
26
27
|
from airflow.exceptions import AirflowException, AirflowProviderDeprecationWarning
|
27
28
|
from airflow.providers.google.cloud._internal_client.secret_manager_client import _SecretManagerClient
|
28
|
-
from airflow.providers.google.cloud.utils.credentials_provider import
|
29
|
+
from airflow.providers.google.cloud.utils.credentials_provider import (
|
30
|
+
_get_target_principal_and_delegates,
|
31
|
+
get_credentials_and_project_id,
|
32
|
+
)
|
29
33
|
from airflow.secrets import BaseSecretsBackend
|
30
34
|
from airflow.utils.log.logging_mixin import LoggingMixin
|
31
35
|
|
@@ -76,6 +80,14 @@ class CloudSecretManagerBackend(BaseSecretsBackend, LoggingMixin):
|
|
76
80
|
:param project_id: Project ID to read the secrets from. If not passed, the project ID from credentials
|
77
81
|
will be used.
|
78
82
|
:param sep: Separator used to concatenate connections_prefix and conn_id. Default: "-"
|
83
|
+
:param impersonation_chain: Optional service account to impersonate using
|
84
|
+
short-term credentials, or chained list of accounts required to get the
|
85
|
+
access token of the last account in the list, which will be impersonated
|
86
|
+
in the request. If set as a string, the account must grant the
|
87
|
+
originating account the Service Account Token Creator IAM role. If set
|
88
|
+
as a sequence, the identities from the list must grant Service Account
|
89
|
+
Token Creator IAM role to the directly preceding identity, with first
|
90
|
+
account from the list granting this role to the originating account.
|
79
91
|
"""
|
80
92
|
|
81
93
|
def __init__(
|
@@ -89,6 +101,7 @@ class CloudSecretManagerBackend(BaseSecretsBackend, LoggingMixin):
|
|
89
101
|
gcp_scopes: str | None = None,
|
90
102
|
project_id: str | None = None,
|
91
103
|
sep: str = "-",
|
104
|
+
impersonation_chain: str | Sequence[str] | None = None,
|
92
105
|
**kwargs,
|
93
106
|
) -> None:
|
94
107
|
super().__init__(**kwargs)
|
@@ -103,11 +116,19 @@ class CloudSecretManagerBackend(BaseSecretsBackend, LoggingMixin):
|
|
103
116
|
f"follows that pattern {SECRET_ID_PATTERN}"
|
104
117
|
)
|
105
118
|
try:
|
119
|
+
if impersonation_chain:
|
120
|
+
target_principal, delegates = _get_target_principal_and_delegates(impersonation_chain)
|
121
|
+
else:
|
122
|
+
target_principal = None
|
123
|
+
delegates = None
|
124
|
+
|
106
125
|
self.credentials, self.project_id = get_credentials_and_project_id(
|
107
126
|
keyfile_dict=gcp_keyfile_dict,
|
108
127
|
key_path=gcp_key_path,
|
109
128
|
credential_config_file=gcp_credential_config_file,
|
110
129
|
scopes=gcp_scopes,
|
130
|
+
target_principal=target_principal,
|
131
|
+
delegates=delegates,
|
111
132
|
)
|
112
133
|
except (DefaultCredentialsError, FileNotFoundError):
|
113
134
|
log.exception(
|
@@ -19,9 +19,10 @@
|
|
19
19
|
|
20
20
|
from __future__ import annotations
|
21
21
|
|
22
|
+
import warnings
|
22
23
|
from typing import TYPE_CHECKING, Any, Sequence
|
23
24
|
|
24
|
-
from airflow.exceptions import AirflowException, AirflowSkipException
|
25
|
+
from airflow.exceptions import AirflowException, AirflowProviderDeprecationWarning, AirflowSkipException
|
25
26
|
from airflow.providers.google.cloud.triggers.cloud_composer import CloudComposerExecutionTrigger
|
26
27
|
from airflow.sensors.base import BaseSensorOperator
|
27
28
|
|
@@ -33,6 +34,11 @@ class CloudComposerEnvironmentSensor(BaseSensorOperator):
|
|
33
34
|
"""
|
34
35
|
Check the status of the Cloud Composer Environment task.
|
35
36
|
|
37
|
+
This Sensor is deprecated. You can achieve the same functionality by using Cloud Composer Operators
|
38
|
+
CloudComposerCreateEnvironmentOperator, CloudComposerDeleteEnvironmentOperator and
|
39
|
+
CloudComposerUpdateEnvironmentOperator in deferrable or non-deferrable mode, since every operator
|
40
|
+
gives user a possibility to wait (asynchronously or synchronously) until Operation will be finished.
|
41
|
+
|
36
42
|
:param project_id: Required. The ID of the Google Cloud project that the service belongs to.
|
37
43
|
:param region: Required. The ID of the Google Cloud region that the service belongs to.
|
38
44
|
:param operation_name: The name of the operation resource
|
@@ -59,6 +65,13 @@ class CloudComposerEnvironmentSensor(BaseSensorOperator):
|
|
59
65
|
pooling_period_seconds: int = 30,
|
60
66
|
**kwargs,
|
61
67
|
):
|
68
|
+
warnings.warn(
|
69
|
+
f"The `{self.__class__.__name__}` operator is deprecated. You can achieve the same functionality "
|
70
|
+
f"by using operators in deferrable or non-deferrable mode, since every operator for Cloud "
|
71
|
+
f"Composer will wait for the operation to complete.",
|
72
|
+
AirflowProviderDeprecationWarning,
|
73
|
+
stacklevel=2,
|
74
|
+
)
|
62
75
|
super().__init__(**kwargs)
|
63
76
|
self.project_id = project_id
|
64
77
|
self.region = region
|
@@ -259,3 +259,121 @@ class DataplexDataQualityJobStatusSensor(BaseSensorOperator):
|
|
259
259
|
raise AirflowSkipException(message)
|
260
260
|
raise AirflowDataQualityScanException(message)
|
261
261
|
return job_status == DataScanJob.State.SUCCEEDED
|
262
|
+
|
263
|
+
|
264
|
+
class DataplexDataProfileJobStatusSensor(BaseSensorOperator):
|
265
|
+
"""
|
266
|
+
Check the status of the Dataplex DataProfile job.
|
267
|
+
|
268
|
+
:param project_id: Required. The ID of the Google Cloud project that the task belongs to.
|
269
|
+
:param region: Required. The ID of the Google Cloud region that the task belongs to.
|
270
|
+
:param data_scan_id: Required. Data Quality scan identifier.
|
271
|
+
:param job_id: Required. Job ID.
|
272
|
+
:param api_version: The version of the api that will be requested for example 'v3'.
|
273
|
+
:param retry: A retry object used to retry requests. If `None` is specified, requests
|
274
|
+
will not be retried.
|
275
|
+
:param metadata: Additional metadata that is provided to the method.
|
276
|
+
:param gcp_conn_id: The connection ID to use when fetching connection info.
|
277
|
+
:param impersonation_chain: Optional service account to impersonate using short-term
|
278
|
+
credentials, or chained list of accounts required to get the access_token
|
279
|
+
of the last account in the list, which will be impersonated in the request.
|
280
|
+
If set as a string, the account must grant the originating account
|
281
|
+
the Service Account Token Creator IAM role.
|
282
|
+
If set as a sequence, the identities from the list must grant
|
283
|
+
Service Account Token Creator IAM role to the directly preceding identity, with first
|
284
|
+
account from the list granting this role to the originating account (templated).
|
285
|
+
:param result_timeout: Value in seconds for which operator will wait for the Data Quality scan result.
|
286
|
+
Throws exception if there is no result found after specified amount of seconds.
|
287
|
+
|
288
|
+
:return: Boolean indicating if the job run has reached the ``DataScanJob.State.SUCCEEDED``.
|
289
|
+
"""
|
290
|
+
|
291
|
+
template_fields = ["job_id"]
|
292
|
+
|
293
|
+
def __init__(
|
294
|
+
self,
|
295
|
+
project_id: str,
|
296
|
+
region: str,
|
297
|
+
data_scan_id: str,
|
298
|
+
job_id: str,
|
299
|
+
api_version: str = "v1",
|
300
|
+
retry: Retry | _MethodDefault = DEFAULT,
|
301
|
+
metadata: Sequence[tuple[str, str]] = (),
|
302
|
+
gcp_conn_id: str = "google_cloud_default",
|
303
|
+
impersonation_chain: str | Sequence[str] | None = None,
|
304
|
+
result_timeout: float = 60.0 * 10,
|
305
|
+
start_sensor_time: float | None = None,
|
306
|
+
*args,
|
307
|
+
**kwargs,
|
308
|
+
) -> None:
|
309
|
+
super().__init__(*args, **kwargs)
|
310
|
+
self.project_id = project_id
|
311
|
+
self.region = region
|
312
|
+
self.data_scan_id = data_scan_id
|
313
|
+
self.job_id = job_id
|
314
|
+
self.api_version = api_version
|
315
|
+
self.retry = retry
|
316
|
+
self.metadata = metadata
|
317
|
+
self.gcp_conn_id = gcp_conn_id
|
318
|
+
self.impersonation_chain = impersonation_chain
|
319
|
+
self.result_timeout = result_timeout
|
320
|
+
self.start_sensor_time = start_sensor_time
|
321
|
+
|
322
|
+
def _duration(self):
|
323
|
+
if not self.start_sensor_time:
|
324
|
+
self.start_sensor_time = time.monotonic()
|
325
|
+
return time.monotonic() - self.start_sensor_time
|
326
|
+
|
327
|
+
def poke(self, context: Context) -> bool:
|
328
|
+
self.log.info("Waiting for job %s to be %s", self.job_id, DataScanJob.State.SUCCEEDED)
|
329
|
+
if self.result_timeout:
|
330
|
+
duration = self._duration()
|
331
|
+
if duration > self.result_timeout:
|
332
|
+
# TODO: remove this if check when min_airflow_version is set to higher than 2.7.1
|
333
|
+
message = (
|
334
|
+
f"Timeout: Data Profile scan {self.job_id} is not ready after {self.result_timeout}s"
|
335
|
+
)
|
336
|
+
if self.soft_fail:
|
337
|
+
raise AirflowSkipException(message)
|
338
|
+
raise AirflowDataQualityScanResultTimeoutException(message)
|
339
|
+
|
340
|
+
hook = DataplexHook(
|
341
|
+
gcp_conn_id=self.gcp_conn_id,
|
342
|
+
api_version=self.api_version,
|
343
|
+
impersonation_chain=self.impersonation_chain,
|
344
|
+
)
|
345
|
+
|
346
|
+
try:
|
347
|
+
job = hook.get_data_scan_job(
|
348
|
+
project_id=self.project_id,
|
349
|
+
region=self.region,
|
350
|
+
data_scan_id=self.data_scan_id,
|
351
|
+
job_id=self.job_id,
|
352
|
+
timeout=self.timeout,
|
353
|
+
retry=self.retry,
|
354
|
+
metadata=self.metadata,
|
355
|
+
)
|
356
|
+
except GoogleAPICallError as e:
|
357
|
+
# TODO: remove this if check when min_airflow_version is set to higher than 2.7.1
|
358
|
+
message = f"Error occurred when trying to retrieve Data Profile scan job: {self.data_scan_id}"
|
359
|
+
if self.soft_fail:
|
360
|
+
raise AirflowSkipException(message, e)
|
361
|
+
raise AirflowException(message, e)
|
362
|
+
|
363
|
+
job_status = job.state
|
364
|
+
self.log.info(
|
365
|
+
"Current status of the Dataplex Data Profile scan job %s => %s", self.job_id, job_status
|
366
|
+
)
|
367
|
+
if job_status == DataScanJob.State.FAILED:
|
368
|
+
# TODO: remove this if check when min_airflow_version is set to higher than 2.7.1
|
369
|
+
message = f"Data Profile scan job failed: {self.job_id}"
|
370
|
+
if self.soft_fail:
|
371
|
+
raise AirflowSkipException(message)
|
372
|
+
raise AirflowException(message)
|
373
|
+
if job_status == DataScanJob.State.CANCELLED:
|
374
|
+
# TODO: remove this if check when min_airflow_version is set to higher than 2.7.1
|
375
|
+
message = f"Data Profile scan job cancelled: {self.job_id}"
|
376
|
+
if self.soft_fail:
|
377
|
+
raise AirflowSkipException(message)
|
378
|
+
raise AirflowException(message)
|
379
|
+
return job_status == DataScanJob.State.SUCCEEDED
|
@@ -50,6 +50,7 @@ class GCSObjectExistenceSensor(BaseSensorOperator):
|
|
50
50
|
:param bucket: The Google Cloud Storage bucket where the object is.
|
51
51
|
:param object: The name of the object to check in the Google cloud
|
52
52
|
storage bucket.
|
53
|
+
:param use_glob: When set to True the object parameter is interpreted as glob
|
53
54
|
:param google_cloud_conn_id: The connection ID to use when
|
54
55
|
connecting to Google Cloud Storage.
|
55
56
|
:param impersonation_chain: Optional service account to impersonate using short-term
|
@@ -75,6 +76,7 @@ class GCSObjectExistenceSensor(BaseSensorOperator):
|
|
75
76
|
*,
|
76
77
|
bucket: str,
|
77
78
|
object: str,
|
79
|
+
use_glob: bool = False,
|
78
80
|
google_cloud_conn_id: str = "google_cloud_default",
|
79
81
|
impersonation_chain: str | Sequence[str] | None = None,
|
80
82
|
retry: Retry = DEFAULT_RETRY,
|
@@ -84,7 +86,9 @@ class GCSObjectExistenceSensor(BaseSensorOperator):
|
|
84
86
|
super().__init__(**kwargs)
|
85
87
|
self.bucket = bucket
|
86
88
|
self.object = object
|
89
|
+
self.use_glob = use_glob
|
87
90
|
self.google_cloud_conn_id = google_cloud_conn_id
|
91
|
+
self._matches: list[str] = []
|
88
92
|
self.impersonation_chain = impersonation_chain
|
89
93
|
self.retry = retry
|
90
94
|
|
@@ -96,7 +100,11 @@ class GCSObjectExistenceSensor(BaseSensorOperator):
|
|
96
100
|
gcp_conn_id=self.google_cloud_conn_id,
|
97
101
|
impersonation_chain=self.impersonation_chain,
|
98
102
|
)
|
99
|
-
|
103
|
+
if self.use_glob:
|
104
|
+
self._matches = hook.list(self.bucket, match_glob=self.object)
|
105
|
+
return bool(self._matches)
|
106
|
+
else:
|
107
|
+
return hook.exists(self.bucket, self.object, self.retry)
|
100
108
|
|
101
109
|
def execute(self, context: Context) -> None:
|
102
110
|
"""Airflow runs this method on the worker and defers using the trigger."""
|
@@ -109,6 +117,7 @@ class GCSObjectExistenceSensor(BaseSensorOperator):
|
|
109
117
|
trigger=GCSBlobTrigger(
|
110
118
|
bucket=self.bucket,
|
111
119
|
object_name=self.object,
|
120
|
+
use_glob=self.use_glob,
|
112
121
|
poke_interval=self.poke_interval,
|
113
122
|
google_cloud_conn_id=self.google_cloud_conn_id,
|
114
123
|
hook_params={
|
@@ -58,12 +58,12 @@ class ADLSToGCSOperator(ADLSListOperator):
|
|
58
58
|
resulting gcs path will be ``gs://mybucket/hello/world.avro`` ::
|
59
59
|
|
60
60
|
copy_single_file = AdlsToGoogleCloudStorageOperator(
|
61
|
-
task_id=
|
62
|
-
src_adls=
|
63
|
-
dest_gcs=
|
61
|
+
task_id="copy_single_file",
|
62
|
+
src_adls="hello/world.avro",
|
63
|
+
dest_gcs="gs://mybucket",
|
64
64
|
replace=False,
|
65
|
-
azure_data_lake_conn_id=
|
66
|
-
gcp_conn_id=
|
65
|
+
azure_data_lake_conn_id="azure_data_lake_default",
|
66
|
+
gcp_conn_id="google_cloud_default",
|
67
67
|
)
|
68
68
|
|
69
69
|
The following Operator would copy all parquet files from ADLS
|