apache-airflow-providers-google 10.12.0rc1__py3-none-any.whl → 10.13.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airflow/providers/google/__init__.py +3 -3
- airflow/providers/google/cloud/fs/gcs.py +16 -13
- airflow/providers/google/cloud/hooks/bigquery_dts.py +2 -1
- airflow/providers/google/cloud/hooks/cloud_build.py +2 -1
- airflow/providers/google/cloud/hooks/cloud_composer.py +4 -3
- airflow/providers/google/cloud/hooks/compute_ssh.py +18 -6
- airflow/providers/google/cloud/hooks/dataflow.py +1 -1
- airflow/providers/google/cloud/hooks/dataplex.py +2 -1
- airflow/providers/google/cloud/hooks/dataproc.py +19 -18
- airflow/providers/google/cloud/hooks/gcs.py +2 -0
- airflow/providers/google/cloud/operators/cloud_composer.py +1 -1
- airflow/providers/google/cloud/operators/cloud_run.py +3 -3
- airflow/providers/google/cloud/operators/dataplex.py +530 -1
- airflow/providers/google/cloud/operators/dataproc.py +10 -8
- airflow/providers/google/cloud/operators/gcs.py +85 -10
- airflow/providers/google/cloud/secrets/secret_manager.py +22 -1
- airflow/providers/google/cloud/sensors/cloud_composer.py +14 -1
- airflow/providers/google/cloud/sensors/dataplex.py +118 -0
- airflow/providers/google/cloud/triggers/cloud_run.py +7 -7
- airflow/providers/google/cloud/triggers/dataplex.py +82 -0
- airflow/providers/google/cloud/triggers/dataproc.py +2 -5
- airflow/providers/google/common/hooks/base_google.py +6 -4
- airflow/providers/google/get_provider_info.py +11 -10
- {apache_airflow_providers_google-10.12.0rc1.dist-info → apache_airflow_providers_google-10.13.0rc2.dist-info}/METADATA +24 -24
- {apache_airflow_providers_google-10.12.0rc1.dist-info → apache_airflow_providers_google-10.13.0rc2.dist-info}/RECORD +27 -27
- {apache_airflow_providers_google-10.12.0rc1.dist-info → apache_airflow_providers_google-10.13.0rc2.dist-info}/WHEEL +0 -0
- {apache_airflow_providers_google-10.12.0rc1.dist-info → apache_airflow_providers_google-10.13.0rc2.dist-info}/entry_points.txt +0 -0
@@ -313,6 +313,7 @@ class GCSDeleteObjectsOperator(GoogleCloudBaseOperator):
|
|
313
313
|
)
|
314
314
|
raise ValueError(err_message)
|
315
315
|
|
316
|
+
self._objects: list[str] = []
|
316
317
|
super().__init__(**kwargs)
|
317
318
|
|
318
319
|
def execute(self, context: Context) -> None:
|
@@ -322,13 +323,47 @@ class GCSDeleteObjectsOperator(GoogleCloudBaseOperator):
|
|
322
323
|
)
|
323
324
|
|
324
325
|
if self.objects is not None:
|
325
|
-
|
326
|
+
self._objects = self.objects
|
326
327
|
else:
|
327
|
-
|
328
|
-
self.log.info("Deleting %s objects from %s", len(
|
329
|
-
for object_name in
|
328
|
+
self._objects = hook.list(bucket_name=self.bucket_name, prefix=self.prefix)
|
329
|
+
self.log.info("Deleting %s objects from %s", len(self._objects), self.bucket_name)
|
330
|
+
for object_name in self._objects:
|
330
331
|
hook.delete(bucket_name=self.bucket_name, object_name=object_name)
|
331
332
|
|
333
|
+
def get_openlineage_facets_on_complete(self, task_instance):
|
334
|
+
"""Implementing on_complete as execute() resolves object names."""
|
335
|
+
from openlineage.client.facet import (
|
336
|
+
LifecycleStateChange,
|
337
|
+
LifecycleStateChangeDatasetFacet,
|
338
|
+
LifecycleStateChangeDatasetFacetPreviousIdentifier,
|
339
|
+
)
|
340
|
+
from openlineage.client.run import Dataset
|
341
|
+
|
342
|
+
from airflow.providers.openlineage.extractors import OperatorLineage
|
343
|
+
|
344
|
+
if not self._objects:
|
345
|
+
return OperatorLineage()
|
346
|
+
|
347
|
+
bucket_url = f"gs://{self.bucket_name}"
|
348
|
+
input_datasets = [
|
349
|
+
Dataset(
|
350
|
+
namespace=bucket_url,
|
351
|
+
name=object_name,
|
352
|
+
facets={
|
353
|
+
"lifecycleStateChange": LifecycleStateChangeDatasetFacet(
|
354
|
+
lifecycleStateChange=LifecycleStateChange.DROP.value,
|
355
|
+
previousIdentifier=LifecycleStateChangeDatasetFacetPreviousIdentifier(
|
356
|
+
namespace=bucket_url,
|
357
|
+
name=object_name,
|
358
|
+
),
|
359
|
+
)
|
360
|
+
},
|
361
|
+
)
|
362
|
+
for object_name in self._objects
|
363
|
+
]
|
364
|
+
|
365
|
+
return OperatorLineage(inputs=input_datasets)
|
366
|
+
|
332
367
|
|
333
368
|
class GCSBucketCreateAclEntryOperator(GoogleCloudBaseOperator):
|
334
369
|
"""
|
@@ -596,6 +631,22 @@ class GCSFileTransformOperator(GoogleCloudBaseOperator):
|
|
596
631
|
filename=destination_file.name,
|
597
632
|
)
|
598
633
|
|
634
|
+
def get_openlineage_facets_on_start(self):
|
635
|
+
from openlineage.client.run import Dataset
|
636
|
+
|
637
|
+
from airflow.providers.openlineage.extractors import OperatorLineage
|
638
|
+
|
639
|
+
input_dataset = Dataset(
|
640
|
+
namespace=f"gs://{self.source_bucket}",
|
641
|
+
name=self.source_object,
|
642
|
+
)
|
643
|
+
output_dataset = Dataset(
|
644
|
+
namespace=f"gs://{self.destination_bucket}",
|
645
|
+
name=self.destination_object,
|
646
|
+
)
|
647
|
+
|
648
|
+
return OperatorLineage(inputs=[input_dataset], outputs=[output_dataset])
|
649
|
+
|
599
650
|
|
600
651
|
class GCSTimeSpanFileTransformOperator(GoogleCloudBaseOperator):
|
601
652
|
"""
|
@@ -722,6 +773,9 @@ class GCSTimeSpanFileTransformOperator(GoogleCloudBaseOperator):
|
|
722
773
|
self.upload_continue_on_fail = upload_continue_on_fail
|
723
774
|
self.upload_num_attempts = upload_num_attempts
|
724
775
|
|
776
|
+
self._source_object_names: list[str] = []
|
777
|
+
self._destination_object_names: list[str] = []
|
778
|
+
|
725
779
|
def execute(self, context: Context) -> list[str]:
|
726
780
|
# Define intervals and prefixes.
|
727
781
|
try:
|
@@ -773,7 +827,7 @@ class GCSTimeSpanFileTransformOperator(GoogleCloudBaseOperator):
|
|
773
827
|
)
|
774
828
|
|
775
829
|
# Fetch list of files.
|
776
|
-
|
830
|
+
self._source_object_names = source_hook.list_by_timespan(
|
777
831
|
bucket_name=self.source_bucket,
|
778
832
|
prefix=source_prefix_interp,
|
779
833
|
timespan_start=timespan_start,
|
@@ -785,7 +839,7 @@ class GCSTimeSpanFileTransformOperator(GoogleCloudBaseOperator):
|
|
785
839
|
temp_output_dir_path = Path(temp_output_dir)
|
786
840
|
|
787
841
|
# TODO: download in parallel.
|
788
|
-
for blob_to_transform in
|
842
|
+
for blob_to_transform in self._source_object_names:
|
789
843
|
destination_file = temp_input_dir_path / blob_to_transform
|
790
844
|
destination_file.parent.mkdir(parents=True, exist_ok=True)
|
791
845
|
try:
|
@@ -822,8 +876,6 @@ class GCSTimeSpanFileTransformOperator(GoogleCloudBaseOperator):
|
|
822
876
|
|
823
877
|
self.log.info("Transformation succeeded. Output temporarily located at %s", temp_output_dir_path)
|
824
878
|
|
825
|
-
files_uploaded = []
|
826
|
-
|
827
879
|
# TODO: upload in parallel.
|
828
880
|
for upload_file in temp_output_dir_path.glob("**/*"):
|
829
881
|
if upload_file.is_dir():
|
@@ -844,12 +896,35 @@ class GCSTimeSpanFileTransformOperator(GoogleCloudBaseOperator):
|
|
844
896
|
chunk_size=self.chunk_size,
|
845
897
|
num_max_attempts=self.upload_num_attempts,
|
846
898
|
)
|
847
|
-
|
899
|
+
self._destination_object_names.append(str(upload_file_name))
|
848
900
|
except GoogleCloudError:
|
849
901
|
if not self.upload_continue_on_fail:
|
850
902
|
raise
|
851
903
|
|
852
|
-
return
|
904
|
+
return self._destination_object_names
|
905
|
+
|
906
|
+
def get_openlineage_facets_on_complete(self, task_instance):
|
907
|
+
"""Implementing on_complete as execute() resolves object names."""
|
908
|
+
from openlineage.client.run import Dataset
|
909
|
+
|
910
|
+
from airflow.providers.openlineage.extractors import OperatorLineage
|
911
|
+
|
912
|
+
input_datasets = [
|
913
|
+
Dataset(
|
914
|
+
namespace=f"gs://{self.source_bucket}",
|
915
|
+
name=object_name,
|
916
|
+
)
|
917
|
+
for object_name in self._source_object_names
|
918
|
+
]
|
919
|
+
output_datasets = [
|
920
|
+
Dataset(
|
921
|
+
namespace=f"gs://{self.destination_bucket}",
|
922
|
+
name=object_name,
|
923
|
+
)
|
924
|
+
for object_name in self._destination_object_names
|
925
|
+
]
|
926
|
+
|
927
|
+
return OperatorLineage(inputs=input_datasets, outputs=output_datasets)
|
853
928
|
|
854
929
|
|
855
930
|
class GCSDeleteBucketOperator(GoogleCloudBaseOperator):
|
@@ -20,12 +20,16 @@ from __future__ import annotations
|
|
20
20
|
import logging
|
21
21
|
import re
|
22
22
|
import warnings
|
23
|
+
from typing import Sequence
|
23
24
|
|
24
25
|
from google.auth.exceptions import DefaultCredentialsError
|
25
26
|
|
26
27
|
from airflow.exceptions import AirflowException, AirflowProviderDeprecationWarning
|
27
28
|
from airflow.providers.google.cloud._internal_client.secret_manager_client import _SecretManagerClient
|
28
|
-
from airflow.providers.google.cloud.utils.credentials_provider import
|
29
|
+
from airflow.providers.google.cloud.utils.credentials_provider import (
|
30
|
+
_get_target_principal_and_delegates,
|
31
|
+
get_credentials_and_project_id,
|
32
|
+
)
|
29
33
|
from airflow.secrets import BaseSecretsBackend
|
30
34
|
from airflow.utils.log.logging_mixin import LoggingMixin
|
31
35
|
|
@@ -76,6 +80,14 @@ class CloudSecretManagerBackend(BaseSecretsBackend, LoggingMixin):
|
|
76
80
|
:param project_id: Project ID to read the secrets from. If not passed, the project ID from credentials
|
77
81
|
will be used.
|
78
82
|
:param sep: Separator used to concatenate connections_prefix and conn_id. Default: "-"
|
83
|
+
:param impersonation_chain: Optional service account to impersonate using
|
84
|
+
short-term credentials, or chained list of accounts required to get the
|
85
|
+
access token of the last account in the list, which will be impersonated
|
86
|
+
in the request. If set as a string, the account must grant the
|
87
|
+
originating account the Service Account Token Creator IAM role. If set
|
88
|
+
as a sequence, the identities from the list must grant Service Account
|
89
|
+
Token Creator IAM role to the directly preceding identity, with first
|
90
|
+
account from the list granting this role to the originating account.
|
79
91
|
"""
|
80
92
|
|
81
93
|
def __init__(
|
@@ -89,6 +101,7 @@ class CloudSecretManagerBackend(BaseSecretsBackend, LoggingMixin):
|
|
89
101
|
gcp_scopes: str | None = None,
|
90
102
|
project_id: str | None = None,
|
91
103
|
sep: str = "-",
|
104
|
+
impersonation_chain: str | Sequence[str] | None = None,
|
92
105
|
**kwargs,
|
93
106
|
) -> None:
|
94
107
|
super().__init__(**kwargs)
|
@@ -103,11 +116,19 @@ class CloudSecretManagerBackend(BaseSecretsBackend, LoggingMixin):
|
|
103
116
|
f"follows that pattern {SECRET_ID_PATTERN}"
|
104
117
|
)
|
105
118
|
try:
|
119
|
+
if impersonation_chain:
|
120
|
+
target_principal, delegates = _get_target_principal_and_delegates(impersonation_chain)
|
121
|
+
else:
|
122
|
+
target_principal = None
|
123
|
+
delegates = None
|
124
|
+
|
106
125
|
self.credentials, self.project_id = get_credentials_and_project_id(
|
107
126
|
keyfile_dict=gcp_keyfile_dict,
|
108
127
|
key_path=gcp_key_path,
|
109
128
|
credential_config_file=gcp_credential_config_file,
|
110
129
|
scopes=gcp_scopes,
|
130
|
+
target_principal=target_principal,
|
131
|
+
delegates=delegates,
|
111
132
|
)
|
112
133
|
except (DefaultCredentialsError, FileNotFoundError):
|
113
134
|
log.exception(
|
@@ -19,9 +19,10 @@
|
|
19
19
|
|
20
20
|
from __future__ import annotations
|
21
21
|
|
22
|
+
import warnings
|
22
23
|
from typing import TYPE_CHECKING, Any, Sequence
|
23
24
|
|
24
|
-
from airflow.exceptions import AirflowException, AirflowSkipException
|
25
|
+
from airflow.exceptions import AirflowException, AirflowProviderDeprecationWarning, AirflowSkipException
|
25
26
|
from airflow.providers.google.cloud.triggers.cloud_composer import CloudComposerExecutionTrigger
|
26
27
|
from airflow.sensors.base import BaseSensorOperator
|
27
28
|
|
@@ -33,6 +34,11 @@ class CloudComposerEnvironmentSensor(BaseSensorOperator):
|
|
33
34
|
"""
|
34
35
|
Check the status of the Cloud Composer Environment task.
|
35
36
|
|
37
|
+
This Sensor is deprecated. You can achieve the same functionality by using Cloud Composer Operators
|
38
|
+
CloudComposerCreateEnvironmentOperator, CloudComposerDeleteEnvironmentOperator and
|
39
|
+
CloudComposerUpdateEnvironmentOperator in deferrable or non-deferrable mode, since every operator
|
40
|
+
gives user a possibility to wait (asynchronously or synchronously) until Operation will be finished.
|
41
|
+
|
36
42
|
:param project_id: Required. The ID of the Google Cloud project that the service belongs to.
|
37
43
|
:param region: Required. The ID of the Google Cloud region that the service belongs to.
|
38
44
|
:param operation_name: The name of the operation resource
|
@@ -59,6 +65,13 @@ class CloudComposerEnvironmentSensor(BaseSensorOperator):
|
|
59
65
|
pooling_period_seconds: int = 30,
|
60
66
|
**kwargs,
|
61
67
|
):
|
68
|
+
warnings.warn(
|
69
|
+
f"The `{self.__class__.__name__}` operator is deprecated. You can achieve the same functionality "
|
70
|
+
f"by using operators in deferrable or non-deferrable mode, since every operator for Cloud "
|
71
|
+
f"Composer will wait for the operation to complete.",
|
72
|
+
AirflowProviderDeprecationWarning,
|
73
|
+
stacklevel=2,
|
74
|
+
)
|
62
75
|
super().__init__(**kwargs)
|
63
76
|
self.project_id = project_id
|
64
77
|
self.region = region
|
@@ -259,3 +259,121 @@ class DataplexDataQualityJobStatusSensor(BaseSensorOperator):
|
|
259
259
|
raise AirflowSkipException(message)
|
260
260
|
raise AirflowDataQualityScanException(message)
|
261
261
|
return job_status == DataScanJob.State.SUCCEEDED
|
262
|
+
|
263
|
+
|
264
|
+
class DataplexDataProfileJobStatusSensor(BaseSensorOperator):
|
265
|
+
"""
|
266
|
+
Check the status of the Dataplex DataProfile job.
|
267
|
+
|
268
|
+
:param project_id: Required. The ID of the Google Cloud project that the task belongs to.
|
269
|
+
:param region: Required. The ID of the Google Cloud region that the task belongs to.
|
270
|
+
:param data_scan_id: Required. Data Quality scan identifier.
|
271
|
+
:param job_id: Required. Job ID.
|
272
|
+
:param api_version: The version of the api that will be requested for example 'v3'.
|
273
|
+
:param retry: A retry object used to retry requests. If `None` is specified, requests
|
274
|
+
will not be retried.
|
275
|
+
:param metadata: Additional metadata that is provided to the method.
|
276
|
+
:param gcp_conn_id: The connection ID to use when fetching connection info.
|
277
|
+
:param impersonation_chain: Optional service account to impersonate using short-term
|
278
|
+
credentials, or chained list of accounts required to get the access_token
|
279
|
+
of the last account in the list, which will be impersonated in the request.
|
280
|
+
If set as a string, the account must grant the originating account
|
281
|
+
the Service Account Token Creator IAM role.
|
282
|
+
If set as a sequence, the identities from the list must grant
|
283
|
+
Service Account Token Creator IAM role to the directly preceding identity, with first
|
284
|
+
account from the list granting this role to the originating account (templated).
|
285
|
+
:param result_timeout: Value in seconds for which operator will wait for the Data Quality scan result.
|
286
|
+
Throws exception if there is no result found after specified amount of seconds.
|
287
|
+
|
288
|
+
:return: Boolean indicating if the job run has reached the ``DataScanJob.State.SUCCEEDED``.
|
289
|
+
"""
|
290
|
+
|
291
|
+
template_fields = ["job_id"]
|
292
|
+
|
293
|
+
def __init__(
|
294
|
+
self,
|
295
|
+
project_id: str,
|
296
|
+
region: str,
|
297
|
+
data_scan_id: str,
|
298
|
+
job_id: str,
|
299
|
+
api_version: str = "v1",
|
300
|
+
retry: Retry | _MethodDefault = DEFAULT,
|
301
|
+
metadata: Sequence[tuple[str, str]] = (),
|
302
|
+
gcp_conn_id: str = "google_cloud_default",
|
303
|
+
impersonation_chain: str | Sequence[str] | None = None,
|
304
|
+
result_timeout: float = 60.0 * 10,
|
305
|
+
start_sensor_time: float | None = None,
|
306
|
+
*args,
|
307
|
+
**kwargs,
|
308
|
+
) -> None:
|
309
|
+
super().__init__(*args, **kwargs)
|
310
|
+
self.project_id = project_id
|
311
|
+
self.region = region
|
312
|
+
self.data_scan_id = data_scan_id
|
313
|
+
self.job_id = job_id
|
314
|
+
self.api_version = api_version
|
315
|
+
self.retry = retry
|
316
|
+
self.metadata = metadata
|
317
|
+
self.gcp_conn_id = gcp_conn_id
|
318
|
+
self.impersonation_chain = impersonation_chain
|
319
|
+
self.result_timeout = result_timeout
|
320
|
+
self.start_sensor_time = start_sensor_time
|
321
|
+
|
322
|
+
def _duration(self):
|
323
|
+
if not self.start_sensor_time:
|
324
|
+
self.start_sensor_time = time.monotonic()
|
325
|
+
return time.monotonic() - self.start_sensor_time
|
326
|
+
|
327
|
+
def poke(self, context: Context) -> bool:
|
328
|
+
self.log.info("Waiting for job %s to be %s", self.job_id, DataScanJob.State.SUCCEEDED)
|
329
|
+
if self.result_timeout:
|
330
|
+
duration = self._duration()
|
331
|
+
if duration > self.result_timeout:
|
332
|
+
# TODO: remove this if check when min_airflow_version is set to higher than 2.7.1
|
333
|
+
message = (
|
334
|
+
f"Timeout: Data Profile scan {self.job_id} is not ready after {self.result_timeout}s"
|
335
|
+
)
|
336
|
+
if self.soft_fail:
|
337
|
+
raise AirflowSkipException(message)
|
338
|
+
raise AirflowDataQualityScanResultTimeoutException(message)
|
339
|
+
|
340
|
+
hook = DataplexHook(
|
341
|
+
gcp_conn_id=self.gcp_conn_id,
|
342
|
+
api_version=self.api_version,
|
343
|
+
impersonation_chain=self.impersonation_chain,
|
344
|
+
)
|
345
|
+
|
346
|
+
try:
|
347
|
+
job = hook.get_data_scan_job(
|
348
|
+
project_id=self.project_id,
|
349
|
+
region=self.region,
|
350
|
+
data_scan_id=self.data_scan_id,
|
351
|
+
job_id=self.job_id,
|
352
|
+
timeout=self.timeout,
|
353
|
+
retry=self.retry,
|
354
|
+
metadata=self.metadata,
|
355
|
+
)
|
356
|
+
except GoogleAPICallError as e:
|
357
|
+
# TODO: remove this if check when min_airflow_version is set to higher than 2.7.1
|
358
|
+
message = f"Error occurred when trying to retrieve Data Profile scan job: {self.data_scan_id}"
|
359
|
+
if self.soft_fail:
|
360
|
+
raise AirflowSkipException(message, e)
|
361
|
+
raise AirflowException(message, e)
|
362
|
+
|
363
|
+
job_status = job.state
|
364
|
+
self.log.info(
|
365
|
+
"Current status of the Dataplex Data Profile scan job %s => %s", self.job_id, job_status
|
366
|
+
)
|
367
|
+
if job_status == DataScanJob.State.FAILED:
|
368
|
+
# TODO: remove this if check when min_airflow_version is set to higher than 2.7.1
|
369
|
+
message = f"Data Profile scan job failed: {self.job_id}"
|
370
|
+
if self.soft_fail:
|
371
|
+
raise AirflowSkipException(message)
|
372
|
+
raise AirflowException(message)
|
373
|
+
if job_status == DataScanJob.State.CANCELLED:
|
374
|
+
# TODO: remove this if check when min_airflow_version is set to higher than 2.7.1
|
375
|
+
message = f"Data Profile scan job cancelled: {self.job_id}"
|
376
|
+
if self.soft_fail:
|
377
|
+
raise AirflowSkipException(message)
|
378
|
+
raise AirflowException(message)
|
379
|
+
return job_status == DataScanJob.State.SUCCEEDED
|
@@ -102,21 +102,21 @@ class CloudRunJobFinishedTrigger(BaseTrigger):
|
|
102
102
|
while timeout is None or timeout > 0:
|
103
103
|
operation: operations_pb2.Operation = await hook.get_operation(self.operation_name)
|
104
104
|
if operation.done:
|
105
|
-
# An operation can only have one of those two combinations: if it is
|
106
|
-
# the
|
107
|
-
if operation.
|
105
|
+
# An operation can only have one of those two combinations: if it is failed, then
|
106
|
+
# the error field will be populated, else, then the response field will be.
|
107
|
+
if operation.error.SerializeToString():
|
108
108
|
yield TriggerEvent(
|
109
109
|
{
|
110
|
-
"status": RunJobStatus.
|
110
|
+
"status": RunJobStatus.FAIL.value,
|
111
|
+
"operation_error_code": operation.error.code,
|
112
|
+
"operation_error_message": operation.error.message,
|
111
113
|
"job_name": self.job_name,
|
112
114
|
}
|
113
115
|
)
|
114
116
|
else:
|
115
117
|
yield TriggerEvent(
|
116
118
|
{
|
117
|
-
"status": RunJobStatus.
|
118
|
-
"operation_error_code": operation.error.code,
|
119
|
-
"operation_error_message": operation.error.message,
|
119
|
+
"status": RunJobStatus.SUCCESS.value,
|
120
120
|
"job_name": self.job_name,
|
121
121
|
}
|
122
122
|
)
|
@@ -107,3 +107,85 @@ class DataplexDataQualityJobTrigger(BaseTrigger):
|
|
107
107
|
def _convert_to_dict(self, job: DataScanJob) -> dict:
|
108
108
|
"""Returns a representation of a DataScanJob instance as a dict."""
|
109
109
|
return DataScanJob.to_dict(job)
|
110
|
+
|
111
|
+
|
112
|
+
class DataplexDataProfileJobTrigger(BaseTrigger):
|
113
|
+
"""
|
114
|
+
DataplexDataProfileJobTrigger runs on the trigger worker and waits for the job to be `SUCCEEDED` state.
|
115
|
+
|
116
|
+
:param job_id: Optional. The ID of a Dataplex job.
|
117
|
+
:param data_scan_id: Required. DataScan identifier.
|
118
|
+
:param project_id: Google Cloud Project where the job is running.
|
119
|
+
:param region: The ID of the Google Cloud region that the job belongs to.
|
120
|
+
:param gcp_conn_id: Optional, the connection ID used to connect to Google Cloud Platform.
|
121
|
+
:param impersonation_chain: Optional service account to impersonate using short-term
|
122
|
+
credentials, or chained list of accounts required to get the access_token
|
123
|
+
of the last account in the list, which will be impersonated in the request.
|
124
|
+
If set as a string, the account must grant the originating account
|
125
|
+
the Service Account Token Creator IAM role.
|
126
|
+
If set as a sequence, the identities from the list must grant
|
127
|
+
Service Account Token Creator IAM role to the directly preceding identity, with first
|
128
|
+
account from the list granting this role to the originating account (templated).
|
129
|
+
:param polling_interval_seconds: polling period in seconds to check for the status.
|
130
|
+
"""
|
131
|
+
|
132
|
+
def __init__(
|
133
|
+
self,
|
134
|
+
job_id: str | None,
|
135
|
+
data_scan_id: str,
|
136
|
+
project_id: str | None,
|
137
|
+
region: str,
|
138
|
+
gcp_conn_id: str = "google_cloud_default",
|
139
|
+
polling_interval_seconds: int = 10,
|
140
|
+
impersonation_chain: str | Sequence[str] | None = None,
|
141
|
+
**kwargs,
|
142
|
+
):
|
143
|
+
super().__init__(**kwargs)
|
144
|
+
self.job_id = job_id
|
145
|
+
self.data_scan_id = data_scan_id
|
146
|
+
self.project_id = project_id
|
147
|
+
self.region = region
|
148
|
+
self.gcp_conn_id = gcp_conn_id
|
149
|
+
self.polling_interval_seconds = polling_interval_seconds
|
150
|
+
self.impersonation_chain = impersonation_chain
|
151
|
+
|
152
|
+
def serialize(self):
|
153
|
+
return (
|
154
|
+
"airflow.providers.google.cloud.triggers.dataplex.DataplexDataProfileJobTrigger",
|
155
|
+
{
|
156
|
+
"job_id": self.job_id,
|
157
|
+
"data_scan_id": self.data_scan_id,
|
158
|
+
"project_id": self.project_id,
|
159
|
+
"region": self.region,
|
160
|
+
"gcp_conn_id": self.gcp_conn_id,
|
161
|
+
"impersonation_chain": self.impersonation_chain,
|
162
|
+
"polling_interval_seconds": self.polling_interval_seconds,
|
163
|
+
},
|
164
|
+
)
|
165
|
+
|
166
|
+
async def run(self) -> AsyncIterator[TriggerEvent]:
|
167
|
+
hook = DataplexAsyncHook(
|
168
|
+
gcp_conn_id=self.gcp_conn_id,
|
169
|
+
impersonation_chain=self.impersonation_chain,
|
170
|
+
)
|
171
|
+
while True:
|
172
|
+
job = await hook.get_data_scan_job(
|
173
|
+
project_id=self.project_id,
|
174
|
+
region=self.region,
|
175
|
+
job_id=self.job_id,
|
176
|
+
data_scan_id=self.data_scan_id,
|
177
|
+
)
|
178
|
+
state = job.state
|
179
|
+
if state in (DataScanJob.State.FAILED, DataScanJob.State.SUCCEEDED, DataScanJob.State.CANCELLED):
|
180
|
+
break
|
181
|
+
self.log.info(
|
182
|
+
"Current state is: %s, sleeping for %s seconds.",
|
183
|
+
DataScanJob.State(state).name,
|
184
|
+
self.polling_interval_seconds,
|
185
|
+
)
|
186
|
+
await asyncio.sleep(self.polling_interval_seconds)
|
187
|
+
yield TriggerEvent({"job_id": self.job_id, "job_state": state, "job": self._convert_to_dict(job)})
|
188
|
+
|
189
|
+
def _convert_to_dict(self, job: DataScanJob) -> dict:
|
190
|
+
"""Returns a representation of a DataScanJob instance as a dict."""
|
191
|
+
return DataScanJob.to_dict(job)
|
@@ -25,7 +25,6 @@ from typing import Any, AsyncIterator, Sequence
|
|
25
25
|
from google.api_core.exceptions import NotFound
|
26
26
|
from google.cloud.dataproc_v1 import Batch, ClusterStatus, JobStatus
|
27
27
|
|
28
|
-
from airflow.exceptions import AirflowException
|
29
28
|
from airflow.providers.google.cloud.hooks.dataproc import DataprocAsyncHook
|
30
29
|
from airflow.triggers.base import BaseTrigger, TriggerEvent
|
31
30
|
|
@@ -98,12 +97,10 @@ class DataprocSubmitTrigger(DataprocBaseTrigger):
|
|
98
97
|
)
|
99
98
|
state = job.status.state
|
100
99
|
self.log.info("Dataproc job: %s is in state: %s", self.job_id, state)
|
101
|
-
if state in (JobStatus.State.DONE, JobStatus.State.CANCELLED):
|
100
|
+
if state in (JobStatus.State.DONE, JobStatus.State.CANCELLED, JobStatus.State.ERROR):
|
102
101
|
break
|
103
|
-
elif state == JobStatus.State.ERROR:
|
104
|
-
raise AirflowException(f"Dataproc job execution failed {self.job_id}")
|
105
102
|
await asyncio.sleep(self.polling_interval_seconds)
|
106
|
-
yield TriggerEvent({"job_id": self.job_id, "job_state": state})
|
103
|
+
yield TriggerEvent({"job_id": self.job_id, "job_state": state, "job": job})
|
107
104
|
|
108
105
|
|
109
106
|
class DataprocClusterTrigger(DataprocBaseTrigger):
|
@@ -188,8 +188,8 @@ class GoogleBaseHook(BaseHook):
|
|
188
188
|
conn_type = "google_cloud_platform"
|
189
189
|
hook_name = "Google Cloud"
|
190
190
|
|
191
|
-
@
|
192
|
-
def get_connection_form_widgets() -> dict[str, Any]:
|
191
|
+
@classmethod
|
192
|
+
def get_connection_form_widgets(cls) -> dict[str, Any]:
|
193
193
|
"""Returns connection widgets to add to connection form."""
|
194
194
|
from flask_appbuilder.fieldwidgets import BS3PasswordFieldWidget, BS3TextFieldWidget
|
195
195
|
from flask_babel import lazy_gettext
|
@@ -221,8 +221,8 @@ class GoogleBaseHook(BaseHook):
|
|
221
221
|
),
|
222
222
|
}
|
223
223
|
|
224
|
-
@
|
225
|
-
def get_ui_field_behaviour() -> dict[str, Any]:
|
224
|
+
@classmethod
|
225
|
+
def get_ui_field_behaviour(cls) -> dict[str, Any]:
|
226
226
|
"""Returns custom field behaviour."""
|
227
227
|
return {
|
228
228
|
"hidden_fields": ["host", "schema", "login", "password", "port", "extra"],
|
@@ -267,6 +267,8 @@ class GoogleBaseHook(BaseHook):
|
|
267
267
|
|
268
268
|
if not self.impersonation_chain:
|
269
269
|
self.impersonation_chain = self._get_field("impersonation_chain", None)
|
270
|
+
if isinstance(self.impersonation_chain, str) and "," in self.impersonation_chain:
|
271
|
+
self.impersonation_chain = [s.strip() for s in self.impersonation_chain.split(",")]
|
270
272
|
|
271
273
|
target_principal, delegates = _get_target_principal_and_delegates(self.impersonation_chain)
|
272
274
|
|
@@ -28,8 +28,9 @@ def get_provider_info():
|
|
28
28
|
"name": "Google",
|
29
29
|
"description": "Google services including:\n\n - `Google Ads <https://ads.google.com/>`__\n - `Google Cloud (GCP) <https://cloud.google.com/>`__\n - `Google Firebase <https://firebase.google.com/>`__\n - `Google LevelDB <https://github.com/google/leveldb/>`__\n - `Google Marketing Platform <https://marketingplatform.google.com/>`__\n - `Google Workspace <https://workspace.google.com/>`__ (formerly Google Suite)\n",
|
30
30
|
"suspended": False,
|
31
|
-
"source-date-epoch":
|
31
|
+
"source-date-epoch": 1701983383,
|
32
32
|
"versions": [
|
33
|
+
"10.13.0",
|
33
34
|
"10.12.0",
|
34
35
|
"10.11.1",
|
35
36
|
"10.11.0",
|
@@ -80,30 +81,30 @@ def get_provider_info():
|
|
80
81
|
"1.0.0",
|
81
82
|
],
|
82
83
|
"dependencies": [
|
83
|
-
"apache-airflow>=2.
|
84
|
+
"apache-airflow>=2.6.0",
|
84
85
|
"apache-airflow-providers-common-sql>=1.7.2",
|
85
86
|
"asgiref>=3.5.2",
|
86
87
|
"gcloud-aio-auth>=4.0.0,<5.0.0",
|
87
88
|
"gcloud-aio-bigquery>=6.1.2",
|
88
89
|
"gcloud-aio-storage",
|
89
|
-
"gcsfs>=2023.
|
90
|
+
"gcsfs>=2023.10.0",
|
90
91
|
"google-ads>=22.1.0",
|
91
92
|
"google-api-core>=2.11.0",
|
92
93
|
"google-api-python-client>=1.6.0",
|
93
94
|
"google-auth>=1.0.0",
|
94
95
|
"google-auth-httplib2>=0.0.1",
|
95
96
|
"google-cloud-aiplatform>=1.22.1",
|
96
|
-
"google-cloud-automl>=2.
|
97
|
-
"google-cloud-bigquery-datatransfer>=3.
|
97
|
+
"google-cloud-automl>=2.12.0",
|
98
|
+
"google-cloud-bigquery-datatransfer>=3.13.0",
|
98
99
|
"google-cloud-bigtable>=2.17.0",
|
99
|
-
"google-cloud-build>=3.
|
100
|
+
"google-cloud-build>=3.22.0",
|
100
101
|
"google-cloud-compute>=1.10.0",
|
101
102
|
"google-cloud-container>=2.17.4",
|
102
103
|
"google-cloud-datacatalog>=3.11.1",
|
103
|
-
"google-cloud-dataflow-client>=0.8.
|
104
|
+
"google-cloud-dataflow-client>=0.8.6",
|
104
105
|
"google-cloud-dataform>=0.5.0",
|
105
|
-
"google-cloud-dataplex>=1.
|
106
|
-
"google-cloud-dataproc>=5.
|
106
|
+
"google-cloud-dataplex>=1.10.0",
|
107
|
+
"google-cloud-dataproc>=5.8.0",
|
107
108
|
"google-cloud-dataproc-metastore>=1.12.0",
|
108
109
|
"google-cloud-dlp>=3.12.0",
|
109
110
|
"google-cloud-kms>=2.15.0",
|
@@ -111,7 +112,7 @@ def get_provider_info():
|
|
111
112
|
"google-cloud-logging>=3.5.0",
|
112
113
|
"google-cloud-memcache>=1.7.0",
|
113
114
|
"google-cloud-monitoring>=2.14.1",
|
114
|
-
"google-cloud-orchestration-airflow>=1.
|
115
|
+
"google-cloud-orchestration-airflow>=1.10.0",
|
115
116
|
"google-cloud-os-login>=2.9.1",
|
116
117
|
"google-cloud-pubsub>=2.15.0",
|
117
118
|
"google-cloud-redis>=2.12.0",
|