apache-airflow-providers-google 10.14.0rc1__py3-none-any.whl → 10.15.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airflow/providers/google/__init__.py +1 -1
- airflow/providers/google/ads/hooks/ads.py +1 -2
- airflow/providers/google/cloud/hooks/automl.py +13 -13
- airflow/providers/google/cloud/hooks/bigquery.py +208 -256
- airflow/providers/google/cloud/hooks/bigquery_dts.py +6 -6
- airflow/providers/google/cloud/hooks/bigtable.py +8 -8
- airflow/providers/google/cloud/hooks/cloud_batch.py +1 -1
- airflow/providers/google/cloud/hooks/cloud_build.py +19 -20
- airflow/providers/google/cloud/hooks/cloud_composer.py +4 -4
- airflow/providers/google/cloud/hooks/cloud_memorystore.py +10 -10
- airflow/providers/google/cloud/hooks/cloud_run.py +1 -1
- airflow/providers/google/cloud/hooks/cloud_sql.py +18 -19
- airflow/providers/google/cloud/hooks/cloud_storage_transfer_service.py +3 -3
- airflow/providers/google/cloud/hooks/compute.py +16 -16
- airflow/providers/google/cloud/hooks/compute_ssh.py +1 -1
- airflow/providers/google/cloud/hooks/datacatalog.py +22 -22
- airflow/providers/google/cloud/hooks/dataflow.py +48 -49
- airflow/providers/google/cloud/hooks/dataform.py +16 -16
- airflow/providers/google/cloud/hooks/datafusion.py +15 -15
- airflow/providers/google/cloud/hooks/datapipeline.py +3 -3
- airflow/providers/google/cloud/hooks/dataplex.py +19 -19
- airflow/providers/google/cloud/hooks/dataprep.py +10 -10
- airflow/providers/google/cloud/hooks/dataproc.py +132 -14
- airflow/providers/google/cloud/hooks/dataproc_metastore.py +13 -13
- airflow/providers/google/cloud/hooks/datastore.py +3 -3
- airflow/providers/google/cloud/hooks/dlp.py +25 -25
- airflow/providers/google/cloud/hooks/gcs.py +39 -27
- airflow/providers/google/cloud/hooks/gdm.py +3 -3
- airflow/providers/google/cloud/hooks/kms.py +3 -3
- airflow/providers/google/cloud/hooks/kubernetes_engine.py +63 -48
- airflow/providers/google/cloud/hooks/life_sciences.py +13 -12
- airflow/providers/google/cloud/hooks/looker.py +8 -9
- airflow/providers/google/cloud/hooks/mlengine.py +12 -12
- airflow/providers/google/cloud/hooks/natural_language.py +2 -2
- airflow/providers/google/cloud/hooks/os_login.py +1 -1
- airflow/providers/google/cloud/hooks/pubsub.py +9 -9
- airflow/providers/google/cloud/hooks/secret_manager.py +1 -1
- airflow/providers/google/cloud/hooks/spanner.py +11 -11
- airflow/providers/google/cloud/hooks/speech_to_text.py +1 -1
- airflow/providers/google/cloud/hooks/stackdriver.py +7 -7
- airflow/providers/google/cloud/hooks/tasks.py +11 -11
- airflow/providers/google/cloud/hooks/text_to_speech.py +1 -1
- airflow/providers/google/cloud/hooks/translate.py +1 -1
- airflow/providers/google/cloud/hooks/vertex_ai/auto_ml.py +13 -13
- airflow/providers/google/cloud/hooks/vertex_ai/batch_prediction_job.py +6 -6
- airflow/providers/google/cloud/hooks/vertex_ai/custom_job.py +45 -50
- airflow/providers/google/cloud/hooks/vertex_ai/dataset.py +13 -13
- airflow/providers/google/cloud/hooks/vertex_ai/endpoint_service.py +9 -9
- airflow/providers/google/cloud/hooks/vertex_ai/hyperparameter_tuning_job.py +128 -11
- airflow/providers/google/cloud/hooks/vertex_ai/model_service.py +10 -10
- airflow/providers/google/cloud/hooks/vertex_ai/pipeline_job.py +8 -8
- airflow/providers/google/cloud/hooks/video_intelligence.py +2 -2
- airflow/providers/google/cloud/hooks/vision.py +1 -1
- airflow/providers/google/cloud/hooks/workflows.py +10 -10
- airflow/providers/google/cloud/links/datafusion.py +12 -5
- airflow/providers/google/cloud/operators/bigquery.py +11 -11
- airflow/providers/google/cloud/operators/cloud_storage_transfer_service.py +3 -1
- airflow/providers/google/cloud/operators/dataflow.py +16 -16
- airflow/providers/google/cloud/operators/datafusion.py +9 -1
- airflow/providers/google/cloud/operators/dataproc.py +444 -69
- airflow/providers/google/cloud/operators/kubernetes_engine.py +6 -6
- airflow/providers/google/cloud/operators/life_sciences.py +10 -9
- airflow/providers/google/cloud/operators/mlengine.py +96 -96
- airflow/providers/google/cloud/operators/pubsub.py +2 -0
- airflow/providers/google/cloud/operators/vertex_ai/custom_job.py +33 -3
- airflow/providers/google/cloud/operators/vertex_ai/hyperparameter_tuning_job.py +59 -2
- airflow/providers/google/cloud/secrets/secret_manager.py +8 -7
- airflow/providers/google/cloud/sensors/bigquery.py +20 -16
- airflow/providers/google/cloud/sensors/cloud_composer.py +11 -8
- airflow/providers/google/cloud/sensors/dataproc_metastore.py +12 -2
- airflow/providers/google/cloud/sensors/gcs.py +8 -7
- airflow/providers/google/cloud/transfers/bigquery_to_gcs.py +1 -0
- airflow/providers/google/cloud/transfers/cassandra_to_gcs.py +4 -4
- airflow/providers/google/cloud/transfers/gcs_to_bigquery.py +1 -0
- airflow/providers/google/cloud/transfers/gcs_to_sftp.py +1 -1
- airflow/providers/google/cloud/transfers/mssql_to_gcs.py +1 -1
- airflow/providers/google/cloud/transfers/mysql_to_gcs.py +1 -1
- airflow/providers/google/cloud/transfers/oracle_to_gcs.py +1 -1
- airflow/providers/google/cloud/transfers/postgres_to_gcs.py +1 -1
- airflow/providers/google/cloud/transfers/presto_to_gcs.py +1 -1
- airflow/providers/google/cloud/transfers/s3_to_gcs.py +3 -3
- airflow/providers/google/cloud/transfers/sftp_to_gcs.py +1 -1
- airflow/providers/google/cloud/transfers/sql_to_gcs.py +3 -3
- airflow/providers/google/cloud/transfers/trino_to_gcs.py +1 -1
- airflow/providers/google/cloud/triggers/bigquery.py +12 -12
- airflow/providers/google/cloud/triggers/bigquery_dts.py +1 -1
- airflow/providers/google/cloud/triggers/cloud_batch.py +3 -1
- airflow/providers/google/cloud/triggers/cloud_build.py +2 -2
- airflow/providers/google/cloud/triggers/cloud_run.py +1 -1
- airflow/providers/google/cloud/triggers/cloud_storage_transfer_service.py +6 -6
- airflow/providers/google/cloud/triggers/dataflow.py +3 -1
- airflow/providers/google/cloud/triggers/datafusion.py +2 -2
- airflow/providers/google/cloud/triggers/dataplex.py +2 -2
- airflow/providers/google/cloud/triggers/dataproc.py +34 -14
- airflow/providers/google/cloud/triggers/gcs.py +12 -8
- airflow/providers/google/cloud/triggers/kubernetes_engine.py +2 -2
- airflow/providers/google/cloud/triggers/mlengine.py +2 -2
- airflow/providers/google/cloud/triggers/pubsub.py +1 -1
- airflow/providers/google/cloud/triggers/vertex_ai.py +99 -0
- airflow/providers/google/cloud/utils/bigquery.py +2 -2
- airflow/providers/google/cloud/utils/credentials_provider.py +2 -2
- airflow/providers/google/cloud/utils/dataform.py +1 -1
- airflow/providers/google/cloud/utils/dataproc.py +25 -0
- airflow/providers/google/cloud/utils/field_validator.py +2 -2
- airflow/providers/google/cloud/utils/helpers.py +2 -2
- airflow/providers/google/cloud/utils/mlengine_operator_utils.py +1 -1
- airflow/providers/google/cloud/utils/mlengine_prediction_summary.py +1 -1
- airflow/providers/google/common/auth_backend/google_openid.py +2 -2
- airflow/providers/google/common/hooks/base_google.py +87 -23
- airflow/providers/google/common/hooks/discovery_api.py +2 -2
- airflow/providers/google/common/utils/id_token_credentials.py +5 -5
- airflow/providers/google/firebase/hooks/firestore.py +3 -3
- airflow/providers/google/get_provider_info.py +7 -2
- airflow/providers/google/leveldb/hooks/leveldb.py +4 -4
- airflow/providers/google/marketing_platform/hooks/analytics.py +11 -14
- airflow/providers/google/marketing_platform/hooks/campaign_manager.py +11 -11
- airflow/providers/google/marketing_platform/hooks/display_video.py +13 -13
- airflow/providers/google/marketing_platform/hooks/search_ads.py +4 -4
- airflow/providers/google/marketing_platform/operators/analytics.py +37 -32
- airflow/providers/google/suite/hooks/calendar.py +2 -2
- airflow/providers/google/suite/hooks/drive.py +7 -7
- airflow/providers/google/suite/hooks/sheets.py +8 -8
- {apache_airflow_providers_google-10.14.0rc1.dist-info → apache_airflow_providers_google-10.15.0rc1.dist-info}/METADATA +11 -11
- {apache_airflow_providers_google-10.14.0rc1.dist-info → apache_airflow_providers_google-10.15.0rc1.dist-info}/RECORD +126 -124
- {apache_airflow_providers_google-10.14.0rc1.dist-info → apache_airflow_providers_google-10.15.0rc1.dist-info}/WHEEL +0 -0
- {apache_airflow_providers_google-10.14.0rc1.dist-info → apache_airflow_providers_google-10.15.0rc1.dist-info}/entry_points.txt +0 -0
@@ -25,11 +25,13 @@ import re
|
|
25
25
|
import time
|
26
26
|
import uuid
|
27
27
|
import warnings
|
28
|
+
from collections.abc import MutableSequence
|
28
29
|
from dataclasses import dataclass
|
29
30
|
from datetime import datetime, timedelta
|
30
31
|
from enum import Enum
|
31
32
|
from typing import TYPE_CHECKING, Any, Sequence
|
32
33
|
|
34
|
+
from deprecated import deprecated
|
33
35
|
from google.api_core.exceptions import AlreadyExists, NotFound
|
34
36
|
from google.api_core.gapic_v1.method import DEFAULT, _MethodDefault
|
35
37
|
from google.api_core.retry import Retry, exponential_sleep_generator
|
@@ -56,9 +58,10 @@ from airflow.providers.google.cloud.triggers.dataproc import (
|
|
56
58
|
DataprocBatchTrigger,
|
57
59
|
DataprocClusterTrigger,
|
58
60
|
DataprocDeleteClusterTrigger,
|
61
|
+
DataprocOperationTrigger,
|
59
62
|
DataprocSubmitTrigger,
|
60
|
-
DataprocWorkflowTrigger,
|
61
63
|
)
|
64
|
+
from airflow.providers.google.cloud.utils.dataproc import DataprocOperationType
|
62
65
|
from airflow.utils import timezone
|
63
66
|
|
64
67
|
if TYPE_CHECKING:
|
@@ -66,6 +69,7 @@ if TYPE_CHECKING:
|
|
66
69
|
from google.api_core.retry_async import AsyncRetry
|
67
70
|
from google.protobuf.duration_pb2 import Duration
|
68
71
|
from google.protobuf.field_mask_pb2 import FieldMask
|
72
|
+
from google.type.interval_pb2 import Interval
|
69
73
|
|
70
74
|
from airflow.utils.context import Context
|
71
75
|
|
@@ -155,12 +159,18 @@ class ClusterGenerator:
|
|
155
159
|
Valid values: ``pd-ssd`` (Persistent Disk Solid State Drive) or
|
156
160
|
``pd-standard`` (Persistent Disk Hard Disk Drive).
|
157
161
|
:param master_disk_size: Disk size for the primary node
|
162
|
+
:param master_accelerator_type: Type of the accelerator card (GPU) to attach to the primary node,
|
163
|
+
see https://cloud.google.com/dataproc/docs/reference/rest/v1/InstanceGroupConfig#acceleratorconfig
|
164
|
+
:param master_accelerator_count: Number of accelerator cards (GPUs) to attach to the primary node
|
158
165
|
:param worker_machine_type: Compute engine machine type to use for the worker nodes
|
159
166
|
:param worker_disk_type: Type of the boot disk for the worker node
|
160
167
|
(default is ``pd-standard``).
|
161
168
|
Valid values: ``pd-ssd`` (Persistent Disk Solid State Drive) or
|
162
169
|
``pd-standard`` (Persistent Disk Hard Disk Drive).
|
163
170
|
:param worker_disk_size: Disk size for the worker nodes
|
171
|
+
:param worker_accelerator_type: Type of the accelerator card (GPU) to attach to the worker nodes,
|
172
|
+
see https://cloud.google.com/dataproc/docs/reference/rest/v1/InstanceGroupConfig#acceleratorconfig
|
173
|
+
:param worker_accelerator_count: Number of accelerator cards (GPUs) to attach to the worker nodes
|
164
174
|
:param num_preemptible_workers: The # of VM instances in the instance group as secondary workers
|
165
175
|
inside the cluster with Preemptibility enabled by default.
|
166
176
|
Note, that it is not possible to mix non-preemptible and preemptible secondary workers in
|
@@ -197,6 +207,9 @@ class ClusterGenerator:
|
|
197
207
|
identify the driver group in future operations, such as resizing the node group.
|
198
208
|
:param secondary_worker_instance_flexibility_policy: Instance flexibility Policy allowing a mixture of VM
|
199
209
|
shapes and provisioning models.
|
210
|
+
:param secondary_worker_accelerator_type: Type of the accelerator card (GPU) to attach to the secondary workers,
|
211
|
+
see https://cloud.google.com/dataproc/docs/reference/rest/v1/InstanceGroupConfig#acceleratorconfig
|
212
|
+
:param secondary_worker_accelerator_count: Number of accelerator cards (GPUs) to attach to the secondary workers
|
200
213
|
"""
|
201
214
|
|
202
215
|
def __init__(
|
@@ -224,9 +237,13 @@ class ClusterGenerator:
|
|
224
237
|
master_machine_type: str = "n1-standard-4",
|
225
238
|
master_disk_type: str = "pd-standard",
|
226
239
|
master_disk_size: int = 1024,
|
240
|
+
master_accelerator_type: str | None = None,
|
241
|
+
master_accelerator_count: int | None = None,
|
227
242
|
worker_machine_type: str = "n1-standard-4",
|
228
243
|
worker_disk_type: str = "pd-standard",
|
229
244
|
worker_disk_size: int = 1024,
|
245
|
+
worker_accelerator_type: str | None = None,
|
246
|
+
worker_accelerator_count: int | None = None,
|
230
247
|
num_preemptible_workers: int = 0,
|
231
248
|
preemptibility: str = PreemptibilityType.PREEMPTIBLE.value,
|
232
249
|
service_account: str | None = None,
|
@@ -239,6 +256,8 @@ class ClusterGenerator:
|
|
239
256
|
driver_pool_size: int = 0,
|
240
257
|
driver_pool_id: str | None = None,
|
241
258
|
secondary_worker_instance_flexibility_policy: InstanceFlexibilityPolicy | None = None,
|
259
|
+
secondary_worker_accelerator_type: str | None = None,
|
260
|
+
secondary_worker_accelerator_count: int | None = None,
|
242
261
|
**kwargs,
|
243
262
|
) -> None:
|
244
263
|
self.project_id = project_id
|
@@ -260,10 +279,14 @@ class ClusterGenerator:
|
|
260
279
|
self.master_machine_type = master_machine_type
|
261
280
|
self.master_disk_type = master_disk_type
|
262
281
|
self.master_disk_size = master_disk_size
|
282
|
+
self.master_accelerator_type = master_accelerator_type
|
283
|
+
self.master_accelerator_count = master_accelerator_count
|
263
284
|
self.autoscaling_policy = autoscaling_policy
|
264
285
|
self.worker_machine_type = worker_machine_type
|
265
286
|
self.worker_disk_type = worker_disk_type
|
266
287
|
self.worker_disk_size = worker_disk_size
|
288
|
+
self.worker_accelerator_type = worker_accelerator_type
|
289
|
+
self.worker_accelerator_count = worker_accelerator_count
|
267
290
|
self.zone = zone
|
268
291
|
self.network_uri = network_uri
|
269
292
|
self.subnetwork_uri = subnetwork_uri
|
@@ -280,6 +303,8 @@ class ClusterGenerator:
|
|
280
303
|
self.driver_pool_size = driver_pool_size
|
281
304
|
self.driver_pool_id = driver_pool_id
|
282
305
|
self.secondary_worker_instance_flexibility_policy = secondary_worker_instance_flexibility_policy
|
306
|
+
self.secondary_worker_accelerator_type = secondary_worker_accelerator_type
|
307
|
+
self.secondary_worker_accelerator_count = secondary_worker_accelerator_count
|
283
308
|
|
284
309
|
if self.custom_image and self.image_version:
|
285
310
|
raise ValueError("The custom_image and image_version can't be both set")
|
@@ -336,10 +361,10 @@ class ClusterGenerator:
|
|
336
361
|
if self.subnetwork_uri:
|
337
362
|
cluster_data[config]["subnetwork_uri"] = self.subnetwork_uri
|
338
363
|
|
339
|
-
if self.internal_ip_only:
|
340
|
-
if not self.subnetwork_uri:
|
364
|
+
if self.internal_ip_only is not None:
|
365
|
+
if not self.subnetwork_uri and self.internal_ip_only:
|
341
366
|
raise AirflowException("Set internal_ip_only to true only when you pass a subnetwork_uri.")
|
342
|
-
cluster_data[config]["internal_ip_only"] =
|
367
|
+
cluster_data[config]["internal_ip_only"] = self.internal_ip_only
|
343
368
|
|
344
369
|
if self.tags:
|
345
370
|
cluster_data[config]["tags"] = self.tags
|
@@ -420,6 +445,18 @@ class ClusterGenerator:
|
|
420
445
|
if self.min_num_workers:
|
421
446
|
cluster_data["worker_config"]["min_num_instances"] = self.min_num_workers
|
422
447
|
|
448
|
+
if self.master_accelerator_type:
|
449
|
+
cluster_data["master_config"]["accelerators"] = {
|
450
|
+
"accelerator_type_uri": self.master_accelerator_type,
|
451
|
+
"accelerator_count": self.master_accelerator_count,
|
452
|
+
}
|
453
|
+
|
454
|
+
if self.worker_accelerator_type:
|
455
|
+
cluster_data["worker_config"]["accelerators"] = {
|
456
|
+
"accelerator_type_uri": self.worker_accelerator_type,
|
457
|
+
"accelerator_count": self.worker_accelerator_count,
|
458
|
+
}
|
459
|
+
|
423
460
|
if self.num_preemptible_workers > 0:
|
424
461
|
cluster_data["secondary_worker_config"] = {
|
425
462
|
"num_instances": self.num_preemptible_workers,
|
@@ -431,6 +468,11 @@ class ClusterGenerator:
|
|
431
468
|
"is_preemptible": True,
|
432
469
|
"preemptibility": self.preemptibility.value,
|
433
470
|
}
|
471
|
+
if self.worker_accelerator_type:
|
472
|
+
cluster_data["secondary_worker_config"]["accelerators"] = {
|
473
|
+
"accelerator_type_uri": self.secondary_worker_accelerator_type,
|
474
|
+
"accelerator_count": self.secondary_worker_accelerator_count,
|
475
|
+
}
|
434
476
|
if self.secondary_worker_instance_flexibility_policy:
|
435
477
|
cluster_data["secondary_worker_config"]["instance_flexibility_policy"] = {
|
436
478
|
"instance_selection_list": [
|
@@ -681,10 +723,13 @@ class DataprocCreateClusterOperator(GoogleCloudBaseOperator):
|
|
681
723
|
return
|
682
724
|
self.log.info("Cluster is in ERROR state")
|
683
725
|
self.log.info("Gathering diagnostic information.")
|
684
|
-
|
726
|
+
operation = hook.diagnose_cluster(
|
685
727
|
region=self.region, cluster_name=self.cluster_name, project_id=self.project_id
|
686
728
|
)
|
729
|
+
operation.result()
|
730
|
+
gcs_uri = str(operation.operation.response.value)
|
687
731
|
self.log.info("Diagnostic information for cluster %s available at: %s", self.cluster_name, gcs_uri)
|
732
|
+
|
688
733
|
if self.delete_on_error:
|
689
734
|
self._delete_cluster(hook)
|
690
735
|
# The delete op is asynchronous and can cause further failure if the cluster finishes
|
@@ -718,6 +763,17 @@ class DataprocCreateClusterOperator(GoogleCloudBaseOperator):
|
|
718
763
|
cluster = self._get_cluster(hook)
|
719
764
|
return cluster
|
720
765
|
|
766
|
+
def _start_cluster(self, hook: DataprocHook):
|
767
|
+
op: operation.Operation = hook.start_cluster(
|
768
|
+
region=self.region,
|
769
|
+
project_id=self.project_id,
|
770
|
+
cluster_name=self.cluster_name,
|
771
|
+
retry=self.retry,
|
772
|
+
timeout=self.timeout,
|
773
|
+
metadata=self.metadata,
|
774
|
+
)
|
775
|
+
return hook.wait_for_operation(timeout=self.timeout, result_retry=self.retry, operation=op)
|
776
|
+
|
721
777
|
def execute(self, context: Context) -> dict:
|
722
778
|
self.log.info("Creating cluster: %s", self.cluster_name)
|
723
779
|
hook = DataprocHook(gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain)
|
@@ -795,6 +851,9 @@ class DataprocCreateClusterOperator(GoogleCloudBaseOperator):
|
|
795
851
|
# Create new cluster
|
796
852
|
cluster = self._create_cluster(hook)
|
797
853
|
self._handle_error_state(hook, cluster)
|
854
|
+
elif cluster.status.state == cluster.status.State.STOPPED:
|
855
|
+
# if the cluster exists and already stopped, then start the cluster
|
856
|
+
self._start_cluster(hook)
|
798
857
|
|
799
858
|
return Cluster.to_dict(cluster)
|
800
859
|
|
@@ -814,6 +873,11 @@ class DataprocCreateClusterOperator(GoogleCloudBaseOperator):
|
|
814
873
|
return event["cluster"]
|
815
874
|
|
816
875
|
|
876
|
+
# TODO: Remove one day
|
877
|
+
@deprecated(
|
878
|
+
reason="Please use `DataprocUpdateClusterOperator` instead.",
|
879
|
+
category=AirflowProviderDeprecationWarning,
|
880
|
+
)
|
817
881
|
class DataprocScaleClusterOperator(GoogleCloudBaseOperator):
|
818
882
|
"""Scale, up or down, a cluster on Google Cloud Dataproc.
|
819
883
|
|
@@ -882,14 +946,6 @@ class DataprocScaleClusterOperator(GoogleCloudBaseOperator):
|
|
882
946
|
self.gcp_conn_id = gcp_conn_id
|
883
947
|
self.impersonation_chain = impersonation_chain
|
884
948
|
|
885
|
-
# TODO: Remove one day
|
886
|
-
warnings.warn(
|
887
|
-
f"The `{type(self).__name__}` operator is deprecated, "
|
888
|
-
"please use `DataprocUpdateClusterOperator` instead.",
|
889
|
-
AirflowProviderDeprecationWarning,
|
890
|
-
stacklevel=2,
|
891
|
-
)
|
892
|
-
|
893
949
|
def _build_scale_cluster_data(self) -> dict:
|
894
950
|
scale_data = {
|
895
951
|
"config": {
|
@@ -1076,6 +1132,189 @@ class DataprocDeleteClusterOperator(GoogleCloudBaseOperator):
|
|
1076
1132
|
)
|
1077
1133
|
|
1078
1134
|
|
1135
|
+
class _DataprocStartStopClusterBaseOperator(GoogleCloudBaseOperator):
|
1136
|
+
"""Base class to start or stop a cluster in a project.
|
1137
|
+
|
1138
|
+
:param cluster_name: Required. Name of the cluster to create
|
1139
|
+
:param region: Required. The specified region where the dataproc cluster is created.
|
1140
|
+
:param project_id: Optional. The ID of the Google Cloud project the cluster belongs to.
|
1141
|
+
:param cluster_uuid: Optional. Specifying the ``cluster_uuid`` means the RPC should fail
|
1142
|
+
if cluster with specified UUID does not exist.
|
1143
|
+
:param request_id: Optional. A unique id used to identify the request. If the server receives two
|
1144
|
+
``DeleteClusterRequest`` requests with the same id, then the second request will be ignored and the
|
1145
|
+
first ``google.longrunning.Operation`` created and stored in the backend is returned.
|
1146
|
+
:param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be
|
1147
|
+
retried.
|
1148
|
+
:param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if
|
1149
|
+
``retry`` is specified, the timeout applies to each individual attempt.
|
1150
|
+
:param metadata: Additional metadata that is provided to the method.
|
1151
|
+
:param gcp_conn_id: The connection ID to use connecting to Google Cloud.
|
1152
|
+
:param impersonation_chain: Optional service account to impersonate using short-term
|
1153
|
+
credentials, or chained list of accounts required to get the access_token
|
1154
|
+
of the last account in the list, which will be impersonated in the request.
|
1155
|
+
If set as a string, the account must grant the originating account
|
1156
|
+
the Service Account Token Creator IAM role.
|
1157
|
+
If set as a sequence, the identities from the list must grant
|
1158
|
+
Service Account Token Creator IAM role to the directly preceding identity, with first
|
1159
|
+
account from the list granting this role to the originating account (templated).
|
1160
|
+
"""
|
1161
|
+
|
1162
|
+
template_fields = (
|
1163
|
+
"cluster_name",
|
1164
|
+
"region",
|
1165
|
+
"project_id",
|
1166
|
+
"request_id",
|
1167
|
+
"impersonation_chain",
|
1168
|
+
)
|
1169
|
+
|
1170
|
+
def __init__(
|
1171
|
+
self,
|
1172
|
+
*,
|
1173
|
+
cluster_name: str,
|
1174
|
+
region: str,
|
1175
|
+
project_id: str | None = None,
|
1176
|
+
cluster_uuid: str | None = None,
|
1177
|
+
request_id: str | None = None,
|
1178
|
+
retry: AsyncRetry | _MethodDefault = DEFAULT,
|
1179
|
+
timeout: float = 1 * 60 * 60,
|
1180
|
+
metadata: Sequence[tuple[str, str]] = (),
|
1181
|
+
gcp_conn_id: str = "google_cloud_default",
|
1182
|
+
impersonation_chain: str | Sequence[str] | None = None,
|
1183
|
+
**kwargs,
|
1184
|
+
) -> None:
|
1185
|
+
super().__init__(**kwargs)
|
1186
|
+
self.project_id = project_id
|
1187
|
+
self.region = region
|
1188
|
+
self.cluster_name = cluster_name
|
1189
|
+
self.cluster_uuid = cluster_uuid
|
1190
|
+
self.request_id = request_id
|
1191
|
+
self.retry = retry
|
1192
|
+
self.timeout = timeout
|
1193
|
+
self.metadata = metadata
|
1194
|
+
self.gcp_conn_id = gcp_conn_id
|
1195
|
+
self.impersonation_chain = impersonation_chain
|
1196
|
+
self._hook: DataprocHook | None = None
|
1197
|
+
|
1198
|
+
@property
|
1199
|
+
def hook(self):
|
1200
|
+
if self._hook is None:
|
1201
|
+
self._hook = DataprocHook(
|
1202
|
+
gcp_conn_id=self.gcp_conn_id,
|
1203
|
+
impersonation_chain=self.impersonation_chain,
|
1204
|
+
)
|
1205
|
+
return self._hook
|
1206
|
+
|
1207
|
+
def _get_project_id(self) -> str:
|
1208
|
+
return self.project_id or self.hook.project_id
|
1209
|
+
|
1210
|
+
def _get_cluster(self) -> Cluster:
|
1211
|
+
"""Retrieve the cluster information.
|
1212
|
+
|
1213
|
+
:return: Instance of ``google.cloud.dataproc_v1.Cluster``` class
|
1214
|
+
"""
|
1215
|
+
return self.hook.get_cluster(
|
1216
|
+
project_id=self._get_project_id(),
|
1217
|
+
region=self.region,
|
1218
|
+
cluster_name=self.cluster_name,
|
1219
|
+
retry=self.retry,
|
1220
|
+
timeout=self.timeout,
|
1221
|
+
metadata=self.metadata,
|
1222
|
+
)
|
1223
|
+
|
1224
|
+
def _check_desired_cluster_state(self, cluster: Cluster) -> tuple[bool, str | None]:
|
1225
|
+
"""Implement this method in child class to return whether the cluster is in desired state or not.
|
1226
|
+
|
1227
|
+
If the cluster is in desired stated you can return a log message content as a second value
|
1228
|
+
for the return tuple.
|
1229
|
+
|
1230
|
+
:param cluster: Required. Instance of ``google.cloud.dataproc_v1.Cluster``
|
1231
|
+
class to interact with Dataproc API
|
1232
|
+
:return: Tuple of (Boolean, Optional[str]) The first value of the tuple is whether the cluster is
|
1233
|
+
in desired state or not. The second value of the tuple will use if you want to log something when
|
1234
|
+
the cluster is in desired state already.
|
1235
|
+
"""
|
1236
|
+
raise NotImplementedError
|
1237
|
+
|
1238
|
+
def _get_operation(self) -> operation.Operation:
|
1239
|
+
"""Implement this method in child class to call the related hook method and return its result.
|
1240
|
+
|
1241
|
+
:return: ``google.api_core.operation.Operation`` value whether the cluster is in desired state or not
|
1242
|
+
"""
|
1243
|
+
raise NotImplementedError
|
1244
|
+
|
1245
|
+
def execute(self, context: Context) -> dict | None:
|
1246
|
+
cluster: Cluster = self._get_cluster()
|
1247
|
+
is_already_desired_state, log_str = self._check_desired_cluster_state(cluster)
|
1248
|
+
if is_already_desired_state:
|
1249
|
+
self.log.info(log_str)
|
1250
|
+
return None
|
1251
|
+
|
1252
|
+
op: operation.Operation = self._get_operation()
|
1253
|
+
result = self.hook.wait_for_operation(timeout=self.timeout, result_retry=self.retry, operation=op)
|
1254
|
+
return Cluster.to_dict(result)
|
1255
|
+
|
1256
|
+
|
1257
|
+
class DataprocStartClusterOperator(_DataprocStartStopClusterBaseOperator):
|
1258
|
+
"""Start a cluster in a project."""
|
1259
|
+
|
1260
|
+
operator_extra_links = (DataprocClusterLink(),)
|
1261
|
+
|
1262
|
+
def execute(self, context: Context) -> dict | None:
|
1263
|
+
self.log.info("Starting the cluster: %s", self.cluster_name)
|
1264
|
+
cluster = super().execute(context)
|
1265
|
+
DataprocClusterLink.persist(
|
1266
|
+
context=context,
|
1267
|
+
operator=self,
|
1268
|
+
cluster_id=self.cluster_name,
|
1269
|
+
project_id=self._get_project_id(),
|
1270
|
+
region=self.region,
|
1271
|
+
)
|
1272
|
+
self.log.info("Cluster started")
|
1273
|
+
return cluster
|
1274
|
+
|
1275
|
+
def _check_desired_cluster_state(self, cluster: Cluster) -> tuple[bool, str | None]:
|
1276
|
+
if cluster.status.state == cluster.status.State.RUNNING:
|
1277
|
+
return True, f'The cluster "{self.cluster_name}" already running!'
|
1278
|
+
return False, None
|
1279
|
+
|
1280
|
+
def _get_operation(self) -> operation.Operation:
|
1281
|
+
return self.hook.start_cluster(
|
1282
|
+
region=self.region,
|
1283
|
+
project_id=self._get_project_id(),
|
1284
|
+
cluster_name=self.cluster_name,
|
1285
|
+
cluster_uuid=self.cluster_uuid,
|
1286
|
+
retry=self.retry,
|
1287
|
+
timeout=self.timeout,
|
1288
|
+
metadata=self.metadata,
|
1289
|
+
)
|
1290
|
+
|
1291
|
+
|
1292
|
+
class DataprocStopClusterOperator(_DataprocStartStopClusterBaseOperator):
|
1293
|
+
"""Stop a cluster in a project."""
|
1294
|
+
|
1295
|
+
def execute(self, context: Context) -> dict | None:
|
1296
|
+
self.log.info("Stopping the cluster: %s", self.cluster_name)
|
1297
|
+
cluster = super().execute(context)
|
1298
|
+
self.log.info("Cluster stopped")
|
1299
|
+
return cluster
|
1300
|
+
|
1301
|
+
def _check_desired_cluster_state(self, cluster: Cluster) -> tuple[bool, str | None]:
|
1302
|
+
if cluster.status.state in [cluster.status.State.STOPPED, cluster.status.State.STOPPING]:
|
1303
|
+
return True, f'The cluster "{self.cluster_name}" already stopped!'
|
1304
|
+
return False, None
|
1305
|
+
|
1306
|
+
def _get_operation(self) -> operation.Operation:
|
1307
|
+
return self.hook.stop_cluster(
|
1308
|
+
region=self.region,
|
1309
|
+
project_id=self._get_project_id(),
|
1310
|
+
cluster_name=self.cluster_name,
|
1311
|
+
cluster_uuid=self.cluster_uuid,
|
1312
|
+
retry=self.retry,
|
1313
|
+
timeout=self.timeout,
|
1314
|
+
metadata=self.metadata,
|
1315
|
+
)
|
1316
|
+
|
1317
|
+
|
1079
1318
|
class DataprocJobBaseOperator(GoogleCloudBaseOperator):
|
1080
1319
|
"""Base class for operators that launch job on DataProc.
|
1081
1320
|
|
@@ -1250,6 +1489,15 @@ class DataprocJobBaseOperator(GoogleCloudBaseOperator):
|
|
1250
1489
|
self.hook.cancel_job(project_id=self.project_id, job_id=self.dataproc_job_id, region=self.region)
|
1251
1490
|
|
1252
1491
|
|
1492
|
+
# TODO: Remove one day
|
1493
|
+
@deprecated(
|
1494
|
+
reason=(
|
1495
|
+
"Please use `DataprocSubmitJobOperator` instead. "
|
1496
|
+
"You can use `generate_job` method to generate dictionary representing your job "
|
1497
|
+
"and use it with the new operator."
|
1498
|
+
),
|
1499
|
+
category=AirflowProviderDeprecationWarning,
|
1500
|
+
)
|
1253
1501
|
class DataprocSubmitPigJobOperator(DataprocJobBaseOperator):
|
1254
1502
|
"""Start a Pig query Job on a Cloud DataProc cluster.
|
1255
1503
|
|
@@ -1324,15 +1572,6 @@ class DataprocSubmitPigJobOperator(DataprocJobBaseOperator):
|
|
1324
1572
|
dataproc_jars: list[str] | None = None,
|
1325
1573
|
**kwargs,
|
1326
1574
|
) -> None:
|
1327
|
-
# TODO: Remove one day
|
1328
|
-
warnings.warn(
|
1329
|
-
"The `{cls}` operator is deprecated, please use `DataprocSubmitJobOperator` instead. You can use"
|
1330
|
-
" `generate_job` method of `{cls}` to generate dictionary representing your job"
|
1331
|
-
" and use it with the new operator.".format(cls=type(self).__name__),
|
1332
|
-
AirflowProviderDeprecationWarning,
|
1333
|
-
stacklevel=2,
|
1334
|
-
)
|
1335
|
-
|
1336
1575
|
super().__init__(
|
1337
1576
|
impersonation_chain=impersonation_chain,
|
1338
1577
|
region=region,
|
@@ -1376,6 +1615,15 @@ class DataprocSubmitPigJobOperator(DataprocJobBaseOperator):
|
|
1376
1615
|
super().execute(context)
|
1377
1616
|
|
1378
1617
|
|
1618
|
+
# TODO: Remove one day
|
1619
|
+
@deprecated(
|
1620
|
+
reason=(
|
1621
|
+
"Please use `DataprocSubmitJobOperator` instead. "
|
1622
|
+
"You can use `generate_job` method to generate dictionary representing your job "
|
1623
|
+
"and use it with the new operator."
|
1624
|
+
),
|
1625
|
+
category=AirflowProviderDeprecationWarning,
|
1626
|
+
)
|
1379
1627
|
class DataprocSubmitHiveJobOperator(DataprocJobBaseOperator):
|
1380
1628
|
"""Start a Hive query Job on a Cloud DataProc cluster.
|
1381
1629
|
|
@@ -1416,15 +1664,6 @@ class DataprocSubmitHiveJobOperator(DataprocJobBaseOperator):
|
|
1416
1664
|
dataproc_jars: list[str] | None = None,
|
1417
1665
|
**kwargs,
|
1418
1666
|
) -> None:
|
1419
|
-
# TODO: Remove one day
|
1420
|
-
warnings.warn(
|
1421
|
-
"The `{cls}` operator is deprecated, please use `DataprocSubmitJobOperator` instead. You can use"
|
1422
|
-
" `generate_job` method of `{cls}` to generate dictionary representing your job"
|
1423
|
-
" and use it with the new operator.".format(cls=type(self).__name__),
|
1424
|
-
AirflowProviderDeprecationWarning,
|
1425
|
-
stacklevel=2,
|
1426
|
-
)
|
1427
|
-
|
1428
1667
|
super().__init__(
|
1429
1668
|
impersonation_chain=impersonation_chain,
|
1430
1669
|
region=region,
|
@@ -1468,6 +1707,15 @@ class DataprocSubmitHiveJobOperator(DataprocJobBaseOperator):
|
|
1468
1707
|
super().execute(context)
|
1469
1708
|
|
1470
1709
|
|
1710
|
+
# TODO: Remove one day
|
1711
|
+
@deprecated(
|
1712
|
+
reason=(
|
1713
|
+
"Please use `DataprocSubmitJobOperator` instead. "
|
1714
|
+
"You can use `generate_job` method to generate dictionary representing your job "
|
1715
|
+
"and use it with the new operator."
|
1716
|
+
),
|
1717
|
+
category=AirflowProviderDeprecationWarning,
|
1718
|
+
)
|
1471
1719
|
class DataprocSubmitSparkSqlJobOperator(DataprocJobBaseOperator):
|
1472
1720
|
"""Start a Spark SQL query Job on a Cloud DataProc cluster.
|
1473
1721
|
|
@@ -1509,15 +1757,6 @@ class DataprocSubmitSparkSqlJobOperator(DataprocJobBaseOperator):
|
|
1509
1757
|
dataproc_jars: list[str] | None = None,
|
1510
1758
|
**kwargs,
|
1511
1759
|
) -> None:
|
1512
|
-
# TODO: Remove one day
|
1513
|
-
warnings.warn(
|
1514
|
-
"The `{cls}` operator is deprecated, please use `DataprocSubmitJobOperator` instead. You can use"
|
1515
|
-
" `generate_job` method of `{cls}` to generate dictionary representing your job"
|
1516
|
-
" and use it with the new operator.".format(cls=type(self).__name__),
|
1517
|
-
AirflowProviderDeprecationWarning,
|
1518
|
-
stacklevel=2,
|
1519
|
-
)
|
1520
|
-
|
1521
1760
|
super().__init__(
|
1522
1761
|
impersonation_chain=impersonation_chain,
|
1523
1762
|
region=region,
|
@@ -1559,6 +1798,15 @@ class DataprocSubmitSparkSqlJobOperator(DataprocJobBaseOperator):
|
|
1559
1798
|
super().execute(context)
|
1560
1799
|
|
1561
1800
|
|
1801
|
+
# TODO: Remove one day
|
1802
|
+
@deprecated(
|
1803
|
+
reason=(
|
1804
|
+
"Please use `DataprocSubmitJobOperator` instead. "
|
1805
|
+
"You can use `generate_job` method to generate dictionary representing your job "
|
1806
|
+
"and use it with the new operator."
|
1807
|
+
),
|
1808
|
+
category=AirflowProviderDeprecationWarning,
|
1809
|
+
)
|
1562
1810
|
class DataprocSubmitSparkJobOperator(DataprocJobBaseOperator):
|
1563
1811
|
"""Start a Spark Job on a Cloud DataProc cluster.
|
1564
1812
|
|
@@ -1604,15 +1852,6 @@ class DataprocSubmitSparkJobOperator(DataprocJobBaseOperator):
|
|
1604
1852
|
dataproc_jars: list[str] | None = None,
|
1605
1853
|
**kwargs,
|
1606
1854
|
) -> None:
|
1607
|
-
# TODO: Remove one day
|
1608
|
-
warnings.warn(
|
1609
|
-
"The `{cls}` operator is deprecated, please use `DataprocSubmitJobOperator` instead. You can use"
|
1610
|
-
" `generate_job` method of `{cls}` to generate dictionary representing your job"
|
1611
|
-
" and use it with the new operator.".format(cls=type(self).__name__),
|
1612
|
-
AirflowProviderDeprecationWarning,
|
1613
|
-
stacklevel=2,
|
1614
|
-
)
|
1615
|
-
|
1616
1855
|
super().__init__(
|
1617
1856
|
impersonation_chain=impersonation_chain,
|
1618
1857
|
region=region,
|
@@ -1650,6 +1889,15 @@ class DataprocSubmitSparkJobOperator(DataprocJobBaseOperator):
|
|
1650
1889
|
super().execute(context)
|
1651
1890
|
|
1652
1891
|
|
1892
|
+
# TODO: Remove one day
|
1893
|
+
@deprecated(
|
1894
|
+
reason=(
|
1895
|
+
"Please use `DataprocSubmitJobOperator` instead. "
|
1896
|
+
"You can use `generate_job` method to generate dictionary representing your job "
|
1897
|
+
"and use it with the new operator."
|
1898
|
+
),
|
1899
|
+
category=AirflowProviderDeprecationWarning,
|
1900
|
+
)
|
1653
1901
|
class DataprocSubmitHadoopJobOperator(DataprocJobBaseOperator):
|
1654
1902
|
"""Start a Hadoop Job on a Cloud DataProc cluster.
|
1655
1903
|
|
@@ -1695,15 +1943,6 @@ class DataprocSubmitHadoopJobOperator(DataprocJobBaseOperator):
|
|
1695
1943
|
dataproc_jars: list[str] | None = None,
|
1696
1944
|
**kwargs,
|
1697
1945
|
) -> None:
|
1698
|
-
# TODO: Remove one day
|
1699
|
-
warnings.warn(
|
1700
|
-
"The `{cls}` operator is deprecated, please use `DataprocSubmitJobOperator` instead. You can use"
|
1701
|
-
" `generate_job` method of `{cls}` to generate dictionary representing your job"
|
1702
|
-
" and use it with the new operator.".format(cls=type(self).__name__),
|
1703
|
-
AirflowProviderDeprecationWarning,
|
1704
|
-
stacklevel=2,
|
1705
|
-
)
|
1706
|
-
|
1707
1946
|
super().__init__(
|
1708
1947
|
impersonation_chain=impersonation_chain,
|
1709
1948
|
region=region,
|
@@ -1740,6 +1979,15 @@ class DataprocSubmitHadoopJobOperator(DataprocJobBaseOperator):
|
|
1740
1979
|
super().execute(context)
|
1741
1980
|
|
1742
1981
|
|
1982
|
+
# TODO: Remove one day
|
1983
|
+
@deprecated(
|
1984
|
+
reason=(
|
1985
|
+
"Please use `DataprocSubmitJobOperator` instead. "
|
1986
|
+
"You can use `generate_job` method to generate dictionary representing your job "
|
1987
|
+
"and use it with the new operator."
|
1988
|
+
),
|
1989
|
+
category=AirflowProviderDeprecationWarning,
|
1990
|
+
)
|
1743
1991
|
class DataprocSubmitPySparkJobOperator(DataprocJobBaseOperator):
|
1744
1992
|
"""Start a PySpark Job on a Cloud DataProc cluster.
|
1745
1993
|
|
@@ -1809,15 +2057,6 @@ class DataprocSubmitPySparkJobOperator(DataprocJobBaseOperator):
|
|
1809
2057
|
dataproc_jars: list[str] | None = None,
|
1810
2058
|
**kwargs,
|
1811
2059
|
) -> None:
|
1812
|
-
# TODO: Remove one day
|
1813
|
-
warnings.warn(
|
1814
|
-
"The `{cls}` operator is deprecated, please use `DataprocSubmitJobOperator` instead. You can use"
|
1815
|
-
" `generate_job` method of `{cls}` to generate dictionary representing your job"
|
1816
|
-
" and use it with the new operator.".format(cls=type(self).__name__),
|
1817
|
-
AirflowProviderDeprecationWarning,
|
1818
|
-
stacklevel=2,
|
1819
|
-
)
|
1820
|
-
|
1821
2060
|
super().__init__(
|
1822
2061
|
impersonation_chain=impersonation_chain,
|
1823
2062
|
region=region,
|
@@ -2054,7 +2293,7 @@ class DataprocInstantiateWorkflowTemplateOperator(GoogleCloudBaseOperator):
|
|
2054
2293
|
self.log.info("Workflow %s completed successfully", workflow_id)
|
2055
2294
|
else:
|
2056
2295
|
self.defer(
|
2057
|
-
trigger=
|
2296
|
+
trigger=DataprocOperationTrigger(
|
2058
2297
|
name=operation_name,
|
2059
2298
|
project_id=self.project_id,
|
2060
2299
|
region=self.region,
|
@@ -2196,7 +2435,7 @@ class DataprocInstantiateInlineWorkflowTemplateOperator(GoogleCloudBaseOperator)
|
|
2196
2435
|
self.log.info("Workflow %s completed successfully", workflow_id)
|
2197
2436
|
else:
|
2198
2437
|
self.defer(
|
2199
|
-
trigger=
|
2438
|
+
trigger=DataprocOperationTrigger(
|
2200
2439
|
name=operation_name,
|
2201
2440
|
project_id=self.project_id or hook.project_id,
|
2202
2441
|
region=self.region,
|
@@ -2530,6 +2769,142 @@ class DataprocUpdateClusterOperator(GoogleCloudBaseOperator):
|
|
2530
2769
|
self.log.info("%s completed successfully.", self.task_id)
|
2531
2770
|
|
2532
2771
|
|
2772
|
+
class DataprocDiagnoseClusterOperator(GoogleCloudBaseOperator):
|
2773
|
+
"""Diagnose a cluster in a project.
|
2774
|
+
|
2775
|
+
After the operation completes, the response contains the Cloud Storage URI of the diagnostic output report containing a summary of collected diagnostics.
|
2776
|
+
|
2777
|
+
:param region: Required. The Cloud Dataproc region in which to handle the request (templated).
|
2778
|
+
:param project_id: Optional. The ID of the Google Cloud project that the cluster belongs to (templated).
|
2779
|
+
:param cluster_name: Required. The cluster name (templated).
|
2780
|
+
:param tarball_gcs_dir: The output Cloud Storage directory for the diagnostic tarball. If not specified, a task-specific directory in the cluster's staging bucket will be used.
|
2781
|
+
:param diagnosis_interval: Time interval in which diagnosis should be carried out on the cluster.
|
2782
|
+
:param jobs: Specifies a list of jobs on which diagnosis is to be performed. Format: `projects/{project}/regions/{region}/jobs/{job}`
|
2783
|
+
:param yarn_application_ids: Specifies a list of yarn applications on which diagnosis is to be performed.
|
2784
|
+
:param metadata: Additional metadata that is provided to the method.
|
2785
|
+
:param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be
|
2786
|
+
retried.
|
2787
|
+
:param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if
|
2788
|
+
``retry`` is specified, the timeout applies to each individual attempt.
|
2789
|
+
:param gcp_conn_id: The connection ID to use connecting to Google Cloud.
|
2790
|
+
:param impersonation_chain: Optional service account to impersonate using short-term
|
2791
|
+
credentials, or chained list of accounts required to get the access_token
|
2792
|
+
of the last account in the list, which will be impersonated in the request.
|
2793
|
+
If set as a string, the account must grant the originating account
|
2794
|
+
the Service Account Token Creator IAM role.
|
2795
|
+
If set as a sequence, the identities from the list must grant
|
2796
|
+
Service Account Token Creator IAM role to the directly preceding identity, with first
|
2797
|
+
account from the list granting this role to the originating account (templated).
|
2798
|
+
:param deferrable: Run operator in the deferrable mode.
|
2799
|
+
:param polling_interval_seconds: Time (seconds) to wait between calls to check the cluster status.
|
2800
|
+
"""
|
2801
|
+
|
2802
|
+
template_fields: Sequence[str] = (
|
2803
|
+
"project_id",
|
2804
|
+
"region",
|
2805
|
+
"cluster_name",
|
2806
|
+
"impersonation_chain",
|
2807
|
+
"tarball_gcs_dir",
|
2808
|
+
"diagnosis_interval",
|
2809
|
+
"jobs",
|
2810
|
+
"yarn_application_ids",
|
2811
|
+
)
|
2812
|
+
|
2813
|
+
def __init__(
|
2814
|
+
self,
|
2815
|
+
*,
|
2816
|
+
region: str,
|
2817
|
+
cluster_name: str,
|
2818
|
+
project_id: str | None = None,
|
2819
|
+
tarball_gcs_dir: str | None = None,
|
2820
|
+
diagnosis_interval: dict | Interval | None = None,
|
2821
|
+
jobs: MutableSequence[str] | None = None,
|
2822
|
+
yarn_application_ids: MutableSequence[str] | None = None,
|
2823
|
+
retry: AsyncRetry | _MethodDefault = DEFAULT,
|
2824
|
+
timeout: float = 1 * 60 * 60,
|
2825
|
+
metadata: Sequence[tuple[str, str]] = (),
|
2826
|
+
gcp_conn_id: str = "google_cloud_default",
|
2827
|
+
impersonation_chain: str | Sequence[str] | None = None,
|
2828
|
+
deferrable: bool = conf.getboolean("operators", "default_deferrable", fallback=False),
|
2829
|
+
polling_interval_seconds: int = 10,
|
2830
|
+
**kwargs,
|
2831
|
+
):
|
2832
|
+
super().__init__(**kwargs)
|
2833
|
+
if deferrable and polling_interval_seconds <= 0:
|
2834
|
+
raise ValueError("Invalid value for polling_interval_seconds. Expected value greater than 0")
|
2835
|
+
self.project_id = project_id
|
2836
|
+
self.region = region
|
2837
|
+
self.cluster_name = cluster_name
|
2838
|
+
self.tarball_gcs_dir = tarball_gcs_dir
|
2839
|
+
self.diagnosis_interval = diagnosis_interval
|
2840
|
+
self.jobs = jobs
|
2841
|
+
self.yarn_application_ids = yarn_application_ids
|
2842
|
+
self.retry = retry
|
2843
|
+
self.timeout = timeout
|
2844
|
+
self.metadata = metadata
|
2845
|
+
self.gcp_conn_id = gcp_conn_id
|
2846
|
+
self.impersonation_chain = impersonation_chain
|
2847
|
+
self.deferrable = deferrable
|
2848
|
+
self.polling_interval_seconds = polling_interval_seconds
|
2849
|
+
|
2850
|
+
def execute(self, context: Context):
|
2851
|
+
hook = DataprocHook(gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain)
|
2852
|
+
self.log.info("Collecting diagnostic tarball for cluster: %s", self.cluster_name)
|
2853
|
+
operation = hook.diagnose_cluster(
|
2854
|
+
region=self.region,
|
2855
|
+
cluster_name=self.cluster_name,
|
2856
|
+
project_id=self.project_id,
|
2857
|
+
tarball_gcs_dir=self.tarball_gcs_dir,
|
2858
|
+
diagnosis_interval=self.diagnosis_interval,
|
2859
|
+
jobs=self.jobs,
|
2860
|
+
yarn_application_ids=self.yarn_application_ids,
|
2861
|
+
retry=self.retry,
|
2862
|
+
timeout=self.timeout,
|
2863
|
+
metadata=self.metadata,
|
2864
|
+
)
|
2865
|
+
|
2866
|
+
if not self.deferrable:
|
2867
|
+
result = hook.wait_for_operation(
|
2868
|
+
timeout=self.timeout, result_retry=self.retry, operation=operation
|
2869
|
+
)
|
2870
|
+
self.log.info(
|
2871
|
+
"The diagnostic output for cluster %s is available at: %s",
|
2872
|
+
self.cluster_name,
|
2873
|
+
result.output_uri,
|
2874
|
+
)
|
2875
|
+
else:
|
2876
|
+
self.defer(
|
2877
|
+
trigger=DataprocOperationTrigger(
|
2878
|
+
name=operation.operation.name,
|
2879
|
+
operation_type=DataprocOperationType.DIAGNOSE.value,
|
2880
|
+
project_id=self.project_id,
|
2881
|
+
region=self.region,
|
2882
|
+
gcp_conn_id=self.gcp_conn_id,
|
2883
|
+
impersonation_chain=self.impersonation_chain,
|
2884
|
+
polling_interval_seconds=self.polling_interval_seconds,
|
2885
|
+
),
|
2886
|
+
method_name="execute_complete",
|
2887
|
+
)
|
2888
|
+
|
2889
|
+
def execute_complete(self, context: Context, event: dict[str, Any] | None = None) -> None:
|
2890
|
+
"""Callback for when the trigger fires.
|
2891
|
+
|
2892
|
+
This returns immediately. It relies on trigger to throw an exception,
|
2893
|
+
otherwise it assumes execution was successful.
|
2894
|
+
"""
|
2895
|
+
if event:
|
2896
|
+
status = event.get("status")
|
2897
|
+
if status in ("failed", "error"):
|
2898
|
+
self.log.exception("Unexpected error in the operation.")
|
2899
|
+
raise AirflowException(event.get("message"))
|
2900
|
+
|
2901
|
+
self.log.info(
|
2902
|
+
"The diagnostic output for cluster %s is available at: %s",
|
2903
|
+
self.cluster_name,
|
2904
|
+
event.get("output_uri"),
|
2905
|
+
)
|
2906
|
+
|
2907
|
+
|
2533
2908
|
class DataprocCreateBatchOperator(GoogleCloudBaseOperator):
|
2534
2909
|
"""Create a batch workload.
|
2535
2910
|
|