apache-airflow-providers-google 10.14.0rc1__py3-none-any.whl → 10.15.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. airflow/providers/google/__init__.py +1 -1
  2. airflow/providers/google/ads/hooks/ads.py +1 -2
  3. airflow/providers/google/cloud/hooks/automl.py +13 -13
  4. airflow/providers/google/cloud/hooks/bigquery.py +208 -256
  5. airflow/providers/google/cloud/hooks/bigquery_dts.py +6 -6
  6. airflow/providers/google/cloud/hooks/bigtable.py +8 -8
  7. airflow/providers/google/cloud/hooks/cloud_batch.py +1 -1
  8. airflow/providers/google/cloud/hooks/cloud_build.py +19 -20
  9. airflow/providers/google/cloud/hooks/cloud_composer.py +4 -4
  10. airflow/providers/google/cloud/hooks/cloud_memorystore.py +10 -10
  11. airflow/providers/google/cloud/hooks/cloud_run.py +1 -1
  12. airflow/providers/google/cloud/hooks/cloud_sql.py +18 -19
  13. airflow/providers/google/cloud/hooks/cloud_storage_transfer_service.py +3 -3
  14. airflow/providers/google/cloud/hooks/compute.py +16 -16
  15. airflow/providers/google/cloud/hooks/compute_ssh.py +1 -1
  16. airflow/providers/google/cloud/hooks/datacatalog.py +22 -22
  17. airflow/providers/google/cloud/hooks/dataflow.py +48 -49
  18. airflow/providers/google/cloud/hooks/dataform.py +16 -16
  19. airflow/providers/google/cloud/hooks/datafusion.py +15 -15
  20. airflow/providers/google/cloud/hooks/datapipeline.py +3 -3
  21. airflow/providers/google/cloud/hooks/dataplex.py +19 -19
  22. airflow/providers/google/cloud/hooks/dataprep.py +10 -10
  23. airflow/providers/google/cloud/hooks/dataproc.py +132 -14
  24. airflow/providers/google/cloud/hooks/dataproc_metastore.py +13 -13
  25. airflow/providers/google/cloud/hooks/datastore.py +3 -3
  26. airflow/providers/google/cloud/hooks/dlp.py +25 -25
  27. airflow/providers/google/cloud/hooks/gcs.py +39 -27
  28. airflow/providers/google/cloud/hooks/gdm.py +3 -3
  29. airflow/providers/google/cloud/hooks/kms.py +3 -3
  30. airflow/providers/google/cloud/hooks/kubernetes_engine.py +63 -48
  31. airflow/providers/google/cloud/hooks/life_sciences.py +13 -12
  32. airflow/providers/google/cloud/hooks/looker.py +8 -9
  33. airflow/providers/google/cloud/hooks/mlengine.py +12 -12
  34. airflow/providers/google/cloud/hooks/natural_language.py +2 -2
  35. airflow/providers/google/cloud/hooks/os_login.py +1 -1
  36. airflow/providers/google/cloud/hooks/pubsub.py +9 -9
  37. airflow/providers/google/cloud/hooks/secret_manager.py +1 -1
  38. airflow/providers/google/cloud/hooks/spanner.py +11 -11
  39. airflow/providers/google/cloud/hooks/speech_to_text.py +1 -1
  40. airflow/providers/google/cloud/hooks/stackdriver.py +7 -7
  41. airflow/providers/google/cloud/hooks/tasks.py +11 -11
  42. airflow/providers/google/cloud/hooks/text_to_speech.py +1 -1
  43. airflow/providers/google/cloud/hooks/translate.py +1 -1
  44. airflow/providers/google/cloud/hooks/vertex_ai/auto_ml.py +13 -13
  45. airflow/providers/google/cloud/hooks/vertex_ai/batch_prediction_job.py +6 -6
  46. airflow/providers/google/cloud/hooks/vertex_ai/custom_job.py +45 -50
  47. airflow/providers/google/cloud/hooks/vertex_ai/dataset.py +13 -13
  48. airflow/providers/google/cloud/hooks/vertex_ai/endpoint_service.py +9 -9
  49. airflow/providers/google/cloud/hooks/vertex_ai/hyperparameter_tuning_job.py +128 -11
  50. airflow/providers/google/cloud/hooks/vertex_ai/model_service.py +10 -10
  51. airflow/providers/google/cloud/hooks/vertex_ai/pipeline_job.py +8 -8
  52. airflow/providers/google/cloud/hooks/video_intelligence.py +2 -2
  53. airflow/providers/google/cloud/hooks/vision.py +1 -1
  54. airflow/providers/google/cloud/hooks/workflows.py +10 -10
  55. airflow/providers/google/cloud/links/datafusion.py +12 -5
  56. airflow/providers/google/cloud/operators/bigquery.py +11 -11
  57. airflow/providers/google/cloud/operators/cloud_storage_transfer_service.py +3 -1
  58. airflow/providers/google/cloud/operators/dataflow.py +16 -16
  59. airflow/providers/google/cloud/operators/datafusion.py +9 -1
  60. airflow/providers/google/cloud/operators/dataproc.py +444 -69
  61. airflow/providers/google/cloud/operators/kubernetes_engine.py +6 -6
  62. airflow/providers/google/cloud/operators/life_sciences.py +10 -9
  63. airflow/providers/google/cloud/operators/mlengine.py +96 -96
  64. airflow/providers/google/cloud/operators/pubsub.py +2 -0
  65. airflow/providers/google/cloud/operators/vertex_ai/custom_job.py +33 -3
  66. airflow/providers/google/cloud/operators/vertex_ai/hyperparameter_tuning_job.py +59 -2
  67. airflow/providers/google/cloud/secrets/secret_manager.py +8 -7
  68. airflow/providers/google/cloud/sensors/bigquery.py +20 -16
  69. airflow/providers/google/cloud/sensors/cloud_composer.py +11 -8
  70. airflow/providers/google/cloud/sensors/dataproc_metastore.py +12 -2
  71. airflow/providers/google/cloud/sensors/gcs.py +8 -7
  72. airflow/providers/google/cloud/transfers/bigquery_to_gcs.py +1 -0
  73. airflow/providers/google/cloud/transfers/cassandra_to_gcs.py +4 -4
  74. airflow/providers/google/cloud/transfers/gcs_to_bigquery.py +1 -0
  75. airflow/providers/google/cloud/transfers/gcs_to_sftp.py +1 -1
  76. airflow/providers/google/cloud/transfers/mssql_to_gcs.py +1 -1
  77. airflow/providers/google/cloud/transfers/mysql_to_gcs.py +1 -1
  78. airflow/providers/google/cloud/transfers/oracle_to_gcs.py +1 -1
  79. airflow/providers/google/cloud/transfers/postgres_to_gcs.py +1 -1
  80. airflow/providers/google/cloud/transfers/presto_to_gcs.py +1 -1
  81. airflow/providers/google/cloud/transfers/s3_to_gcs.py +3 -3
  82. airflow/providers/google/cloud/transfers/sftp_to_gcs.py +1 -1
  83. airflow/providers/google/cloud/transfers/sql_to_gcs.py +3 -3
  84. airflow/providers/google/cloud/transfers/trino_to_gcs.py +1 -1
  85. airflow/providers/google/cloud/triggers/bigquery.py +12 -12
  86. airflow/providers/google/cloud/triggers/bigquery_dts.py +1 -1
  87. airflow/providers/google/cloud/triggers/cloud_batch.py +3 -1
  88. airflow/providers/google/cloud/triggers/cloud_build.py +2 -2
  89. airflow/providers/google/cloud/triggers/cloud_run.py +1 -1
  90. airflow/providers/google/cloud/triggers/cloud_storage_transfer_service.py +6 -6
  91. airflow/providers/google/cloud/triggers/dataflow.py +3 -1
  92. airflow/providers/google/cloud/triggers/datafusion.py +2 -2
  93. airflow/providers/google/cloud/triggers/dataplex.py +2 -2
  94. airflow/providers/google/cloud/triggers/dataproc.py +34 -14
  95. airflow/providers/google/cloud/triggers/gcs.py +12 -8
  96. airflow/providers/google/cloud/triggers/kubernetes_engine.py +2 -2
  97. airflow/providers/google/cloud/triggers/mlengine.py +2 -2
  98. airflow/providers/google/cloud/triggers/pubsub.py +1 -1
  99. airflow/providers/google/cloud/triggers/vertex_ai.py +99 -0
  100. airflow/providers/google/cloud/utils/bigquery.py +2 -2
  101. airflow/providers/google/cloud/utils/credentials_provider.py +2 -2
  102. airflow/providers/google/cloud/utils/dataform.py +1 -1
  103. airflow/providers/google/cloud/utils/dataproc.py +25 -0
  104. airflow/providers/google/cloud/utils/field_validator.py +2 -2
  105. airflow/providers/google/cloud/utils/helpers.py +2 -2
  106. airflow/providers/google/cloud/utils/mlengine_operator_utils.py +1 -1
  107. airflow/providers/google/cloud/utils/mlengine_prediction_summary.py +1 -1
  108. airflow/providers/google/common/auth_backend/google_openid.py +2 -2
  109. airflow/providers/google/common/hooks/base_google.py +87 -23
  110. airflow/providers/google/common/hooks/discovery_api.py +2 -2
  111. airflow/providers/google/common/utils/id_token_credentials.py +5 -5
  112. airflow/providers/google/firebase/hooks/firestore.py +3 -3
  113. airflow/providers/google/get_provider_info.py +7 -2
  114. airflow/providers/google/leveldb/hooks/leveldb.py +4 -4
  115. airflow/providers/google/marketing_platform/hooks/analytics.py +11 -14
  116. airflow/providers/google/marketing_platform/hooks/campaign_manager.py +11 -11
  117. airflow/providers/google/marketing_platform/hooks/display_video.py +13 -13
  118. airflow/providers/google/marketing_platform/hooks/search_ads.py +4 -4
  119. airflow/providers/google/marketing_platform/operators/analytics.py +37 -32
  120. airflow/providers/google/suite/hooks/calendar.py +2 -2
  121. airflow/providers/google/suite/hooks/drive.py +7 -7
  122. airflow/providers/google/suite/hooks/sheets.py +8 -8
  123. {apache_airflow_providers_google-10.14.0rc1.dist-info → apache_airflow_providers_google-10.15.0rc1.dist-info}/METADATA +11 -11
  124. {apache_airflow_providers_google-10.14.0rc1.dist-info → apache_airflow_providers_google-10.15.0rc1.dist-info}/RECORD +126 -124
  125. {apache_airflow_providers_google-10.14.0rc1.dist-info → apache_airflow_providers_google-10.15.0rc1.dist-info}/WHEEL +0 -0
  126. {apache_airflow_providers_google-10.14.0rc1.dist-info → apache_airflow_providers_google-10.15.0rc1.dist-info}/entry_points.txt +0 -0
@@ -25,11 +25,13 @@ import re
25
25
  import time
26
26
  import uuid
27
27
  import warnings
28
+ from collections.abc import MutableSequence
28
29
  from dataclasses import dataclass
29
30
  from datetime import datetime, timedelta
30
31
  from enum import Enum
31
32
  from typing import TYPE_CHECKING, Any, Sequence
32
33
 
34
+ from deprecated import deprecated
33
35
  from google.api_core.exceptions import AlreadyExists, NotFound
34
36
  from google.api_core.gapic_v1.method import DEFAULT, _MethodDefault
35
37
  from google.api_core.retry import Retry, exponential_sleep_generator
@@ -56,9 +58,10 @@ from airflow.providers.google.cloud.triggers.dataproc import (
56
58
  DataprocBatchTrigger,
57
59
  DataprocClusterTrigger,
58
60
  DataprocDeleteClusterTrigger,
61
+ DataprocOperationTrigger,
59
62
  DataprocSubmitTrigger,
60
- DataprocWorkflowTrigger,
61
63
  )
64
+ from airflow.providers.google.cloud.utils.dataproc import DataprocOperationType
62
65
  from airflow.utils import timezone
63
66
 
64
67
  if TYPE_CHECKING:
@@ -66,6 +69,7 @@ if TYPE_CHECKING:
66
69
  from google.api_core.retry_async import AsyncRetry
67
70
  from google.protobuf.duration_pb2 import Duration
68
71
  from google.protobuf.field_mask_pb2 import FieldMask
72
+ from google.type.interval_pb2 import Interval
69
73
 
70
74
  from airflow.utils.context import Context
71
75
 
@@ -155,12 +159,18 @@ class ClusterGenerator:
155
159
  Valid values: ``pd-ssd`` (Persistent Disk Solid State Drive) or
156
160
  ``pd-standard`` (Persistent Disk Hard Disk Drive).
157
161
  :param master_disk_size: Disk size for the primary node
162
+ :param master_accelerator_type: Type of the accelerator card (GPU) to attach to the primary node,
163
+ see https://cloud.google.com/dataproc/docs/reference/rest/v1/InstanceGroupConfig#acceleratorconfig
164
+ :param master_accelerator_count: Number of accelerator cards (GPUs) to attach to the primary node
158
165
  :param worker_machine_type: Compute engine machine type to use for the worker nodes
159
166
  :param worker_disk_type: Type of the boot disk for the worker node
160
167
  (default is ``pd-standard``).
161
168
  Valid values: ``pd-ssd`` (Persistent Disk Solid State Drive) or
162
169
  ``pd-standard`` (Persistent Disk Hard Disk Drive).
163
170
  :param worker_disk_size: Disk size for the worker nodes
171
+ :param worker_accelerator_type: Type of the accelerator card (GPU) to attach to the worker nodes,
172
+ see https://cloud.google.com/dataproc/docs/reference/rest/v1/InstanceGroupConfig#acceleratorconfig
173
+ :param worker_accelerator_count: Number of accelerator cards (GPUs) to attach to the worker nodes
164
174
  :param num_preemptible_workers: The # of VM instances in the instance group as secondary workers
165
175
  inside the cluster with Preemptibility enabled by default.
166
176
  Note, that it is not possible to mix non-preemptible and preemptible secondary workers in
@@ -197,6 +207,9 @@ class ClusterGenerator:
197
207
  identify the driver group in future operations, such as resizing the node group.
198
208
  :param secondary_worker_instance_flexibility_policy: Instance flexibility Policy allowing a mixture of VM
199
209
  shapes and provisioning models.
210
+ :param secondary_worker_accelerator_type: Type of the accelerator card (GPU) to attach to the secondary workers,
211
+ see https://cloud.google.com/dataproc/docs/reference/rest/v1/InstanceGroupConfig#acceleratorconfig
212
+ :param secondary_worker_accelerator_count: Number of accelerator cards (GPUs) to attach to the secondary workers
200
213
  """
201
214
 
202
215
  def __init__(
@@ -224,9 +237,13 @@ class ClusterGenerator:
224
237
  master_machine_type: str = "n1-standard-4",
225
238
  master_disk_type: str = "pd-standard",
226
239
  master_disk_size: int = 1024,
240
+ master_accelerator_type: str | None = None,
241
+ master_accelerator_count: int | None = None,
227
242
  worker_machine_type: str = "n1-standard-4",
228
243
  worker_disk_type: str = "pd-standard",
229
244
  worker_disk_size: int = 1024,
245
+ worker_accelerator_type: str | None = None,
246
+ worker_accelerator_count: int | None = None,
230
247
  num_preemptible_workers: int = 0,
231
248
  preemptibility: str = PreemptibilityType.PREEMPTIBLE.value,
232
249
  service_account: str | None = None,
@@ -239,6 +256,8 @@ class ClusterGenerator:
239
256
  driver_pool_size: int = 0,
240
257
  driver_pool_id: str | None = None,
241
258
  secondary_worker_instance_flexibility_policy: InstanceFlexibilityPolicy | None = None,
259
+ secondary_worker_accelerator_type: str | None = None,
260
+ secondary_worker_accelerator_count: int | None = None,
242
261
  **kwargs,
243
262
  ) -> None:
244
263
  self.project_id = project_id
@@ -260,10 +279,14 @@ class ClusterGenerator:
260
279
  self.master_machine_type = master_machine_type
261
280
  self.master_disk_type = master_disk_type
262
281
  self.master_disk_size = master_disk_size
282
+ self.master_accelerator_type = master_accelerator_type
283
+ self.master_accelerator_count = master_accelerator_count
263
284
  self.autoscaling_policy = autoscaling_policy
264
285
  self.worker_machine_type = worker_machine_type
265
286
  self.worker_disk_type = worker_disk_type
266
287
  self.worker_disk_size = worker_disk_size
288
+ self.worker_accelerator_type = worker_accelerator_type
289
+ self.worker_accelerator_count = worker_accelerator_count
267
290
  self.zone = zone
268
291
  self.network_uri = network_uri
269
292
  self.subnetwork_uri = subnetwork_uri
@@ -280,6 +303,8 @@ class ClusterGenerator:
280
303
  self.driver_pool_size = driver_pool_size
281
304
  self.driver_pool_id = driver_pool_id
282
305
  self.secondary_worker_instance_flexibility_policy = secondary_worker_instance_flexibility_policy
306
+ self.secondary_worker_accelerator_type = secondary_worker_accelerator_type
307
+ self.secondary_worker_accelerator_count = secondary_worker_accelerator_count
283
308
 
284
309
  if self.custom_image and self.image_version:
285
310
  raise ValueError("The custom_image and image_version can't be both set")
@@ -336,10 +361,10 @@ class ClusterGenerator:
336
361
  if self.subnetwork_uri:
337
362
  cluster_data[config]["subnetwork_uri"] = self.subnetwork_uri
338
363
 
339
- if self.internal_ip_only:
340
- if not self.subnetwork_uri:
364
+ if self.internal_ip_only is not None:
365
+ if not self.subnetwork_uri and self.internal_ip_only:
341
366
  raise AirflowException("Set internal_ip_only to true only when you pass a subnetwork_uri.")
342
- cluster_data[config]["internal_ip_only"] = True
367
+ cluster_data[config]["internal_ip_only"] = self.internal_ip_only
343
368
 
344
369
  if self.tags:
345
370
  cluster_data[config]["tags"] = self.tags
@@ -420,6 +445,18 @@ class ClusterGenerator:
420
445
  if self.min_num_workers:
421
446
  cluster_data["worker_config"]["min_num_instances"] = self.min_num_workers
422
447
 
448
+ if self.master_accelerator_type:
449
+ cluster_data["master_config"]["accelerators"] = {
450
+ "accelerator_type_uri": self.master_accelerator_type,
451
+ "accelerator_count": self.master_accelerator_count,
452
+ }
453
+
454
+ if self.worker_accelerator_type:
455
+ cluster_data["worker_config"]["accelerators"] = {
456
+ "accelerator_type_uri": self.worker_accelerator_type,
457
+ "accelerator_count": self.worker_accelerator_count,
458
+ }
459
+
423
460
  if self.num_preemptible_workers > 0:
424
461
  cluster_data["secondary_worker_config"] = {
425
462
  "num_instances": self.num_preemptible_workers,
@@ -431,6 +468,11 @@ class ClusterGenerator:
431
468
  "is_preemptible": True,
432
469
  "preemptibility": self.preemptibility.value,
433
470
  }
471
+ if self.worker_accelerator_type:
472
+ cluster_data["secondary_worker_config"]["accelerators"] = {
473
+ "accelerator_type_uri": self.secondary_worker_accelerator_type,
474
+ "accelerator_count": self.secondary_worker_accelerator_count,
475
+ }
434
476
  if self.secondary_worker_instance_flexibility_policy:
435
477
  cluster_data["secondary_worker_config"]["instance_flexibility_policy"] = {
436
478
  "instance_selection_list": [
@@ -681,10 +723,13 @@ class DataprocCreateClusterOperator(GoogleCloudBaseOperator):
681
723
  return
682
724
  self.log.info("Cluster is in ERROR state")
683
725
  self.log.info("Gathering diagnostic information.")
684
- gcs_uri = hook.diagnose_cluster(
726
+ operation = hook.diagnose_cluster(
685
727
  region=self.region, cluster_name=self.cluster_name, project_id=self.project_id
686
728
  )
729
+ operation.result()
730
+ gcs_uri = str(operation.operation.response.value)
687
731
  self.log.info("Diagnostic information for cluster %s available at: %s", self.cluster_name, gcs_uri)
732
+
688
733
  if self.delete_on_error:
689
734
  self._delete_cluster(hook)
690
735
  # The delete op is asynchronous and can cause further failure if the cluster finishes
@@ -718,6 +763,17 @@ class DataprocCreateClusterOperator(GoogleCloudBaseOperator):
718
763
  cluster = self._get_cluster(hook)
719
764
  return cluster
720
765
 
766
+ def _start_cluster(self, hook: DataprocHook):
767
+ op: operation.Operation = hook.start_cluster(
768
+ region=self.region,
769
+ project_id=self.project_id,
770
+ cluster_name=self.cluster_name,
771
+ retry=self.retry,
772
+ timeout=self.timeout,
773
+ metadata=self.metadata,
774
+ )
775
+ return hook.wait_for_operation(timeout=self.timeout, result_retry=self.retry, operation=op)
776
+
721
777
  def execute(self, context: Context) -> dict:
722
778
  self.log.info("Creating cluster: %s", self.cluster_name)
723
779
  hook = DataprocHook(gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain)
@@ -795,6 +851,9 @@ class DataprocCreateClusterOperator(GoogleCloudBaseOperator):
795
851
  # Create new cluster
796
852
  cluster = self._create_cluster(hook)
797
853
  self._handle_error_state(hook, cluster)
854
+ elif cluster.status.state == cluster.status.State.STOPPED:
855
+ # if the cluster exists and already stopped, then start the cluster
856
+ self._start_cluster(hook)
798
857
 
799
858
  return Cluster.to_dict(cluster)
800
859
 
@@ -814,6 +873,11 @@ class DataprocCreateClusterOperator(GoogleCloudBaseOperator):
814
873
  return event["cluster"]
815
874
 
816
875
 
876
+ # TODO: Remove one day
877
+ @deprecated(
878
+ reason="Please use `DataprocUpdateClusterOperator` instead.",
879
+ category=AirflowProviderDeprecationWarning,
880
+ )
817
881
  class DataprocScaleClusterOperator(GoogleCloudBaseOperator):
818
882
  """Scale, up or down, a cluster on Google Cloud Dataproc.
819
883
 
@@ -882,14 +946,6 @@ class DataprocScaleClusterOperator(GoogleCloudBaseOperator):
882
946
  self.gcp_conn_id = gcp_conn_id
883
947
  self.impersonation_chain = impersonation_chain
884
948
 
885
- # TODO: Remove one day
886
- warnings.warn(
887
- f"The `{type(self).__name__}` operator is deprecated, "
888
- "please use `DataprocUpdateClusterOperator` instead.",
889
- AirflowProviderDeprecationWarning,
890
- stacklevel=2,
891
- )
892
-
893
949
  def _build_scale_cluster_data(self) -> dict:
894
950
  scale_data = {
895
951
  "config": {
@@ -1076,6 +1132,189 @@ class DataprocDeleteClusterOperator(GoogleCloudBaseOperator):
1076
1132
  )
1077
1133
 
1078
1134
 
1135
+ class _DataprocStartStopClusterBaseOperator(GoogleCloudBaseOperator):
1136
+ """Base class to start or stop a cluster in a project.
1137
+
1138
+ :param cluster_name: Required. Name of the cluster to create
1139
+ :param region: Required. The specified region where the dataproc cluster is created.
1140
+ :param project_id: Optional. The ID of the Google Cloud project the cluster belongs to.
1141
+ :param cluster_uuid: Optional. Specifying the ``cluster_uuid`` means the RPC should fail
1142
+ if cluster with specified UUID does not exist.
1143
+ :param request_id: Optional. A unique id used to identify the request. If the server receives two
1144
+ ``DeleteClusterRequest`` requests with the same id, then the second request will be ignored and the
1145
+ first ``google.longrunning.Operation`` created and stored in the backend is returned.
1146
+ :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be
1147
+ retried.
1148
+ :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if
1149
+ ``retry`` is specified, the timeout applies to each individual attempt.
1150
+ :param metadata: Additional metadata that is provided to the method.
1151
+ :param gcp_conn_id: The connection ID to use connecting to Google Cloud.
1152
+ :param impersonation_chain: Optional service account to impersonate using short-term
1153
+ credentials, or chained list of accounts required to get the access_token
1154
+ of the last account in the list, which will be impersonated in the request.
1155
+ If set as a string, the account must grant the originating account
1156
+ the Service Account Token Creator IAM role.
1157
+ If set as a sequence, the identities from the list must grant
1158
+ Service Account Token Creator IAM role to the directly preceding identity, with first
1159
+ account from the list granting this role to the originating account (templated).
1160
+ """
1161
+
1162
+ template_fields = (
1163
+ "cluster_name",
1164
+ "region",
1165
+ "project_id",
1166
+ "request_id",
1167
+ "impersonation_chain",
1168
+ )
1169
+
1170
+ def __init__(
1171
+ self,
1172
+ *,
1173
+ cluster_name: str,
1174
+ region: str,
1175
+ project_id: str | None = None,
1176
+ cluster_uuid: str | None = None,
1177
+ request_id: str | None = None,
1178
+ retry: AsyncRetry | _MethodDefault = DEFAULT,
1179
+ timeout: float = 1 * 60 * 60,
1180
+ metadata: Sequence[tuple[str, str]] = (),
1181
+ gcp_conn_id: str = "google_cloud_default",
1182
+ impersonation_chain: str | Sequence[str] | None = None,
1183
+ **kwargs,
1184
+ ) -> None:
1185
+ super().__init__(**kwargs)
1186
+ self.project_id = project_id
1187
+ self.region = region
1188
+ self.cluster_name = cluster_name
1189
+ self.cluster_uuid = cluster_uuid
1190
+ self.request_id = request_id
1191
+ self.retry = retry
1192
+ self.timeout = timeout
1193
+ self.metadata = metadata
1194
+ self.gcp_conn_id = gcp_conn_id
1195
+ self.impersonation_chain = impersonation_chain
1196
+ self._hook: DataprocHook | None = None
1197
+
1198
+ @property
1199
+ def hook(self):
1200
+ if self._hook is None:
1201
+ self._hook = DataprocHook(
1202
+ gcp_conn_id=self.gcp_conn_id,
1203
+ impersonation_chain=self.impersonation_chain,
1204
+ )
1205
+ return self._hook
1206
+
1207
+ def _get_project_id(self) -> str:
1208
+ return self.project_id or self.hook.project_id
1209
+
1210
+ def _get_cluster(self) -> Cluster:
1211
+ """Retrieve the cluster information.
1212
+
1213
+ :return: Instance of ``google.cloud.dataproc_v1.Cluster``` class
1214
+ """
1215
+ return self.hook.get_cluster(
1216
+ project_id=self._get_project_id(),
1217
+ region=self.region,
1218
+ cluster_name=self.cluster_name,
1219
+ retry=self.retry,
1220
+ timeout=self.timeout,
1221
+ metadata=self.metadata,
1222
+ )
1223
+
1224
+ def _check_desired_cluster_state(self, cluster: Cluster) -> tuple[bool, str | None]:
1225
+ """Implement this method in child class to return whether the cluster is in desired state or not.
1226
+
1227
+ If the cluster is in desired stated you can return a log message content as a second value
1228
+ for the return tuple.
1229
+
1230
+ :param cluster: Required. Instance of ``google.cloud.dataproc_v1.Cluster``
1231
+ class to interact with Dataproc API
1232
+ :return: Tuple of (Boolean, Optional[str]) The first value of the tuple is whether the cluster is
1233
+ in desired state or not. The second value of the tuple will use if you want to log something when
1234
+ the cluster is in desired state already.
1235
+ """
1236
+ raise NotImplementedError
1237
+
1238
+ def _get_operation(self) -> operation.Operation:
1239
+ """Implement this method in child class to call the related hook method and return its result.
1240
+
1241
+ :return: ``google.api_core.operation.Operation`` value whether the cluster is in desired state or not
1242
+ """
1243
+ raise NotImplementedError
1244
+
1245
+ def execute(self, context: Context) -> dict | None:
1246
+ cluster: Cluster = self._get_cluster()
1247
+ is_already_desired_state, log_str = self._check_desired_cluster_state(cluster)
1248
+ if is_already_desired_state:
1249
+ self.log.info(log_str)
1250
+ return None
1251
+
1252
+ op: operation.Operation = self._get_operation()
1253
+ result = self.hook.wait_for_operation(timeout=self.timeout, result_retry=self.retry, operation=op)
1254
+ return Cluster.to_dict(result)
1255
+
1256
+
1257
+ class DataprocStartClusterOperator(_DataprocStartStopClusterBaseOperator):
1258
+ """Start a cluster in a project."""
1259
+
1260
+ operator_extra_links = (DataprocClusterLink(),)
1261
+
1262
+ def execute(self, context: Context) -> dict | None:
1263
+ self.log.info("Starting the cluster: %s", self.cluster_name)
1264
+ cluster = super().execute(context)
1265
+ DataprocClusterLink.persist(
1266
+ context=context,
1267
+ operator=self,
1268
+ cluster_id=self.cluster_name,
1269
+ project_id=self._get_project_id(),
1270
+ region=self.region,
1271
+ )
1272
+ self.log.info("Cluster started")
1273
+ return cluster
1274
+
1275
+ def _check_desired_cluster_state(self, cluster: Cluster) -> tuple[bool, str | None]:
1276
+ if cluster.status.state == cluster.status.State.RUNNING:
1277
+ return True, f'The cluster "{self.cluster_name}" already running!'
1278
+ return False, None
1279
+
1280
+ def _get_operation(self) -> operation.Operation:
1281
+ return self.hook.start_cluster(
1282
+ region=self.region,
1283
+ project_id=self._get_project_id(),
1284
+ cluster_name=self.cluster_name,
1285
+ cluster_uuid=self.cluster_uuid,
1286
+ retry=self.retry,
1287
+ timeout=self.timeout,
1288
+ metadata=self.metadata,
1289
+ )
1290
+
1291
+
1292
+ class DataprocStopClusterOperator(_DataprocStartStopClusterBaseOperator):
1293
+ """Stop a cluster in a project."""
1294
+
1295
+ def execute(self, context: Context) -> dict | None:
1296
+ self.log.info("Stopping the cluster: %s", self.cluster_name)
1297
+ cluster = super().execute(context)
1298
+ self.log.info("Cluster stopped")
1299
+ return cluster
1300
+
1301
+ def _check_desired_cluster_state(self, cluster: Cluster) -> tuple[bool, str | None]:
1302
+ if cluster.status.state in [cluster.status.State.STOPPED, cluster.status.State.STOPPING]:
1303
+ return True, f'The cluster "{self.cluster_name}" already stopped!'
1304
+ return False, None
1305
+
1306
+ def _get_operation(self) -> operation.Operation:
1307
+ return self.hook.stop_cluster(
1308
+ region=self.region,
1309
+ project_id=self._get_project_id(),
1310
+ cluster_name=self.cluster_name,
1311
+ cluster_uuid=self.cluster_uuid,
1312
+ retry=self.retry,
1313
+ timeout=self.timeout,
1314
+ metadata=self.metadata,
1315
+ )
1316
+
1317
+
1079
1318
  class DataprocJobBaseOperator(GoogleCloudBaseOperator):
1080
1319
  """Base class for operators that launch job on DataProc.
1081
1320
 
@@ -1250,6 +1489,15 @@ class DataprocJobBaseOperator(GoogleCloudBaseOperator):
1250
1489
  self.hook.cancel_job(project_id=self.project_id, job_id=self.dataproc_job_id, region=self.region)
1251
1490
 
1252
1491
 
1492
+ # TODO: Remove one day
1493
+ @deprecated(
1494
+ reason=(
1495
+ "Please use `DataprocSubmitJobOperator` instead. "
1496
+ "You can use `generate_job` method to generate dictionary representing your job "
1497
+ "and use it with the new operator."
1498
+ ),
1499
+ category=AirflowProviderDeprecationWarning,
1500
+ )
1253
1501
  class DataprocSubmitPigJobOperator(DataprocJobBaseOperator):
1254
1502
  """Start a Pig query Job on a Cloud DataProc cluster.
1255
1503
 
@@ -1324,15 +1572,6 @@ class DataprocSubmitPigJobOperator(DataprocJobBaseOperator):
1324
1572
  dataproc_jars: list[str] | None = None,
1325
1573
  **kwargs,
1326
1574
  ) -> None:
1327
- # TODO: Remove one day
1328
- warnings.warn(
1329
- "The `{cls}` operator is deprecated, please use `DataprocSubmitJobOperator` instead. You can use"
1330
- " `generate_job` method of `{cls}` to generate dictionary representing your job"
1331
- " and use it with the new operator.".format(cls=type(self).__name__),
1332
- AirflowProviderDeprecationWarning,
1333
- stacklevel=2,
1334
- )
1335
-
1336
1575
  super().__init__(
1337
1576
  impersonation_chain=impersonation_chain,
1338
1577
  region=region,
@@ -1376,6 +1615,15 @@ class DataprocSubmitPigJobOperator(DataprocJobBaseOperator):
1376
1615
  super().execute(context)
1377
1616
 
1378
1617
 
1618
+ # TODO: Remove one day
1619
+ @deprecated(
1620
+ reason=(
1621
+ "Please use `DataprocSubmitJobOperator` instead. "
1622
+ "You can use `generate_job` method to generate dictionary representing your job "
1623
+ "and use it with the new operator."
1624
+ ),
1625
+ category=AirflowProviderDeprecationWarning,
1626
+ )
1379
1627
  class DataprocSubmitHiveJobOperator(DataprocJobBaseOperator):
1380
1628
  """Start a Hive query Job on a Cloud DataProc cluster.
1381
1629
 
@@ -1416,15 +1664,6 @@ class DataprocSubmitHiveJobOperator(DataprocJobBaseOperator):
1416
1664
  dataproc_jars: list[str] | None = None,
1417
1665
  **kwargs,
1418
1666
  ) -> None:
1419
- # TODO: Remove one day
1420
- warnings.warn(
1421
- "The `{cls}` operator is deprecated, please use `DataprocSubmitJobOperator` instead. You can use"
1422
- " `generate_job` method of `{cls}` to generate dictionary representing your job"
1423
- " and use it with the new operator.".format(cls=type(self).__name__),
1424
- AirflowProviderDeprecationWarning,
1425
- stacklevel=2,
1426
- )
1427
-
1428
1667
  super().__init__(
1429
1668
  impersonation_chain=impersonation_chain,
1430
1669
  region=region,
@@ -1468,6 +1707,15 @@ class DataprocSubmitHiveJobOperator(DataprocJobBaseOperator):
1468
1707
  super().execute(context)
1469
1708
 
1470
1709
 
1710
+ # TODO: Remove one day
1711
+ @deprecated(
1712
+ reason=(
1713
+ "Please use `DataprocSubmitJobOperator` instead. "
1714
+ "You can use `generate_job` method to generate dictionary representing your job "
1715
+ "and use it with the new operator."
1716
+ ),
1717
+ category=AirflowProviderDeprecationWarning,
1718
+ )
1471
1719
  class DataprocSubmitSparkSqlJobOperator(DataprocJobBaseOperator):
1472
1720
  """Start a Spark SQL query Job on a Cloud DataProc cluster.
1473
1721
 
@@ -1509,15 +1757,6 @@ class DataprocSubmitSparkSqlJobOperator(DataprocJobBaseOperator):
1509
1757
  dataproc_jars: list[str] | None = None,
1510
1758
  **kwargs,
1511
1759
  ) -> None:
1512
- # TODO: Remove one day
1513
- warnings.warn(
1514
- "The `{cls}` operator is deprecated, please use `DataprocSubmitJobOperator` instead. You can use"
1515
- " `generate_job` method of `{cls}` to generate dictionary representing your job"
1516
- " and use it with the new operator.".format(cls=type(self).__name__),
1517
- AirflowProviderDeprecationWarning,
1518
- stacklevel=2,
1519
- )
1520
-
1521
1760
  super().__init__(
1522
1761
  impersonation_chain=impersonation_chain,
1523
1762
  region=region,
@@ -1559,6 +1798,15 @@ class DataprocSubmitSparkSqlJobOperator(DataprocJobBaseOperator):
1559
1798
  super().execute(context)
1560
1799
 
1561
1800
 
1801
+ # TODO: Remove one day
1802
+ @deprecated(
1803
+ reason=(
1804
+ "Please use `DataprocSubmitJobOperator` instead. "
1805
+ "You can use `generate_job` method to generate dictionary representing your job "
1806
+ "and use it with the new operator."
1807
+ ),
1808
+ category=AirflowProviderDeprecationWarning,
1809
+ )
1562
1810
  class DataprocSubmitSparkJobOperator(DataprocJobBaseOperator):
1563
1811
  """Start a Spark Job on a Cloud DataProc cluster.
1564
1812
 
@@ -1604,15 +1852,6 @@ class DataprocSubmitSparkJobOperator(DataprocJobBaseOperator):
1604
1852
  dataproc_jars: list[str] | None = None,
1605
1853
  **kwargs,
1606
1854
  ) -> None:
1607
- # TODO: Remove one day
1608
- warnings.warn(
1609
- "The `{cls}` operator is deprecated, please use `DataprocSubmitJobOperator` instead. You can use"
1610
- " `generate_job` method of `{cls}` to generate dictionary representing your job"
1611
- " and use it with the new operator.".format(cls=type(self).__name__),
1612
- AirflowProviderDeprecationWarning,
1613
- stacklevel=2,
1614
- )
1615
-
1616
1855
  super().__init__(
1617
1856
  impersonation_chain=impersonation_chain,
1618
1857
  region=region,
@@ -1650,6 +1889,15 @@ class DataprocSubmitSparkJobOperator(DataprocJobBaseOperator):
1650
1889
  super().execute(context)
1651
1890
 
1652
1891
 
1892
+ # TODO: Remove one day
1893
+ @deprecated(
1894
+ reason=(
1895
+ "Please use `DataprocSubmitJobOperator` instead. "
1896
+ "You can use `generate_job` method to generate dictionary representing your job "
1897
+ "and use it with the new operator."
1898
+ ),
1899
+ category=AirflowProviderDeprecationWarning,
1900
+ )
1653
1901
  class DataprocSubmitHadoopJobOperator(DataprocJobBaseOperator):
1654
1902
  """Start a Hadoop Job on a Cloud DataProc cluster.
1655
1903
 
@@ -1695,15 +1943,6 @@ class DataprocSubmitHadoopJobOperator(DataprocJobBaseOperator):
1695
1943
  dataproc_jars: list[str] | None = None,
1696
1944
  **kwargs,
1697
1945
  ) -> None:
1698
- # TODO: Remove one day
1699
- warnings.warn(
1700
- "The `{cls}` operator is deprecated, please use `DataprocSubmitJobOperator` instead. You can use"
1701
- " `generate_job` method of `{cls}` to generate dictionary representing your job"
1702
- " and use it with the new operator.".format(cls=type(self).__name__),
1703
- AirflowProviderDeprecationWarning,
1704
- stacklevel=2,
1705
- )
1706
-
1707
1946
  super().__init__(
1708
1947
  impersonation_chain=impersonation_chain,
1709
1948
  region=region,
@@ -1740,6 +1979,15 @@ class DataprocSubmitHadoopJobOperator(DataprocJobBaseOperator):
1740
1979
  super().execute(context)
1741
1980
 
1742
1981
 
1982
+ # TODO: Remove one day
1983
+ @deprecated(
1984
+ reason=(
1985
+ "Please use `DataprocSubmitJobOperator` instead. "
1986
+ "You can use `generate_job` method to generate dictionary representing your job "
1987
+ "and use it with the new operator."
1988
+ ),
1989
+ category=AirflowProviderDeprecationWarning,
1990
+ )
1743
1991
  class DataprocSubmitPySparkJobOperator(DataprocJobBaseOperator):
1744
1992
  """Start a PySpark Job on a Cloud DataProc cluster.
1745
1993
 
@@ -1809,15 +2057,6 @@ class DataprocSubmitPySparkJobOperator(DataprocJobBaseOperator):
1809
2057
  dataproc_jars: list[str] | None = None,
1810
2058
  **kwargs,
1811
2059
  ) -> None:
1812
- # TODO: Remove one day
1813
- warnings.warn(
1814
- "The `{cls}` operator is deprecated, please use `DataprocSubmitJobOperator` instead. You can use"
1815
- " `generate_job` method of `{cls}` to generate dictionary representing your job"
1816
- " and use it with the new operator.".format(cls=type(self).__name__),
1817
- AirflowProviderDeprecationWarning,
1818
- stacklevel=2,
1819
- )
1820
-
1821
2060
  super().__init__(
1822
2061
  impersonation_chain=impersonation_chain,
1823
2062
  region=region,
@@ -2054,7 +2293,7 @@ class DataprocInstantiateWorkflowTemplateOperator(GoogleCloudBaseOperator):
2054
2293
  self.log.info("Workflow %s completed successfully", workflow_id)
2055
2294
  else:
2056
2295
  self.defer(
2057
- trigger=DataprocWorkflowTrigger(
2296
+ trigger=DataprocOperationTrigger(
2058
2297
  name=operation_name,
2059
2298
  project_id=self.project_id,
2060
2299
  region=self.region,
@@ -2196,7 +2435,7 @@ class DataprocInstantiateInlineWorkflowTemplateOperator(GoogleCloudBaseOperator)
2196
2435
  self.log.info("Workflow %s completed successfully", workflow_id)
2197
2436
  else:
2198
2437
  self.defer(
2199
- trigger=DataprocWorkflowTrigger(
2438
+ trigger=DataprocOperationTrigger(
2200
2439
  name=operation_name,
2201
2440
  project_id=self.project_id or hook.project_id,
2202
2441
  region=self.region,
@@ -2530,6 +2769,142 @@ class DataprocUpdateClusterOperator(GoogleCloudBaseOperator):
2530
2769
  self.log.info("%s completed successfully.", self.task_id)
2531
2770
 
2532
2771
 
2772
+ class DataprocDiagnoseClusterOperator(GoogleCloudBaseOperator):
2773
+ """Diagnose a cluster in a project.
2774
+
2775
+ After the operation completes, the response contains the Cloud Storage URI of the diagnostic output report containing a summary of collected diagnostics.
2776
+
2777
+ :param region: Required. The Cloud Dataproc region in which to handle the request (templated).
2778
+ :param project_id: Optional. The ID of the Google Cloud project that the cluster belongs to (templated).
2779
+ :param cluster_name: Required. The cluster name (templated).
2780
+ :param tarball_gcs_dir: The output Cloud Storage directory for the diagnostic tarball. If not specified, a task-specific directory in the cluster's staging bucket will be used.
2781
+ :param diagnosis_interval: Time interval in which diagnosis should be carried out on the cluster.
2782
+ :param jobs: Specifies a list of jobs on which diagnosis is to be performed. Format: `projects/{project}/regions/{region}/jobs/{job}`
2783
+ :param yarn_application_ids: Specifies a list of yarn applications on which diagnosis is to be performed.
2784
+ :param metadata: Additional metadata that is provided to the method.
2785
+ :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be
2786
+ retried.
2787
+ :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if
2788
+ ``retry`` is specified, the timeout applies to each individual attempt.
2789
+ :param gcp_conn_id: The connection ID to use connecting to Google Cloud.
2790
+ :param impersonation_chain: Optional service account to impersonate using short-term
2791
+ credentials, or chained list of accounts required to get the access_token
2792
+ of the last account in the list, which will be impersonated in the request.
2793
+ If set as a string, the account must grant the originating account
2794
+ the Service Account Token Creator IAM role.
2795
+ If set as a sequence, the identities from the list must grant
2796
+ Service Account Token Creator IAM role to the directly preceding identity, with first
2797
+ account from the list granting this role to the originating account (templated).
2798
+ :param deferrable: Run operator in the deferrable mode.
2799
+ :param polling_interval_seconds: Time (seconds) to wait between calls to check the cluster status.
2800
+ """
2801
+
2802
+ template_fields: Sequence[str] = (
2803
+ "project_id",
2804
+ "region",
2805
+ "cluster_name",
2806
+ "impersonation_chain",
2807
+ "tarball_gcs_dir",
2808
+ "diagnosis_interval",
2809
+ "jobs",
2810
+ "yarn_application_ids",
2811
+ )
2812
+
2813
+ def __init__(
2814
+ self,
2815
+ *,
2816
+ region: str,
2817
+ cluster_name: str,
2818
+ project_id: str | None = None,
2819
+ tarball_gcs_dir: str | None = None,
2820
+ diagnosis_interval: dict | Interval | None = None,
2821
+ jobs: MutableSequence[str] | None = None,
2822
+ yarn_application_ids: MutableSequence[str] | None = None,
2823
+ retry: AsyncRetry | _MethodDefault = DEFAULT,
2824
+ timeout: float = 1 * 60 * 60,
2825
+ metadata: Sequence[tuple[str, str]] = (),
2826
+ gcp_conn_id: str = "google_cloud_default",
2827
+ impersonation_chain: str | Sequence[str] | None = None,
2828
+ deferrable: bool = conf.getboolean("operators", "default_deferrable", fallback=False),
2829
+ polling_interval_seconds: int = 10,
2830
+ **kwargs,
2831
+ ):
2832
+ super().__init__(**kwargs)
2833
+ if deferrable and polling_interval_seconds <= 0:
2834
+ raise ValueError("Invalid value for polling_interval_seconds. Expected value greater than 0")
2835
+ self.project_id = project_id
2836
+ self.region = region
2837
+ self.cluster_name = cluster_name
2838
+ self.tarball_gcs_dir = tarball_gcs_dir
2839
+ self.diagnosis_interval = diagnosis_interval
2840
+ self.jobs = jobs
2841
+ self.yarn_application_ids = yarn_application_ids
2842
+ self.retry = retry
2843
+ self.timeout = timeout
2844
+ self.metadata = metadata
2845
+ self.gcp_conn_id = gcp_conn_id
2846
+ self.impersonation_chain = impersonation_chain
2847
+ self.deferrable = deferrable
2848
+ self.polling_interval_seconds = polling_interval_seconds
2849
+
2850
+ def execute(self, context: Context):
2851
+ hook = DataprocHook(gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain)
2852
+ self.log.info("Collecting diagnostic tarball for cluster: %s", self.cluster_name)
2853
+ operation = hook.diagnose_cluster(
2854
+ region=self.region,
2855
+ cluster_name=self.cluster_name,
2856
+ project_id=self.project_id,
2857
+ tarball_gcs_dir=self.tarball_gcs_dir,
2858
+ diagnosis_interval=self.diagnosis_interval,
2859
+ jobs=self.jobs,
2860
+ yarn_application_ids=self.yarn_application_ids,
2861
+ retry=self.retry,
2862
+ timeout=self.timeout,
2863
+ metadata=self.metadata,
2864
+ )
2865
+
2866
+ if not self.deferrable:
2867
+ result = hook.wait_for_operation(
2868
+ timeout=self.timeout, result_retry=self.retry, operation=operation
2869
+ )
2870
+ self.log.info(
2871
+ "The diagnostic output for cluster %s is available at: %s",
2872
+ self.cluster_name,
2873
+ result.output_uri,
2874
+ )
2875
+ else:
2876
+ self.defer(
2877
+ trigger=DataprocOperationTrigger(
2878
+ name=operation.operation.name,
2879
+ operation_type=DataprocOperationType.DIAGNOSE.value,
2880
+ project_id=self.project_id,
2881
+ region=self.region,
2882
+ gcp_conn_id=self.gcp_conn_id,
2883
+ impersonation_chain=self.impersonation_chain,
2884
+ polling_interval_seconds=self.polling_interval_seconds,
2885
+ ),
2886
+ method_name="execute_complete",
2887
+ )
2888
+
2889
+ def execute_complete(self, context: Context, event: dict[str, Any] | None = None) -> None:
2890
+ """Callback for when the trigger fires.
2891
+
2892
+ This returns immediately. It relies on trigger to throw an exception,
2893
+ otherwise it assumes execution was successful.
2894
+ """
2895
+ if event:
2896
+ status = event.get("status")
2897
+ if status in ("failed", "error"):
2898
+ self.log.exception("Unexpected error in the operation.")
2899
+ raise AirflowException(event.get("message"))
2900
+
2901
+ self.log.info(
2902
+ "The diagnostic output for cluster %s is available at: %s",
2903
+ self.cluster_name,
2904
+ event.get("output_uri"),
2905
+ )
2906
+
2907
+
2533
2908
  class DataprocCreateBatchOperator(GoogleCloudBaseOperator):
2534
2909
  """Create a batch workload.
2535
2910