apache-airflow-providers-google 14.1.0__py3-none-any.whl → 15.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. airflow/providers/google/__init__.py +1 -1
  2. airflow/providers/google/ads/hooks/ads.py +7 -33
  3. airflow/providers/google/ads/transfers/ads_to_gcs.py +1 -17
  4. airflow/providers/google/cloud/hooks/bigquery.py +6 -11
  5. airflow/providers/google/cloud/hooks/cloud_batch.py +1 -2
  6. airflow/providers/google/cloud/hooks/cloud_build.py +1 -54
  7. airflow/providers/google/cloud/hooks/compute.py +4 -3
  8. airflow/providers/google/cloud/hooks/dataflow.py +2 -139
  9. airflow/providers/google/cloud/hooks/dataform.py +6 -12
  10. airflow/providers/google/cloud/hooks/datafusion.py +1 -2
  11. airflow/providers/google/cloud/hooks/dataplex.py +1 -1
  12. airflow/providers/google/cloud/hooks/gcs.py +13 -5
  13. airflow/providers/google/cloud/hooks/life_sciences.py +1 -1
  14. airflow/providers/google/cloud/hooks/translate.py +1 -1
  15. airflow/providers/google/cloud/hooks/vertex_ai/auto_ml.py +3 -2
  16. airflow/providers/google/cloud/hooks/vertex_ai/batch_prediction_job.py +1 -1
  17. airflow/providers/google/cloud/hooks/vertex_ai/custom_job.py +2 -272
  18. airflow/providers/google/cloud/hooks/vertex_ai/endpoint_service.py +2 -1
  19. airflow/providers/google/cloud/hooks/vertex_ai/hyperparameter_tuning_job.py +1 -1
  20. airflow/providers/google/cloud/hooks/vertex_ai/model_service.py +2 -1
  21. airflow/providers/google/cloud/links/cloud_storage_transfer.py +1 -3
  22. airflow/providers/google/cloud/links/dataproc.py +0 -1
  23. airflow/providers/google/cloud/log/gcs_task_handler.py +147 -115
  24. airflow/providers/google/cloud/openlineage/facets.py +32 -32
  25. airflow/providers/google/cloud/openlineage/mixins.py +2 -2
  26. airflow/providers/google/cloud/operators/automl.py +1 -1
  27. airflow/providers/google/cloud/operators/cloud_storage_transfer_service.py +0 -3
  28. airflow/providers/google/cloud/operators/datafusion.py +1 -22
  29. airflow/providers/google/cloud/operators/dataproc.py +1 -143
  30. airflow/providers/google/cloud/operators/dataproc_metastore.py +0 -1
  31. airflow/providers/google/cloud/operators/mlengine.py +3 -1406
  32. airflow/providers/google/cloud/operators/spanner.py +1 -2
  33. airflow/providers/google/cloud/operators/translate.py +2 -2
  34. airflow/providers/google/cloud/operators/vertex_ai/auto_ml.py +0 -12
  35. airflow/providers/google/cloud/operators/vertex_ai/custom_job.py +1 -22
  36. airflow/providers/google/cloud/sensors/cloud_storage_transfer_service.py +4 -3
  37. airflow/providers/google/cloud/sensors/dataproc_metastore.py +1 -1
  38. airflow/providers/google/cloud/transfers/gcs_to_bigquery.py +1 -2
  39. airflow/providers/google/cloud/transfers/sftp_to_gcs.py +23 -10
  40. airflow/providers/google/cloud/triggers/cloud_storage_transfer_service.py +2 -2
  41. airflow/providers/google/common/auth_backend/google_openid.py +1 -1
  42. airflow/providers/google/common/hooks/base_google.py +7 -28
  43. airflow/providers/google/get_provider_info.py +3 -1
  44. airflow/providers/google/marketing_platform/sensors/display_video.py +1 -1
  45. airflow/providers/google/suite/hooks/drive.py +2 -2
  46. {apache_airflow_providers_google-14.1.0.dist-info → apache_airflow_providers_google-15.0.0rc1.dist-info}/METADATA +11 -9
  47. {apache_airflow_providers_google-14.1.0.dist-info → apache_airflow_providers_google-15.0.0rc1.dist-info}/RECORD +49 -50
  48. airflow/providers/google/cloud/utils/mlengine_operator_utils.py +0 -273
  49. {apache_airflow_providers_google-14.1.0.dist-info → apache_airflow_providers_google-15.0.0rc1.dist-info}/WHEEL +0 -0
  50. {apache_airflow_providers_google-14.1.0.dist-info → apache_airflow_providers_google-15.0.0rc1.dist-info}/entry_points.txt +0 -0
@@ -25,6 +25,8 @@ from functools import cached_property
25
25
  from pathlib import Path
26
26
  from typing import TYPE_CHECKING
27
27
 
28
+ import attrs
29
+
28
30
  # not sure why but mypy complains on missing `storage` but it is clearly there and is importable
29
31
  from google.cloud import storage # type: ignore[attr-defined]
30
32
 
@@ -42,6 +44,8 @@ from airflow.utils.log.logging_mixin import LoggingMixin
42
44
 
43
45
  if TYPE_CHECKING:
44
46
  from airflow.models.taskinstance import TaskInstance
47
+ from airflow.sdk.types import RuntimeTaskInstanceProtocol as RuntimeTI
48
+ from airflow.utils.log.file_task_handler import LogMessages, LogSourceInfo
45
49
 
46
50
  _DEFAULT_SCOPESS = frozenset(
47
51
  [
@@ -52,6 +56,126 @@ _DEFAULT_SCOPESS = frozenset(
52
56
  logger = logging.getLogger(__name__)
53
57
 
54
58
 
59
+ @attrs.define
60
+ class GCSRemoteLogIO(LoggingMixin): # noqa: D101
61
+ remote_base: str
62
+ base_log_folder: Path = attrs.field(converter=Path)
63
+ delete_local_copy: bool
64
+
65
+ gcp_key_path: str | None
66
+ gcp_keyfile_dict: dict | None
67
+ scopes: Collection[str] | None
68
+ project_id: str
69
+
70
+ def upload(self, path: os.PathLike, ti: RuntimeTI):
71
+ """Upload the given log path to the remote storage."""
72
+ path = Path(path)
73
+ if path.is_absolute():
74
+ local_loc = path
75
+ remote_loc = os.path.join(self.remote_base, path.relative_to(self.base_log_folder))
76
+ else:
77
+ local_loc = self.base_log_folder.joinpath(path)
78
+ remote_loc = os.path.join(self.remote_base, path)
79
+
80
+ if local_loc.is_file():
81
+ # read log and remove old logs to get just the latest additions
82
+ log = local_loc.read_text()
83
+ has_uploaded = self.write(log, remote_loc)
84
+ if has_uploaded and self.delete_local_copy:
85
+ shutil.rmtree(os.path.dirname(local_loc))
86
+
87
+ @cached_property
88
+ def hook(self) -> GCSHook | None:
89
+ """Returns GCSHook if remote_log_conn_id configured."""
90
+ conn_id = conf.get("logging", "remote_log_conn_id", fallback=None)
91
+ if conn_id:
92
+ try:
93
+ return GCSHook(gcp_conn_id=conn_id)
94
+ except AirflowNotFoundException:
95
+ pass
96
+ return None
97
+
98
+ @cached_property
99
+ def client(self) -> storage.Client:
100
+ """Returns GCS Client."""
101
+ if self.hook:
102
+ credentials, project_id = self.hook.get_credentials_and_project_id()
103
+ else:
104
+ credentials, project_id = get_credentials_and_project_id(
105
+ key_path=self.gcp_key_path,
106
+ keyfile_dict=self.gcp_keyfile_dict,
107
+ scopes=self.scopes,
108
+ disable_logging=True,
109
+ )
110
+ return storage.Client(
111
+ credentials=credentials,
112
+ client_info=CLIENT_INFO,
113
+ project=self.project_id if self.project_id else project_id,
114
+ )
115
+
116
+ def write(self, log: str, remote_log_location: str) -> bool:
117
+ """
118
+ Write the log to the remote location and return `True`; fail silently and return `False` on error.
119
+
120
+ :param log: the log to write to the remote_log_location
121
+ :param remote_log_location: the log's location in remote storage
122
+ :return: whether the log is successfully written to remote location or not.
123
+ """
124
+ try:
125
+ blob = storage.Blob.from_string(remote_log_location, self.client)
126
+ old_log = blob.download_as_bytes().decode()
127
+ log = f"{old_log}\n{log}" if old_log else log
128
+ except Exception as e:
129
+ if not self.no_log_found(e):
130
+ self.log.warning("Error checking for previous log: %s", e)
131
+ try:
132
+ blob = storage.Blob.from_string(remote_log_location, self.client)
133
+ blob.upload_from_string(log, content_type="text/plain")
134
+ except Exception as e:
135
+ self.log.error("Could not write logs to %s: %s", remote_log_location, e)
136
+ return False
137
+ return True
138
+
139
+ @staticmethod
140
+ def no_log_found(exc):
141
+ """
142
+ Given exception, determine whether it is result of log not found.
143
+
144
+ :meta private:
145
+ """
146
+ return (exc.args and isinstance(exc.args[0], str) and "No such object" in exc.args[0]) or getattr(
147
+ exc, "resp", {}
148
+ ).get("status") == "404"
149
+
150
+ def read(self, relative_path: str, ti: RuntimeTI) -> tuple[LogSourceInfo, LogMessages | None]:
151
+ messages = []
152
+ logs = []
153
+ remote_loc = os.path.join(self.remote_base, relative_path)
154
+ uris = []
155
+ bucket, prefix = _parse_gcs_url(remote_loc)
156
+ blobs = list(self.client.list_blobs(bucket_or_name=bucket, prefix=prefix))
157
+
158
+ if blobs:
159
+ uris = [f"gs://{bucket}/{b.name}" for b in blobs]
160
+ if AIRFLOW_V_3_0_PLUS:
161
+ messages = uris
162
+ else:
163
+ messages.extend(["Found remote logs:", *[f" * {x}" for x in sorted(uris)]])
164
+ else:
165
+ return messages, None
166
+
167
+ try:
168
+ for key in sorted(uris):
169
+ blob = storage.Blob.from_string(key, self.client)
170
+ remote_log = blob.download_as_bytes().decode()
171
+ if remote_log:
172
+ logs.append(remote_log)
173
+ except Exception as e:
174
+ if not AIRFLOW_V_3_0_PLUS:
175
+ messages.append(f"Unable to read remote log {e}")
176
+ return messages, logs
177
+
178
+
55
179
  class GCSTaskHandler(FileTaskHandler, LoggingMixin):
56
180
  """
57
181
  GCSTaskHandler is a python log handler that handles and reads task instance logs.
@@ -91,45 +215,19 @@ class GCSTaskHandler(FileTaskHandler, LoggingMixin):
91
215
  ):
92
216
  super().__init__(base_log_folder)
93
217
  self.handler: logging.FileHandler | None = None
94
- self.remote_base = gcs_log_folder
95
218
  self.log_relative_path = ""
96
219
  self.closed = False
97
220
  self.upload_on_close = True
98
- self.gcp_key_path = gcp_key_path
99
- self.gcp_keyfile_dict = gcp_keyfile_dict
100
- self.scopes = gcp_scopes
101
- self.project_id = project_id
102
- self.delete_local_copy = kwargs.get(
103
- "delete_local_copy", conf.getboolean("logging", "delete_local_logs")
104
- )
105
-
106
- @cached_property
107
- def hook(self) -> GCSHook | None:
108
- """Returns GCSHook if remote_log_conn_id configured."""
109
- conn_id = conf.get("logging", "remote_log_conn_id", fallback=None)
110
- if conn_id:
111
- try:
112
- return GCSHook(gcp_conn_id=conn_id)
113
- except AirflowNotFoundException:
114
- pass
115
- return None
116
-
117
- @cached_property
118
- def client(self) -> storage.Client:
119
- """Returns GCS Client."""
120
- if self.hook:
121
- credentials, project_id = self.hook.get_credentials_and_project_id()
122
- else:
123
- credentials, project_id = get_credentials_and_project_id(
124
- key_path=self.gcp_key_path,
125
- keyfile_dict=self.gcp_keyfile_dict,
126
- scopes=self.scopes,
127
- disable_logging=True,
128
- )
129
- return storage.Client(
130
- credentials=credentials,
131
- client_info=CLIENT_INFO,
132
- project=self.project_id if self.project_id else project_id,
221
+ self.io = GCSRemoteLogIO(
222
+ base_log_folder=base_log_folder,
223
+ remote_base=gcs_log_folder,
224
+ delete_local_copy=kwargs.get(
225
+ "delete_local_copy", conf.getboolean("logging", "delete_local_logs")
226
+ ),
227
+ gcp_key_path=gcp_key_path,
228
+ gcp_keyfile_dict=gcp_keyfile_dict,
229
+ scopes=gcp_scopes,
230
+ project_id=project_id,
133
231
  )
134
232
 
135
233
  def set_context(self, ti: TaskInstance, *, identifier: str | None = None) -> None:
@@ -140,6 +238,8 @@ class GCSTaskHandler(FileTaskHandler, LoggingMixin):
140
238
  if TYPE_CHECKING:
141
239
  assert self.handler is not None
142
240
 
241
+ self.ti = ti
242
+
143
243
  full_path = self.handler.baseFilename
144
244
  self.log_relative_path = Path(full_path).relative_to(self.local_base).as_posix()
145
245
  is_trigger_log_context = getattr(ti, "is_trigger_log_context", False)
@@ -159,91 +259,23 @@ class GCSTaskHandler(FileTaskHandler, LoggingMixin):
159
259
  if not self.upload_on_close:
160
260
  return
161
261
 
162
- local_loc = os.path.join(self.local_base, self.log_relative_path)
163
- remote_loc = os.path.join(self.remote_base, self.log_relative_path)
164
- if os.path.exists(local_loc):
165
- # read log and remove old logs to get just the latest additions
166
- with open(local_loc) as logfile:
167
- log = logfile.read()
168
- gcs_write = self.gcs_write(log, remote_loc)
169
- if gcs_write and self.delete_local_copy:
170
- shutil.rmtree(os.path.dirname(local_loc))
262
+ if hasattr(self, "ti"):
263
+ self.io.upload(self.log_relative_path, self.ti)
171
264
 
172
265
  # Mark closed so we don't double write if close is called twice
173
266
  self.closed = True
174
267
 
175
- def _add_message(self, msg):
176
- filename, lineno, func, stackinfo = logger.findCaller()
177
- record = logging.LogRecord("", logging.INFO, filename, lineno, msg + "\n", None, None, func=func)
178
- return self.format(record)
268
+ def _read_remote_logs(self, ti, try_number, metadata=None) -> tuple[LogSourceInfo, LogMessages]:
269
+ # Explicitly getting log relative path is necessary as the given
270
+ # task instance might be different than task instance passed in
271
+ # in set_context method.
272
+ worker_log_rel_path = self._render_filename(ti, try_number)
179
273
 
180
- def _read_remote_logs(self, ti, try_number, metadata=None) -> tuple[list[str], list[str]]:
181
- # Explicitly getting log relative path is necessary because this method
182
- # is called from webserver from TaskLogReader, where we don't call set_context
183
- # and can read logs for different TIs in each request
184
- messages = []
185
- logs = []
186
- worker_log_relative_path = self._render_filename(ti, try_number)
187
- remote_loc = os.path.join(self.remote_base, worker_log_relative_path)
188
- uris = []
189
- bucket, prefix = _parse_gcs_url(remote_loc)
190
- blobs = list(self.client.list_blobs(bucket_or_name=bucket, prefix=prefix))
274
+ messages, logs = self.io.read(worker_log_rel_path, ti)
191
275
 
192
- if blobs:
193
- uris = [f"gs://{bucket}/{b.name}" for b in blobs]
194
- if AIRFLOW_V_3_0_PLUS:
195
- messages = uris
196
- else:
197
- messages.extend(["Found remote logs:", *[f" * {x}" for x in sorted(uris)]])
198
- else:
276
+ if logs is None:
277
+ logs = []
199
278
  if not AIRFLOW_V_3_0_PLUS:
200
279
  messages.append(f"No logs found in GCS; ti={ti}")
201
- try:
202
- for key in sorted(uris):
203
- blob = storage.Blob.from_string(key, self.client)
204
- remote_log = blob.download_as_bytes().decode()
205
- if remote_log:
206
- logs.append(remote_log)
207
- except Exception as e:
208
- if not AIRFLOW_V_3_0_PLUS:
209
- messages.append(f"Unable to read remote log {e}")
210
- return messages, logs
211
-
212
- def gcs_write(self, log, remote_log_location) -> bool:
213
- """
214
- Write the log to the remote location and return `True`; fail silently and return `False` on error.
215
-
216
- :param log: the log to write to the remote_log_location
217
- :param remote_log_location: the log's location in remote storage
218
- :return: whether the log is successfully written to remote location or not.
219
- """
220
- try:
221
- blob = storage.Blob.from_string(remote_log_location, self.client)
222
- old_log = blob.download_as_bytes().decode()
223
- log = f"{old_log}\n{log}" if old_log else log
224
- except Exception as e:
225
- if not self.no_log_found(e):
226
- log += self._add_message(
227
- f"Error checking for previous log; if exists, may be overwritten: {e}"
228
- )
229
- self.log.warning("Error checking for previous log: %s", e)
230
- try:
231
- blob = storage.Blob.from_string(remote_log_location, self.client)
232
- blob.upload_from_string(log, content_type="text/plain")
233
- except Exception as e:
234
- self.log.error("Could not write logs to %s: %s", remote_log_location, e)
235
- return False
236
- return True
237
280
 
238
- @staticmethod
239
- def no_log_found(exc):
240
- """
241
- Given exception, determine whether it is result of log not found.
242
-
243
- :meta private:
244
- """
245
- if (exc.args and isinstance(exc.args[0], str) and "No such object" in exc.args[0]) or getattr(
246
- exc, "resp", {}
247
- ).get("status") == "404":
248
- return True
249
- return False
281
+ return messages, logs
@@ -25,43 +25,43 @@ from airflow.providers.google import __version__ as provider_version
25
25
 
26
26
  if TYPE_CHECKING:
27
27
  from openlineage.client.generated.base import RunFacet
28
- else:
28
+
29
+ try:
29
30
  try:
30
- try:
31
- from openlineage.client.generated.base import RunFacet
32
- except ImportError: # Old OpenLineage client is used
33
- from openlineage.client.facet import BaseFacet as RunFacet
31
+ from openlineage.client.generated.base import RunFacet
32
+ except ImportError: # Old OpenLineage client is used
33
+ from openlineage.client.facet import BaseFacet as RunFacet # type: ignore[assignment]
34
34
 
35
- @define
36
- class BigQueryJobRunFacet(RunFacet):
37
- """
38
- Facet that represents relevant statistics of bigquery run.
35
+ @define
36
+ class BigQueryJobRunFacet(RunFacet):
37
+ """
38
+ Facet that represents relevant statistics of bigquery run.
39
39
 
40
- :param cached: BigQuery caches query results. Rest of the statistics will not be provided for cached queries.
41
- :param billedBytes: How many bytes BigQuery bills for.
42
- :param properties: Full property tree of BigQUery run.
43
- """
40
+ :param cached: BigQuery caches query results. Rest of the statistics will not be provided for cached queries.
41
+ :param billedBytes: How many bytes BigQuery bills for.
42
+ :param properties: Full property tree of BigQUery run.
43
+ """
44
44
 
45
- cached: bool
46
- billedBytes: int | None = field(default=None)
47
- properties: str | None = field(default=None)
45
+ cached: bool
46
+ billedBytes: int | None = field(default=None)
47
+ properties: str | None = field(default=None)
48
48
 
49
- @staticmethod
50
- def _get_schema() -> str:
51
- return (
52
- "https://raw.githubusercontent.com/apache/airflow/"
53
- f"providers-google/{provider_version}/airflow/providers/google/"
54
- "openlineage/BigQueryJobRunFacet.json"
55
- )
56
- except ImportError: # OpenLineage is not available
49
+ @staticmethod
50
+ def _get_schema() -> str:
51
+ return (
52
+ "https://raw.githubusercontent.com/apache/airflow/"
53
+ f"providers-google/{provider_version}/airflow/providers/google/"
54
+ "openlineage/BigQueryJobRunFacet.json"
55
+ )
56
+ except ImportError: # OpenLineage is not available
57
57
 
58
- def create_no_op(*_, **__) -> None:
59
- """
60
- Create a no-op placeholder.
58
+ def create_no_op(*_, **__) -> None:
59
+ """
60
+ Create a no-op placeholder.
61
61
 
62
- This function creates and returns a None value, used as a placeholder when the OpenLineage client
63
- library is available. It represents an action that has no effect.
64
- """
65
- return None
62
+ This function creates and returns a None value, used as a placeholder when the OpenLineage client
63
+ library is available. It represents an action that has no effect.
64
+ """
65
+ return None
66
66
 
67
- BigQueryJobRunFacet = create_no_op
67
+ BigQueryJobRunFacet = create_no_op # type: ignore[misc, assignment]
@@ -188,10 +188,10 @@ class _BigQueryInsertJobOperatorOpenLineageMixin:
188
188
  return list(final_outputs.values())
189
189
 
190
190
  def _get_input_dataset(self, table: dict) -> InputDataset:
191
- return cast(InputDataset, self._get_dataset(table, "input"))
191
+ return cast("InputDataset", self._get_dataset(table, "input"))
192
192
 
193
193
  def _get_output_dataset(self, table: dict) -> OutputDataset:
194
- return cast(OutputDataset, self._get_dataset(table, "output"))
194
+ return cast("OutputDataset", self._get_dataset(table, "output"))
195
195
 
196
196
  def _get_dataset(self, table: dict, dataset_type: str) -> Dataset:
197
197
  project = table.get("projectId")
@@ -268,7 +268,7 @@ class AutoMLPredictOperator(GoogleCloudBaseOperator):
268
268
  @cached_property
269
269
  def model(self) -> Model | None:
270
270
  if self.model_id:
271
- hook = cast(CloudAutoMLHook, self.hook)
271
+ hook = cast("CloudAutoMLHook", self.hook)
272
272
  return hook.get_model(
273
273
  model_id=self.model_id,
274
274
  location=self.location,
@@ -280,7 +280,6 @@ class CloudDataTransferServiceCreateJobOperator(GoogleCloudBaseOperator):
280
280
  if project_id:
281
281
  CloudStorageTransferJobLink.persist(
282
282
  context=context,
283
- task_instance=self,
284
283
  project_id=project_id,
285
284
  job_name=result[NAME],
286
285
  )
@@ -370,7 +369,6 @@ class CloudDataTransferServiceUpdateJobOperator(GoogleCloudBaseOperator):
370
369
  if project_id:
371
370
  CloudStorageTransferJobLink.persist(
372
371
  context=context,
373
- task_instance=self,
374
372
  project_id=project_id,
375
373
  job_name=self.job_name,
376
374
  )
@@ -516,7 +514,6 @@ class CloudDataTransferServiceRunJobOperator(GoogleCloudBaseOperator):
516
514
  if project_id:
517
515
  CloudStorageTransferJobLink.persist(
518
516
  context=context,
519
- task_instance=self,
520
517
  project_id=project_id,
521
518
  job_name=self.job_name,
522
519
  )
@@ -26,7 +26,7 @@ from google.api_core.retry import exponential_sleep_generator
26
26
  from googleapiclient.errors import HttpError
27
27
 
28
28
  from airflow.configuration import conf
29
- from airflow.exceptions import AirflowException, AirflowProviderDeprecationWarning
29
+ from airflow.exceptions import AirflowException
30
30
  from airflow.providers.google.cloud.hooks.datafusion import SUCCESS_STATES, DataFusionHook, PipelineStates
31
31
  from airflow.providers.google.cloud.links.datafusion import (
32
32
  DataFusionInstanceLink,
@@ -37,33 +37,12 @@ from airflow.providers.google.cloud.operators.cloud_base import GoogleCloudBaseO
37
37
  from airflow.providers.google.cloud.triggers.datafusion import DataFusionStartPipelineTrigger
38
38
  from airflow.providers.google.cloud.utils.datafusion import DataFusionPipelineType
39
39
  from airflow.providers.google.cloud.utils.helpers import resource_path_to_dict
40
- from airflow.providers.google.common.deprecated import deprecated
41
40
  from airflow.providers.google.common.hooks.base_google import PROVIDE_PROJECT_ID
42
41
 
43
42
  if TYPE_CHECKING:
44
43
  from airflow.utils.context import Context
45
44
 
46
45
 
47
- class DataFusionPipelineLinkHelper:
48
- """
49
- Helper class for Pipeline links.
50
-
51
- .. warning::
52
- This class is deprecated. Consider using ``resource_path_to_dict()`` instead.
53
- """
54
-
55
- @staticmethod
56
- @deprecated(
57
- planned_removal_date="March 01, 2025",
58
- use_instead="airflow.providers.google.cloud.utils.helpers.resource_path_to_dict",
59
- category=AirflowProviderDeprecationWarning,
60
- )
61
- def get_project_id(instance):
62
- instance = instance["name"]
63
- project_id = next(x for x in instance.split("/") if x.startswith("airflow"))
64
- return project_id
65
-
66
-
67
46
  class CloudDataFusionRestartInstanceOperator(GoogleCloudBaseOperator):
68
47
  """
69
48
  Restart a single Data Fusion instance.
@@ -44,7 +44,6 @@ from airflow.providers.google.cloud.hooks.dataproc import (
44
44
  )
45
45
  from airflow.providers.google.cloud.links.dataproc import (
46
46
  DATAPROC_BATCH_LINK,
47
- DATAPROC_CLUSTER_LINK_DEPRECATED,
48
47
  DATAPROC_JOB_LINK_DEPRECATED,
49
48
  DataprocBatchesListLink,
50
49
  DataprocBatchLink,
@@ -63,7 +62,6 @@ from airflow.providers.google.cloud.triggers.dataproc import (
63
62
  DataprocSubmitTrigger,
64
63
  )
65
64
  from airflow.providers.google.cloud.utils.dataproc import DataprocOperationType
66
- from airflow.providers.google.common.deprecated import deprecated
67
65
  from airflow.providers.google.common.hooks.base_google import PROVIDE_PROJECT_ID
68
66
  from airflow.utils import timezone
69
67
 
@@ -919,145 +917,6 @@ class DataprocCreateClusterOperator(GoogleCloudBaseOperator):
919
917
  return event["cluster"]
920
918
 
921
919
 
922
- # TODO: Remove one day
923
- @deprecated(
924
- planned_removal_date="March 01, 2025",
925
- use_instead="DataprocUpdateClusterOperator",
926
- category=AirflowProviderDeprecationWarning,
927
- )
928
- class DataprocScaleClusterOperator(GoogleCloudBaseOperator):
929
- """
930
- Scale, up or down, a cluster on Google Cloud Dataproc.
931
-
932
- The operator will wait until the cluster is re-scaled.
933
-
934
- Example usage:
935
-
936
- .. code-block:: python
937
-
938
- t1 = DataprocClusterScaleOperator(
939
- task_id="dataproc_scale",
940
- project_id="my-project",
941
- cluster_name="cluster-1",
942
- num_workers=10,
943
- num_preemptible_workers=10,
944
- graceful_decommission_timeout="1h",
945
- )
946
-
947
- .. seealso::
948
- For more detail on about scaling clusters have a look at the reference:
949
- https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/scaling-clusters
950
-
951
- :param cluster_name: The name of the cluster to scale. (templated)
952
- :param project_id: The ID of the google cloud project in which
953
- the cluster runs. (templated)
954
- :param region: The region for the dataproc cluster. (templated)
955
- :param num_workers: The new number of workers
956
- :param num_preemptible_workers: The new number of preemptible workers
957
- :param graceful_decommission_timeout: Timeout for graceful YARN decommissioning.
958
- Maximum value is 1d
959
- :param gcp_conn_id: The connection ID to use connecting to Google Cloud.
960
- :param impersonation_chain: Optional service account to impersonate using short-term
961
- credentials, or chained list of accounts required to get the access_token
962
- of the last account in the list, which will be impersonated in the request.
963
- If set as a string, the account must grant the originating account
964
- the Service Account Token Creator IAM role.
965
- If set as a sequence, the identities from the list must grant
966
- Service Account Token Creator IAM role to the directly preceding identity, with first
967
- account from the list granting this role to the originating account (templated).
968
- """
969
-
970
- template_fields: Sequence[str] = ("cluster_name", "project_id", "region", "impersonation_chain")
971
-
972
- operator_extra_links = (DataprocLink(),)
973
-
974
- def __init__(
975
- self,
976
- *,
977
- cluster_name: str,
978
- project_id: str = PROVIDE_PROJECT_ID,
979
- region: str = "global",
980
- num_workers: int = 2,
981
- num_preemptible_workers: int = 0,
982
- graceful_decommission_timeout: str | None = None,
983
- gcp_conn_id: str = "google_cloud_default",
984
- impersonation_chain: str | Sequence[str] | None = None,
985
- **kwargs,
986
- ) -> None:
987
- super().__init__(**kwargs)
988
- self.project_id = project_id
989
- self.region = region
990
- self.cluster_name = cluster_name
991
- self.num_workers = num_workers
992
- self.num_preemptible_workers = num_preemptible_workers
993
- self.graceful_decommission_timeout = graceful_decommission_timeout
994
- self.gcp_conn_id = gcp_conn_id
995
- self.impersonation_chain = impersonation_chain
996
-
997
- def _build_scale_cluster_data(self) -> dict:
998
- scale_data = {
999
- "config": {
1000
- "worker_config": {"num_instances": self.num_workers},
1001
- "secondary_worker_config": {"num_instances": self.num_preemptible_workers},
1002
- }
1003
- }
1004
- return scale_data
1005
-
1006
- @property
1007
- def _graceful_decommission_timeout_object(self) -> dict[str, int] | None:
1008
- if not self.graceful_decommission_timeout:
1009
- return None
1010
-
1011
- timeout = None
1012
- match = re.fullmatch(r"(\d+)([smdh])", self.graceful_decommission_timeout)
1013
- if match:
1014
- val = int(match.group(1))
1015
- unit = match.group(2)
1016
- if unit == "s":
1017
- timeout = val
1018
- elif unit == "m":
1019
- timeout = int(timedelta(minutes=val).total_seconds())
1020
- elif unit == "h":
1021
- timeout = int(timedelta(hours=val).total_seconds())
1022
- elif unit == "d":
1023
- timeout = int(timedelta(days=val).total_seconds())
1024
-
1025
- if not timeout:
1026
- raise AirflowException(
1027
- "DataprocClusterScaleOperator "
1028
- " should be expressed in day, hours, minutes or seconds. "
1029
- " i.e. 1d, 4h, 10m, 30s"
1030
- )
1031
-
1032
- return {"seconds": timeout}
1033
-
1034
- def execute(self, context: Context) -> None:
1035
- """Scale, up or down, a cluster on Google Cloud Dataproc."""
1036
- self.log.info("Scaling cluster: %s", self.cluster_name)
1037
-
1038
- scaling_cluster_data = self._build_scale_cluster_data()
1039
- update_mask = ["config.worker_config.num_instances", "config.secondary_worker_config.num_instances"]
1040
-
1041
- hook = DataprocHook(gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain)
1042
- # Save data required to display extra link no matter what the cluster status will be
1043
- DataprocLink.persist(
1044
- context=context,
1045
- task_instance=self,
1046
- url=DATAPROC_CLUSTER_LINK_DEPRECATED,
1047
- resource=self.cluster_name,
1048
- )
1049
- operation = hook.update_cluster(
1050
- project_id=self.project_id,
1051
- region=self.region,
1052
- cluster_name=self.cluster_name,
1053
- cluster=scaling_cluster_data,
1054
- graceful_decommission_timeout=self._graceful_decommission_timeout_object,
1055
- update_mask={"paths": update_mask},
1056
- )
1057
- operation.result()
1058
- self.log.info("Cluster scaling finished")
1059
-
1060
-
1061
920
  class DataprocDeleteClusterOperator(GoogleCloudBaseOperator):
1062
921
  """
1063
922
  Delete a cluster in a project.
@@ -1463,8 +1322,7 @@ class DataprocJobBaseOperator(GoogleCloudBaseOperator):
1463
1322
  """Initialize `self.job_template` with default values."""
1464
1323
  if self.project_id is None:
1465
1324
  raise AirflowException(
1466
- "project id should either be set via project_id "
1467
- "parameter or retrieved from the connection,"
1325
+ "project id should either be set via project_id parameter or retrieved from the connection,"
1468
1326
  )
1469
1327
  job_template = DataProcJobBuilder(
1470
1328
  project_id=self.project_id,
@@ -31,7 +31,6 @@ from google.cloud.metastore_v1.types import Backup, MetadataImport, Service
31
31
  from google.cloud.metastore_v1.types.metastore import DatabaseDumpSpec, Restore
32
32
 
33
33
  from airflow.exceptions import AirflowException
34
- from airflow.models import BaseOperator, BaseOperatorLink
35
34
  from airflow.providers.google.cloud.hooks.dataproc_metastore import DataprocMetastoreHook
36
35
  from airflow.providers.google.cloud.operators.cloud_base import GoogleCloudBaseOperator
37
36
  from airflow.providers.google.common.links.storage import StorageLink