apache-airflow-providers-google 14.1.0__py3-none-any.whl → 15.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airflow/providers/google/__init__.py +1 -1
- airflow/providers/google/ads/hooks/ads.py +7 -33
- airflow/providers/google/ads/transfers/ads_to_gcs.py +1 -17
- airflow/providers/google/cloud/hooks/bigquery.py +6 -11
- airflow/providers/google/cloud/hooks/cloud_batch.py +1 -2
- airflow/providers/google/cloud/hooks/cloud_build.py +1 -54
- airflow/providers/google/cloud/hooks/compute.py +4 -3
- airflow/providers/google/cloud/hooks/dataflow.py +2 -139
- airflow/providers/google/cloud/hooks/dataform.py +6 -12
- airflow/providers/google/cloud/hooks/datafusion.py +1 -2
- airflow/providers/google/cloud/hooks/dataplex.py +1 -1
- airflow/providers/google/cloud/hooks/gcs.py +13 -5
- airflow/providers/google/cloud/hooks/life_sciences.py +1 -1
- airflow/providers/google/cloud/hooks/translate.py +1 -1
- airflow/providers/google/cloud/hooks/vertex_ai/auto_ml.py +3 -2
- airflow/providers/google/cloud/hooks/vertex_ai/batch_prediction_job.py +1 -1
- airflow/providers/google/cloud/hooks/vertex_ai/custom_job.py +2 -272
- airflow/providers/google/cloud/hooks/vertex_ai/endpoint_service.py +2 -1
- airflow/providers/google/cloud/hooks/vertex_ai/hyperparameter_tuning_job.py +1 -1
- airflow/providers/google/cloud/hooks/vertex_ai/model_service.py +2 -1
- airflow/providers/google/cloud/links/cloud_storage_transfer.py +1 -3
- airflow/providers/google/cloud/links/dataproc.py +0 -1
- airflow/providers/google/cloud/log/gcs_task_handler.py +147 -115
- airflow/providers/google/cloud/openlineage/facets.py +32 -32
- airflow/providers/google/cloud/openlineage/mixins.py +2 -2
- airflow/providers/google/cloud/operators/automl.py +1 -1
- airflow/providers/google/cloud/operators/cloud_storage_transfer_service.py +0 -3
- airflow/providers/google/cloud/operators/datafusion.py +1 -22
- airflow/providers/google/cloud/operators/dataproc.py +1 -143
- airflow/providers/google/cloud/operators/dataproc_metastore.py +0 -1
- airflow/providers/google/cloud/operators/mlengine.py +3 -1406
- airflow/providers/google/cloud/operators/spanner.py +1 -2
- airflow/providers/google/cloud/operators/translate.py +2 -2
- airflow/providers/google/cloud/operators/vertex_ai/auto_ml.py +0 -12
- airflow/providers/google/cloud/operators/vertex_ai/custom_job.py +1 -22
- airflow/providers/google/cloud/sensors/cloud_storage_transfer_service.py +4 -3
- airflow/providers/google/cloud/sensors/dataproc_metastore.py +1 -1
- airflow/providers/google/cloud/transfers/gcs_to_bigquery.py +1 -2
- airflow/providers/google/cloud/transfers/sftp_to_gcs.py +23 -10
- airflow/providers/google/cloud/triggers/cloud_storage_transfer_service.py +2 -2
- airflow/providers/google/common/auth_backend/google_openid.py +1 -1
- airflow/providers/google/common/hooks/base_google.py +7 -28
- airflow/providers/google/get_provider_info.py +3 -1
- airflow/providers/google/marketing_platform/sensors/display_video.py +1 -1
- airflow/providers/google/suite/hooks/drive.py +2 -2
- {apache_airflow_providers_google-14.1.0.dist-info → apache_airflow_providers_google-15.0.0rc1.dist-info}/METADATA +11 -9
- {apache_airflow_providers_google-14.1.0.dist-info → apache_airflow_providers_google-15.0.0rc1.dist-info}/RECORD +49 -50
- airflow/providers/google/cloud/utils/mlengine_operator_utils.py +0 -273
- {apache_airflow_providers_google-14.1.0.dist-info → apache_airflow_providers_google-15.0.0rc1.dist-info}/WHEEL +0 -0
- {apache_airflow_providers_google-14.1.0.dist-info → apache_airflow_providers_google-15.0.0rc1.dist-info}/entry_points.txt +0 -0
@@ -25,6 +25,8 @@ from functools import cached_property
|
|
25
25
|
from pathlib import Path
|
26
26
|
from typing import TYPE_CHECKING
|
27
27
|
|
28
|
+
import attrs
|
29
|
+
|
28
30
|
# not sure why but mypy complains on missing `storage` but it is clearly there and is importable
|
29
31
|
from google.cloud import storage # type: ignore[attr-defined]
|
30
32
|
|
@@ -42,6 +44,8 @@ from airflow.utils.log.logging_mixin import LoggingMixin
|
|
42
44
|
|
43
45
|
if TYPE_CHECKING:
|
44
46
|
from airflow.models.taskinstance import TaskInstance
|
47
|
+
from airflow.sdk.types import RuntimeTaskInstanceProtocol as RuntimeTI
|
48
|
+
from airflow.utils.log.file_task_handler import LogMessages, LogSourceInfo
|
45
49
|
|
46
50
|
_DEFAULT_SCOPESS = frozenset(
|
47
51
|
[
|
@@ -52,6 +56,126 @@ _DEFAULT_SCOPESS = frozenset(
|
|
52
56
|
logger = logging.getLogger(__name__)
|
53
57
|
|
54
58
|
|
59
|
+
@attrs.define
|
60
|
+
class GCSRemoteLogIO(LoggingMixin): # noqa: D101
|
61
|
+
remote_base: str
|
62
|
+
base_log_folder: Path = attrs.field(converter=Path)
|
63
|
+
delete_local_copy: bool
|
64
|
+
|
65
|
+
gcp_key_path: str | None
|
66
|
+
gcp_keyfile_dict: dict | None
|
67
|
+
scopes: Collection[str] | None
|
68
|
+
project_id: str
|
69
|
+
|
70
|
+
def upload(self, path: os.PathLike, ti: RuntimeTI):
|
71
|
+
"""Upload the given log path to the remote storage."""
|
72
|
+
path = Path(path)
|
73
|
+
if path.is_absolute():
|
74
|
+
local_loc = path
|
75
|
+
remote_loc = os.path.join(self.remote_base, path.relative_to(self.base_log_folder))
|
76
|
+
else:
|
77
|
+
local_loc = self.base_log_folder.joinpath(path)
|
78
|
+
remote_loc = os.path.join(self.remote_base, path)
|
79
|
+
|
80
|
+
if local_loc.is_file():
|
81
|
+
# read log and remove old logs to get just the latest additions
|
82
|
+
log = local_loc.read_text()
|
83
|
+
has_uploaded = self.write(log, remote_loc)
|
84
|
+
if has_uploaded and self.delete_local_copy:
|
85
|
+
shutil.rmtree(os.path.dirname(local_loc))
|
86
|
+
|
87
|
+
@cached_property
|
88
|
+
def hook(self) -> GCSHook | None:
|
89
|
+
"""Returns GCSHook if remote_log_conn_id configured."""
|
90
|
+
conn_id = conf.get("logging", "remote_log_conn_id", fallback=None)
|
91
|
+
if conn_id:
|
92
|
+
try:
|
93
|
+
return GCSHook(gcp_conn_id=conn_id)
|
94
|
+
except AirflowNotFoundException:
|
95
|
+
pass
|
96
|
+
return None
|
97
|
+
|
98
|
+
@cached_property
|
99
|
+
def client(self) -> storage.Client:
|
100
|
+
"""Returns GCS Client."""
|
101
|
+
if self.hook:
|
102
|
+
credentials, project_id = self.hook.get_credentials_and_project_id()
|
103
|
+
else:
|
104
|
+
credentials, project_id = get_credentials_and_project_id(
|
105
|
+
key_path=self.gcp_key_path,
|
106
|
+
keyfile_dict=self.gcp_keyfile_dict,
|
107
|
+
scopes=self.scopes,
|
108
|
+
disable_logging=True,
|
109
|
+
)
|
110
|
+
return storage.Client(
|
111
|
+
credentials=credentials,
|
112
|
+
client_info=CLIENT_INFO,
|
113
|
+
project=self.project_id if self.project_id else project_id,
|
114
|
+
)
|
115
|
+
|
116
|
+
def write(self, log: str, remote_log_location: str) -> bool:
|
117
|
+
"""
|
118
|
+
Write the log to the remote location and return `True`; fail silently and return `False` on error.
|
119
|
+
|
120
|
+
:param log: the log to write to the remote_log_location
|
121
|
+
:param remote_log_location: the log's location in remote storage
|
122
|
+
:return: whether the log is successfully written to remote location or not.
|
123
|
+
"""
|
124
|
+
try:
|
125
|
+
blob = storage.Blob.from_string(remote_log_location, self.client)
|
126
|
+
old_log = blob.download_as_bytes().decode()
|
127
|
+
log = f"{old_log}\n{log}" if old_log else log
|
128
|
+
except Exception as e:
|
129
|
+
if not self.no_log_found(e):
|
130
|
+
self.log.warning("Error checking for previous log: %s", e)
|
131
|
+
try:
|
132
|
+
blob = storage.Blob.from_string(remote_log_location, self.client)
|
133
|
+
blob.upload_from_string(log, content_type="text/plain")
|
134
|
+
except Exception as e:
|
135
|
+
self.log.error("Could not write logs to %s: %s", remote_log_location, e)
|
136
|
+
return False
|
137
|
+
return True
|
138
|
+
|
139
|
+
@staticmethod
|
140
|
+
def no_log_found(exc):
|
141
|
+
"""
|
142
|
+
Given exception, determine whether it is result of log not found.
|
143
|
+
|
144
|
+
:meta private:
|
145
|
+
"""
|
146
|
+
return (exc.args and isinstance(exc.args[0], str) and "No such object" in exc.args[0]) or getattr(
|
147
|
+
exc, "resp", {}
|
148
|
+
).get("status") == "404"
|
149
|
+
|
150
|
+
def read(self, relative_path: str, ti: RuntimeTI) -> tuple[LogSourceInfo, LogMessages | None]:
|
151
|
+
messages = []
|
152
|
+
logs = []
|
153
|
+
remote_loc = os.path.join(self.remote_base, relative_path)
|
154
|
+
uris = []
|
155
|
+
bucket, prefix = _parse_gcs_url(remote_loc)
|
156
|
+
blobs = list(self.client.list_blobs(bucket_or_name=bucket, prefix=prefix))
|
157
|
+
|
158
|
+
if blobs:
|
159
|
+
uris = [f"gs://{bucket}/{b.name}" for b in blobs]
|
160
|
+
if AIRFLOW_V_3_0_PLUS:
|
161
|
+
messages = uris
|
162
|
+
else:
|
163
|
+
messages.extend(["Found remote logs:", *[f" * {x}" for x in sorted(uris)]])
|
164
|
+
else:
|
165
|
+
return messages, None
|
166
|
+
|
167
|
+
try:
|
168
|
+
for key in sorted(uris):
|
169
|
+
blob = storage.Blob.from_string(key, self.client)
|
170
|
+
remote_log = blob.download_as_bytes().decode()
|
171
|
+
if remote_log:
|
172
|
+
logs.append(remote_log)
|
173
|
+
except Exception as e:
|
174
|
+
if not AIRFLOW_V_3_0_PLUS:
|
175
|
+
messages.append(f"Unable to read remote log {e}")
|
176
|
+
return messages, logs
|
177
|
+
|
178
|
+
|
55
179
|
class GCSTaskHandler(FileTaskHandler, LoggingMixin):
|
56
180
|
"""
|
57
181
|
GCSTaskHandler is a python log handler that handles and reads task instance logs.
|
@@ -91,45 +215,19 @@ class GCSTaskHandler(FileTaskHandler, LoggingMixin):
|
|
91
215
|
):
|
92
216
|
super().__init__(base_log_folder)
|
93
217
|
self.handler: logging.FileHandler | None = None
|
94
|
-
self.remote_base = gcs_log_folder
|
95
218
|
self.log_relative_path = ""
|
96
219
|
self.closed = False
|
97
220
|
self.upload_on_close = True
|
98
|
-
self.
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
"""Returns GCSHook if remote_log_conn_id configured."""
|
109
|
-
conn_id = conf.get("logging", "remote_log_conn_id", fallback=None)
|
110
|
-
if conn_id:
|
111
|
-
try:
|
112
|
-
return GCSHook(gcp_conn_id=conn_id)
|
113
|
-
except AirflowNotFoundException:
|
114
|
-
pass
|
115
|
-
return None
|
116
|
-
|
117
|
-
@cached_property
|
118
|
-
def client(self) -> storage.Client:
|
119
|
-
"""Returns GCS Client."""
|
120
|
-
if self.hook:
|
121
|
-
credentials, project_id = self.hook.get_credentials_and_project_id()
|
122
|
-
else:
|
123
|
-
credentials, project_id = get_credentials_and_project_id(
|
124
|
-
key_path=self.gcp_key_path,
|
125
|
-
keyfile_dict=self.gcp_keyfile_dict,
|
126
|
-
scopes=self.scopes,
|
127
|
-
disable_logging=True,
|
128
|
-
)
|
129
|
-
return storage.Client(
|
130
|
-
credentials=credentials,
|
131
|
-
client_info=CLIENT_INFO,
|
132
|
-
project=self.project_id if self.project_id else project_id,
|
221
|
+
self.io = GCSRemoteLogIO(
|
222
|
+
base_log_folder=base_log_folder,
|
223
|
+
remote_base=gcs_log_folder,
|
224
|
+
delete_local_copy=kwargs.get(
|
225
|
+
"delete_local_copy", conf.getboolean("logging", "delete_local_logs")
|
226
|
+
),
|
227
|
+
gcp_key_path=gcp_key_path,
|
228
|
+
gcp_keyfile_dict=gcp_keyfile_dict,
|
229
|
+
scopes=gcp_scopes,
|
230
|
+
project_id=project_id,
|
133
231
|
)
|
134
232
|
|
135
233
|
def set_context(self, ti: TaskInstance, *, identifier: str | None = None) -> None:
|
@@ -140,6 +238,8 @@ class GCSTaskHandler(FileTaskHandler, LoggingMixin):
|
|
140
238
|
if TYPE_CHECKING:
|
141
239
|
assert self.handler is not None
|
142
240
|
|
241
|
+
self.ti = ti
|
242
|
+
|
143
243
|
full_path = self.handler.baseFilename
|
144
244
|
self.log_relative_path = Path(full_path).relative_to(self.local_base).as_posix()
|
145
245
|
is_trigger_log_context = getattr(ti, "is_trigger_log_context", False)
|
@@ -159,91 +259,23 @@ class GCSTaskHandler(FileTaskHandler, LoggingMixin):
|
|
159
259
|
if not self.upload_on_close:
|
160
260
|
return
|
161
261
|
|
162
|
-
|
163
|
-
|
164
|
-
if os.path.exists(local_loc):
|
165
|
-
# read log and remove old logs to get just the latest additions
|
166
|
-
with open(local_loc) as logfile:
|
167
|
-
log = logfile.read()
|
168
|
-
gcs_write = self.gcs_write(log, remote_loc)
|
169
|
-
if gcs_write and self.delete_local_copy:
|
170
|
-
shutil.rmtree(os.path.dirname(local_loc))
|
262
|
+
if hasattr(self, "ti"):
|
263
|
+
self.io.upload(self.log_relative_path, self.ti)
|
171
264
|
|
172
265
|
# Mark closed so we don't double write if close is called twice
|
173
266
|
self.closed = True
|
174
267
|
|
175
|
-
def
|
176
|
-
|
177
|
-
|
178
|
-
|
268
|
+
def _read_remote_logs(self, ti, try_number, metadata=None) -> tuple[LogSourceInfo, LogMessages]:
|
269
|
+
# Explicitly getting log relative path is necessary as the given
|
270
|
+
# task instance might be different than task instance passed in
|
271
|
+
# in set_context method.
|
272
|
+
worker_log_rel_path = self._render_filename(ti, try_number)
|
179
273
|
|
180
|
-
|
181
|
-
# Explicitly getting log relative path is necessary because this method
|
182
|
-
# is called from webserver from TaskLogReader, where we don't call set_context
|
183
|
-
# and can read logs for different TIs in each request
|
184
|
-
messages = []
|
185
|
-
logs = []
|
186
|
-
worker_log_relative_path = self._render_filename(ti, try_number)
|
187
|
-
remote_loc = os.path.join(self.remote_base, worker_log_relative_path)
|
188
|
-
uris = []
|
189
|
-
bucket, prefix = _parse_gcs_url(remote_loc)
|
190
|
-
blobs = list(self.client.list_blobs(bucket_or_name=bucket, prefix=prefix))
|
274
|
+
messages, logs = self.io.read(worker_log_rel_path, ti)
|
191
275
|
|
192
|
-
if
|
193
|
-
|
194
|
-
if AIRFLOW_V_3_0_PLUS:
|
195
|
-
messages = uris
|
196
|
-
else:
|
197
|
-
messages.extend(["Found remote logs:", *[f" * {x}" for x in sorted(uris)]])
|
198
|
-
else:
|
276
|
+
if logs is None:
|
277
|
+
logs = []
|
199
278
|
if not AIRFLOW_V_3_0_PLUS:
|
200
279
|
messages.append(f"No logs found in GCS; ti={ti}")
|
201
|
-
try:
|
202
|
-
for key in sorted(uris):
|
203
|
-
blob = storage.Blob.from_string(key, self.client)
|
204
|
-
remote_log = blob.download_as_bytes().decode()
|
205
|
-
if remote_log:
|
206
|
-
logs.append(remote_log)
|
207
|
-
except Exception as e:
|
208
|
-
if not AIRFLOW_V_3_0_PLUS:
|
209
|
-
messages.append(f"Unable to read remote log {e}")
|
210
|
-
return messages, logs
|
211
|
-
|
212
|
-
def gcs_write(self, log, remote_log_location) -> bool:
|
213
|
-
"""
|
214
|
-
Write the log to the remote location and return `True`; fail silently and return `False` on error.
|
215
|
-
|
216
|
-
:param log: the log to write to the remote_log_location
|
217
|
-
:param remote_log_location: the log's location in remote storage
|
218
|
-
:return: whether the log is successfully written to remote location or not.
|
219
|
-
"""
|
220
|
-
try:
|
221
|
-
blob = storage.Blob.from_string(remote_log_location, self.client)
|
222
|
-
old_log = blob.download_as_bytes().decode()
|
223
|
-
log = f"{old_log}\n{log}" if old_log else log
|
224
|
-
except Exception as e:
|
225
|
-
if not self.no_log_found(e):
|
226
|
-
log += self._add_message(
|
227
|
-
f"Error checking for previous log; if exists, may be overwritten: {e}"
|
228
|
-
)
|
229
|
-
self.log.warning("Error checking for previous log: %s", e)
|
230
|
-
try:
|
231
|
-
blob = storage.Blob.from_string(remote_log_location, self.client)
|
232
|
-
blob.upload_from_string(log, content_type="text/plain")
|
233
|
-
except Exception as e:
|
234
|
-
self.log.error("Could not write logs to %s: %s", remote_log_location, e)
|
235
|
-
return False
|
236
|
-
return True
|
237
280
|
|
238
|
-
|
239
|
-
def no_log_found(exc):
|
240
|
-
"""
|
241
|
-
Given exception, determine whether it is result of log not found.
|
242
|
-
|
243
|
-
:meta private:
|
244
|
-
"""
|
245
|
-
if (exc.args and isinstance(exc.args[0], str) and "No such object" in exc.args[0]) or getattr(
|
246
|
-
exc, "resp", {}
|
247
|
-
).get("status") == "404":
|
248
|
-
return True
|
249
|
-
return False
|
281
|
+
return messages, logs
|
@@ -25,43 +25,43 @@ from airflow.providers.google import __version__ as provider_version
|
|
25
25
|
|
26
26
|
if TYPE_CHECKING:
|
27
27
|
from openlineage.client.generated.base import RunFacet
|
28
|
-
|
28
|
+
|
29
|
+
try:
|
29
30
|
try:
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
from openlineage.client.facet import BaseFacet as RunFacet
|
31
|
+
from openlineage.client.generated.base import RunFacet
|
32
|
+
except ImportError: # Old OpenLineage client is used
|
33
|
+
from openlineage.client.facet import BaseFacet as RunFacet # type: ignore[assignment]
|
34
34
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
35
|
+
@define
|
36
|
+
class BigQueryJobRunFacet(RunFacet):
|
37
|
+
"""
|
38
|
+
Facet that represents relevant statistics of bigquery run.
|
39
39
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
40
|
+
:param cached: BigQuery caches query results. Rest of the statistics will not be provided for cached queries.
|
41
|
+
:param billedBytes: How many bytes BigQuery bills for.
|
42
|
+
:param properties: Full property tree of BigQUery run.
|
43
|
+
"""
|
44
44
|
|
45
|
-
|
46
|
-
|
47
|
-
|
45
|
+
cached: bool
|
46
|
+
billedBytes: int | None = field(default=None)
|
47
|
+
properties: str | None = field(default=None)
|
48
48
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
49
|
+
@staticmethod
|
50
|
+
def _get_schema() -> str:
|
51
|
+
return (
|
52
|
+
"https://raw.githubusercontent.com/apache/airflow/"
|
53
|
+
f"providers-google/{provider_version}/airflow/providers/google/"
|
54
|
+
"openlineage/BigQueryJobRunFacet.json"
|
55
|
+
)
|
56
|
+
except ImportError: # OpenLineage is not available
|
57
57
|
|
58
|
-
|
59
|
-
|
60
|
-
|
58
|
+
def create_no_op(*_, **__) -> None:
|
59
|
+
"""
|
60
|
+
Create a no-op placeholder.
|
61
61
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
62
|
+
This function creates and returns a None value, used as a placeholder when the OpenLineage client
|
63
|
+
library is available. It represents an action that has no effect.
|
64
|
+
"""
|
65
|
+
return None
|
66
66
|
|
67
|
-
|
67
|
+
BigQueryJobRunFacet = create_no_op # type: ignore[misc, assignment]
|
@@ -188,10 +188,10 @@ class _BigQueryInsertJobOperatorOpenLineageMixin:
|
|
188
188
|
return list(final_outputs.values())
|
189
189
|
|
190
190
|
def _get_input_dataset(self, table: dict) -> InputDataset:
|
191
|
-
return cast(InputDataset, self._get_dataset(table, "input"))
|
191
|
+
return cast("InputDataset", self._get_dataset(table, "input"))
|
192
192
|
|
193
193
|
def _get_output_dataset(self, table: dict) -> OutputDataset:
|
194
|
-
return cast(OutputDataset, self._get_dataset(table, "output"))
|
194
|
+
return cast("OutputDataset", self._get_dataset(table, "output"))
|
195
195
|
|
196
196
|
def _get_dataset(self, table: dict, dataset_type: str) -> Dataset:
|
197
197
|
project = table.get("projectId")
|
@@ -268,7 +268,7 @@ class AutoMLPredictOperator(GoogleCloudBaseOperator):
|
|
268
268
|
@cached_property
|
269
269
|
def model(self) -> Model | None:
|
270
270
|
if self.model_id:
|
271
|
-
hook = cast(CloudAutoMLHook, self.hook)
|
271
|
+
hook = cast("CloudAutoMLHook", self.hook)
|
272
272
|
return hook.get_model(
|
273
273
|
model_id=self.model_id,
|
274
274
|
location=self.location,
|
@@ -280,7 +280,6 @@ class CloudDataTransferServiceCreateJobOperator(GoogleCloudBaseOperator):
|
|
280
280
|
if project_id:
|
281
281
|
CloudStorageTransferJobLink.persist(
|
282
282
|
context=context,
|
283
|
-
task_instance=self,
|
284
283
|
project_id=project_id,
|
285
284
|
job_name=result[NAME],
|
286
285
|
)
|
@@ -370,7 +369,6 @@ class CloudDataTransferServiceUpdateJobOperator(GoogleCloudBaseOperator):
|
|
370
369
|
if project_id:
|
371
370
|
CloudStorageTransferJobLink.persist(
|
372
371
|
context=context,
|
373
|
-
task_instance=self,
|
374
372
|
project_id=project_id,
|
375
373
|
job_name=self.job_name,
|
376
374
|
)
|
@@ -516,7 +514,6 @@ class CloudDataTransferServiceRunJobOperator(GoogleCloudBaseOperator):
|
|
516
514
|
if project_id:
|
517
515
|
CloudStorageTransferJobLink.persist(
|
518
516
|
context=context,
|
519
|
-
task_instance=self,
|
520
517
|
project_id=project_id,
|
521
518
|
job_name=self.job_name,
|
522
519
|
)
|
@@ -26,7 +26,7 @@ from google.api_core.retry import exponential_sleep_generator
|
|
26
26
|
from googleapiclient.errors import HttpError
|
27
27
|
|
28
28
|
from airflow.configuration import conf
|
29
|
-
from airflow.exceptions import AirflowException
|
29
|
+
from airflow.exceptions import AirflowException
|
30
30
|
from airflow.providers.google.cloud.hooks.datafusion import SUCCESS_STATES, DataFusionHook, PipelineStates
|
31
31
|
from airflow.providers.google.cloud.links.datafusion import (
|
32
32
|
DataFusionInstanceLink,
|
@@ -37,33 +37,12 @@ from airflow.providers.google.cloud.operators.cloud_base import GoogleCloudBaseO
|
|
37
37
|
from airflow.providers.google.cloud.triggers.datafusion import DataFusionStartPipelineTrigger
|
38
38
|
from airflow.providers.google.cloud.utils.datafusion import DataFusionPipelineType
|
39
39
|
from airflow.providers.google.cloud.utils.helpers import resource_path_to_dict
|
40
|
-
from airflow.providers.google.common.deprecated import deprecated
|
41
40
|
from airflow.providers.google.common.hooks.base_google import PROVIDE_PROJECT_ID
|
42
41
|
|
43
42
|
if TYPE_CHECKING:
|
44
43
|
from airflow.utils.context import Context
|
45
44
|
|
46
45
|
|
47
|
-
class DataFusionPipelineLinkHelper:
|
48
|
-
"""
|
49
|
-
Helper class for Pipeline links.
|
50
|
-
|
51
|
-
.. warning::
|
52
|
-
This class is deprecated. Consider using ``resource_path_to_dict()`` instead.
|
53
|
-
"""
|
54
|
-
|
55
|
-
@staticmethod
|
56
|
-
@deprecated(
|
57
|
-
planned_removal_date="March 01, 2025",
|
58
|
-
use_instead="airflow.providers.google.cloud.utils.helpers.resource_path_to_dict",
|
59
|
-
category=AirflowProviderDeprecationWarning,
|
60
|
-
)
|
61
|
-
def get_project_id(instance):
|
62
|
-
instance = instance["name"]
|
63
|
-
project_id = next(x for x in instance.split("/") if x.startswith("airflow"))
|
64
|
-
return project_id
|
65
|
-
|
66
|
-
|
67
46
|
class CloudDataFusionRestartInstanceOperator(GoogleCloudBaseOperator):
|
68
47
|
"""
|
69
48
|
Restart a single Data Fusion instance.
|
@@ -44,7 +44,6 @@ from airflow.providers.google.cloud.hooks.dataproc import (
|
|
44
44
|
)
|
45
45
|
from airflow.providers.google.cloud.links.dataproc import (
|
46
46
|
DATAPROC_BATCH_LINK,
|
47
|
-
DATAPROC_CLUSTER_LINK_DEPRECATED,
|
48
47
|
DATAPROC_JOB_LINK_DEPRECATED,
|
49
48
|
DataprocBatchesListLink,
|
50
49
|
DataprocBatchLink,
|
@@ -63,7 +62,6 @@ from airflow.providers.google.cloud.triggers.dataproc import (
|
|
63
62
|
DataprocSubmitTrigger,
|
64
63
|
)
|
65
64
|
from airflow.providers.google.cloud.utils.dataproc import DataprocOperationType
|
66
|
-
from airflow.providers.google.common.deprecated import deprecated
|
67
65
|
from airflow.providers.google.common.hooks.base_google import PROVIDE_PROJECT_ID
|
68
66
|
from airflow.utils import timezone
|
69
67
|
|
@@ -919,145 +917,6 @@ class DataprocCreateClusterOperator(GoogleCloudBaseOperator):
|
|
919
917
|
return event["cluster"]
|
920
918
|
|
921
919
|
|
922
|
-
# TODO: Remove one day
|
923
|
-
@deprecated(
|
924
|
-
planned_removal_date="March 01, 2025",
|
925
|
-
use_instead="DataprocUpdateClusterOperator",
|
926
|
-
category=AirflowProviderDeprecationWarning,
|
927
|
-
)
|
928
|
-
class DataprocScaleClusterOperator(GoogleCloudBaseOperator):
|
929
|
-
"""
|
930
|
-
Scale, up or down, a cluster on Google Cloud Dataproc.
|
931
|
-
|
932
|
-
The operator will wait until the cluster is re-scaled.
|
933
|
-
|
934
|
-
Example usage:
|
935
|
-
|
936
|
-
.. code-block:: python
|
937
|
-
|
938
|
-
t1 = DataprocClusterScaleOperator(
|
939
|
-
task_id="dataproc_scale",
|
940
|
-
project_id="my-project",
|
941
|
-
cluster_name="cluster-1",
|
942
|
-
num_workers=10,
|
943
|
-
num_preemptible_workers=10,
|
944
|
-
graceful_decommission_timeout="1h",
|
945
|
-
)
|
946
|
-
|
947
|
-
.. seealso::
|
948
|
-
For more detail on about scaling clusters have a look at the reference:
|
949
|
-
https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/scaling-clusters
|
950
|
-
|
951
|
-
:param cluster_name: The name of the cluster to scale. (templated)
|
952
|
-
:param project_id: The ID of the google cloud project in which
|
953
|
-
the cluster runs. (templated)
|
954
|
-
:param region: The region for the dataproc cluster. (templated)
|
955
|
-
:param num_workers: The new number of workers
|
956
|
-
:param num_preemptible_workers: The new number of preemptible workers
|
957
|
-
:param graceful_decommission_timeout: Timeout for graceful YARN decommissioning.
|
958
|
-
Maximum value is 1d
|
959
|
-
:param gcp_conn_id: The connection ID to use connecting to Google Cloud.
|
960
|
-
:param impersonation_chain: Optional service account to impersonate using short-term
|
961
|
-
credentials, or chained list of accounts required to get the access_token
|
962
|
-
of the last account in the list, which will be impersonated in the request.
|
963
|
-
If set as a string, the account must grant the originating account
|
964
|
-
the Service Account Token Creator IAM role.
|
965
|
-
If set as a sequence, the identities from the list must grant
|
966
|
-
Service Account Token Creator IAM role to the directly preceding identity, with first
|
967
|
-
account from the list granting this role to the originating account (templated).
|
968
|
-
"""
|
969
|
-
|
970
|
-
template_fields: Sequence[str] = ("cluster_name", "project_id", "region", "impersonation_chain")
|
971
|
-
|
972
|
-
operator_extra_links = (DataprocLink(),)
|
973
|
-
|
974
|
-
def __init__(
|
975
|
-
self,
|
976
|
-
*,
|
977
|
-
cluster_name: str,
|
978
|
-
project_id: str = PROVIDE_PROJECT_ID,
|
979
|
-
region: str = "global",
|
980
|
-
num_workers: int = 2,
|
981
|
-
num_preemptible_workers: int = 0,
|
982
|
-
graceful_decommission_timeout: str | None = None,
|
983
|
-
gcp_conn_id: str = "google_cloud_default",
|
984
|
-
impersonation_chain: str | Sequence[str] | None = None,
|
985
|
-
**kwargs,
|
986
|
-
) -> None:
|
987
|
-
super().__init__(**kwargs)
|
988
|
-
self.project_id = project_id
|
989
|
-
self.region = region
|
990
|
-
self.cluster_name = cluster_name
|
991
|
-
self.num_workers = num_workers
|
992
|
-
self.num_preemptible_workers = num_preemptible_workers
|
993
|
-
self.graceful_decommission_timeout = graceful_decommission_timeout
|
994
|
-
self.gcp_conn_id = gcp_conn_id
|
995
|
-
self.impersonation_chain = impersonation_chain
|
996
|
-
|
997
|
-
def _build_scale_cluster_data(self) -> dict:
|
998
|
-
scale_data = {
|
999
|
-
"config": {
|
1000
|
-
"worker_config": {"num_instances": self.num_workers},
|
1001
|
-
"secondary_worker_config": {"num_instances": self.num_preemptible_workers},
|
1002
|
-
}
|
1003
|
-
}
|
1004
|
-
return scale_data
|
1005
|
-
|
1006
|
-
@property
|
1007
|
-
def _graceful_decommission_timeout_object(self) -> dict[str, int] | None:
|
1008
|
-
if not self.graceful_decommission_timeout:
|
1009
|
-
return None
|
1010
|
-
|
1011
|
-
timeout = None
|
1012
|
-
match = re.fullmatch(r"(\d+)([smdh])", self.graceful_decommission_timeout)
|
1013
|
-
if match:
|
1014
|
-
val = int(match.group(1))
|
1015
|
-
unit = match.group(2)
|
1016
|
-
if unit == "s":
|
1017
|
-
timeout = val
|
1018
|
-
elif unit == "m":
|
1019
|
-
timeout = int(timedelta(minutes=val).total_seconds())
|
1020
|
-
elif unit == "h":
|
1021
|
-
timeout = int(timedelta(hours=val).total_seconds())
|
1022
|
-
elif unit == "d":
|
1023
|
-
timeout = int(timedelta(days=val).total_seconds())
|
1024
|
-
|
1025
|
-
if not timeout:
|
1026
|
-
raise AirflowException(
|
1027
|
-
"DataprocClusterScaleOperator "
|
1028
|
-
" should be expressed in day, hours, minutes or seconds. "
|
1029
|
-
" i.e. 1d, 4h, 10m, 30s"
|
1030
|
-
)
|
1031
|
-
|
1032
|
-
return {"seconds": timeout}
|
1033
|
-
|
1034
|
-
def execute(self, context: Context) -> None:
|
1035
|
-
"""Scale, up or down, a cluster on Google Cloud Dataproc."""
|
1036
|
-
self.log.info("Scaling cluster: %s", self.cluster_name)
|
1037
|
-
|
1038
|
-
scaling_cluster_data = self._build_scale_cluster_data()
|
1039
|
-
update_mask = ["config.worker_config.num_instances", "config.secondary_worker_config.num_instances"]
|
1040
|
-
|
1041
|
-
hook = DataprocHook(gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain)
|
1042
|
-
# Save data required to display extra link no matter what the cluster status will be
|
1043
|
-
DataprocLink.persist(
|
1044
|
-
context=context,
|
1045
|
-
task_instance=self,
|
1046
|
-
url=DATAPROC_CLUSTER_LINK_DEPRECATED,
|
1047
|
-
resource=self.cluster_name,
|
1048
|
-
)
|
1049
|
-
operation = hook.update_cluster(
|
1050
|
-
project_id=self.project_id,
|
1051
|
-
region=self.region,
|
1052
|
-
cluster_name=self.cluster_name,
|
1053
|
-
cluster=scaling_cluster_data,
|
1054
|
-
graceful_decommission_timeout=self._graceful_decommission_timeout_object,
|
1055
|
-
update_mask={"paths": update_mask},
|
1056
|
-
)
|
1057
|
-
operation.result()
|
1058
|
-
self.log.info("Cluster scaling finished")
|
1059
|
-
|
1060
|
-
|
1061
920
|
class DataprocDeleteClusterOperator(GoogleCloudBaseOperator):
|
1062
921
|
"""
|
1063
922
|
Delete a cluster in a project.
|
@@ -1463,8 +1322,7 @@ class DataprocJobBaseOperator(GoogleCloudBaseOperator):
|
|
1463
1322
|
"""Initialize `self.job_template` with default values."""
|
1464
1323
|
if self.project_id is None:
|
1465
1324
|
raise AirflowException(
|
1466
|
-
"project id should either be set via project_id "
|
1467
|
-
"parameter or retrieved from the connection,"
|
1325
|
+
"project id should either be set via project_id parameter or retrieved from the connection,"
|
1468
1326
|
)
|
1469
1327
|
job_template = DataProcJobBuilder(
|
1470
1328
|
project_id=self.project_id,
|
@@ -31,7 +31,6 @@ from google.cloud.metastore_v1.types import Backup, MetadataImport, Service
|
|
31
31
|
from google.cloud.metastore_v1.types.metastore import DatabaseDumpSpec, Restore
|
32
32
|
|
33
33
|
from airflow.exceptions import AirflowException
|
34
|
-
from airflow.models import BaseOperator, BaseOperatorLink
|
35
34
|
from airflow.providers.google.cloud.hooks.dataproc_metastore import DataprocMetastoreHook
|
36
35
|
from airflow.providers.google.cloud.operators.cloud_base import GoogleCloudBaseOperator
|
37
36
|
from airflow.providers.google.common.links.storage import StorageLink
|