apache-airflow-providers-google 11.0.0__py3-none-any.whl → 12.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airflow/providers/google/__init__.py +3 -3
- airflow/providers/google/assets/gcs.py +1 -7
- airflow/providers/google/cloud/hooks/alloy_db.py +289 -0
- airflow/providers/google/cloud/hooks/cloud_batch.py +13 -5
- airflow/providers/google/cloud/hooks/dataproc.py +7 -3
- airflow/providers/google/cloud/hooks/dataproc_metastore.py +41 -22
- airflow/providers/google/cloud/hooks/kubernetes_engine.py +7 -38
- airflow/providers/google/cloud/hooks/translate.py +355 -0
- airflow/providers/google/cloud/hooks/vertex_ai/feature_store.py +147 -0
- airflow/providers/google/cloud/hooks/vertex_ai/generative_model.py +10 -0
- airflow/providers/google/cloud/links/alloy_db.py +55 -0
- airflow/providers/google/cloud/links/translate.py +98 -0
- airflow/providers/google/cloud/log/stackdriver_task_handler.py +1 -5
- airflow/providers/google/cloud/openlineage/mixins.py +4 -12
- airflow/providers/google/cloud/openlineage/utils.py +200 -22
- airflow/providers/google/cloud/operators/alloy_db.py +459 -0
- airflow/providers/google/cloud/operators/automl.py +55 -44
- airflow/providers/google/cloud/operators/bigquery.py +60 -15
- airflow/providers/google/cloud/operators/dataproc.py +12 -0
- airflow/providers/google/cloud/operators/gcs.py +5 -14
- airflow/providers/google/cloud/operators/kubernetes_engine.py +377 -705
- airflow/providers/google/cloud/operators/mlengine.py +41 -31
- airflow/providers/google/cloud/operators/translate.py +586 -1
- airflow/providers/google/cloud/operators/vertex_ai/feature_store.py +163 -0
- airflow/providers/google/cloud/operators/vertex_ai/generative_model.py +5 -0
- airflow/providers/google/cloud/sensors/dataproc.py +2 -2
- airflow/providers/google/cloud/sensors/vertex_ai/__init__.py +16 -0
- airflow/providers/google/cloud/sensors/vertex_ai/feature_store.py +112 -0
- airflow/providers/google/cloud/transfers/bigquery_to_gcs.py +6 -11
- airflow/providers/google/cloud/transfers/bigquery_to_mssql.py +3 -0
- airflow/providers/google/cloud/transfers/bigquery_to_mysql.py +3 -0
- airflow/providers/google/cloud/transfers/gcs_to_bigquery.py +5 -10
- airflow/providers/google/cloud/transfers/gcs_to_gcs.py +3 -15
- airflow/providers/google/cloud/transfers/gcs_to_local.py +9 -0
- airflow/providers/google/cloud/transfers/local_to_gcs.py +41 -6
- airflow/providers/google/cloud/transfers/s3_to_gcs.py +15 -0
- airflow/providers/google/get_provider_info.py +30 -18
- airflow/providers/google/version_compat.py +36 -0
- {apache_airflow_providers_google-11.0.0.dist-info → apache_airflow_providers_google-12.0.0.dist-info}/METADATA +16 -18
- {apache_airflow_providers_google-11.0.0.dist-info → apache_airflow_providers_google-12.0.0.dist-info}/RECORD +42 -37
- airflow/providers/google/cloud/hooks/datapipeline.py +0 -71
- airflow/providers/google/cloud/openlineage/BigQueryErrorRunFacet.json +0 -30
- airflow/providers/google/cloud/operators/datapipeline.py +0 -63
- {apache_airflow_providers_google-11.0.0.dist-info → apache_airflow_providers_google-12.0.0.dist-info}/WHEEL +0 -0
- {apache_airflow_providers_google-11.0.0.dist-info → apache_airflow_providers_google-12.0.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,163 @@
|
|
1
|
+
#
|
2
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
# or more contributor license agreements. See the NOTICE file
|
4
|
+
# distributed with this work for additional information
|
5
|
+
# regarding copyright ownership. The ASF licenses this file
|
6
|
+
# to you under the Apache License, Version 2.0 (the
|
7
|
+
# "License"); you may not use this file except in compliance
|
8
|
+
# with the License. You may obtain a copy of the License at
|
9
|
+
#
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
#
|
12
|
+
# Unless required by applicable law or agreed to in writing,
|
13
|
+
# software distributed under the License is distributed on an
|
14
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
# KIND, either express or implied. See the License for the
|
16
|
+
# specific language governing permissions and limitations
|
17
|
+
# under the License.
|
18
|
+
"""This module contains Google Vertex AI Feature Store operators."""
|
19
|
+
|
20
|
+
from __future__ import annotations
|
21
|
+
|
22
|
+
from collections.abc import Sequence
|
23
|
+
from typing import TYPE_CHECKING, Any
|
24
|
+
|
25
|
+
from airflow.providers.google.cloud.hooks.vertex_ai.feature_store import FeatureStoreHook
|
26
|
+
from airflow.providers.google.cloud.operators.cloud_base import GoogleCloudBaseOperator
|
27
|
+
|
28
|
+
if TYPE_CHECKING:
|
29
|
+
from airflow.utils.context import Context
|
30
|
+
|
31
|
+
|
32
|
+
class SyncFeatureViewOperator(GoogleCloudBaseOperator):
|
33
|
+
"""
|
34
|
+
Initiate a synchronization operation for a Feature View in Vertex AI Feature Store.
|
35
|
+
|
36
|
+
This operator triggers a sync operation that updates the online serving data for a feature view
|
37
|
+
based on the latest data in the underlying batch source. The sync operation ensures that
|
38
|
+
the online feature values are up-to-date for real-time serving.
|
39
|
+
|
40
|
+
:param project_id: Required. The ID of the Google Cloud project that contains the feature store.
|
41
|
+
This is used to identify which project's resources to interact with.
|
42
|
+
:param location: Required. The location of the feature store (e.g., 'us-central1', 'us-east1').
|
43
|
+
This specifies the Google Cloud region where the feature store resources are located.
|
44
|
+
:param feature_online_store_id: Required. The ID of the online feature store that contains
|
45
|
+
the feature view to be synchronized. This store serves as the online serving layer.
|
46
|
+
:param feature_view_id: Required. The ID of the feature view to synchronize. This identifies
|
47
|
+
the specific view that needs to have its online values updated from the batch source.
|
48
|
+
:param gcp_conn_id: The connection ID to use for connecting to Google Cloud Platform.
|
49
|
+
Defaults to 'google_cloud_default'.
|
50
|
+
:param impersonation_chain: Optional service account to impersonate using short-term
|
51
|
+
credentials. Can be either a single account or a chain of accounts required to
|
52
|
+
get the access_token of the last account in the list, which will be impersonated
|
53
|
+
in the request. If set as a string, the account must grant the originating account
|
54
|
+
the Service Account Token Creator IAM role. If set as a sequence, the identities
|
55
|
+
from the list must grant Service Account Token Creator IAM role to the directly
|
56
|
+
preceding identity, with first account from the list granting this role to the
|
57
|
+
originating account.
|
58
|
+
"""
|
59
|
+
|
60
|
+
template_fields: Sequence[str] = (
|
61
|
+
"project_id",
|
62
|
+
"location",
|
63
|
+
"feature_online_store_id",
|
64
|
+
"feature_view_id",
|
65
|
+
)
|
66
|
+
|
67
|
+
def __init__(
|
68
|
+
self,
|
69
|
+
*,
|
70
|
+
project_id: str,
|
71
|
+
location: str,
|
72
|
+
feature_online_store_id: str,
|
73
|
+
feature_view_id: str,
|
74
|
+
gcp_conn_id: str = "google_cloud_default",
|
75
|
+
impersonation_chain: str | Sequence[str] | None = None,
|
76
|
+
**kwargs,
|
77
|
+
) -> None:
|
78
|
+
super().__init__(**kwargs)
|
79
|
+
self.project_id = project_id
|
80
|
+
self.location = location
|
81
|
+
self.feature_online_store_id = feature_online_store_id
|
82
|
+
self.feature_view_id = feature_view_id
|
83
|
+
self.gcp_conn_id = gcp_conn_id
|
84
|
+
self.impersonation_chain = impersonation_chain
|
85
|
+
|
86
|
+
def execute(self, context: Context) -> str:
|
87
|
+
"""Execute the feature view sync operation."""
|
88
|
+
self.hook = FeatureStoreHook(
|
89
|
+
gcp_conn_id=self.gcp_conn_id,
|
90
|
+
impersonation_chain=self.impersonation_chain,
|
91
|
+
)
|
92
|
+
self.log.info("Submitting Feature View sync job now...")
|
93
|
+
response = self.hook.sync_feature_view(
|
94
|
+
project_id=self.project_id,
|
95
|
+
location=self.location,
|
96
|
+
feature_online_store_id=self.feature_online_store_id,
|
97
|
+
feature_view_id=self.feature_view_id,
|
98
|
+
)
|
99
|
+
self.log.info("Retrieved Feature View sync: %s", response)
|
100
|
+
|
101
|
+
return response
|
102
|
+
|
103
|
+
|
104
|
+
class GetFeatureViewSyncOperator(GoogleCloudBaseOperator):
|
105
|
+
"""
|
106
|
+
Retrieve the status and details of a Feature View synchronization operation.
|
107
|
+
|
108
|
+
This operator fetches information about a specific feature view sync operation,
|
109
|
+
including its current status, timing information, and synchronization metrics.
|
110
|
+
It's typically used to monitor the progress of a sync operation initiated by
|
111
|
+
the SyncFeatureViewOperator.
|
112
|
+
|
113
|
+
:param location: Required. The location of the feature store (e.g., 'us-central1', 'us-east1').
|
114
|
+
This specifies the Google Cloud region where the feature store resources are located.
|
115
|
+
:param feature_view_sync_name: Required. The full resource name of the feature view
|
116
|
+
sync operation to retrieve. This is typically the return value from a
|
117
|
+
SyncFeatureViewOperator execution.
|
118
|
+
:param gcp_conn_id: The connection ID to use for connecting to Google Cloud Platform.
|
119
|
+
Defaults to 'google_cloud_default'.
|
120
|
+
:param impersonation_chain: Optional service account to impersonate using short-term
|
121
|
+
credentials. Can be either a single account or a chain of accounts required to
|
122
|
+
get the access_token of the last account in the list, which will be impersonated
|
123
|
+
in the request. If set as a string, the account must grant the originating account
|
124
|
+
the Service Account Token Creator IAM role. If set as a sequence, the identities
|
125
|
+
from the list must grant Service Account Token Creator IAM role to the directly
|
126
|
+
preceding identity, with first account from the list granting this role to the
|
127
|
+
originating account.
|
128
|
+
"""
|
129
|
+
|
130
|
+
template_fields: Sequence[str] = (
|
131
|
+
"location",
|
132
|
+
"feature_view_sync_name",
|
133
|
+
)
|
134
|
+
|
135
|
+
def __init__(
|
136
|
+
self,
|
137
|
+
*,
|
138
|
+
location: str,
|
139
|
+
feature_view_sync_name: str,
|
140
|
+
gcp_conn_id: str = "google_cloud_default",
|
141
|
+
impersonation_chain: str | Sequence[str] | None = None,
|
142
|
+
**kwargs,
|
143
|
+
) -> None:
|
144
|
+
super().__init__(**kwargs)
|
145
|
+
self.location = location
|
146
|
+
self.feature_view_sync_name = feature_view_sync_name
|
147
|
+
self.gcp_conn_id = gcp_conn_id
|
148
|
+
self.impersonation_chain = impersonation_chain
|
149
|
+
|
150
|
+
def execute(self, context: Context) -> dict[str, Any]:
|
151
|
+
"""Execute the get feature view sync operation."""
|
152
|
+
self.hook = FeatureStoreHook(
|
153
|
+
gcp_conn_id=self.gcp_conn_id,
|
154
|
+
impersonation_chain=self.impersonation_chain,
|
155
|
+
)
|
156
|
+
self.log.info("Retrieving Feature View sync job now...")
|
157
|
+
response = self.hook.get_feature_view_sync(
|
158
|
+
location=self.location, feature_view_sync_name=self.feature_view_sync_name
|
159
|
+
)
|
160
|
+
self.log.info("Retrieved Feature View sync: %s", self.feature_view_sync_name)
|
161
|
+
self.log.info(response)
|
162
|
+
|
163
|
+
return response
|
@@ -353,6 +353,11 @@ class PromptMultimodalModelWithMediaOperator(GoogleCloudBaseOperator):
|
|
353
353
|
return response
|
354
354
|
|
355
355
|
|
356
|
+
@deprecated(
|
357
|
+
planned_removal_date="April 09, 2025",
|
358
|
+
use_instead="GenerativeModelGenerateContentOperator",
|
359
|
+
category=AirflowProviderDeprecationWarning,
|
360
|
+
)
|
356
361
|
class TextGenerationModelPredictOperator(GoogleCloudBaseOperator):
|
357
362
|
"""
|
358
363
|
Uses the Vertex AI PaLM API to generate natural language text.
|
@@ -107,10 +107,10 @@ class DataprocJobSensor(BaseSensorOperator):
|
|
107
107
|
}:
|
108
108
|
message = f"Job was cancelled:\n{job}"
|
109
109
|
raise AirflowException(message)
|
110
|
-
elif JobStatus.State.DONE
|
110
|
+
elif state == JobStatus.State.DONE:
|
111
111
|
self.log.debug("Job %s completed successfully.", self.dataproc_job_id)
|
112
112
|
return True
|
113
|
-
elif JobStatus.State.ATTEMPT_FAILURE
|
113
|
+
elif state == JobStatus.State.ATTEMPT_FAILURE:
|
114
114
|
self.log.debug("Job %s attempt has failed.", self.dataproc_job_id)
|
115
115
|
|
116
116
|
self.log.info("Waiting for job %s to complete.", self.dataproc_job_id)
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
@@ -0,0 +1,112 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
|
18
|
+
"""This module contains a Vertex AI Feature Store sensor."""
|
19
|
+
|
20
|
+
from __future__ import annotations
|
21
|
+
|
22
|
+
import time
|
23
|
+
from collections.abc import Sequence
|
24
|
+
from typing import TYPE_CHECKING
|
25
|
+
|
26
|
+
from airflow.exceptions import AirflowException
|
27
|
+
from airflow.providers.google.cloud.hooks.vertex_ai.feature_store import FeatureStoreHook
|
28
|
+
from airflow.sensors.base import BaseSensorOperator
|
29
|
+
|
30
|
+
if TYPE_CHECKING:
|
31
|
+
from airflow.utils.context import Context
|
32
|
+
|
33
|
+
|
34
|
+
class FeatureViewSyncSensor(BaseSensorOperator):
|
35
|
+
"""
|
36
|
+
Sensor to monitor the state of a Vertex AI Feature View sync operation.
|
37
|
+
|
38
|
+
:param feature_view_sync_name: The name of the feature view sync operation to monitor. (templated)
|
39
|
+
:param location: Required. The Cloud region in which to handle the request. (templated)
|
40
|
+
:param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform.
|
41
|
+
:param wait_timeout: How many seconds to wait for sync to complete.
|
42
|
+
:param impersonation_chain: Optional service account to impersonate using short-term
|
43
|
+
credentials.
|
44
|
+
"""
|
45
|
+
|
46
|
+
template_fields: Sequence[str] = ("location", "feature_view_sync_name")
|
47
|
+
ui_color = "#f0eee4"
|
48
|
+
|
49
|
+
def __init__(
|
50
|
+
self,
|
51
|
+
*,
|
52
|
+
feature_view_sync_name: str,
|
53
|
+
location: str,
|
54
|
+
gcp_conn_id: str = "google_cloud_default",
|
55
|
+
wait_timeout: int | None = None,
|
56
|
+
impersonation_chain: str | Sequence[str] | None = None,
|
57
|
+
**kwargs,
|
58
|
+
) -> None:
|
59
|
+
super().__init__(**kwargs)
|
60
|
+
self.feature_view_sync_name = feature_view_sync_name
|
61
|
+
self.location = location
|
62
|
+
self.gcp_conn_id = gcp_conn_id
|
63
|
+
self.wait_timeout = wait_timeout
|
64
|
+
self.impersonation_chain = impersonation_chain
|
65
|
+
self.start_sensor_time: float | None = None
|
66
|
+
|
67
|
+
def execute(self, context: Context) -> None:
|
68
|
+
self.start_sensor_time = time.monotonic()
|
69
|
+
super().execute(context)
|
70
|
+
|
71
|
+
def _duration(self):
|
72
|
+
return time.monotonic() - self.start_sensor_time
|
73
|
+
|
74
|
+
def poke(self, context: Context) -> bool:
|
75
|
+
hook = FeatureStoreHook(
|
76
|
+
gcp_conn_id=self.gcp_conn_id,
|
77
|
+
impersonation_chain=self.impersonation_chain,
|
78
|
+
)
|
79
|
+
|
80
|
+
try:
|
81
|
+
response = hook.get_feature_view_sync(
|
82
|
+
location=self.location,
|
83
|
+
feature_view_sync_name=self.feature_view_sync_name,
|
84
|
+
)
|
85
|
+
|
86
|
+
# Check if the sync has completed by verifying end_time exists
|
87
|
+
if response.get("end_time", 0) > 0:
|
88
|
+
self.log.info(
|
89
|
+
"Feature View sync %s completed. Rows synced: %d, Total slots: %d",
|
90
|
+
self.feature_view_sync_name,
|
91
|
+
int(response.get("sync_summary", "").get("row_synced", "")),
|
92
|
+
int(response.get("sync_summary", "").get("total_slot", "")),
|
93
|
+
)
|
94
|
+
return True
|
95
|
+
|
96
|
+
if self.wait_timeout and self._duration() > self.wait_timeout:
|
97
|
+
raise AirflowException(
|
98
|
+
f"Timeout: Feature View sync {self.feature_view_sync_name} "
|
99
|
+
f"not completed after {self.wait_timeout}s"
|
100
|
+
)
|
101
|
+
|
102
|
+
self.log.info("Waiting for Feature View sync %s to complete.", self.feature_view_sync_name)
|
103
|
+
return False
|
104
|
+
|
105
|
+
except Exception as e:
|
106
|
+
if self.wait_timeout and self._duration() > self.wait_timeout:
|
107
|
+
raise AirflowException(
|
108
|
+
f"Timeout: Feature View sync {self.feature_view_sync_name} "
|
109
|
+
f"not completed after {self.wait_timeout}s"
|
110
|
+
)
|
111
|
+
self.log.info("Error checking sync status, will retry: %s", str(e))
|
112
|
+
return False
|
@@ -292,8 +292,6 @@ class BigQueryToGCSOperator(BaseOperator):
|
|
292
292
|
|
293
293
|
def get_openlineage_facets_on_complete(self, task_instance):
|
294
294
|
"""Implement on_complete as we will include final BQ job id."""
|
295
|
-
from pathlib import Path
|
296
|
-
|
297
295
|
from airflow.providers.common.compat.openlineage.facet import (
|
298
296
|
BaseFacet,
|
299
297
|
Dataset,
|
@@ -303,6 +301,8 @@ class BigQueryToGCSOperator(BaseOperator):
|
|
303
301
|
)
|
304
302
|
from airflow.providers.google.cloud.hooks.gcs import _parse_gcs_url
|
305
303
|
from airflow.providers.google.cloud.openlineage.utils import (
|
304
|
+
WILDCARD,
|
305
|
+
extract_ds_name_from_gcs_path,
|
306
306
|
get_facets_from_bq_table,
|
307
307
|
get_identity_column_lineage_facet,
|
308
308
|
)
|
@@ -333,24 +333,19 @@ class BigQueryToGCSOperator(BaseOperator):
|
|
333
333
|
output_datasets = []
|
334
334
|
for uri in sorted(self.destination_cloud_storage_uris):
|
335
335
|
bucket, blob = _parse_gcs_url(uri)
|
336
|
-
additional_facets = {}
|
337
336
|
|
338
|
-
|
339
|
-
|
340
|
-
#
|
337
|
+
additional_facets = {}
|
338
|
+
if WILDCARD in blob:
|
339
|
+
# For path with wildcard we attach a symlink with unmodified path.
|
341
340
|
additional_facets = {
|
342
341
|
"symlink": SymlinksDatasetFacet(
|
343
342
|
identifiers=[Identifier(namespace=f"gs://{bucket}", name=blob, type="file")]
|
344
343
|
),
|
345
344
|
}
|
346
|
-
blob = Path(blob).parent.as_posix()
|
347
|
-
if blob == ".":
|
348
|
-
# blob path does not have leading slash, but we need root dataset name to be "/"
|
349
|
-
blob = "/"
|
350
345
|
|
351
346
|
dataset = Dataset(
|
352
347
|
namespace=f"gs://{bucket}",
|
353
|
-
name=blob,
|
348
|
+
name=extract_ds_name_from_gcs_path(blob),
|
354
349
|
facets=merge_dicts(output_dataset_facets, additional_facets),
|
355
350
|
)
|
356
351
|
output_datasets.append(dataset)
|
@@ -45,6 +45,9 @@ class BigQueryToMsSqlOperator(BigQueryToSqlBaseOperator):
|
|
45
45
|
:param mssql_table: target MsSQL table. It is deprecated: use target_table_name instead. (templated)
|
46
46
|
:param target_table_name: target MsSQL table. It takes precedence over mssql_table. (templated)
|
47
47
|
:param mssql_conn_id: reference to a specific mssql hook
|
48
|
+
|
49
|
+
.. warning::
|
50
|
+
The `mssql_table` parameter has been deprecated. Use `target_table_name` instead.
|
48
51
|
"""
|
49
52
|
|
50
53
|
template_fields: Sequence[str] = (
|
@@ -39,6 +39,9 @@ class BigQueryToMySqlOperator(BigQueryToSqlBaseOperator):
|
|
39
39
|
specific database. It is deprecated: use target_table_name instead. (templated)
|
40
40
|
:param target_table_name: target MySQL table. It takes precedence over mysql_table. (templated)
|
41
41
|
:param mysql_conn_id: Reference to :ref:`mysql connection id <howto/connection:mysql>`.
|
42
|
+
|
43
|
+
.. warning::
|
44
|
+
The `mysql_table` parameter has been deprecated. Use `target_table_name` instead.
|
42
45
|
"""
|
43
46
|
|
44
47
|
template_fields: Sequence[str] = (*BigQueryToSqlBaseOperator.template_fields, "dataset_id", "table_id")
|
@@ -756,8 +756,6 @@ class GCSToBigQueryOperator(BaseOperator):
|
|
756
756
|
|
757
757
|
def get_openlineage_facets_on_complete(self, task_instance):
|
758
758
|
"""Implement on_complete as we will include final BQ job id."""
|
759
|
-
from pathlib import Path
|
760
|
-
|
761
759
|
from airflow.providers.common.compat.openlineage.facet import (
|
762
760
|
Dataset,
|
763
761
|
ExternalQueryRunFacet,
|
@@ -765,6 +763,8 @@ class GCSToBigQueryOperator(BaseOperator):
|
|
765
763
|
SymlinksDatasetFacet,
|
766
764
|
)
|
767
765
|
from airflow.providers.google.cloud.openlineage.utils import (
|
766
|
+
WILDCARD,
|
767
|
+
extract_ds_name_from_gcs_path,
|
768
768
|
get_facets_from_bq_table,
|
769
769
|
get_identity_column_lineage_facet,
|
770
770
|
)
|
@@ -793,22 +793,17 @@ class GCSToBigQueryOperator(BaseOperator):
|
|
793
793
|
for blob in sorted(source_objects):
|
794
794
|
additional_facets = {}
|
795
795
|
|
796
|
-
if
|
797
|
-
#
|
798
|
-
# but we create a symlink to the full object path with wildcard.
|
796
|
+
if WILDCARD in blob:
|
797
|
+
# For path with wildcard we attach a symlink with unmodified path.
|
799
798
|
additional_facets = {
|
800
799
|
"symlink": SymlinksDatasetFacet(
|
801
800
|
identifiers=[Identifier(namespace=f"gs://{self.bucket}", name=blob, type="file")]
|
802
801
|
),
|
803
802
|
}
|
804
|
-
blob = Path(blob).parent.as_posix()
|
805
|
-
if blob == ".":
|
806
|
-
# blob path does not have leading slash, but we need root dataset name to be "/"
|
807
|
-
blob = "/"
|
808
803
|
|
809
804
|
dataset = Dataset(
|
810
805
|
namespace=f"gs://{self.bucket}",
|
811
|
-
name=blob,
|
806
|
+
name=extract_ds_name_from_gcs_path(blob),
|
812
807
|
facets=merge_dicts(input_dataset_facets, additional_facets),
|
813
808
|
)
|
814
809
|
input_datasets.append(dataset)
|
@@ -551,28 +551,16 @@ class GCSToGCSOperator(BaseOperator):
|
|
551
551
|
This means we won't have to normalize self.source_object and self.source_objects,
|
552
552
|
destination bucket and so on.
|
553
553
|
"""
|
554
|
-
from pathlib import Path
|
555
|
-
|
556
554
|
from airflow.providers.common.compat.openlineage.facet import Dataset
|
555
|
+
from airflow.providers.google.cloud.openlineage.utils import extract_ds_name_from_gcs_path
|
557
556
|
from airflow.providers.openlineage.extractors import OperatorLineage
|
558
557
|
|
559
|
-
|
560
|
-
if WILDCARD in pref:
|
561
|
-
pref = pref.split(WILDCARD)[0]
|
562
|
-
# Use parent if not a file (dot not in name) and not a dir (ends with slash)
|
563
|
-
if "." not in pref.split("/")[-1] and not pref.endswith("/"):
|
564
|
-
pref = Path(pref).parent.as_posix()
|
565
|
-
return ["/" if pref in ("", "/", ".") else pref.rstrip("/")] # Adjust root path
|
566
|
-
|
567
|
-
inputs = []
|
568
|
-
for prefix in self.source_objects:
|
569
|
-
result = _process_prefix(prefix)
|
570
|
-
inputs.extend(result)
|
558
|
+
inputs = [extract_ds_name_from_gcs_path(path) for path in self.source_objects]
|
571
559
|
|
572
560
|
if self.destination_object is None:
|
573
561
|
outputs = inputs.copy()
|
574
562
|
else:
|
575
|
-
outputs =
|
563
|
+
outputs = [extract_ds_name_from_gcs_path(self.destination_object)]
|
576
564
|
|
577
565
|
return OperatorLineage(
|
578
566
|
inputs=[
|
@@ -113,3 +113,12 @@ class GCSToLocalFilesystemOperator(BaseOperator):
|
|
113
113
|
raise AirflowException("The size of the downloaded file is too large to push to XCom!")
|
114
114
|
else:
|
115
115
|
hook.download(bucket_name=self.bucket, object_name=self.object_name, filename=self.filename)
|
116
|
+
|
117
|
+
def get_openlineage_facets_on_start(self):
|
118
|
+
from airflow.providers.common.compat.openlineage.facet import Dataset
|
119
|
+
from airflow.providers.openlineage.extractors import OperatorLineage
|
120
|
+
|
121
|
+
return OperatorLineage(
|
122
|
+
inputs=[Dataset(namespace=f"gs://{self.bucket}", name=self.object_name)],
|
123
|
+
outputs=[Dataset(namespace="file", name=self.filename)] if self.filename else [],
|
124
|
+
)
|
@@ -69,12 +69,12 @@ class LocalFilesystemToGCSOperator(BaseOperator):
|
|
69
69
|
def __init__(
|
70
70
|
self,
|
71
71
|
*,
|
72
|
-
src,
|
73
|
-
dst,
|
74
|
-
bucket,
|
75
|
-
gcp_conn_id="google_cloud_default",
|
76
|
-
mime_type="application/octet-stream",
|
77
|
-
gzip=False,
|
72
|
+
src: str | list[str],
|
73
|
+
dst: str,
|
74
|
+
bucket: str,
|
75
|
+
gcp_conn_id: str = "google_cloud_default",
|
76
|
+
mime_type: str = "application/octet-stream",
|
77
|
+
gzip: bool = False,
|
78
78
|
chunk_size: int | None = None,
|
79
79
|
impersonation_chain: str | Sequence[str] | None = None,
|
80
80
|
**kwargs,
|
@@ -120,3 +120,38 @@ class LocalFilesystemToGCSOperator(BaseOperator):
|
|
120
120
|
gzip=self.gzip,
|
121
121
|
chunk_size=self.chunk_size,
|
122
122
|
)
|
123
|
+
|
124
|
+
def get_openlineage_facets_on_start(self):
|
125
|
+
from airflow.providers.common.compat.openlineage.facet import (
|
126
|
+
Dataset,
|
127
|
+
Identifier,
|
128
|
+
SymlinksDatasetFacet,
|
129
|
+
)
|
130
|
+
from airflow.providers.google.cloud.openlineage.utils import WILDCARD, extract_ds_name_from_gcs_path
|
131
|
+
from airflow.providers.openlineage.extractors import OperatorLineage
|
132
|
+
|
133
|
+
source_facets = {}
|
134
|
+
if isinstance(self.src, str): # Single path provided, possibly relative or with wildcard
|
135
|
+
original_src = f"{self.src}"
|
136
|
+
absolute_src = os.path.abspath(self.src)
|
137
|
+
resolved_src = extract_ds_name_from_gcs_path(absolute_src)
|
138
|
+
if original_src.startswith("/") and not resolved_src.startswith("/"):
|
139
|
+
resolved_src = "/" + resolved_src
|
140
|
+
source_objects = [resolved_src]
|
141
|
+
|
142
|
+
if WILDCARD in original_src or absolute_src != resolved_src:
|
143
|
+
# We attach a symlink with unmodified path.
|
144
|
+
source_facets = {
|
145
|
+
"symlink": SymlinksDatasetFacet(
|
146
|
+
identifiers=[Identifier(namespace="file", name=original_src, type="file")]
|
147
|
+
),
|
148
|
+
}
|
149
|
+
else:
|
150
|
+
source_objects = self.src
|
151
|
+
|
152
|
+
dest_object = self.dst if os.path.basename(self.dst) else extract_ds_name_from_gcs_path(self.dst)
|
153
|
+
|
154
|
+
return OperatorLineage(
|
155
|
+
inputs=[Dataset(namespace="file", name=src, facets=source_facets) for src in source_objects],
|
156
|
+
outputs=[Dataset(namespace=f"gs://{self.bucket}", name=dest_object)],
|
157
|
+
)
|
@@ -345,3 +345,18 @@ class S3ToGCSOperator(S3ListOperator):
|
|
345
345
|
gcp_conn_id=self.gcp_conn_id,
|
346
346
|
impersonation_chain=self.google_impersonation_chain,
|
347
347
|
)
|
348
|
+
|
349
|
+
def get_openlineage_facets_on_start(self):
|
350
|
+
from airflow.providers.common.compat.openlineage.facet import Dataset
|
351
|
+
from airflow.providers.google.cloud.hooks.gcs import _parse_gcs_url
|
352
|
+
from airflow.providers.google.cloud.openlineage.utils import extract_ds_name_from_gcs_path
|
353
|
+
from airflow.providers.openlineage.extractors import OperatorLineage
|
354
|
+
|
355
|
+
gcs_bucket, gcs_blob = _parse_gcs_url(self.dest_gcs)
|
356
|
+
if not self.apply_gcs_prefix:
|
357
|
+
gcs_blob += self.prefix
|
358
|
+
|
359
|
+
return OperatorLineage(
|
360
|
+
inputs=[Dataset(namespace=f"s3://{self.bucket}", name=self.prefix.strip("/") or "/")],
|
361
|
+
outputs=[Dataset(namespace=f"gs://{gcs_bucket}", name=extract_ds_name_from_gcs_path(gcs_blob))],
|
362
|
+
)
|