apache-airflow-providers-google 11.0.0rc1__py3-none-any.whl → 12.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. airflow/providers/google/__init__.py +3 -3
  2. airflow/providers/google/assets/gcs.py +1 -7
  3. airflow/providers/google/cloud/hooks/alloy_db.py +289 -0
  4. airflow/providers/google/cloud/hooks/cloud_batch.py +13 -5
  5. airflow/providers/google/cloud/hooks/dataproc.py +7 -3
  6. airflow/providers/google/cloud/hooks/dataproc_metastore.py +41 -22
  7. airflow/providers/google/cloud/hooks/kubernetes_engine.py +7 -38
  8. airflow/providers/google/cloud/hooks/translate.py +355 -0
  9. airflow/providers/google/cloud/hooks/vertex_ai/feature_store.py +147 -0
  10. airflow/providers/google/cloud/hooks/vertex_ai/generative_model.py +10 -0
  11. airflow/providers/google/cloud/links/alloy_db.py +55 -0
  12. airflow/providers/google/cloud/links/translate.py +98 -0
  13. airflow/providers/google/cloud/log/stackdriver_task_handler.py +1 -5
  14. airflow/providers/google/cloud/openlineage/mixins.py +4 -12
  15. airflow/providers/google/cloud/openlineage/utils.py +200 -22
  16. airflow/providers/google/cloud/operators/alloy_db.py +459 -0
  17. airflow/providers/google/cloud/operators/automl.py +55 -44
  18. airflow/providers/google/cloud/operators/bigquery.py +60 -15
  19. airflow/providers/google/cloud/operators/dataproc.py +12 -0
  20. airflow/providers/google/cloud/operators/gcs.py +5 -14
  21. airflow/providers/google/cloud/operators/kubernetes_engine.py +377 -705
  22. airflow/providers/google/cloud/operators/mlengine.py +41 -31
  23. airflow/providers/google/cloud/operators/translate.py +586 -1
  24. airflow/providers/google/cloud/operators/vertex_ai/feature_store.py +163 -0
  25. airflow/providers/google/cloud/operators/vertex_ai/generative_model.py +5 -0
  26. airflow/providers/google/cloud/sensors/dataproc.py +2 -2
  27. airflow/providers/google/cloud/sensors/vertex_ai/__init__.py +16 -0
  28. airflow/providers/google/cloud/sensors/vertex_ai/feature_store.py +112 -0
  29. airflow/providers/google/cloud/transfers/bigquery_to_gcs.py +6 -11
  30. airflow/providers/google/cloud/transfers/bigquery_to_mssql.py +3 -0
  31. airflow/providers/google/cloud/transfers/bigquery_to_mysql.py +3 -0
  32. airflow/providers/google/cloud/transfers/gcs_to_bigquery.py +5 -10
  33. airflow/providers/google/cloud/transfers/gcs_to_gcs.py +3 -15
  34. airflow/providers/google/cloud/transfers/gcs_to_local.py +9 -0
  35. airflow/providers/google/cloud/transfers/local_to_gcs.py +41 -6
  36. airflow/providers/google/cloud/transfers/s3_to_gcs.py +15 -0
  37. airflow/providers/google/get_provider_info.py +30 -18
  38. airflow/providers/google/version_compat.py +36 -0
  39. {apache_airflow_providers_google-11.0.0rc1.dist-info → apache_airflow_providers_google-12.0.0rc1.dist-info}/METADATA +16 -18
  40. {apache_airflow_providers_google-11.0.0rc1.dist-info → apache_airflow_providers_google-12.0.0rc1.dist-info}/RECORD +42 -37
  41. airflow/providers/google/cloud/hooks/datapipeline.py +0 -71
  42. airflow/providers/google/cloud/openlineage/BigQueryErrorRunFacet.json +0 -30
  43. airflow/providers/google/cloud/operators/datapipeline.py +0 -63
  44. {apache_airflow_providers_google-11.0.0rc1.dist-info → apache_airflow_providers_google-12.0.0rc1.dist-info}/WHEEL +0 -0
  45. {apache_airflow_providers_google-11.0.0rc1.dist-info → apache_airflow_providers_google-12.0.0rc1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,163 @@
1
+ #
2
+ # Licensed to the Apache Software Foundation (ASF) under one
3
+ # or more contributor license agreements. See the NOTICE file
4
+ # distributed with this work for additional information
5
+ # regarding copyright ownership. The ASF licenses this file
6
+ # to you under the Apache License, Version 2.0 (the
7
+ # "License"); you may not use this file except in compliance
8
+ # with the License. You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing,
13
+ # software distributed under the License is distributed on an
14
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ # KIND, either express or implied. See the License for the
16
+ # specific language governing permissions and limitations
17
+ # under the License.
18
+ """This module contains Google Vertex AI Feature Store operators."""
19
+
20
+ from __future__ import annotations
21
+
22
+ from collections.abc import Sequence
23
+ from typing import TYPE_CHECKING, Any
24
+
25
+ from airflow.providers.google.cloud.hooks.vertex_ai.feature_store import FeatureStoreHook
26
+ from airflow.providers.google.cloud.operators.cloud_base import GoogleCloudBaseOperator
27
+
28
+ if TYPE_CHECKING:
29
+ from airflow.utils.context import Context
30
+
31
+
32
+ class SyncFeatureViewOperator(GoogleCloudBaseOperator):
33
+ """
34
+ Initiate a synchronization operation for a Feature View in Vertex AI Feature Store.
35
+
36
+ This operator triggers a sync operation that updates the online serving data for a feature view
37
+ based on the latest data in the underlying batch source. The sync operation ensures that
38
+ the online feature values are up-to-date for real-time serving.
39
+
40
+ :param project_id: Required. The ID of the Google Cloud project that contains the feature store.
41
+ This is used to identify which project's resources to interact with.
42
+ :param location: Required. The location of the feature store (e.g., 'us-central1', 'us-east1').
43
+ This specifies the Google Cloud region where the feature store resources are located.
44
+ :param feature_online_store_id: Required. The ID of the online feature store that contains
45
+ the feature view to be synchronized. This store serves as the online serving layer.
46
+ :param feature_view_id: Required. The ID of the feature view to synchronize. This identifies
47
+ the specific view that needs to have its online values updated from the batch source.
48
+ :param gcp_conn_id: The connection ID to use for connecting to Google Cloud Platform.
49
+ Defaults to 'google_cloud_default'.
50
+ :param impersonation_chain: Optional service account to impersonate using short-term
51
+ credentials. Can be either a single account or a chain of accounts required to
52
+ get the access_token of the last account in the list, which will be impersonated
53
+ in the request. If set as a string, the account must grant the originating account
54
+ the Service Account Token Creator IAM role. If set as a sequence, the identities
55
+ from the list must grant Service Account Token Creator IAM role to the directly
56
+ preceding identity, with first account from the list granting this role to the
57
+ originating account.
58
+ """
59
+
60
+ template_fields: Sequence[str] = (
61
+ "project_id",
62
+ "location",
63
+ "feature_online_store_id",
64
+ "feature_view_id",
65
+ )
66
+
67
+ def __init__(
68
+ self,
69
+ *,
70
+ project_id: str,
71
+ location: str,
72
+ feature_online_store_id: str,
73
+ feature_view_id: str,
74
+ gcp_conn_id: str = "google_cloud_default",
75
+ impersonation_chain: str | Sequence[str] | None = None,
76
+ **kwargs,
77
+ ) -> None:
78
+ super().__init__(**kwargs)
79
+ self.project_id = project_id
80
+ self.location = location
81
+ self.feature_online_store_id = feature_online_store_id
82
+ self.feature_view_id = feature_view_id
83
+ self.gcp_conn_id = gcp_conn_id
84
+ self.impersonation_chain = impersonation_chain
85
+
86
+ def execute(self, context: Context) -> str:
87
+ """Execute the feature view sync operation."""
88
+ self.hook = FeatureStoreHook(
89
+ gcp_conn_id=self.gcp_conn_id,
90
+ impersonation_chain=self.impersonation_chain,
91
+ )
92
+ self.log.info("Submitting Feature View sync job now...")
93
+ response = self.hook.sync_feature_view(
94
+ project_id=self.project_id,
95
+ location=self.location,
96
+ feature_online_store_id=self.feature_online_store_id,
97
+ feature_view_id=self.feature_view_id,
98
+ )
99
+ self.log.info("Retrieved Feature View sync: %s", response)
100
+
101
+ return response
102
+
103
+
104
+ class GetFeatureViewSyncOperator(GoogleCloudBaseOperator):
105
+ """
106
+ Retrieve the status and details of a Feature View synchronization operation.
107
+
108
+ This operator fetches information about a specific feature view sync operation,
109
+ including its current status, timing information, and synchronization metrics.
110
+ It's typically used to monitor the progress of a sync operation initiated by
111
+ the SyncFeatureViewOperator.
112
+
113
+ :param location: Required. The location of the feature store (e.g., 'us-central1', 'us-east1').
114
+ This specifies the Google Cloud region where the feature store resources are located.
115
+ :param feature_view_sync_name: Required. The full resource name of the feature view
116
+ sync operation to retrieve. This is typically the return value from a
117
+ SyncFeatureViewOperator execution.
118
+ :param gcp_conn_id: The connection ID to use for connecting to Google Cloud Platform.
119
+ Defaults to 'google_cloud_default'.
120
+ :param impersonation_chain: Optional service account to impersonate using short-term
121
+ credentials. Can be either a single account or a chain of accounts required to
122
+ get the access_token of the last account in the list, which will be impersonated
123
+ in the request. If set as a string, the account must grant the originating account
124
+ the Service Account Token Creator IAM role. If set as a sequence, the identities
125
+ from the list must grant Service Account Token Creator IAM role to the directly
126
+ preceding identity, with first account from the list granting this role to the
127
+ originating account.
128
+ """
129
+
130
+ template_fields: Sequence[str] = (
131
+ "location",
132
+ "feature_view_sync_name",
133
+ )
134
+
135
+ def __init__(
136
+ self,
137
+ *,
138
+ location: str,
139
+ feature_view_sync_name: str,
140
+ gcp_conn_id: str = "google_cloud_default",
141
+ impersonation_chain: str | Sequence[str] | None = None,
142
+ **kwargs,
143
+ ) -> None:
144
+ super().__init__(**kwargs)
145
+ self.location = location
146
+ self.feature_view_sync_name = feature_view_sync_name
147
+ self.gcp_conn_id = gcp_conn_id
148
+ self.impersonation_chain = impersonation_chain
149
+
150
+ def execute(self, context: Context) -> dict[str, Any]:
151
+ """Execute the get feature view sync operation."""
152
+ self.hook = FeatureStoreHook(
153
+ gcp_conn_id=self.gcp_conn_id,
154
+ impersonation_chain=self.impersonation_chain,
155
+ )
156
+ self.log.info("Retrieving Feature View sync job now...")
157
+ response = self.hook.get_feature_view_sync(
158
+ location=self.location, feature_view_sync_name=self.feature_view_sync_name
159
+ )
160
+ self.log.info("Retrieved Feature View sync: %s", self.feature_view_sync_name)
161
+ self.log.info(response)
162
+
163
+ return response
@@ -353,6 +353,11 @@ class PromptMultimodalModelWithMediaOperator(GoogleCloudBaseOperator):
353
353
  return response
354
354
 
355
355
 
356
+ @deprecated(
357
+ planned_removal_date="April 09, 2025",
358
+ use_instead="GenerativeModelGenerateContentOperator",
359
+ category=AirflowProviderDeprecationWarning,
360
+ )
356
361
  class TextGenerationModelPredictOperator(GoogleCloudBaseOperator):
357
362
  """
358
363
  Uses the Vertex AI PaLM API to generate natural language text.
@@ -107,10 +107,10 @@ class DataprocJobSensor(BaseSensorOperator):
107
107
  }:
108
108
  message = f"Job was cancelled:\n{job}"
109
109
  raise AirflowException(message)
110
- elif JobStatus.State.DONE == state:
110
+ elif state == JobStatus.State.DONE:
111
111
  self.log.debug("Job %s completed successfully.", self.dataproc_job_id)
112
112
  return True
113
- elif JobStatus.State.ATTEMPT_FAILURE == state:
113
+ elif state == JobStatus.State.ATTEMPT_FAILURE:
114
114
  self.log.debug("Job %s attempt has failed.", self.dataproc_job_id)
115
115
 
116
116
  self.log.info("Waiting for job %s to complete.", self.dataproc_job_id)
@@ -0,0 +1,16 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
@@ -0,0 +1,112 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ """This module contains a Vertex AI Feature Store sensor."""
19
+
20
+ from __future__ import annotations
21
+
22
+ import time
23
+ from collections.abc import Sequence
24
+ from typing import TYPE_CHECKING
25
+
26
+ from airflow.exceptions import AirflowException
27
+ from airflow.providers.google.cloud.hooks.vertex_ai.feature_store import FeatureStoreHook
28
+ from airflow.sensors.base import BaseSensorOperator
29
+
30
+ if TYPE_CHECKING:
31
+ from airflow.utils.context import Context
32
+
33
+
34
+ class FeatureViewSyncSensor(BaseSensorOperator):
35
+ """
36
+ Sensor to monitor the state of a Vertex AI Feature View sync operation.
37
+
38
+ :param feature_view_sync_name: The name of the feature view sync operation to monitor. (templated)
39
+ :param location: Required. The Cloud region in which to handle the request. (templated)
40
+ :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform.
41
+ :param wait_timeout: How many seconds to wait for sync to complete.
42
+ :param impersonation_chain: Optional service account to impersonate using short-term
43
+ credentials.
44
+ """
45
+
46
+ template_fields: Sequence[str] = ("location", "feature_view_sync_name")
47
+ ui_color = "#f0eee4"
48
+
49
+ def __init__(
50
+ self,
51
+ *,
52
+ feature_view_sync_name: str,
53
+ location: str,
54
+ gcp_conn_id: str = "google_cloud_default",
55
+ wait_timeout: int | None = None,
56
+ impersonation_chain: str | Sequence[str] | None = None,
57
+ **kwargs,
58
+ ) -> None:
59
+ super().__init__(**kwargs)
60
+ self.feature_view_sync_name = feature_view_sync_name
61
+ self.location = location
62
+ self.gcp_conn_id = gcp_conn_id
63
+ self.wait_timeout = wait_timeout
64
+ self.impersonation_chain = impersonation_chain
65
+ self.start_sensor_time: float | None = None
66
+
67
+ def execute(self, context: Context) -> None:
68
+ self.start_sensor_time = time.monotonic()
69
+ super().execute(context)
70
+
71
+ def _duration(self):
72
+ return time.monotonic() - self.start_sensor_time
73
+
74
+ def poke(self, context: Context) -> bool:
75
+ hook = FeatureStoreHook(
76
+ gcp_conn_id=self.gcp_conn_id,
77
+ impersonation_chain=self.impersonation_chain,
78
+ )
79
+
80
+ try:
81
+ response = hook.get_feature_view_sync(
82
+ location=self.location,
83
+ feature_view_sync_name=self.feature_view_sync_name,
84
+ )
85
+
86
+ # Check if the sync has completed by verifying end_time exists
87
+ if response.get("end_time", 0) > 0:
88
+ self.log.info(
89
+ "Feature View sync %s completed. Rows synced: %d, Total slots: %d",
90
+ self.feature_view_sync_name,
91
+ int(response.get("sync_summary", "").get("row_synced", "")),
92
+ int(response.get("sync_summary", "").get("total_slot", "")),
93
+ )
94
+ return True
95
+
96
+ if self.wait_timeout and self._duration() > self.wait_timeout:
97
+ raise AirflowException(
98
+ f"Timeout: Feature View sync {self.feature_view_sync_name} "
99
+ f"not completed after {self.wait_timeout}s"
100
+ )
101
+
102
+ self.log.info("Waiting for Feature View sync %s to complete.", self.feature_view_sync_name)
103
+ return False
104
+
105
+ except Exception as e:
106
+ if self.wait_timeout and self._duration() > self.wait_timeout:
107
+ raise AirflowException(
108
+ f"Timeout: Feature View sync {self.feature_view_sync_name} "
109
+ f"not completed after {self.wait_timeout}s"
110
+ )
111
+ self.log.info("Error checking sync status, will retry: %s", str(e))
112
+ return False
@@ -292,8 +292,6 @@ class BigQueryToGCSOperator(BaseOperator):
292
292
 
293
293
  def get_openlineage_facets_on_complete(self, task_instance):
294
294
  """Implement on_complete as we will include final BQ job id."""
295
- from pathlib import Path
296
-
297
295
  from airflow.providers.common.compat.openlineage.facet import (
298
296
  BaseFacet,
299
297
  Dataset,
@@ -303,6 +301,8 @@ class BigQueryToGCSOperator(BaseOperator):
303
301
  )
304
302
  from airflow.providers.google.cloud.hooks.gcs import _parse_gcs_url
305
303
  from airflow.providers.google.cloud.openlineage.utils import (
304
+ WILDCARD,
305
+ extract_ds_name_from_gcs_path,
306
306
  get_facets_from_bq_table,
307
307
  get_identity_column_lineage_facet,
308
308
  )
@@ -333,24 +333,19 @@ class BigQueryToGCSOperator(BaseOperator):
333
333
  output_datasets = []
334
334
  for uri in sorted(self.destination_cloud_storage_uris):
335
335
  bucket, blob = _parse_gcs_url(uri)
336
- additional_facets = {}
337
336
 
338
- if "*" in blob:
339
- # If wildcard ("*") is used in gcs path, we want the name of dataset to be directory name,
340
- # but we create a symlink to the full object path with wildcard.
337
+ additional_facets = {}
338
+ if WILDCARD in blob:
339
+ # For path with wildcard we attach a symlink with unmodified path.
341
340
  additional_facets = {
342
341
  "symlink": SymlinksDatasetFacet(
343
342
  identifiers=[Identifier(namespace=f"gs://{bucket}", name=blob, type="file")]
344
343
  ),
345
344
  }
346
- blob = Path(blob).parent.as_posix()
347
- if blob == ".":
348
- # blob path does not have leading slash, but we need root dataset name to be "/"
349
- blob = "/"
350
345
 
351
346
  dataset = Dataset(
352
347
  namespace=f"gs://{bucket}",
353
- name=blob,
348
+ name=extract_ds_name_from_gcs_path(blob),
354
349
  facets=merge_dicts(output_dataset_facets, additional_facets),
355
350
  )
356
351
  output_datasets.append(dataset)
@@ -45,6 +45,9 @@ class BigQueryToMsSqlOperator(BigQueryToSqlBaseOperator):
45
45
  :param mssql_table: target MsSQL table. It is deprecated: use target_table_name instead. (templated)
46
46
  :param target_table_name: target MsSQL table. It takes precedence over mssql_table. (templated)
47
47
  :param mssql_conn_id: reference to a specific mssql hook
48
+
49
+ .. warning::
50
+ The `mssql_table` parameter has been deprecated. Use `target_table_name` instead.
48
51
  """
49
52
 
50
53
  template_fields: Sequence[str] = (
@@ -39,6 +39,9 @@ class BigQueryToMySqlOperator(BigQueryToSqlBaseOperator):
39
39
  specific database. It is deprecated: use target_table_name instead. (templated)
40
40
  :param target_table_name: target MySQL table. It takes precedence over mysql_table. (templated)
41
41
  :param mysql_conn_id: Reference to :ref:`mysql connection id <howto/connection:mysql>`.
42
+
43
+ .. warning::
44
+ The `mysql_table` parameter has been deprecated. Use `target_table_name` instead.
42
45
  """
43
46
 
44
47
  template_fields: Sequence[str] = (*BigQueryToSqlBaseOperator.template_fields, "dataset_id", "table_id")
@@ -756,8 +756,6 @@ class GCSToBigQueryOperator(BaseOperator):
756
756
 
757
757
  def get_openlineage_facets_on_complete(self, task_instance):
758
758
  """Implement on_complete as we will include final BQ job id."""
759
- from pathlib import Path
760
-
761
759
  from airflow.providers.common.compat.openlineage.facet import (
762
760
  Dataset,
763
761
  ExternalQueryRunFacet,
@@ -765,6 +763,8 @@ class GCSToBigQueryOperator(BaseOperator):
765
763
  SymlinksDatasetFacet,
766
764
  )
767
765
  from airflow.providers.google.cloud.openlineage.utils import (
766
+ WILDCARD,
767
+ extract_ds_name_from_gcs_path,
768
768
  get_facets_from_bq_table,
769
769
  get_identity_column_lineage_facet,
770
770
  )
@@ -793,22 +793,17 @@ class GCSToBigQueryOperator(BaseOperator):
793
793
  for blob in sorted(source_objects):
794
794
  additional_facets = {}
795
795
 
796
- if "*" in blob:
797
- # If wildcard ("*") is used in gcs path, we want the name of dataset to be directory name,
798
- # but we create a symlink to the full object path with wildcard.
796
+ if WILDCARD in blob:
797
+ # For path with wildcard we attach a symlink with unmodified path.
799
798
  additional_facets = {
800
799
  "symlink": SymlinksDatasetFacet(
801
800
  identifiers=[Identifier(namespace=f"gs://{self.bucket}", name=blob, type="file")]
802
801
  ),
803
802
  }
804
- blob = Path(blob).parent.as_posix()
805
- if blob == ".":
806
- # blob path does not have leading slash, but we need root dataset name to be "/"
807
- blob = "/"
808
803
 
809
804
  dataset = Dataset(
810
805
  namespace=f"gs://{self.bucket}",
811
- name=blob,
806
+ name=extract_ds_name_from_gcs_path(blob),
812
807
  facets=merge_dicts(input_dataset_facets, additional_facets),
813
808
  )
814
809
  input_datasets.append(dataset)
@@ -551,28 +551,16 @@ class GCSToGCSOperator(BaseOperator):
551
551
  This means we won't have to normalize self.source_object and self.source_objects,
552
552
  destination bucket and so on.
553
553
  """
554
- from pathlib import Path
555
-
556
554
  from airflow.providers.common.compat.openlineage.facet import Dataset
555
+ from airflow.providers.google.cloud.openlineage.utils import extract_ds_name_from_gcs_path
557
556
  from airflow.providers.openlineage.extractors import OperatorLineage
558
557
 
559
- def _process_prefix(pref):
560
- if WILDCARD in pref:
561
- pref = pref.split(WILDCARD)[0]
562
- # Use parent if not a file (dot not in name) and not a dir (ends with slash)
563
- if "." not in pref.split("/")[-1] and not pref.endswith("/"):
564
- pref = Path(pref).parent.as_posix()
565
- return ["/" if pref in ("", "/", ".") else pref.rstrip("/")] # Adjust root path
566
-
567
- inputs = []
568
- for prefix in self.source_objects:
569
- result = _process_prefix(prefix)
570
- inputs.extend(result)
558
+ inputs = [extract_ds_name_from_gcs_path(path) for path in self.source_objects]
571
559
 
572
560
  if self.destination_object is None:
573
561
  outputs = inputs.copy()
574
562
  else:
575
- outputs = _process_prefix(self.destination_object)
563
+ outputs = [extract_ds_name_from_gcs_path(self.destination_object)]
576
564
 
577
565
  return OperatorLineage(
578
566
  inputs=[
@@ -113,3 +113,12 @@ class GCSToLocalFilesystemOperator(BaseOperator):
113
113
  raise AirflowException("The size of the downloaded file is too large to push to XCom!")
114
114
  else:
115
115
  hook.download(bucket_name=self.bucket, object_name=self.object_name, filename=self.filename)
116
+
117
+ def get_openlineage_facets_on_start(self):
118
+ from airflow.providers.common.compat.openlineage.facet import Dataset
119
+ from airflow.providers.openlineage.extractors import OperatorLineage
120
+
121
+ return OperatorLineage(
122
+ inputs=[Dataset(namespace=f"gs://{self.bucket}", name=self.object_name)],
123
+ outputs=[Dataset(namespace="file", name=self.filename)] if self.filename else [],
124
+ )
@@ -69,12 +69,12 @@ class LocalFilesystemToGCSOperator(BaseOperator):
69
69
  def __init__(
70
70
  self,
71
71
  *,
72
- src,
73
- dst,
74
- bucket,
75
- gcp_conn_id="google_cloud_default",
76
- mime_type="application/octet-stream",
77
- gzip=False,
72
+ src: str | list[str],
73
+ dst: str,
74
+ bucket: str,
75
+ gcp_conn_id: str = "google_cloud_default",
76
+ mime_type: str = "application/octet-stream",
77
+ gzip: bool = False,
78
78
  chunk_size: int | None = None,
79
79
  impersonation_chain: str | Sequence[str] | None = None,
80
80
  **kwargs,
@@ -120,3 +120,38 @@ class LocalFilesystemToGCSOperator(BaseOperator):
120
120
  gzip=self.gzip,
121
121
  chunk_size=self.chunk_size,
122
122
  )
123
+
124
+ def get_openlineage_facets_on_start(self):
125
+ from airflow.providers.common.compat.openlineage.facet import (
126
+ Dataset,
127
+ Identifier,
128
+ SymlinksDatasetFacet,
129
+ )
130
+ from airflow.providers.google.cloud.openlineage.utils import WILDCARD, extract_ds_name_from_gcs_path
131
+ from airflow.providers.openlineage.extractors import OperatorLineage
132
+
133
+ source_facets = {}
134
+ if isinstance(self.src, str): # Single path provided, possibly relative or with wildcard
135
+ original_src = f"{self.src}"
136
+ absolute_src = os.path.abspath(self.src)
137
+ resolved_src = extract_ds_name_from_gcs_path(absolute_src)
138
+ if original_src.startswith("/") and not resolved_src.startswith("/"):
139
+ resolved_src = "/" + resolved_src
140
+ source_objects = [resolved_src]
141
+
142
+ if WILDCARD in original_src or absolute_src != resolved_src:
143
+ # We attach a symlink with unmodified path.
144
+ source_facets = {
145
+ "symlink": SymlinksDatasetFacet(
146
+ identifiers=[Identifier(namespace="file", name=original_src, type="file")]
147
+ ),
148
+ }
149
+ else:
150
+ source_objects = self.src
151
+
152
+ dest_object = self.dst if os.path.basename(self.dst) else extract_ds_name_from_gcs_path(self.dst)
153
+
154
+ return OperatorLineage(
155
+ inputs=[Dataset(namespace="file", name=src, facets=source_facets) for src in source_objects],
156
+ outputs=[Dataset(namespace=f"gs://{self.bucket}", name=dest_object)],
157
+ )
@@ -345,3 +345,18 @@ class S3ToGCSOperator(S3ListOperator):
345
345
  gcp_conn_id=self.gcp_conn_id,
346
346
  impersonation_chain=self.google_impersonation_chain,
347
347
  )
348
+
349
+ def get_openlineage_facets_on_start(self):
350
+ from airflow.providers.common.compat.openlineage.facet import Dataset
351
+ from airflow.providers.google.cloud.hooks.gcs import _parse_gcs_url
352
+ from airflow.providers.google.cloud.openlineage.utils import extract_ds_name_from_gcs_path
353
+ from airflow.providers.openlineage.extractors import OperatorLineage
354
+
355
+ gcs_bucket, gcs_blob = _parse_gcs_url(self.dest_gcs)
356
+ if not self.apply_gcs_prefix:
357
+ gcs_blob += self.prefix
358
+
359
+ return OperatorLineage(
360
+ inputs=[Dataset(namespace=f"s3://{self.bucket}", name=self.prefix.strip("/") or "/")],
361
+ outputs=[Dataset(namespace=f"gs://{gcs_bucket}", name=extract_ds_name_from_gcs_path(gcs_blob))],
362
+ )