apache-airflow-providers-google 15.1.0rc1__py3-none-any.whl → 19.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airflow/providers/google/3rd-party-licenses/NOTICE +2 -12
- airflow/providers/google/__init__.py +3 -3
- airflow/providers/google/ads/hooks/ads.py +39 -6
- airflow/providers/google/ads/operators/ads.py +2 -2
- airflow/providers/google/ads/transfers/ads_to_gcs.py +2 -2
- airflow/providers/google/assets/gcs.py +1 -11
- airflow/providers/google/cloud/bundles/__init__.py +16 -0
- airflow/providers/google/cloud/bundles/gcs.py +161 -0
- airflow/providers/google/cloud/hooks/alloy_db.py +1 -1
- airflow/providers/google/cloud/hooks/bigquery.py +176 -293
- airflow/providers/google/cloud/hooks/cloud_batch.py +1 -1
- airflow/providers/google/cloud/hooks/cloud_build.py +1 -1
- airflow/providers/google/cloud/hooks/cloud_composer.py +288 -15
- airflow/providers/google/cloud/hooks/cloud_logging.py +109 -0
- airflow/providers/google/cloud/hooks/cloud_memorystore.py +1 -1
- airflow/providers/google/cloud/hooks/cloud_run.py +18 -10
- airflow/providers/google/cloud/hooks/cloud_sql.py +102 -23
- airflow/providers/google/cloud/hooks/cloud_storage_transfer_service.py +29 -7
- airflow/providers/google/cloud/hooks/compute.py +1 -1
- airflow/providers/google/cloud/hooks/compute_ssh.py +6 -2
- airflow/providers/google/cloud/hooks/datacatalog.py +10 -1
- airflow/providers/google/cloud/hooks/dataflow.py +72 -95
- airflow/providers/google/cloud/hooks/dataform.py +1 -1
- airflow/providers/google/cloud/hooks/datafusion.py +21 -19
- airflow/providers/google/cloud/hooks/dataplex.py +2 -2
- airflow/providers/google/cloud/hooks/dataprep.py +1 -1
- airflow/providers/google/cloud/hooks/dataproc.py +73 -72
- airflow/providers/google/cloud/hooks/dataproc_metastore.py +1 -1
- airflow/providers/google/cloud/hooks/dlp.py +1 -1
- airflow/providers/google/cloud/hooks/functions.py +1 -1
- airflow/providers/google/cloud/hooks/gcs.py +112 -15
- airflow/providers/google/cloud/hooks/gdm.py +1 -1
- airflow/providers/google/cloud/hooks/gen_ai.py +196 -0
- airflow/providers/google/cloud/hooks/kubernetes_engine.py +3 -3
- airflow/providers/google/cloud/hooks/looker.py +6 -2
- airflow/providers/google/cloud/hooks/managed_kafka.py +1 -1
- airflow/providers/google/cloud/hooks/mlengine.py +4 -3
- airflow/providers/google/cloud/hooks/pubsub.py +3 -0
- airflow/providers/google/cloud/hooks/secret_manager.py +102 -10
- airflow/providers/google/cloud/hooks/spanner.py +74 -9
- airflow/providers/google/cloud/hooks/stackdriver.py +11 -9
- airflow/providers/google/cloud/hooks/tasks.py +1 -1
- airflow/providers/google/cloud/hooks/translate.py +2 -2
- airflow/providers/google/cloud/hooks/vertex_ai/auto_ml.py +2 -210
- airflow/providers/google/cloud/hooks/vertex_ai/batch_prediction_job.py +3 -3
- airflow/providers/google/cloud/hooks/vertex_ai/custom_job.py +28 -2
- airflow/providers/google/cloud/hooks/vertex_ai/experiment_service.py +202 -0
- airflow/providers/google/cloud/hooks/vertex_ai/feature_store.py +308 -8
- airflow/providers/google/cloud/hooks/vertex_ai/generative_model.py +79 -75
- airflow/providers/google/cloud/hooks/vertex_ai/hyperparameter_tuning_job.py +1 -1
- airflow/providers/google/cloud/hooks/vertex_ai/model_service.py +1 -1
- airflow/providers/google/cloud/hooks/vertex_ai/pipeline_job.py +1 -1
- airflow/providers/google/cloud/hooks/vertex_ai/ray.py +223 -0
- airflow/providers/google/cloud/hooks/vision.py +3 -3
- airflow/providers/google/cloud/hooks/workflows.py +1 -1
- airflow/providers/google/cloud/links/alloy_db.py +0 -46
- airflow/providers/google/cloud/links/base.py +77 -13
- airflow/providers/google/cloud/links/bigquery.py +0 -47
- airflow/providers/google/cloud/links/bigquery_dts.py +0 -20
- airflow/providers/google/cloud/links/bigtable.py +0 -48
- airflow/providers/google/cloud/links/cloud_build.py +0 -73
- airflow/providers/google/cloud/links/cloud_functions.py +0 -33
- airflow/providers/google/cloud/links/cloud_memorystore.py +0 -58
- airflow/providers/google/cloud/links/{life_sciences.py → cloud_run.py} +5 -27
- airflow/providers/google/cloud/links/cloud_sql.py +0 -33
- airflow/providers/google/cloud/links/cloud_storage_transfer.py +17 -44
- airflow/providers/google/cloud/links/cloud_tasks.py +7 -26
- airflow/providers/google/cloud/links/compute.py +0 -58
- airflow/providers/google/cloud/links/data_loss_prevention.py +0 -169
- airflow/providers/google/cloud/links/datacatalog.py +23 -54
- airflow/providers/google/cloud/links/dataflow.py +0 -34
- airflow/providers/google/cloud/links/dataform.py +0 -64
- airflow/providers/google/cloud/links/datafusion.py +1 -96
- airflow/providers/google/cloud/links/dataplex.py +0 -154
- airflow/providers/google/cloud/links/dataprep.py +0 -24
- airflow/providers/google/cloud/links/dataproc.py +11 -95
- airflow/providers/google/cloud/links/datastore.py +0 -31
- airflow/providers/google/cloud/links/kubernetes_engine.py +9 -60
- airflow/providers/google/cloud/links/managed_kafka.py +0 -70
- airflow/providers/google/cloud/links/mlengine.py +0 -70
- airflow/providers/google/cloud/links/pubsub.py +0 -32
- airflow/providers/google/cloud/links/spanner.py +0 -33
- airflow/providers/google/cloud/links/stackdriver.py +0 -30
- airflow/providers/google/cloud/links/translate.py +17 -187
- airflow/providers/google/cloud/links/vertex_ai.py +28 -195
- airflow/providers/google/cloud/links/workflows.py +0 -52
- airflow/providers/google/cloud/log/gcs_task_handler.py +58 -22
- airflow/providers/google/cloud/log/stackdriver_task_handler.py +9 -6
- airflow/providers/google/cloud/openlineage/CloudStorageTransferJobFacet.json +68 -0
- airflow/providers/google/cloud/openlineage/CloudStorageTransferRunFacet.json +60 -0
- airflow/providers/google/cloud/openlineage/DataFusionRunFacet.json +32 -0
- airflow/providers/google/cloud/openlineage/facets.py +102 -1
- airflow/providers/google/cloud/openlineage/mixins.py +10 -8
- airflow/providers/google/cloud/openlineage/utils.py +15 -1
- airflow/providers/google/cloud/operators/alloy_db.py +71 -56
- airflow/providers/google/cloud/operators/bigquery.py +73 -636
- airflow/providers/google/cloud/operators/bigquery_dts.py +4 -6
- airflow/providers/google/cloud/operators/bigtable.py +37 -8
- airflow/providers/google/cloud/operators/cloud_base.py +21 -1
- airflow/providers/google/cloud/operators/cloud_batch.py +3 -3
- airflow/providers/google/cloud/operators/cloud_build.py +76 -33
- airflow/providers/google/cloud/operators/cloud_composer.py +129 -41
- airflow/providers/google/cloud/operators/cloud_logging_sink.py +341 -0
- airflow/providers/google/cloud/operators/cloud_memorystore.py +69 -43
- airflow/providers/google/cloud/operators/cloud_run.py +24 -6
- airflow/providers/google/cloud/operators/cloud_sql.py +8 -17
- airflow/providers/google/cloud/operators/cloud_storage_transfer_service.py +93 -12
- airflow/providers/google/cloud/operators/compute.py +9 -41
- airflow/providers/google/cloud/operators/datacatalog.py +157 -21
- airflow/providers/google/cloud/operators/dataflow.py +40 -16
- airflow/providers/google/cloud/operators/dataform.py +15 -5
- airflow/providers/google/cloud/operators/datafusion.py +42 -21
- airflow/providers/google/cloud/operators/dataplex.py +194 -110
- airflow/providers/google/cloud/operators/dataprep.py +1 -5
- airflow/providers/google/cloud/operators/dataproc.py +80 -36
- airflow/providers/google/cloud/operators/dataproc_metastore.py +97 -89
- airflow/providers/google/cloud/operators/datastore.py +23 -7
- airflow/providers/google/cloud/operators/dlp.py +6 -29
- airflow/providers/google/cloud/operators/functions.py +17 -8
- airflow/providers/google/cloud/operators/gcs.py +12 -9
- airflow/providers/google/cloud/operators/gen_ai.py +389 -0
- airflow/providers/google/cloud/operators/kubernetes_engine.py +62 -100
- airflow/providers/google/cloud/operators/looker.py +2 -2
- airflow/providers/google/cloud/operators/managed_kafka.py +108 -53
- airflow/providers/google/cloud/operators/natural_language.py +1 -1
- airflow/providers/google/cloud/operators/pubsub.py +68 -15
- airflow/providers/google/cloud/operators/spanner.py +26 -13
- airflow/providers/google/cloud/operators/speech_to_text.py +2 -3
- airflow/providers/google/cloud/operators/stackdriver.py +1 -9
- airflow/providers/google/cloud/operators/tasks.py +1 -12
- airflow/providers/google/cloud/operators/text_to_speech.py +2 -3
- airflow/providers/google/cloud/operators/translate.py +41 -17
- airflow/providers/google/cloud/operators/translate_speech.py +2 -3
- airflow/providers/google/cloud/operators/vertex_ai/auto_ml.py +39 -19
- airflow/providers/google/cloud/operators/vertex_ai/batch_prediction_job.py +30 -10
- airflow/providers/google/cloud/operators/vertex_ai/custom_job.py +55 -27
- airflow/providers/google/cloud/operators/vertex_ai/dataset.py +70 -8
- airflow/providers/google/cloud/operators/vertex_ai/endpoint_service.py +43 -9
- airflow/providers/google/cloud/operators/vertex_ai/experiment_service.py +435 -0
- airflow/providers/google/cloud/operators/vertex_ai/feature_store.py +532 -1
- airflow/providers/google/cloud/operators/vertex_ai/generative_model.py +135 -115
- airflow/providers/google/cloud/operators/vertex_ai/hyperparameter_tuning_job.py +12 -10
- airflow/providers/google/cloud/operators/vertex_ai/model_service.py +57 -11
- airflow/providers/google/cloud/operators/vertex_ai/pipeline_job.py +31 -8
- airflow/providers/google/cloud/operators/vertex_ai/ray.py +393 -0
- airflow/providers/google/cloud/operators/video_intelligence.py +1 -1
- airflow/providers/google/cloud/operators/vision.py +2 -2
- airflow/providers/google/cloud/operators/workflows.py +18 -15
- airflow/providers/google/cloud/secrets/secret_manager.py +3 -2
- airflow/providers/google/cloud/sensors/bigquery.py +3 -3
- airflow/providers/google/cloud/sensors/bigquery_dts.py +2 -3
- airflow/providers/google/cloud/sensors/bigtable.py +11 -4
- airflow/providers/google/cloud/sensors/cloud_composer.py +533 -30
- airflow/providers/google/cloud/sensors/cloud_storage_transfer_service.py +2 -3
- airflow/providers/google/cloud/sensors/dataflow.py +26 -10
- airflow/providers/google/cloud/sensors/dataform.py +2 -3
- airflow/providers/google/cloud/sensors/datafusion.py +4 -5
- airflow/providers/google/cloud/sensors/dataplex.py +2 -3
- airflow/providers/google/cloud/sensors/dataprep.py +2 -2
- airflow/providers/google/cloud/sensors/dataproc.py +2 -3
- airflow/providers/google/cloud/sensors/dataproc_metastore.py +2 -3
- airflow/providers/google/cloud/sensors/gcs.py +4 -5
- airflow/providers/google/cloud/sensors/looker.py +2 -3
- airflow/providers/google/cloud/sensors/pubsub.py +4 -5
- airflow/providers/google/cloud/sensors/tasks.py +2 -2
- airflow/providers/google/cloud/sensors/vertex_ai/feature_store.py +2 -3
- airflow/providers/google/cloud/sensors/workflows.py +2 -3
- airflow/providers/google/cloud/transfers/adls_to_gcs.py +1 -1
- airflow/providers/google/cloud/transfers/azure_blob_to_gcs.py +2 -2
- airflow/providers/google/cloud/transfers/azure_fileshare_to_gcs.py +4 -3
- airflow/providers/google/cloud/transfers/bigquery_to_bigquery.py +11 -8
- airflow/providers/google/cloud/transfers/bigquery_to_gcs.py +10 -5
- airflow/providers/google/cloud/transfers/bigquery_to_mssql.py +7 -3
- airflow/providers/google/cloud/transfers/bigquery_to_mysql.py +12 -1
- airflow/providers/google/cloud/transfers/bigquery_to_postgres.py +24 -10
- airflow/providers/google/cloud/transfers/bigquery_to_sql.py +104 -5
- airflow/providers/google/cloud/transfers/calendar_to_gcs.py +1 -1
- airflow/providers/google/cloud/transfers/cassandra_to_gcs.py +3 -3
- airflow/providers/google/cloud/transfers/facebook_ads_to_gcs.py +4 -4
- airflow/providers/google/cloud/transfers/gcs_to_bigquery.py +21 -13
- airflow/providers/google/cloud/transfers/gcs_to_gcs.py +4 -3
- airflow/providers/google/cloud/transfers/gcs_to_local.py +6 -4
- airflow/providers/google/cloud/transfers/gcs_to_sftp.py +11 -5
- airflow/providers/google/cloud/transfers/gdrive_to_gcs.py +6 -2
- airflow/providers/google/cloud/transfers/gdrive_to_local.py +2 -2
- airflow/providers/google/cloud/transfers/http_to_gcs.py +193 -0
- airflow/providers/google/cloud/transfers/local_to_gcs.py +2 -2
- airflow/providers/google/cloud/transfers/mssql_to_gcs.py +1 -1
- airflow/providers/google/cloud/transfers/oracle_to_gcs.py +36 -11
- airflow/providers/google/cloud/transfers/postgres_to_gcs.py +42 -9
- airflow/providers/google/cloud/transfers/s3_to_gcs.py +13 -7
- airflow/providers/google/cloud/transfers/salesforce_to_gcs.py +2 -2
- airflow/providers/google/cloud/transfers/sftp_to_gcs.py +14 -5
- airflow/providers/google/cloud/transfers/sheets_to_gcs.py +3 -3
- airflow/providers/google/cloud/transfers/sql_to_gcs.py +10 -10
- airflow/providers/google/cloud/triggers/bigquery.py +76 -35
- airflow/providers/google/cloud/triggers/cloud_build.py +1 -1
- airflow/providers/google/cloud/triggers/cloud_composer.py +303 -47
- airflow/providers/google/cloud/triggers/cloud_run.py +3 -3
- airflow/providers/google/cloud/triggers/cloud_storage_transfer_service.py +92 -2
- airflow/providers/google/cloud/triggers/dataflow.py +122 -0
- airflow/providers/google/cloud/triggers/datafusion.py +1 -1
- airflow/providers/google/cloud/triggers/dataplex.py +14 -2
- airflow/providers/google/cloud/triggers/dataproc.py +123 -53
- airflow/providers/google/cloud/triggers/kubernetes_engine.py +47 -28
- airflow/providers/google/cloud/triggers/mlengine.py +1 -1
- airflow/providers/google/cloud/triggers/pubsub.py +15 -19
- airflow/providers/google/cloud/triggers/vertex_ai.py +1 -1
- airflow/providers/google/cloud/utils/bigquery_get_data.py +1 -1
- airflow/providers/google/cloud/utils/credentials_provider.py +2 -2
- airflow/providers/google/cloud/utils/field_sanitizer.py +1 -1
- airflow/providers/google/cloud/utils/field_validator.py +2 -3
- airflow/providers/google/common/auth_backend/google_openid.py +4 -4
- airflow/providers/google/common/deprecated.py +2 -1
- airflow/providers/google/common/hooks/base_google.py +27 -9
- airflow/providers/google/common/hooks/operation_helpers.py +1 -1
- airflow/providers/google/common/links/storage.py +0 -22
- airflow/providers/google/common/utils/get_secret.py +31 -0
- airflow/providers/google/common/utils/id_token_credentials.py +3 -4
- airflow/providers/google/firebase/hooks/firestore.py +1 -1
- airflow/providers/google/firebase/operators/firestore.py +3 -3
- airflow/providers/google/get_provider_info.py +56 -52
- airflow/providers/google/go_module_utils.py +35 -3
- airflow/providers/google/leveldb/hooks/leveldb.py +27 -2
- airflow/providers/google/leveldb/operators/leveldb.py +2 -2
- airflow/providers/google/marketing_platform/hooks/campaign_manager.py +1 -1
- airflow/providers/google/marketing_platform/hooks/display_video.py +3 -109
- airflow/providers/google/marketing_platform/hooks/search_ads.py +1 -1
- airflow/providers/google/marketing_platform/links/analytics_admin.py +5 -14
- airflow/providers/google/marketing_platform/operators/analytics_admin.py +2 -3
- airflow/providers/google/marketing_platform/operators/campaign_manager.py +6 -6
- airflow/providers/google/marketing_platform/operators/display_video.py +28 -489
- airflow/providers/google/marketing_platform/operators/search_ads.py +2 -2
- airflow/providers/google/marketing_platform/sensors/campaign_manager.py +2 -2
- airflow/providers/google/marketing_platform/sensors/display_video.py +3 -64
- airflow/providers/google/suite/hooks/calendar.py +2 -2
- airflow/providers/google/suite/hooks/sheets.py +16 -2
- airflow/providers/google/suite/operators/sheets.py +8 -3
- airflow/providers/google/suite/sensors/drive.py +2 -2
- airflow/providers/google/suite/transfers/gcs_to_gdrive.py +3 -3
- airflow/providers/google/suite/transfers/gcs_to_sheets.py +1 -1
- airflow/providers/google/suite/transfers/local_to_drive.py +3 -3
- airflow/providers/google/suite/transfers/sql_to_sheets.py +5 -4
- airflow/providers/google/version_compat.py +15 -1
- {apache_airflow_providers_google-15.1.0rc1.dist-info → apache_airflow_providers_google-19.3.0.dist-info}/METADATA +90 -46
- apache_airflow_providers_google-19.3.0.dist-info/RECORD +331 -0
- apache_airflow_providers_google-19.3.0.dist-info/licenses/NOTICE +5 -0
- airflow/providers/google/cloud/hooks/automl.py +0 -673
- airflow/providers/google/cloud/hooks/life_sciences.py +0 -159
- airflow/providers/google/cloud/links/automl.py +0 -193
- airflow/providers/google/cloud/operators/automl.py +0 -1362
- airflow/providers/google/cloud/operators/life_sciences.py +0 -119
- airflow/providers/google/cloud/operators/mlengine.py +0 -112
- apache_airflow_providers_google-15.1.0rc1.dist-info/RECORD +0 -321
- {apache_airflow_providers_google-15.1.0rc1.dist-info → apache_airflow_providers_google-19.3.0.dist-info}/WHEEL +0 -0
- {apache_airflow_providers_google-15.1.0rc1.dist-info → apache_airflow_providers_google-19.3.0.dist-info}/entry_points.txt +0 -0
- {airflow/providers/google → apache_airflow_providers_google-19.3.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -27,11 +27,11 @@ from typing import TYPE_CHECKING
|
|
|
27
27
|
|
|
28
28
|
import attrs
|
|
29
29
|
|
|
30
|
-
#
|
|
31
|
-
|
|
30
|
+
# Make mypy happy by importing as aliases
|
|
31
|
+
import google.cloud.storage as storage
|
|
32
32
|
|
|
33
33
|
from airflow.configuration import conf
|
|
34
|
-
from airflow.
|
|
34
|
+
from airflow.providers.common.compat.sdk import AirflowNotFoundException
|
|
35
35
|
from airflow.providers.google.cloud.hooks.gcs import GCSHook, _parse_gcs_url
|
|
36
36
|
from airflow.providers.google.cloud.utils.credentials_provider import (
|
|
37
37
|
get_credentials_and_project_id,
|
|
@@ -43,9 +43,11 @@ from airflow.utils.log.file_task_handler import FileTaskHandler
|
|
|
43
43
|
from airflow.utils.log.logging_mixin import LoggingMixin
|
|
44
44
|
|
|
45
45
|
if TYPE_CHECKING:
|
|
46
|
+
from io import TextIOWrapper
|
|
47
|
+
|
|
46
48
|
from airflow.models.taskinstance import TaskInstance
|
|
47
49
|
from airflow.sdk.types import RuntimeTaskInstanceProtocol as RuntimeTI
|
|
48
|
-
from airflow.utils.log.file_task_handler import
|
|
50
|
+
from airflow.utils.log.file_task_handler import LogResponse, RawLogStream, StreamingLogResponse
|
|
49
51
|
|
|
50
52
|
_DEFAULT_SCOPESS = frozenset(
|
|
51
53
|
[
|
|
@@ -61,13 +63,15 @@ class GCSRemoteLogIO(LoggingMixin): # noqa: D101
|
|
|
61
63
|
remote_base: str
|
|
62
64
|
base_log_folder: Path = attrs.field(converter=Path)
|
|
63
65
|
delete_local_copy: bool
|
|
66
|
+
project_id: str | None = None
|
|
67
|
+
|
|
68
|
+
gcp_key_path: str | None = None
|
|
69
|
+
gcp_keyfile_dict: dict | None = None
|
|
70
|
+
scopes: Collection[str] | None = _DEFAULT_SCOPESS
|
|
64
71
|
|
|
65
|
-
|
|
66
|
-
gcp_keyfile_dict: dict | None
|
|
67
|
-
scopes: Collection[str] | None
|
|
68
|
-
project_id: str
|
|
72
|
+
processors = ()
|
|
69
73
|
|
|
70
|
-
def upload(self, path: os.PathLike, ti: RuntimeTI):
|
|
74
|
+
def upload(self, path: os.PathLike | str, ti: RuntimeTI):
|
|
71
75
|
"""Upload the given log path to the remote storage."""
|
|
72
76
|
path = Path(path)
|
|
73
77
|
if path.is_absolute():
|
|
@@ -147,11 +151,26 @@ class GCSRemoteLogIO(LoggingMixin): # noqa: D101
|
|
|
147
151
|
exc, "resp", {}
|
|
148
152
|
).get("status") == "404"
|
|
149
153
|
|
|
150
|
-
def read(self, relative_path: str, ti: RuntimeTI) ->
|
|
151
|
-
messages =
|
|
152
|
-
|
|
154
|
+
def read(self, relative_path: str, ti: RuntimeTI) -> LogResponse:
|
|
155
|
+
messages, log_streams = self.stream(relative_path, ti)
|
|
156
|
+
if not log_streams:
|
|
157
|
+
return messages, None
|
|
158
|
+
|
|
159
|
+
logs: list[str] = []
|
|
160
|
+
try:
|
|
161
|
+
# for each log_stream, exhaust the generator into a string
|
|
162
|
+
logs = ["".join(line for line in log_stream) for log_stream in log_streams]
|
|
163
|
+
except Exception as e:
|
|
164
|
+
if not AIRFLOW_V_3_0_PLUS:
|
|
165
|
+
messages.append(f"Unable to read remote log {e}")
|
|
166
|
+
|
|
167
|
+
return messages, logs
|
|
168
|
+
|
|
169
|
+
def stream(self, relative_path: str, ti: RuntimeTI) -> StreamingLogResponse:
|
|
170
|
+
messages: list[str] = []
|
|
171
|
+
log_streams: list[RawLogStream] = []
|
|
153
172
|
remote_loc = os.path.join(self.remote_base, relative_path)
|
|
154
|
-
uris = []
|
|
173
|
+
uris: list[str] = []
|
|
155
174
|
bucket, prefix = _parse_gcs_url(remote_loc)
|
|
156
175
|
blobs = list(self.client.list_blobs(bucket_or_name=bucket, prefix=prefix))
|
|
157
176
|
|
|
@@ -162,18 +181,29 @@ class GCSRemoteLogIO(LoggingMixin): # noqa: D101
|
|
|
162
181
|
else:
|
|
163
182
|
messages.extend(["Found remote logs:", *[f" * {x}" for x in sorted(uris)]])
|
|
164
183
|
else:
|
|
165
|
-
return messages,
|
|
184
|
+
return messages, []
|
|
166
185
|
|
|
167
186
|
try:
|
|
168
187
|
for key in sorted(uris):
|
|
169
188
|
blob = storage.Blob.from_string(key, self.client)
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
logs.append(remote_log)
|
|
189
|
+
stream = blob.open("r")
|
|
190
|
+
log_streams.append(self._get_log_stream(stream))
|
|
173
191
|
except Exception as e:
|
|
174
192
|
if not AIRFLOW_V_3_0_PLUS:
|
|
175
193
|
messages.append(f"Unable to read remote log {e}")
|
|
176
|
-
return messages,
|
|
194
|
+
return messages, log_streams
|
|
195
|
+
|
|
196
|
+
def _get_log_stream(self, stream: TextIOWrapper) -> RawLogStream:
|
|
197
|
+
"""
|
|
198
|
+
Yield lines from the given stream.
|
|
199
|
+
|
|
200
|
+
:param stream: The opened stream to read from.
|
|
201
|
+
:yield: Lines of the log file.
|
|
202
|
+
"""
|
|
203
|
+
try:
|
|
204
|
+
yield from stream
|
|
205
|
+
finally:
|
|
206
|
+
stream.close()
|
|
177
207
|
|
|
178
208
|
|
|
179
209
|
class GCSTaskHandler(FileTaskHandler, LoggingMixin):
|
|
@@ -211,9 +241,15 @@ class GCSTaskHandler(FileTaskHandler, LoggingMixin):
|
|
|
211
241
|
gcp_keyfile_dict: dict | None = None,
|
|
212
242
|
gcp_scopes: Collection[str] | None = _DEFAULT_SCOPESS,
|
|
213
243
|
project_id: str = PROVIDE_PROJECT_ID,
|
|
244
|
+
max_bytes: int = 0,
|
|
245
|
+
backup_count: int = 0,
|
|
246
|
+
delay: bool = False,
|
|
214
247
|
**kwargs,
|
|
215
|
-
):
|
|
216
|
-
|
|
248
|
+
) -> None:
|
|
249
|
+
# support log file size handling of FileTaskHandler
|
|
250
|
+
super().__init__(
|
|
251
|
+
base_log_folder=base_log_folder, max_bytes=max_bytes, backup_count=backup_count, delay=delay
|
|
252
|
+
)
|
|
217
253
|
self.handler: logging.FileHandler | None = None
|
|
218
254
|
self.log_relative_path = ""
|
|
219
255
|
self.closed = False
|
|
@@ -265,7 +301,7 @@ class GCSTaskHandler(FileTaskHandler, LoggingMixin):
|
|
|
265
301
|
# Mark closed so we don't double write if close is called twice
|
|
266
302
|
self.closed = True
|
|
267
303
|
|
|
268
|
-
def _read_remote_logs(self, ti, try_number, metadata=None) ->
|
|
304
|
+
def _read_remote_logs(self, ti, try_number, metadata=None) -> LogResponse:
|
|
269
305
|
# Explicitly getting log relative path is necessary as the given
|
|
270
306
|
# task instance might be different than task instance passed in
|
|
271
307
|
# in set_context method.
|
|
@@ -275,7 +311,7 @@ class GCSTaskHandler(FileTaskHandler, LoggingMixin):
|
|
|
275
311
|
|
|
276
312
|
if logs is None:
|
|
277
313
|
logs = []
|
|
278
|
-
if not AIRFLOW_V_3_0_PLUS:
|
|
314
|
+
if not AIRFLOW_V_3_0_PLUS and not messages:
|
|
279
315
|
messages.append(f"No logs found in GCS; ti={ti}")
|
|
280
316
|
|
|
281
317
|
return messages, logs
|
|
@@ -35,17 +35,20 @@ from airflow.exceptions import AirflowProviderDeprecationWarning
|
|
|
35
35
|
from airflow.providers.google.cloud.utils.credentials_provider import get_credentials_and_project_id
|
|
36
36
|
from airflow.providers.google.common.consts import CLIENT_INFO
|
|
37
37
|
from airflow.providers.google.version_compat import AIRFLOW_V_3_0_PLUS
|
|
38
|
-
|
|
38
|
+
|
|
39
|
+
try:
|
|
40
|
+
from airflow.sdk.definitions._internal.types import NOTSET, ArgNotSet
|
|
41
|
+
except ImportError:
|
|
42
|
+
from airflow.utils.types import NOTSET, ArgNotSet # type: ignore[attr-defined,no-redef]
|
|
43
|
+
|
|
44
|
+
if not AIRFLOW_V_3_0_PLUS:
|
|
45
|
+
from airflow.utils.log.trigger_handler import ctx_indiv_trigger
|
|
39
46
|
|
|
40
47
|
if TYPE_CHECKING:
|
|
41
48
|
from google.auth.credentials import Credentials
|
|
42
49
|
|
|
43
50
|
from airflow.models import TaskInstance
|
|
44
51
|
|
|
45
|
-
|
|
46
|
-
if not AIRFLOW_V_3_0_PLUS:
|
|
47
|
-
from airflow.utils.log.trigger_handler import ctx_indiv_trigger
|
|
48
|
-
|
|
49
52
|
DEFAULT_LOGGER_NAME = "airflow"
|
|
50
53
|
_GLOBAL_RESOURCE = Resource(type="global", labels={})
|
|
51
54
|
|
|
@@ -159,7 +162,7 @@ class StackdriverTaskHandler(logging.Handler):
|
|
|
159
162
|
"""Object responsible for sending data to Stackdriver."""
|
|
160
163
|
# The Transport object is badly defined (no init) but in the docs client/name as constructor
|
|
161
164
|
# arguments are a requirement for any class that derives from Transport class, hence ignore:
|
|
162
|
-
return self.transport_type(self._client, self.gcp_log_name)
|
|
165
|
+
return self.transport_type(self._client, self.gcp_log_name)
|
|
163
166
|
|
|
164
167
|
def _get_labels(self, task_instance=None):
|
|
165
168
|
if task_instance:
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
|
+
"$defs": {
|
|
4
|
+
"CloudStorageTransferJobFacet": {
|
|
5
|
+
"allOf": [
|
|
6
|
+
{
|
|
7
|
+
"$ref": "https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/JobFacet"
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
"type": "object",
|
|
11
|
+
"properties": {
|
|
12
|
+
"jobName": {
|
|
13
|
+
"type": "string",
|
|
14
|
+
"description": "Transfer job name assigned by GCP Storage Transfer Service."
|
|
15
|
+
},
|
|
16
|
+
"projectId": {
|
|
17
|
+
"type": "string",
|
|
18
|
+
"description": "GCP project ID."
|
|
19
|
+
},
|
|
20
|
+
"description": {
|
|
21
|
+
"type": "string",
|
|
22
|
+
"description": "Optional description of the transfer job."
|
|
23
|
+
},
|
|
24
|
+
"status": {
|
|
25
|
+
"type": "string",
|
|
26
|
+
"description": "Status of the transfer job (ENABLED, DISABLED)."
|
|
27
|
+
},
|
|
28
|
+
"sourceBucket": {
|
|
29
|
+
"type": "string",
|
|
30
|
+
"description": "Source AWS S3 bucket."
|
|
31
|
+
},
|
|
32
|
+
"sourcePath": {
|
|
33
|
+
"type": "string",
|
|
34
|
+
"description": "Prefix path inside the source bucket."
|
|
35
|
+
},
|
|
36
|
+
"targetBucket": {
|
|
37
|
+
"type": "string",
|
|
38
|
+
"description": "Target GCS bucket."
|
|
39
|
+
},
|
|
40
|
+
"targetPath": {
|
|
41
|
+
"type": "string",
|
|
42
|
+
"description": "Prefix path inside the target bucket."
|
|
43
|
+
},
|
|
44
|
+
"objectConditions": {
|
|
45
|
+
"type": "object",
|
|
46
|
+
"description": "Filtering conditions for objects transferred."
|
|
47
|
+
},
|
|
48
|
+
"transferOptions": {
|
|
49
|
+
"type": "object",
|
|
50
|
+
"description": "Transfer options such as overwrite or delete."
|
|
51
|
+
},
|
|
52
|
+
"schedule": {
|
|
53
|
+
"type": "object",
|
|
54
|
+
"description": "Transfer schedule details."
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
],
|
|
59
|
+
"type": "object"
|
|
60
|
+
}
|
|
61
|
+
},
|
|
62
|
+
"type": "object",
|
|
63
|
+
"properties": {
|
|
64
|
+
"cloudStorageTransferJob": {
|
|
65
|
+
"$ref": "#/$defs/CloudStorageTransferJobFacet"
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
|
+
"$defs": {
|
|
4
|
+
"CloudStorageTransferRunFacet": {
|
|
5
|
+
"allOf": [
|
|
6
|
+
{
|
|
7
|
+
"$ref": "https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/RunFacet"
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
"type": "object",
|
|
11
|
+
"properties": {
|
|
12
|
+
"jobName": {
|
|
13
|
+
"type": "string",
|
|
14
|
+
"description": "Transfer job name associated with this run."
|
|
15
|
+
},
|
|
16
|
+
"operationName": {
|
|
17
|
+
"type": "string",
|
|
18
|
+
"description": "Transfer operation name if available."
|
|
19
|
+
},
|
|
20
|
+
"status": {
|
|
21
|
+
"type": "string",
|
|
22
|
+
"description": "Run status if available."
|
|
23
|
+
},
|
|
24
|
+
"startTime": {
|
|
25
|
+
"type": "string",
|
|
26
|
+
"description": "Start time of the transfer operation."
|
|
27
|
+
},
|
|
28
|
+
"endTime": {
|
|
29
|
+
"type": "string",
|
|
30
|
+
"description": "End time of the transfer operation."
|
|
31
|
+
},
|
|
32
|
+
"wait": {
|
|
33
|
+
"type": "boolean",
|
|
34
|
+
"description": "Whether the operator waited for completion."
|
|
35
|
+
},
|
|
36
|
+
"timeout": {
|
|
37
|
+
"type": ["number", "null"],
|
|
38
|
+
"description": "Timeout in seconds."
|
|
39
|
+
},
|
|
40
|
+
"deferrable": {
|
|
41
|
+
"type": "boolean",
|
|
42
|
+
"description": "Whether the operator used deferrable mode."
|
|
43
|
+
},
|
|
44
|
+
"deleteJobAfterCompletion": {
|
|
45
|
+
"type": "boolean",
|
|
46
|
+
"description": "Whether the transfer job was deleted after completion."
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
],
|
|
51
|
+
"type": "object"
|
|
52
|
+
}
|
|
53
|
+
},
|
|
54
|
+
"type": "object",
|
|
55
|
+
"properties": {
|
|
56
|
+
"cloudStorageTransferRun": {
|
|
57
|
+
"$ref": "#/$defs/CloudStorageTransferRunFacet"
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
|
+
"$defs": {
|
|
4
|
+
"DataFusionRunFacet": {
|
|
5
|
+
"allOf": [
|
|
6
|
+
{
|
|
7
|
+
"$ref": "https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/RunFacet"
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
"type": "object",
|
|
11
|
+
"properties": {
|
|
12
|
+
"runId": {
|
|
13
|
+
"type": "string",
|
|
14
|
+
"description": "Pipeline run ID assigned by Cloud Data Fusion."
|
|
15
|
+
},
|
|
16
|
+
"runtimeArgs": {
|
|
17
|
+
"type": "object",
|
|
18
|
+
"description": "Runtime arguments provided when starting the pipeline."
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
],
|
|
23
|
+
"type": "object"
|
|
24
|
+
}
|
|
25
|
+
},
|
|
26
|
+
"type": "object",
|
|
27
|
+
"properties": {
|
|
28
|
+
"dataFusionRun": {
|
|
29
|
+
"$ref": "#/$defs/DataFusionRunFacet"
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
}
|
|
@@ -24,13 +24,17 @@ from attr import define, field
|
|
|
24
24
|
from airflow.providers.google import __version__ as provider_version
|
|
25
25
|
|
|
26
26
|
if TYPE_CHECKING:
|
|
27
|
-
from openlineage.client.generated.base import RunFacet
|
|
27
|
+
from openlineage.client.generated.base import JobFacet, RunFacet
|
|
28
28
|
|
|
29
29
|
try:
|
|
30
30
|
try:
|
|
31
31
|
from openlineage.client.generated.base import RunFacet
|
|
32
32
|
except ImportError: # Old OpenLineage client is used
|
|
33
33
|
from openlineage.client.facet import BaseFacet as RunFacet # type: ignore[assignment]
|
|
34
|
+
try:
|
|
35
|
+
from openlineage.client.generated.base import JobFacet
|
|
36
|
+
except ImportError: # Old OpenLineage client is used
|
|
37
|
+
from openlineage.client.facet import BaseFacet as JobFacet # type: ignore[assignment]
|
|
34
38
|
|
|
35
39
|
@define
|
|
36
40
|
class BigQueryJobRunFacet(RunFacet):
|
|
@@ -53,6 +57,100 @@ try:
|
|
|
53
57
|
f"providers-google/{provider_version}/airflow/providers/google/"
|
|
54
58
|
"openlineage/BigQueryJobRunFacet.json"
|
|
55
59
|
)
|
|
60
|
+
|
|
61
|
+
@define
|
|
62
|
+
class CloudStorageTransferJobFacet(JobFacet):
|
|
63
|
+
"""
|
|
64
|
+
Facet representing a Cloud Storage Transfer Service job configuration.
|
|
65
|
+
|
|
66
|
+
:param jobName: Unique name of the transfer job.
|
|
67
|
+
:param projectId: GCP project where the transfer job is defined.
|
|
68
|
+
:param description: User-provided description of the transfer job.
|
|
69
|
+
:param status: Current status of the transfer job (e.g. "ENABLED", "DISABLED").
|
|
70
|
+
:param sourceBucket: Name of the source bucket (e.g. AWS S3).
|
|
71
|
+
:param sourcePath: Prefix/path inside the source bucket.
|
|
72
|
+
:param targetBucket: Name of the destination bucket (e.g. GCS).
|
|
73
|
+
:param targetPath: Prefix/path inside the destination bucket.
|
|
74
|
+
:param objectConditions: Object selection rules (e.g. include/exclude prefixes).
|
|
75
|
+
:param transferOptions: Transfer options, such as overwrite behavior or whether to delete objects
|
|
76
|
+
from the source after transfer.
|
|
77
|
+
:param schedule: Schedule for the transfer job (if recurring).
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
jobName: str | None = field(default=None)
|
|
81
|
+
projectId: str | None = field(default=None)
|
|
82
|
+
description: str | None = field(default=None)
|
|
83
|
+
status: str | None = field(default=None)
|
|
84
|
+
sourceBucket: str | None = field(default=None)
|
|
85
|
+
sourcePath: str | None = field(default=None)
|
|
86
|
+
targetBucket: str | None = field(default=None)
|
|
87
|
+
targetPath: str | None = field(default=None)
|
|
88
|
+
objectConditions: dict | None = field(default=None)
|
|
89
|
+
transferOptions: dict | None = field(default=None)
|
|
90
|
+
schedule: dict | None = field(default=None)
|
|
91
|
+
|
|
92
|
+
@staticmethod
|
|
93
|
+
def _get_schema() -> str:
|
|
94
|
+
return (
|
|
95
|
+
"https://raw.githubusercontent.com/apache/airflow/"
|
|
96
|
+
f"providers-google/{provider_version}/airflow/providers/google/"
|
|
97
|
+
"openlineage/CloudStorageTransferJobFacet.json"
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
@define
|
|
101
|
+
class CloudStorageTransferRunFacet(RunFacet):
|
|
102
|
+
"""
|
|
103
|
+
Facet representing a Cloud Storage Transfer Service job execution run.
|
|
104
|
+
|
|
105
|
+
:param jobName: Name of the transfer job being executed.
|
|
106
|
+
:param operationName: Name of the specific transfer operation instance.
|
|
107
|
+
:param status: Current status of the operation (e.g. "IN_PROGRESS", "SUCCESS", "FAILED").
|
|
108
|
+
:param startTime: Time when the transfer job execution started (ISO 8601 format).
|
|
109
|
+
:param endTime: Time when the transfer job execution finished (ISO 8601 format).
|
|
110
|
+
:param wait: Whether the operator waits for the job to complete before finishing.
|
|
111
|
+
:param timeout: Timeout (in seconds) for the transfer run to complete.
|
|
112
|
+
:param deferrable: Whether the operator defers execution until job completion.
|
|
113
|
+
:param deleteJobAfterCompletion: Whether the operator deletes the transfer job after the run completes.
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
jobName: str | None = field(default=None)
|
|
117
|
+
operationName: str | None = field(default=None)
|
|
118
|
+
status: str | None = field(default=None)
|
|
119
|
+
startTime: str | None = field(default=None)
|
|
120
|
+
endTime: str | None = field(default=None)
|
|
121
|
+
wait: bool = field(default=True)
|
|
122
|
+
timeout: float | None = field(default=None)
|
|
123
|
+
deferrable: bool = field(default=False)
|
|
124
|
+
deleteJobAfterCompletion: bool = field(default=False)
|
|
125
|
+
|
|
126
|
+
@staticmethod
|
|
127
|
+
def _get_schema() -> str:
|
|
128
|
+
return (
|
|
129
|
+
"https://raw.githubusercontent.com/apache/airflow/"
|
|
130
|
+
f"providers-google/{provider_version}/airflow/providers/google/"
|
|
131
|
+
"openlineage/CloudStorageTransferRunFacet.json"
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
@define
|
|
135
|
+
class DataFusionRunFacet(RunFacet):
|
|
136
|
+
"""
|
|
137
|
+
Facet that represents relevant details of a Cloud Data Fusion pipeline run.
|
|
138
|
+
|
|
139
|
+
:param runId: The pipeline execution id.
|
|
140
|
+
:param runtimeArgs: Runtime arguments passed to the pipeline.
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
runId: str | None = field(default=None)
|
|
144
|
+
runtimeArgs: dict[str, str] | None = field(default=None)
|
|
145
|
+
|
|
146
|
+
@staticmethod
|
|
147
|
+
def _get_schema() -> str:
|
|
148
|
+
return (
|
|
149
|
+
"https://raw.githubusercontent.com/apache/airflow/"
|
|
150
|
+
f"providers-google/{provider_version}/airflow/providers/google/"
|
|
151
|
+
"openlineage/DataFusionRunFacet.json"
|
|
152
|
+
)
|
|
153
|
+
|
|
56
154
|
except ImportError: # OpenLineage is not available
|
|
57
155
|
|
|
58
156
|
def create_no_op(*_, **__) -> None:
|
|
@@ -65,3 +163,6 @@ except ImportError: # OpenLineage is not available
|
|
|
65
163
|
return None
|
|
66
164
|
|
|
67
165
|
BigQueryJobRunFacet = create_no_op # type: ignore[misc, assignment]
|
|
166
|
+
CloudStorageTransferJobFacet = create_no_op # type: ignore[misc, assignment]
|
|
167
|
+
CloudStorageTransferRunFacet = create_no_op # type: ignore[misc, assignment]
|
|
168
|
+
DataFusionRunFacet = create_no_op # type: ignore[misc, assignment]
|
|
@@ -80,7 +80,7 @@ class _BigQueryInsertJobOperatorOpenLineageMixin:
|
|
|
80
80
|
from airflow.providers.openlineage.sqlparser import SQLParser
|
|
81
81
|
|
|
82
82
|
if not self.job_id:
|
|
83
|
-
self.log.warning("No BigQuery job_id was found by OpenLineage.")
|
|
83
|
+
self.log.warning("No BigQuery job_id was found by OpenLineage.")
|
|
84
84
|
return OperatorLineage()
|
|
85
85
|
|
|
86
86
|
if not self.hook:
|
|
@@ -92,14 +92,16 @@ class _BigQueryInsertJobOperatorOpenLineageMixin:
|
|
|
92
92
|
impersonation_chain=self.impersonation_chain,
|
|
93
93
|
)
|
|
94
94
|
|
|
95
|
-
self.log.debug("Extracting data from bigquery job: `%s`", self.job_id)
|
|
95
|
+
self.log.debug("Extracting data from bigquery job: `%s`", self.job_id)
|
|
96
96
|
inputs, outputs = [], []
|
|
97
97
|
run_facets: dict[str, RunFacet] = {
|
|
98
98
|
"externalQuery": ExternalQueryRunFacet(externalQueryId=self.job_id, source="bigquery")
|
|
99
99
|
}
|
|
100
|
-
self._client = self.hook.get_client(
|
|
100
|
+
self._client = self.hook.get_client(
|
|
101
|
+
project_id=self.project_id or self.hook.project_id, location=self.location
|
|
102
|
+
)
|
|
101
103
|
try:
|
|
102
|
-
job_properties = self._client.get_job(job_id=self.job_id)._properties
|
|
104
|
+
job_properties = self._client.get_job(job_id=self.job_id)._properties
|
|
103
105
|
|
|
104
106
|
if get_from_nullable_chain(job_properties, ["status", "state"]) != "DONE":
|
|
105
107
|
raise ValueError(f"Trying to extract data from running bigquery job: `{self.job_id}`")
|
|
@@ -107,11 +109,11 @@ class _BigQueryInsertJobOperatorOpenLineageMixin:
|
|
|
107
109
|
run_facets["bigQueryJob"] = self._get_bigquery_job_run_facet(job_properties)
|
|
108
110
|
|
|
109
111
|
if get_from_nullable_chain(job_properties, ["statistics", "numChildJobs"]):
|
|
110
|
-
self.log.debug("Found SCRIPT job. Extracting lineage from child jobs instead.")
|
|
112
|
+
self.log.debug("Found SCRIPT job. Extracting lineage from child jobs instead.")
|
|
111
113
|
# SCRIPT job type has no input / output information but spawns child jobs that have one
|
|
112
114
|
# https://cloud.google.com/bigquery/docs/information-schema-jobs#multi-statement_query_job
|
|
113
115
|
for child_job_id in self._client.list_jobs(parent_job=self.job_id):
|
|
114
|
-
child_job_properties = self._client.get_job(job_id=child_job_id)._properties
|
|
116
|
+
child_job_properties = self._client.get_job(job_id=child_job_id)._properties
|
|
115
117
|
child_inputs, child_outputs = self._get_inputs_and_outputs(child_job_properties)
|
|
116
118
|
inputs.extend(child_inputs)
|
|
117
119
|
outputs.extend(child_outputs)
|
|
@@ -119,7 +121,7 @@ class _BigQueryInsertJobOperatorOpenLineageMixin:
|
|
|
119
121
|
inputs, outputs = self._get_inputs_and_outputs(job_properties)
|
|
120
122
|
|
|
121
123
|
except Exception as e:
|
|
122
|
-
self.log.warning("Cannot retrieve job details from BigQuery.Client. %s", e, exc_info=True)
|
|
124
|
+
self.log.warning("Cannot retrieve job details from BigQuery.Client. %s", e, exc_info=True)
|
|
123
125
|
exception_msg = traceback.format_exc()
|
|
124
126
|
run_facets.update(
|
|
125
127
|
{
|
|
@@ -173,7 +175,7 @@ class _BigQueryInsertJobOperatorOpenLineageMixin:
|
|
|
173
175
|
if (
|
|
174
176
|
single_output.facets
|
|
175
177
|
and final_outputs[key].facets
|
|
176
|
-
and "columnLineage" in single_output.facets
|
|
178
|
+
and "columnLineage" in single_output.facets
|
|
177
179
|
and "columnLineage" in final_outputs[key].facets # type: ignore
|
|
178
180
|
):
|
|
179
181
|
single_output.facets["columnLineage"] = merge_column_lineage_facets(
|
|
@@ -49,7 +49,7 @@ if TYPE_CHECKING:
|
|
|
49
49
|
from google.cloud.bigquery.table import Table
|
|
50
50
|
|
|
51
51
|
from airflow.providers.common.compat.openlineage.facet import Dataset
|
|
52
|
-
from airflow.
|
|
52
|
+
from airflow.providers.common.compat.sdk import Context
|
|
53
53
|
|
|
54
54
|
|
|
55
55
|
log = logging.getLogger(__name__)
|
|
@@ -214,7 +214,20 @@ def extract_ds_name_from_gcs_path(path: str) -> str:
|
|
|
214
214
|
|
|
215
215
|
def get_facets_from_bq_table(table: Table) -> dict[str, DatasetFacet]:
|
|
216
216
|
"""Get facets from BigQuery table object."""
|
|
217
|
+
return get_facets_from_bq_table_for_given_fields(table, selected_fields=None)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def get_facets_from_bq_table_for_given_fields(
|
|
221
|
+
table: Table, selected_fields: list[str] | None
|
|
222
|
+
) -> dict[str, DatasetFacet]:
|
|
223
|
+
"""
|
|
224
|
+
Get facets from BigQuery table object for selected fields only.
|
|
225
|
+
|
|
226
|
+
If selected_fields is None, include all fields.
|
|
227
|
+
"""
|
|
217
228
|
facets: dict[str, DatasetFacet] = {}
|
|
229
|
+
selected_fields_set = set(selected_fields) if selected_fields else None
|
|
230
|
+
|
|
218
231
|
if table.schema:
|
|
219
232
|
facets["schema"] = SchemaDatasetFacet(
|
|
220
233
|
fields=[
|
|
@@ -222,6 +235,7 @@ def get_facets_from_bq_table(table: Table) -> dict[str, DatasetFacet]:
|
|
|
222
235
|
name=schema_field.name, type=schema_field.field_type, description=schema_field.description
|
|
223
236
|
)
|
|
224
237
|
for schema_field in table.schema
|
|
238
|
+
if selected_fields_set is None or schema_field.name in selected_fields_set
|
|
225
239
|
]
|
|
226
240
|
)
|
|
227
241
|
if table.description:
|