apache-airflow-providers-google 12.0.0rc2__py3-none-any.whl → 13.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airflow/providers/google/LICENSE +0 -52
- airflow/providers/google/__init__.py +1 -1
- airflow/providers/google/ads/hooks/ads.py +27 -13
- airflow/providers/google/ads/transfers/ads_to_gcs.py +18 -4
- airflow/providers/google/assets/bigquery.py +17 -0
- airflow/providers/google/cloud/_internal_client/secret_manager_client.py +2 -3
- airflow/providers/google/cloud/hooks/alloy_db.py +736 -8
- airflow/providers/google/cloud/hooks/automl.py +10 -4
- airflow/providers/google/cloud/hooks/bigquery.py +125 -22
- airflow/providers/google/cloud/hooks/bigquery_dts.py +8 -8
- airflow/providers/google/cloud/hooks/bigtable.py +2 -3
- airflow/providers/google/cloud/hooks/cloud_batch.py +3 -4
- airflow/providers/google/cloud/hooks/cloud_build.py +4 -5
- airflow/providers/google/cloud/hooks/cloud_composer.py +3 -4
- airflow/providers/google/cloud/hooks/cloud_memorystore.py +3 -4
- airflow/providers/google/cloud/hooks/cloud_run.py +3 -4
- airflow/providers/google/cloud/hooks/cloud_sql.py +7 -3
- airflow/providers/google/cloud/hooks/cloud_storage_transfer_service.py +119 -7
- airflow/providers/google/cloud/hooks/compute.py +3 -3
- airflow/providers/google/cloud/hooks/datacatalog.py +3 -4
- airflow/providers/google/cloud/hooks/dataflow.py +12 -12
- airflow/providers/google/cloud/hooks/dataform.py +2 -3
- airflow/providers/google/cloud/hooks/datafusion.py +2 -2
- airflow/providers/google/cloud/hooks/dataplex.py +1032 -11
- airflow/providers/google/cloud/hooks/dataproc.py +4 -5
- airflow/providers/google/cloud/hooks/dataproc_metastore.py +3 -4
- airflow/providers/google/cloud/hooks/dlp.py +3 -4
- airflow/providers/google/cloud/hooks/gcs.py +7 -6
- airflow/providers/google/cloud/hooks/kms.py +2 -3
- airflow/providers/google/cloud/hooks/kubernetes_engine.py +8 -8
- airflow/providers/google/cloud/hooks/life_sciences.py +1 -1
- airflow/providers/google/cloud/hooks/managed_kafka.py +482 -0
- airflow/providers/google/cloud/hooks/natural_language.py +2 -3
- airflow/providers/google/cloud/hooks/os_login.py +2 -3
- airflow/providers/google/cloud/hooks/pubsub.py +6 -6
- airflow/providers/google/cloud/hooks/secret_manager.py +2 -3
- airflow/providers/google/cloud/hooks/spanner.py +2 -2
- airflow/providers/google/cloud/hooks/speech_to_text.py +2 -3
- airflow/providers/google/cloud/hooks/stackdriver.py +4 -4
- airflow/providers/google/cloud/hooks/tasks.py +3 -4
- airflow/providers/google/cloud/hooks/text_to_speech.py +2 -3
- airflow/providers/google/cloud/hooks/translate.py +236 -5
- airflow/providers/google/cloud/hooks/vertex_ai/auto_ml.py +9 -4
- airflow/providers/google/cloud/hooks/vertex_ai/batch_prediction_job.py +3 -4
- airflow/providers/google/cloud/hooks/vertex_ai/custom_job.py +4 -5
- airflow/providers/google/cloud/hooks/vertex_ai/dataset.py +3 -4
- airflow/providers/google/cloud/hooks/vertex_ai/endpoint_service.py +2 -3
- airflow/providers/google/cloud/hooks/vertex_ai/feature_store.py +3 -4
- airflow/providers/google/cloud/hooks/vertex_ai/generative_model.py +1 -181
- airflow/providers/google/cloud/hooks/vertex_ai/hyperparameter_tuning_job.py +3 -4
- airflow/providers/google/cloud/hooks/vertex_ai/model_service.py +2 -3
- airflow/providers/google/cloud/hooks/vertex_ai/pipeline_job.py +3 -4
- airflow/providers/google/cloud/hooks/vertex_ai/prediction_service.py +2 -3
- airflow/providers/google/cloud/hooks/video_intelligence.py +2 -3
- airflow/providers/google/cloud/hooks/vision.py +3 -4
- airflow/providers/google/cloud/hooks/workflows.py +2 -3
- airflow/providers/google/cloud/links/alloy_db.py +46 -0
- airflow/providers/google/cloud/links/bigquery.py +25 -0
- airflow/providers/google/cloud/links/dataplex.py +172 -2
- airflow/providers/google/cloud/links/kubernetes_engine.py +1 -2
- airflow/providers/google/cloud/links/managed_kafka.py +104 -0
- airflow/providers/google/cloud/links/translate.py +28 -0
- airflow/providers/google/cloud/log/gcs_task_handler.py +3 -3
- airflow/providers/google/cloud/log/stackdriver_task_handler.py +11 -10
- airflow/providers/google/cloud/openlineage/facets.py +67 -0
- airflow/providers/google/cloud/openlineage/mixins.py +438 -173
- airflow/providers/google/cloud/openlineage/utils.py +394 -61
- airflow/providers/google/cloud/operators/alloy_db.py +980 -69
- airflow/providers/google/cloud/operators/automl.py +83 -245
- airflow/providers/google/cloud/operators/bigquery.py +377 -74
- airflow/providers/google/cloud/operators/bigquery_dts.py +126 -13
- airflow/providers/google/cloud/operators/bigtable.py +1 -3
- airflow/providers/google/cloud/operators/cloud_base.py +1 -2
- airflow/providers/google/cloud/operators/cloud_batch.py +2 -4
- airflow/providers/google/cloud/operators/cloud_build.py +3 -5
- airflow/providers/google/cloud/operators/cloud_composer.py +5 -7
- airflow/providers/google/cloud/operators/cloud_memorystore.py +4 -6
- airflow/providers/google/cloud/operators/cloud_run.py +6 -5
- airflow/providers/google/cloud/operators/cloud_sql.py +20 -8
- airflow/providers/google/cloud/operators/cloud_storage_transfer_service.py +62 -8
- airflow/providers/google/cloud/operators/compute.py +3 -4
- airflow/providers/google/cloud/operators/datacatalog.py +9 -11
- airflow/providers/google/cloud/operators/dataflow.py +1 -112
- airflow/providers/google/cloud/operators/dataform.py +3 -5
- airflow/providers/google/cloud/operators/datafusion.py +1 -1
- airflow/providers/google/cloud/operators/dataplex.py +2046 -7
- airflow/providers/google/cloud/operators/dataproc.py +102 -17
- airflow/providers/google/cloud/operators/dataproc_metastore.py +7 -9
- airflow/providers/google/cloud/operators/dlp.py +17 -19
- airflow/providers/google/cloud/operators/gcs.py +14 -17
- airflow/providers/google/cloud/operators/kubernetes_engine.py +2 -2
- airflow/providers/google/cloud/operators/managed_kafka.py +788 -0
- airflow/providers/google/cloud/operators/natural_language.py +3 -5
- airflow/providers/google/cloud/operators/pubsub.py +39 -7
- airflow/providers/google/cloud/operators/speech_to_text.py +3 -5
- airflow/providers/google/cloud/operators/stackdriver.py +3 -5
- airflow/providers/google/cloud/operators/tasks.py +4 -6
- airflow/providers/google/cloud/operators/text_to_speech.py +2 -4
- airflow/providers/google/cloud/operators/translate.py +414 -5
- airflow/providers/google/cloud/operators/translate_speech.py +2 -4
- airflow/providers/google/cloud/operators/vertex_ai/auto_ml.py +9 -8
- airflow/providers/google/cloud/operators/vertex_ai/batch_prediction_job.py +4 -6
- airflow/providers/google/cloud/operators/vertex_ai/custom_job.py +6 -8
- airflow/providers/google/cloud/operators/vertex_ai/dataset.py +4 -6
- airflow/providers/google/cloud/operators/vertex_ai/endpoint_service.py +4 -6
- airflow/providers/google/cloud/operators/vertex_ai/generative_model.py +0 -322
- airflow/providers/google/cloud/operators/vertex_ai/hyperparameter_tuning_job.py +4 -6
- airflow/providers/google/cloud/operators/vertex_ai/model_service.py +4 -6
- airflow/providers/google/cloud/operators/vertex_ai/pipeline_job.py +4 -6
- airflow/providers/google/cloud/operators/video_intelligence.py +3 -5
- airflow/providers/google/cloud/operators/vision.py +4 -6
- airflow/providers/google/cloud/operators/workflows.py +5 -7
- airflow/providers/google/cloud/secrets/secret_manager.py +1 -2
- airflow/providers/google/cloud/sensors/bigquery_dts.py +3 -5
- airflow/providers/google/cloud/sensors/bigtable.py +2 -3
- airflow/providers/google/cloud/sensors/cloud_composer.py +32 -8
- airflow/providers/google/cloud/sensors/cloud_storage_transfer_service.py +39 -1
- airflow/providers/google/cloud/sensors/dataplex.py +4 -6
- airflow/providers/google/cloud/sensors/dataproc.py +2 -3
- airflow/providers/google/cloud/sensors/dataproc_metastore.py +1 -2
- airflow/providers/google/cloud/sensors/gcs.py +2 -4
- airflow/providers/google/cloud/sensors/pubsub.py +2 -3
- airflow/providers/google/cloud/sensors/workflows.py +3 -5
- airflow/providers/google/cloud/transfers/bigquery_to_gcs.py +5 -5
- airflow/providers/google/cloud/transfers/gcs_to_bigquery.py +10 -12
- airflow/providers/google/cloud/transfers/gcs_to_gcs.py +1 -1
- airflow/providers/google/cloud/transfers/gcs_to_sftp.py +36 -4
- airflow/providers/google/cloud/transfers/mssql_to_gcs.py +27 -2
- airflow/providers/google/cloud/transfers/mysql_to_gcs.py +27 -2
- airflow/providers/google/cloud/transfers/postgres_to_gcs.py +27 -2
- airflow/providers/google/cloud/transfers/sftp_to_gcs.py +34 -5
- airflow/providers/google/cloud/transfers/sql_to_gcs.py +15 -0
- airflow/providers/google/cloud/transfers/trino_to_gcs.py +25 -2
- airflow/providers/google/cloud/triggers/bigquery_dts.py +1 -2
- airflow/providers/google/cloud/triggers/cloud_batch.py +1 -2
- airflow/providers/google/cloud/triggers/cloud_build.py +1 -2
- airflow/providers/google/cloud/triggers/cloud_composer.py +13 -3
- airflow/providers/google/cloud/triggers/cloud_storage_transfer_service.py +102 -4
- airflow/providers/google/cloud/triggers/dataflow.py +2 -3
- airflow/providers/google/cloud/triggers/dataplex.py +1 -2
- airflow/providers/google/cloud/triggers/dataproc.py +2 -3
- airflow/providers/google/cloud/triggers/kubernetes_engine.py +1 -1
- airflow/providers/google/cloud/triggers/pubsub.py +1 -2
- airflow/providers/google/cloud/triggers/vertex_ai.py +7 -8
- airflow/providers/google/cloud/utils/credentials_provider.py +15 -8
- airflow/providers/google/cloud/utils/external_token_supplier.py +1 -0
- airflow/providers/google/common/auth_backend/google_openid.py +4 -4
- airflow/providers/google/common/consts.py +1 -2
- airflow/providers/google/common/hooks/base_google.py +8 -7
- airflow/providers/google/get_provider_info.py +186 -134
- airflow/providers/google/marketing_platform/hooks/analytics_admin.py +2 -3
- airflow/providers/google/marketing_platform/hooks/search_ads.py +1 -1
- airflow/providers/google/marketing_platform/operators/analytics_admin.py +5 -7
- {apache_airflow_providers_google-12.0.0rc2.dist-info → apache_airflow_providers_google-13.0.0.dist-info}/METADATA +41 -58
- {apache_airflow_providers_google-12.0.0rc2.dist-info → apache_airflow_providers_google-13.0.0.dist-info}/RECORD +157 -159
- airflow/providers/google/cloud/example_dags/example_facebook_ads_to_gcs.py +0 -141
- airflow/providers/google/cloud/example_dags/example_looker.py +0 -64
- airflow/providers/google/cloud/example_dags/example_presto_to_gcs.py +0 -194
- airflow/providers/google/cloud/example_dags/example_salesforce_to_gcs.py +0 -129
- airflow/providers/google/marketing_platform/example_dags/__init__.py +0 -16
- airflow/providers/google/marketing_platform/example_dags/example_display_video.py +0 -213
- {apache_airflow_providers_google-12.0.0rc2.dist-info → apache_airflow_providers_google-13.0.0.dist-info}/WHEEL +0 -0
- {apache_airflow_providers_google-12.0.0rc2.dist-info → apache_airflow_providers_google-13.0.0.dist-info}/entry_points.txt +0 -0
@@ -17,36 +17,38 @@
|
|
17
17
|
# under the License.
|
18
18
|
from __future__ import annotations
|
19
19
|
|
20
|
+
import copy
|
20
21
|
import logging
|
21
22
|
import os
|
22
23
|
import pathlib
|
24
|
+
import re
|
25
|
+
from collections import defaultdict
|
26
|
+
from collections.abc import Iterable
|
23
27
|
from typing import TYPE_CHECKING, Any
|
24
28
|
|
25
|
-
from attr import define, field
|
26
|
-
|
27
|
-
if TYPE_CHECKING:
|
28
|
-
from google.cloud.bigquery.table import Table
|
29
|
-
|
30
|
-
from airflow.providers.common.compat.openlineage.facet import Dataset
|
31
|
-
from airflow.utils.context import Context
|
32
|
-
|
33
29
|
from airflow.providers.common.compat.openlineage.facet import (
|
34
|
-
BaseFacet,
|
35
30
|
ColumnLineageDatasetFacet,
|
31
|
+
DatasetFacet,
|
36
32
|
DocumentationDatasetFacet,
|
37
33
|
Fields,
|
38
34
|
Identifier,
|
39
35
|
InputField,
|
40
|
-
RunFacet,
|
41
36
|
SchemaDatasetFacet,
|
42
37
|
SchemaDatasetFacetFields,
|
43
38
|
SymlinksDatasetFacet,
|
44
39
|
)
|
45
40
|
from airflow.providers.common.compat.openlineage.utils.spark import (
|
46
41
|
inject_parent_job_information_into_spark_properties,
|
42
|
+
inject_transport_information_into_spark_properties,
|
47
43
|
)
|
48
|
-
from airflow.providers.google import __version__ as provider_version
|
49
44
|
from airflow.providers.google.cloud.hooks.gcs import _parse_gcs_url
|
45
|
+
from google.cloud.dataproc_v1 import Batch, RuntimeConfig
|
46
|
+
|
47
|
+
if TYPE_CHECKING:
|
48
|
+
from airflow.providers.common.compat.openlineage.facet import Dataset
|
49
|
+
from airflow.utils.context import Context
|
50
|
+
from google.cloud.bigquery.table import Table
|
51
|
+
|
50
52
|
|
51
53
|
log = logging.getLogger(__name__)
|
52
54
|
|
@@ -55,6 +57,106 @@ BIGQUERY_URI = "bigquery"
|
|
55
57
|
WILDCARD = "*"
|
56
58
|
|
57
59
|
|
60
|
+
def merge_column_lineage_facets(facets: list[ColumnLineageDatasetFacet]) -> ColumnLineageDatasetFacet:
|
61
|
+
"""
|
62
|
+
Merge multiple column lineage facets into a single consolidated facet.
|
63
|
+
|
64
|
+
Specifically, it aggregates input fields and transformations for each field across all provided facets.
|
65
|
+
|
66
|
+
Args:
|
67
|
+
facets: Column Lineage Facets to be merged.
|
68
|
+
|
69
|
+
Returns:
|
70
|
+
A new Column Lineage Facet containing all fields, their respective input fields and transformations.
|
71
|
+
|
72
|
+
Notes:
|
73
|
+
- Input fields are uniquely identified by their `(namespace, name, field)` tuple.
|
74
|
+
- If multiple facets contain the same field with the same input field, those input
|
75
|
+
fields are merged without duplication.
|
76
|
+
- Transformations associated with input fields are also merged. If transformations
|
77
|
+
are not supported by the version of the `InputField` class, they will be omitted.
|
78
|
+
- Transformation merging relies on a composite key of the field name and input field
|
79
|
+
tuple to track and consolidate transformations.
|
80
|
+
|
81
|
+
Examples:
|
82
|
+
Case 1: Two facets with the same input field
|
83
|
+
```
|
84
|
+
>>> facet1 = ColumnLineageDatasetFacet(
|
85
|
+
... fields={"columnA": Fields(inputFields=[InputField("namespace1", "dataset1", "field1")])}
|
86
|
+
... )
|
87
|
+
>>> facet2 = ColumnLineageDatasetFacet(
|
88
|
+
... fields={"columnA": Fields(inputFields=[InputField("namespace1", "dataset1", "field1")])}
|
89
|
+
... )
|
90
|
+
>>> merged = merge_column_lineage_facets([facet1, facet2])
|
91
|
+
>>> merged.fields["columnA"].inputFields
|
92
|
+
[InputField("namespace1", "dataset1", "field1")]
|
93
|
+
```
|
94
|
+
|
95
|
+
Case 2: Two facets with different transformations for the same input field
|
96
|
+
```
|
97
|
+
>>> facet1 = ColumnLineageDatasetFacet(
|
98
|
+
... fields={
|
99
|
+
... "columnA": Fields(
|
100
|
+
... inputFields=[InputField("namespace1", "dataset1", "field1", transformations=["t1"])]
|
101
|
+
... )
|
102
|
+
... }
|
103
|
+
... )
|
104
|
+
>>> facet2 = ColumnLineageDatasetFacet(
|
105
|
+
... fields={
|
106
|
+
... "columnA": Fields(
|
107
|
+
... inputFields=[InputField("namespace1", "dataset1", "field1", transformations=["t2"])]
|
108
|
+
... )
|
109
|
+
... }
|
110
|
+
... )
|
111
|
+
>>> merged = merge_column_lineage_facets([facet1, facet2])
|
112
|
+
>>> merged.fields["columnA"].inputFields[0].transformations
|
113
|
+
["t1", "t2"]
|
114
|
+
```
|
115
|
+
"""
|
116
|
+
# Dictionary to collect all unique input fields for each field name
|
117
|
+
fields_sources: dict[str, set[tuple[str, str, str]]] = defaultdict(set)
|
118
|
+
# Dictionary to aggregate transformations for each input field
|
119
|
+
transformations: dict[str, list] = defaultdict(list)
|
120
|
+
|
121
|
+
for facet in facets:
|
122
|
+
for field_name, single_field in facet.fields.items():
|
123
|
+
for input_field in single_field.inputFields:
|
124
|
+
input_key_fields = (input_field.namespace, input_field.name, input_field.field)
|
125
|
+
fields_sources[field_name].add(input_key_fields)
|
126
|
+
|
127
|
+
if single_transformations := getattr(input_field, "transformations", []):
|
128
|
+
transformation_key = "".join((field_name, *input_key_fields))
|
129
|
+
transformations[transformation_key].extend(single_transformations)
|
130
|
+
|
131
|
+
# Check if the `InputField` class supports the `transformations` attribute (since OL client 1.17.1)
|
132
|
+
input_field_allows_transformation_info = True
|
133
|
+
try:
|
134
|
+
InputField(namespace="a", name="b", field="c", transformations=[])
|
135
|
+
except TypeError:
|
136
|
+
input_field_allows_transformation_info = False
|
137
|
+
|
138
|
+
return ColumnLineageDatasetFacet(
|
139
|
+
fields={
|
140
|
+
field_name: Fields(
|
141
|
+
inputFields=[
|
142
|
+
InputField(
|
143
|
+
namespace,
|
144
|
+
name,
|
145
|
+
column,
|
146
|
+
transformations.get("".join((field_name, namespace, name, column)), []),
|
147
|
+
)
|
148
|
+
if input_field_allows_transformation_info
|
149
|
+
else InputField(namespace, name, column)
|
150
|
+
for namespace, name, column in sorted(input_fields)
|
151
|
+
],
|
152
|
+
transformationType="", # Legacy transformation information
|
153
|
+
transformationDescription="", # Legacy transformation information
|
154
|
+
)
|
155
|
+
for field_name, input_fields in fields_sources.items()
|
156
|
+
}
|
157
|
+
)
|
158
|
+
|
159
|
+
|
58
160
|
def extract_ds_name_from_gcs_path(path: str) -> str:
|
59
161
|
"""
|
60
162
|
Extract and process the dataset name from a given path.
|
@@ -108,9 +210,9 @@ def extract_ds_name_from_gcs_path(path: str) -> str:
|
|
108
210
|
return path
|
109
211
|
|
110
212
|
|
111
|
-
def get_facets_from_bq_table(table: Table) -> dict[str,
|
213
|
+
def get_facets_from_bq_table(table: Table) -> dict[str, DatasetFacet]:
|
112
214
|
"""Get facets from BigQuery table object."""
|
113
|
-
facets: dict[str,
|
215
|
+
facets: dict[str, DatasetFacet] = {}
|
114
216
|
if table.schema:
|
115
217
|
facets["schema"] = SchemaDatasetFacet(
|
116
218
|
fields=[
|
@@ -124,26 +226,37 @@ def get_facets_from_bq_table(table: Table) -> dict[str, BaseFacet]:
|
|
124
226
|
facets["documentation"] = DocumentationDatasetFacet(description=table.description)
|
125
227
|
|
126
228
|
if table.external_data_configuration:
|
127
|
-
symlinks =
|
128
|
-
for uri in table.external_data_configuration.source_uris:
|
129
|
-
if uri.startswith("gs://"):
|
130
|
-
bucket, blob = _parse_gcs_url(uri)
|
131
|
-
blob = extract_ds_name_from_gcs_path(blob)
|
132
|
-
symlinks.add((f"gs://{bucket}", blob))
|
133
|
-
|
229
|
+
symlinks = get_namespace_name_from_source_uris(table.external_data_configuration.source_uris)
|
134
230
|
facets["symlink"] = SymlinksDatasetFacet(
|
135
231
|
identifiers=[
|
136
|
-
Identifier(
|
232
|
+
Identifier(
|
233
|
+
namespace=namespace, name=name, type="file" if namespace.startswith("gs://") else "table"
|
234
|
+
)
|
137
235
|
for namespace, name in sorted(symlinks)
|
138
236
|
]
|
139
237
|
)
|
140
238
|
return facets
|
141
239
|
|
142
240
|
|
241
|
+
def get_namespace_name_from_source_uris(source_uris: Iterable[str]) -> set[tuple[str, str]]:
|
242
|
+
result = set()
|
243
|
+
for uri in source_uris:
|
244
|
+
if uri.startswith("gs://"):
|
245
|
+
bucket, blob = _parse_gcs_url(uri)
|
246
|
+
result.add((f"gs://{bucket}", extract_ds_name_from_gcs_path(blob)))
|
247
|
+
elif uri.startswith("https://googleapis.com/bigtable"):
|
248
|
+
regex = r"https://googleapis.com/bigtable/projects/([^/]+)/instances/([^/]+)(?:/appProfiles/([^/]+))?/tables/([^/]+)"
|
249
|
+
match = re.match(regex, uri)
|
250
|
+
if match:
|
251
|
+
project_id, instance_id, table_id = match.groups()[0], match.groups()[1], match.groups()[3]
|
252
|
+
result.add((f"bigtable://{project_id}/{instance_id}", table_id))
|
253
|
+
return result
|
254
|
+
|
255
|
+
|
143
256
|
def get_identity_column_lineage_facet(
|
144
|
-
dest_field_names:
|
145
|
-
input_datasets:
|
146
|
-
) -> dict[str,
|
257
|
+
dest_field_names: Iterable[str],
|
258
|
+
input_datasets: Iterable[Dataset],
|
259
|
+
) -> dict[str, DatasetFacet]:
|
147
260
|
"""
|
148
261
|
Get column lineage facet for identity transformations.
|
149
262
|
|
@@ -200,31 +313,6 @@ def get_identity_column_lineage_facet(
|
|
200
313
|
return {"columnLineage": column_lineage_facet}
|
201
314
|
|
202
315
|
|
203
|
-
@define
|
204
|
-
class BigQueryJobRunFacet(RunFacet):
|
205
|
-
"""
|
206
|
-
Facet that represents relevant statistics of bigquery run.
|
207
|
-
|
208
|
-
This facet is used to provide statistics about bigquery run.
|
209
|
-
|
210
|
-
:param cached: BigQuery caches query results. Rest of the statistics will not be provided for cached queries.
|
211
|
-
:param billedBytes: How many bytes BigQuery bills for.
|
212
|
-
:param properties: Full property tree of BigQUery run.
|
213
|
-
"""
|
214
|
-
|
215
|
-
cached: bool
|
216
|
-
billedBytes: int | None = field(default=None)
|
217
|
-
properties: str | None = field(default=None)
|
218
|
-
|
219
|
-
@staticmethod
|
220
|
-
def _get_schema() -> str:
|
221
|
-
return (
|
222
|
-
"https://raw.githubusercontent.com/apache/airflow/"
|
223
|
-
f"providers-google/{provider_version}/airflow/providers/google/"
|
224
|
-
"openlineage/BigQueryJobRunFacet.json"
|
225
|
-
)
|
226
|
-
|
227
|
-
|
228
316
|
def get_from_nullable_chain(source: Any, chain: list[str]) -> Any | None:
|
229
317
|
"""
|
230
318
|
Get object from nested structure of objects, where it's not guaranteed that all keys in the nested structure exist.
|
@@ -336,31 +424,38 @@ def _replace_dataproc_job_properties(job: dict, job_type: str, new_properties: d
|
|
336
424
|
|
337
425
|
|
338
426
|
def inject_openlineage_properties_into_dataproc_job(
|
339
|
-
job: dict, context: Context, inject_parent_job_info: bool
|
427
|
+
job: dict, context: Context, inject_parent_job_info: bool, inject_transport_info: bool
|
340
428
|
) -> dict:
|
341
429
|
"""
|
342
430
|
Inject OpenLineage properties into Spark job definition.
|
343
431
|
|
344
|
-
|
345
|
-
|
432
|
+
This function does not remove existing configurations or modify the job definition in any way,
|
433
|
+
except to add the required OpenLineage properties if they are not already present.
|
434
|
+
|
435
|
+
The entire properties injection process will be skipped if any condition is met:
|
436
|
+
- The OpenLineage provider is not accessible.
|
437
|
+
- The job type is unsupported.
|
438
|
+
- Both `inject_parent_job_info` and `inject_transport_info` are set to False.
|
346
439
|
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
440
|
+
Additionally, specific information will not be injected if relevant OpenLineage properties already exist.
|
441
|
+
|
442
|
+
Parent job information will not be injected if:
|
443
|
+
- Any property prefixed with `spark.openlineage.parent` exists.
|
444
|
+
- `inject_parent_job_info` is False.
|
445
|
+
Transport information will not be injected if:
|
446
|
+
- Any property prefixed with `spark.openlineage.transport` exists.
|
447
|
+
- `inject_transport_info` is False.
|
354
448
|
|
355
449
|
Args:
|
356
450
|
job: The original Dataproc job definition.
|
357
451
|
context: The Airflow context in which the job is running.
|
358
452
|
inject_parent_job_info: Flag indicating whether to inject parent job information.
|
453
|
+
inject_transport_info: Flag indicating whether to inject transport information.
|
359
454
|
|
360
455
|
Returns:
|
361
456
|
The modified job definition with OpenLineage properties injected, if applicable.
|
362
457
|
"""
|
363
|
-
if not inject_parent_job_info:
|
458
|
+
if not inject_parent_job_info and not inject_transport_info:
|
364
459
|
log.debug("Automatic injection of OpenLineage information is disabled.")
|
365
460
|
return job
|
366
461
|
|
@@ -380,9 +475,247 @@ def inject_openlineage_properties_into_dataproc_job(
|
|
380
475
|
|
381
476
|
properties = job[job_type].get("properties", {})
|
382
477
|
|
383
|
-
|
478
|
+
if inject_parent_job_info:
|
479
|
+
log.debug("Injecting OpenLineage parent job information into Spark properties.")
|
480
|
+
properties = inject_parent_job_information_into_spark_properties(
|
481
|
+
properties=properties, context=context
|
482
|
+
)
|
483
|
+
|
484
|
+
if inject_transport_info:
|
485
|
+
log.debug("Injecting OpenLineage transport information into Spark properties.")
|
486
|
+
properties = inject_transport_information_into_spark_properties(
|
487
|
+
properties=properties, context=context
|
488
|
+
)
|
384
489
|
|
385
490
|
job_with_ol_config = _replace_dataproc_job_properties(
|
386
491
|
job=job, job_type=job_type, new_properties=properties
|
387
492
|
)
|
388
493
|
return job_with_ol_config
|
494
|
+
|
495
|
+
|
496
|
+
def _is_dataproc_batch_of_supported_type(batch: dict | Batch) -> bool:
|
497
|
+
"""
|
498
|
+
Check if a Dataproc batch is of a supported type for Openlineage automatic injection.
|
499
|
+
|
500
|
+
This function determines if the given batch is of a supported type
|
501
|
+
by checking for specific job type attributes or keys in the batch.
|
502
|
+
|
503
|
+
Args:
|
504
|
+
batch: The Dataproc batch to check.
|
505
|
+
|
506
|
+
Returns:
|
507
|
+
True if the batch is of a supported type (`spark_batch` or
|
508
|
+
`pyspark_batch`), otherwise False.
|
509
|
+
"""
|
510
|
+
supported_job_types = ("spark_batch", "pyspark_batch")
|
511
|
+
if isinstance(batch, Batch):
|
512
|
+
if any(getattr(batch, job_type) for job_type in supported_job_types):
|
513
|
+
return True
|
514
|
+
return False
|
515
|
+
|
516
|
+
# For dictionary-based batch
|
517
|
+
if any(job_type in batch for job_type in supported_job_types):
|
518
|
+
return True
|
519
|
+
return False
|
520
|
+
|
521
|
+
|
522
|
+
def _extract_dataproc_batch_properties(batch: dict | Batch) -> dict:
|
523
|
+
"""
|
524
|
+
Extract Dataproc batch properties from a Batch object or dictionary.
|
525
|
+
|
526
|
+
This function retrieves the `properties` from the `runtime_config` of a
|
527
|
+
Dataproc `Batch` object or a dictionary representation of a batch.
|
528
|
+
|
529
|
+
Args:
|
530
|
+
batch: The Dataproc batch to extract properties from.
|
531
|
+
|
532
|
+
Returns:
|
533
|
+
Extracted `properties` if found, otherwise an empty dictionary.
|
534
|
+
"""
|
535
|
+
if isinstance(batch, Batch):
|
536
|
+
return dict(batch.runtime_config.properties)
|
537
|
+
|
538
|
+
# For dictionary-based batch
|
539
|
+
run_time_config = batch.get("runtime_config", {})
|
540
|
+
if isinstance(run_time_config, RuntimeConfig):
|
541
|
+
return dict(run_time_config.properties)
|
542
|
+
return run_time_config.get("properties", {})
|
543
|
+
|
544
|
+
|
545
|
+
def _replace_dataproc_batch_properties(batch: dict | Batch, new_properties: dict) -> dict | Batch:
|
546
|
+
"""
|
547
|
+
Replace the properties of a Dataproc batch.
|
548
|
+
|
549
|
+
Args:
|
550
|
+
batch: The original Dataproc batch definition.
|
551
|
+
new_properties: The new properties to replace the existing ones.
|
552
|
+
|
553
|
+
Returns:
|
554
|
+
A modified copy of the Dataproc batch definition with updated properties.
|
555
|
+
"""
|
556
|
+
batch = copy.deepcopy(batch)
|
557
|
+
if isinstance(batch, Batch):
|
558
|
+
if not batch.runtime_config:
|
559
|
+
batch.runtime_config = RuntimeConfig(properties=new_properties)
|
560
|
+
elif isinstance(batch.runtime_config, dict):
|
561
|
+
batch.runtime_config["properties"] = new_properties
|
562
|
+
else:
|
563
|
+
batch.runtime_config.properties = new_properties
|
564
|
+
return batch
|
565
|
+
|
566
|
+
# For dictionary-based batch
|
567
|
+
run_time_config = batch.get("runtime_config")
|
568
|
+
if not run_time_config:
|
569
|
+
batch["runtime_config"] = {"properties": new_properties}
|
570
|
+
elif isinstance(run_time_config, dict):
|
571
|
+
run_time_config["properties"] = new_properties
|
572
|
+
else:
|
573
|
+
run_time_config.properties = new_properties
|
574
|
+
return batch
|
575
|
+
|
576
|
+
|
577
|
+
def inject_openlineage_properties_into_dataproc_batch(
|
578
|
+
batch: dict | Batch, context: Context, inject_parent_job_info: bool, inject_transport_info: bool
|
579
|
+
) -> dict | Batch:
|
580
|
+
"""
|
581
|
+
Inject OpenLineage properties into Dataproc batch definition.
|
582
|
+
|
583
|
+
This function does not remove existing configurations or modify the batch definition in any way,
|
584
|
+
except to add the required OpenLineage properties if they are not already present.
|
585
|
+
|
586
|
+
The entire properties injection process will be skipped if any condition is met:
|
587
|
+
- The OpenLineage provider is not accessible.
|
588
|
+
- The batch type is unsupported.
|
589
|
+
- Both `inject_parent_job_info` and `inject_transport_info` are set to False.
|
590
|
+
|
591
|
+
Additionally, specific information will not be injected if relevant OpenLineage properties already exist.
|
592
|
+
|
593
|
+
Parent job information will not be injected if:
|
594
|
+
- Any property prefixed with `spark.openlineage.parent` exists.
|
595
|
+
- `inject_parent_job_info` is False.
|
596
|
+
Transport information will not be injected if:
|
597
|
+
- Any property prefixed with `spark.openlineage.transport` exists.
|
598
|
+
- `inject_transport_info` is False.
|
599
|
+
|
600
|
+
Args:
|
601
|
+
batch: The original Dataproc batch definition.
|
602
|
+
context: The Airflow context in which the job is running.
|
603
|
+
inject_parent_job_info: Flag indicating whether to inject parent job information.
|
604
|
+
inject_transport_info: Flag indicating whether to inject transport information.
|
605
|
+
|
606
|
+
Returns:
|
607
|
+
The modified batch definition with OpenLineage properties injected, if applicable.
|
608
|
+
"""
|
609
|
+
if not inject_parent_job_info and not inject_transport_info:
|
610
|
+
log.debug("Automatic injection of OpenLineage information is disabled.")
|
611
|
+
return batch
|
612
|
+
|
613
|
+
if not _is_openlineage_provider_accessible():
|
614
|
+
log.warning(
|
615
|
+
"Could not access OpenLineage provider for automatic OpenLineage "
|
616
|
+
"properties injection. No action will be performed."
|
617
|
+
)
|
618
|
+
return batch
|
619
|
+
|
620
|
+
if not _is_dataproc_batch_of_supported_type(batch):
|
621
|
+
log.warning(
|
622
|
+
"Could not find a supported Dataproc batch type for automatic OpenLineage "
|
623
|
+
"properties injection. No action will be performed.",
|
624
|
+
)
|
625
|
+
return batch
|
626
|
+
|
627
|
+
properties = _extract_dataproc_batch_properties(batch)
|
628
|
+
|
629
|
+
if inject_parent_job_info:
|
630
|
+
log.debug("Injecting OpenLineage parent job information into Spark properties.")
|
631
|
+
properties = inject_parent_job_information_into_spark_properties(
|
632
|
+
properties=properties, context=context
|
633
|
+
)
|
634
|
+
|
635
|
+
if inject_transport_info:
|
636
|
+
log.debug("Injecting OpenLineage transport information into Spark properties.")
|
637
|
+
properties = inject_transport_information_into_spark_properties(
|
638
|
+
properties=properties, context=context
|
639
|
+
)
|
640
|
+
|
641
|
+
batch_with_ol_config = _replace_dataproc_batch_properties(batch=batch, new_properties=properties)
|
642
|
+
return batch_with_ol_config
|
643
|
+
|
644
|
+
|
645
|
+
def inject_openlineage_properties_into_dataproc_workflow_template(
|
646
|
+
template: dict, context: Context, inject_parent_job_info: bool, inject_transport_info: bool
|
647
|
+
) -> dict:
|
648
|
+
"""
|
649
|
+
Inject OpenLineage properties into all Spark jobs within Workflow Template.
|
650
|
+
|
651
|
+
This function does not remove existing configurations or modify the jobs definition in any way,
|
652
|
+
except to add the required OpenLineage properties if they are not already present.
|
653
|
+
|
654
|
+
The entire properties injection process for each job will be skipped if any condition is met:
|
655
|
+
- The OpenLineage provider is not accessible.
|
656
|
+
- The job type is unsupported.
|
657
|
+
- Both `inject_parent_job_info` and `inject_transport_info` are set to False.
|
658
|
+
|
659
|
+
Additionally, specific information will not be injected if relevant OpenLineage properties already exist.
|
660
|
+
|
661
|
+
Parent job information will not be injected if:
|
662
|
+
- Any property prefixed with `spark.openlineage.parent` exists.
|
663
|
+
- `inject_parent_job_info` is False.
|
664
|
+
Transport information will not be injected if:
|
665
|
+
- Any property prefixed with `spark.openlineage.transport` exists.
|
666
|
+
- `inject_transport_info` is False.
|
667
|
+
|
668
|
+
Args:
|
669
|
+
template: The original Dataproc Workflow Template definition.
|
670
|
+
context: The Airflow context in which the job is running.
|
671
|
+
inject_parent_job_info: Flag indicating whether to inject parent job information.
|
672
|
+
inject_transport_info: Flag indicating whether to inject transport information.
|
673
|
+
|
674
|
+
Returns:
|
675
|
+
The modified Workflow Template definition with OpenLineage properties injected, if applicable.
|
676
|
+
"""
|
677
|
+
if not inject_parent_job_info and not inject_transport_info:
|
678
|
+
log.debug("Automatic injection of OpenLineage information is disabled.")
|
679
|
+
return template
|
680
|
+
|
681
|
+
if not _is_openlineage_provider_accessible():
|
682
|
+
log.warning(
|
683
|
+
"Could not access OpenLineage provider for automatic OpenLineage "
|
684
|
+
"properties injection. No action will be performed."
|
685
|
+
)
|
686
|
+
return template
|
687
|
+
|
688
|
+
final_jobs = []
|
689
|
+
for single_job_definition in template["jobs"]:
|
690
|
+
step_id = single_job_definition["step_id"]
|
691
|
+
log.debug("Injecting OpenLineage properties into Workflow step: `%s`", step_id)
|
692
|
+
|
693
|
+
if (job_type := _extract_supported_job_type_from_dataproc_job(single_job_definition)) is None:
|
694
|
+
log.debug(
|
695
|
+
"Could not find a supported Dataproc job type for automatic OpenLineage "
|
696
|
+
"properties injection. No action will be performed.",
|
697
|
+
)
|
698
|
+
final_jobs.append(single_job_definition)
|
699
|
+
continue
|
700
|
+
|
701
|
+
properties = single_job_definition[job_type].get("properties", {})
|
702
|
+
|
703
|
+
if inject_parent_job_info:
|
704
|
+
log.debug("Injecting OpenLineage parent job information into Spark properties.")
|
705
|
+
properties = inject_parent_job_information_into_spark_properties(
|
706
|
+
properties=properties, context=context
|
707
|
+
)
|
708
|
+
|
709
|
+
if inject_transport_info:
|
710
|
+
log.debug("Injecting OpenLineage transport information into Spark properties.")
|
711
|
+
properties = inject_transport_information_into_spark_properties(
|
712
|
+
properties=properties, context=context
|
713
|
+
)
|
714
|
+
|
715
|
+
job_with_ol_config = _replace_dataproc_job_properties(
|
716
|
+
job=single_job_definition, job_type=job_type, new_properties=properties
|
717
|
+
)
|
718
|
+
final_jobs.append(job_with_ol_config)
|
719
|
+
|
720
|
+
template["jobs"] = final_jobs
|
721
|
+
return template
|