apache-airflow-providers-google 12.0.0rc2__py3-none-any.whl → 14.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. airflow/providers/google/LICENSE +0 -52
  2. airflow/providers/google/__init__.py +1 -1
  3. airflow/providers/google/ads/hooks/ads.py +27 -13
  4. airflow/providers/google/ads/transfers/ads_to_gcs.py +18 -4
  5. airflow/providers/google/assets/bigquery.py +17 -0
  6. airflow/providers/google/cloud/_internal_client/secret_manager_client.py +2 -3
  7. airflow/providers/google/cloud/hooks/alloy_db.py +736 -8
  8. airflow/providers/google/cloud/hooks/automl.py +10 -4
  9. airflow/providers/google/cloud/hooks/bigquery.py +125 -22
  10. airflow/providers/google/cloud/hooks/bigquery_dts.py +8 -8
  11. airflow/providers/google/cloud/hooks/bigtable.py +2 -3
  12. airflow/providers/google/cloud/hooks/cloud_batch.py +3 -4
  13. airflow/providers/google/cloud/hooks/cloud_build.py +4 -5
  14. airflow/providers/google/cloud/hooks/cloud_composer.py +3 -4
  15. airflow/providers/google/cloud/hooks/cloud_memorystore.py +3 -4
  16. airflow/providers/google/cloud/hooks/cloud_run.py +3 -4
  17. airflow/providers/google/cloud/hooks/cloud_sql.py +7 -3
  18. airflow/providers/google/cloud/hooks/cloud_storage_transfer_service.py +119 -7
  19. airflow/providers/google/cloud/hooks/compute.py +3 -3
  20. airflow/providers/google/cloud/hooks/datacatalog.py +3 -4
  21. airflow/providers/google/cloud/hooks/dataflow.py +12 -12
  22. airflow/providers/google/cloud/hooks/dataform.py +2 -3
  23. airflow/providers/google/cloud/hooks/datafusion.py +2 -2
  24. airflow/providers/google/cloud/hooks/dataplex.py +1032 -11
  25. airflow/providers/google/cloud/hooks/dataproc.py +4 -5
  26. airflow/providers/google/cloud/hooks/dataproc_metastore.py +3 -4
  27. airflow/providers/google/cloud/hooks/dlp.py +3 -4
  28. airflow/providers/google/cloud/hooks/gcs.py +7 -6
  29. airflow/providers/google/cloud/hooks/kms.py +2 -3
  30. airflow/providers/google/cloud/hooks/kubernetes_engine.py +8 -8
  31. airflow/providers/google/cloud/hooks/life_sciences.py +1 -1
  32. airflow/providers/google/cloud/hooks/managed_kafka.py +482 -0
  33. airflow/providers/google/cloud/hooks/natural_language.py +2 -3
  34. airflow/providers/google/cloud/hooks/os_login.py +2 -3
  35. airflow/providers/google/cloud/hooks/pubsub.py +6 -6
  36. airflow/providers/google/cloud/hooks/secret_manager.py +2 -3
  37. airflow/providers/google/cloud/hooks/spanner.py +2 -2
  38. airflow/providers/google/cloud/hooks/speech_to_text.py +2 -3
  39. airflow/providers/google/cloud/hooks/stackdriver.py +4 -4
  40. airflow/providers/google/cloud/hooks/tasks.py +3 -4
  41. airflow/providers/google/cloud/hooks/text_to_speech.py +2 -3
  42. airflow/providers/google/cloud/hooks/translate.py +236 -5
  43. airflow/providers/google/cloud/hooks/vertex_ai/auto_ml.py +9 -4
  44. airflow/providers/google/cloud/hooks/vertex_ai/batch_prediction_job.py +3 -4
  45. airflow/providers/google/cloud/hooks/vertex_ai/custom_job.py +4 -5
  46. airflow/providers/google/cloud/hooks/vertex_ai/dataset.py +3 -4
  47. airflow/providers/google/cloud/hooks/vertex_ai/endpoint_service.py +2 -3
  48. airflow/providers/google/cloud/hooks/vertex_ai/feature_store.py +3 -4
  49. airflow/providers/google/cloud/hooks/vertex_ai/generative_model.py +1 -181
  50. airflow/providers/google/cloud/hooks/vertex_ai/hyperparameter_tuning_job.py +3 -4
  51. airflow/providers/google/cloud/hooks/vertex_ai/model_service.py +2 -3
  52. airflow/providers/google/cloud/hooks/vertex_ai/pipeline_job.py +3 -4
  53. airflow/providers/google/cloud/hooks/vertex_ai/prediction_service.py +2 -3
  54. airflow/providers/google/cloud/hooks/video_intelligence.py +2 -3
  55. airflow/providers/google/cloud/hooks/vision.py +3 -4
  56. airflow/providers/google/cloud/hooks/workflows.py +2 -3
  57. airflow/providers/google/cloud/links/alloy_db.py +46 -0
  58. airflow/providers/google/cloud/links/bigquery.py +25 -0
  59. airflow/providers/google/cloud/links/dataplex.py +172 -2
  60. airflow/providers/google/cloud/links/kubernetes_engine.py +1 -2
  61. airflow/providers/google/cloud/links/managed_kafka.py +104 -0
  62. airflow/providers/google/cloud/links/translate.py +28 -0
  63. airflow/providers/google/cloud/log/gcs_task_handler.py +3 -3
  64. airflow/providers/google/cloud/log/stackdriver_task_handler.py +11 -10
  65. airflow/providers/google/cloud/openlineage/facets.py +67 -0
  66. airflow/providers/google/cloud/openlineage/mixins.py +438 -173
  67. airflow/providers/google/cloud/openlineage/utils.py +394 -61
  68. airflow/providers/google/cloud/operators/alloy_db.py +980 -69
  69. airflow/providers/google/cloud/operators/automl.py +83 -245
  70. airflow/providers/google/cloud/operators/bigquery.py +377 -74
  71. airflow/providers/google/cloud/operators/bigquery_dts.py +126 -13
  72. airflow/providers/google/cloud/operators/bigtable.py +1 -3
  73. airflow/providers/google/cloud/operators/cloud_base.py +1 -2
  74. airflow/providers/google/cloud/operators/cloud_batch.py +2 -4
  75. airflow/providers/google/cloud/operators/cloud_build.py +3 -5
  76. airflow/providers/google/cloud/operators/cloud_composer.py +5 -7
  77. airflow/providers/google/cloud/operators/cloud_memorystore.py +4 -6
  78. airflow/providers/google/cloud/operators/cloud_run.py +6 -5
  79. airflow/providers/google/cloud/operators/cloud_sql.py +20 -8
  80. airflow/providers/google/cloud/operators/cloud_storage_transfer_service.py +62 -8
  81. airflow/providers/google/cloud/operators/compute.py +3 -4
  82. airflow/providers/google/cloud/operators/datacatalog.py +9 -11
  83. airflow/providers/google/cloud/operators/dataflow.py +1 -112
  84. airflow/providers/google/cloud/operators/dataform.py +3 -5
  85. airflow/providers/google/cloud/operators/datafusion.py +1 -1
  86. airflow/providers/google/cloud/operators/dataplex.py +2046 -7
  87. airflow/providers/google/cloud/operators/dataproc.py +102 -17
  88. airflow/providers/google/cloud/operators/dataproc_metastore.py +7 -9
  89. airflow/providers/google/cloud/operators/dlp.py +17 -19
  90. airflow/providers/google/cloud/operators/gcs.py +14 -17
  91. airflow/providers/google/cloud/operators/kubernetes_engine.py +2 -2
  92. airflow/providers/google/cloud/operators/managed_kafka.py +788 -0
  93. airflow/providers/google/cloud/operators/natural_language.py +3 -5
  94. airflow/providers/google/cloud/operators/pubsub.py +39 -7
  95. airflow/providers/google/cloud/operators/speech_to_text.py +3 -5
  96. airflow/providers/google/cloud/operators/stackdriver.py +3 -5
  97. airflow/providers/google/cloud/operators/tasks.py +4 -6
  98. airflow/providers/google/cloud/operators/text_to_speech.py +2 -4
  99. airflow/providers/google/cloud/operators/translate.py +414 -5
  100. airflow/providers/google/cloud/operators/translate_speech.py +2 -4
  101. airflow/providers/google/cloud/operators/vertex_ai/auto_ml.py +9 -8
  102. airflow/providers/google/cloud/operators/vertex_ai/batch_prediction_job.py +4 -6
  103. airflow/providers/google/cloud/operators/vertex_ai/custom_job.py +6 -8
  104. airflow/providers/google/cloud/operators/vertex_ai/dataset.py +4 -6
  105. airflow/providers/google/cloud/operators/vertex_ai/endpoint_service.py +4 -6
  106. airflow/providers/google/cloud/operators/vertex_ai/generative_model.py +0 -322
  107. airflow/providers/google/cloud/operators/vertex_ai/hyperparameter_tuning_job.py +4 -6
  108. airflow/providers/google/cloud/operators/vertex_ai/model_service.py +4 -6
  109. airflow/providers/google/cloud/operators/vertex_ai/pipeline_job.py +4 -6
  110. airflow/providers/google/cloud/operators/video_intelligence.py +3 -5
  111. airflow/providers/google/cloud/operators/vision.py +4 -6
  112. airflow/providers/google/cloud/operators/workflows.py +5 -7
  113. airflow/providers/google/cloud/secrets/secret_manager.py +1 -2
  114. airflow/providers/google/cloud/sensors/bigquery_dts.py +3 -5
  115. airflow/providers/google/cloud/sensors/bigtable.py +2 -3
  116. airflow/providers/google/cloud/sensors/cloud_composer.py +32 -8
  117. airflow/providers/google/cloud/sensors/cloud_storage_transfer_service.py +39 -1
  118. airflow/providers/google/cloud/sensors/dataplex.py +4 -6
  119. airflow/providers/google/cloud/sensors/dataproc.py +2 -3
  120. airflow/providers/google/cloud/sensors/dataproc_metastore.py +1 -2
  121. airflow/providers/google/cloud/sensors/gcs.py +2 -4
  122. airflow/providers/google/cloud/sensors/pubsub.py +2 -3
  123. airflow/providers/google/cloud/sensors/workflows.py +3 -5
  124. airflow/providers/google/cloud/transfers/bigquery_to_gcs.py +5 -5
  125. airflow/providers/google/cloud/transfers/gcs_to_bigquery.py +10 -12
  126. airflow/providers/google/cloud/transfers/gcs_to_gcs.py +1 -1
  127. airflow/providers/google/cloud/transfers/gcs_to_sftp.py +36 -4
  128. airflow/providers/google/cloud/transfers/mssql_to_gcs.py +27 -2
  129. airflow/providers/google/cloud/transfers/mysql_to_gcs.py +27 -2
  130. airflow/providers/google/cloud/transfers/postgres_to_gcs.py +27 -2
  131. airflow/providers/google/cloud/transfers/sftp_to_gcs.py +34 -5
  132. airflow/providers/google/cloud/transfers/sql_to_gcs.py +15 -0
  133. airflow/providers/google/cloud/transfers/trino_to_gcs.py +25 -2
  134. airflow/providers/google/cloud/triggers/bigquery_dts.py +1 -2
  135. airflow/providers/google/cloud/triggers/cloud_batch.py +1 -2
  136. airflow/providers/google/cloud/triggers/cloud_build.py +1 -2
  137. airflow/providers/google/cloud/triggers/cloud_composer.py +13 -3
  138. airflow/providers/google/cloud/triggers/cloud_storage_transfer_service.py +102 -4
  139. airflow/providers/google/cloud/triggers/dataflow.py +2 -3
  140. airflow/providers/google/cloud/triggers/dataplex.py +1 -2
  141. airflow/providers/google/cloud/triggers/dataproc.py +2 -3
  142. airflow/providers/google/cloud/triggers/kubernetes_engine.py +1 -1
  143. airflow/providers/google/cloud/triggers/pubsub.py +1 -2
  144. airflow/providers/google/cloud/triggers/vertex_ai.py +7 -8
  145. airflow/providers/google/cloud/utils/credentials_provider.py +15 -8
  146. airflow/providers/google/cloud/utils/external_token_supplier.py +1 -0
  147. airflow/providers/google/common/auth_backend/google_openid.py +4 -4
  148. airflow/providers/google/common/consts.py +1 -2
  149. airflow/providers/google/common/hooks/base_google.py +8 -7
  150. airflow/providers/google/get_provider_info.py +186 -134
  151. airflow/providers/google/marketing_platform/hooks/analytics_admin.py +2 -3
  152. airflow/providers/google/marketing_platform/hooks/search_ads.py +1 -1
  153. airflow/providers/google/marketing_platform/operators/analytics_admin.py +5 -7
  154. {apache_airflow_providers_google-12.0.0rc2.dist-info → apache_airflow_providers_google-14.0.0.dist-info}/METADATA +41 -58
  155. {apache_airflow_providers_google-12.0.0rc2.dist-info → apache_airflow_providers_google-14.0.0.dist-info}/RECORD +157 -159
  156. airflow/providers/google/cloud/example_dags/example_facebook_ads_to_gcs.py +0 -141
  157. airflow/providers/google/cloud/example_dags/example_looker.py +0 -64
  158. airflow/providers/google/cloud/example_dags/example_presto_to_gcs.py +0 -194
  159. airflow/providers/google/cloud/example_dags/example_salesforce_to_gcs.py +0 -129
  160. airflow/providers/google/marketing_platform/example_dags/__init__.py +0 -16
  161. airflow/providers/google/marketing_platform/example_dags/example_display_video.py +0 -213
  162. {apache_airflow_providers_google-12.0.0rc2.dist-info → apache_airflow_providers_google-14.0.0.dist-info}/WHEEL +0 -0
  163. {apache_airflow_providers_google-12.0.0rc2.dist-info → apache_airflow_providers_google-14.0.0.dist-info}/entry_points.txt +0 -0
@@ -17,36 +17,38 @@
17
17
  # under the License.
18
18
  from __future__ import annotations
19
19
 
20
+ import copy
20
21
  import logging
21
22
  import os
22
23
  import pathlib
24
+ import re
25
+ from collections import defaultdict
26
+ from collections.abc import Iterable
23
27
  from typing import TYPE_CHECKING, Any
24
28
 
25
- from attr import define, field
26
-
27
- if TYPE_CHECKING:
28
- from google.cloud.bigquery.table import Table
29
-
30
- from airflow.providers.common.compat.openlineage.facet import Dataset
31
- from airflow.utils.context import Context
32
-
33
29
  from airflow.providers.common.compat.openlineage.facet import (
34
- BaseFacet,
35
30
  ColumnLineageDatasetFacet,
31
+ DatasetFacet,
36
32
  DocumentationDatasetFacet,
37
33
  Fields,
38
34
  Identifier,
39
35
  InputField,
40
- RunFacet,
41
36
  SchemaDatasetFacet,
42
37
  SchemaDatasetFacetFields,
43
38
  SymlinksDatasetFacet,
44
39
  )
45
40
  from airflow.providers.common.compat.openlineage.utils.spark import (
46
41
  inject_parent_job_information_into_spark_properties,
42
+ inject_transport_information_into_spark_properties,
47
43
  )
48
- from airflow.providers.google import __version__ as provider_version
49
44
  from airflow.providers.google.cloud.hooks.gcs import _parse_gcs_url
45
+ from google.cloud.dataproc_v1 import Batch, RuntimeConfig
46
+
47
+ if TYPE_CHECKING:
48
+ from airflow.providers.common.compat.openlineage.facet import Dataset
49
+ from airflow.utils.context import Context
50
+ from google.cloud.bigquery.table import Table
51
+
50
52
 
51
53
  log = logging.getLogger(__name__)
52
54
 
@@ -55,6 +57,106 @@ BIGQUERY_URI = "bigquery"
55
57
  WILDCARD = "*"
56
58
 
57
59
 
60
+ def merge_column_lineage_facets(facets: list[ColumnLineageDatasetFacet]) -> ColumnLineageDatasetFacet:
61
+ """
62
+ Merge multiple column lineage facets into a single consolidated facet.
63
+
64
+ Specifically, it aggregates input fields and transformations for each field across all provided facets.
65
+
66
+ Args:
67
+ facets: Column Lineage Facets to be merged.
68
+
69
+ Returns:
70
+ A new Column Lineage Facet containing all fields, their respective input fields and transformations.
71
+
72
+ Notes:
73
+ - Input fields are uniquely identified by their `(namespace, name, field)` tuple.
74
+ - If multiple facets contain the same field with the same input field, those input
75
+ fields are merged without duplication.
76
+ - Transformations associated with input fields are also merged. If transformations
77
+ are not supported by the version of the `InputField` class, they will be omitted.
78
+ - Transformation merging relies on a composite key of the field name and input field
79
+ tuple to track and consolidate transformations.
80
+
81
+ Examples:
82
+ Case 1: Two facets with the same input field
83
+ ```
84
+ >>> facet1 = ColumnLineageDatasetFacet(
85
+ ... fields={"columnA": Fields(inputFields=[InputField("namespace1", "dataset1", "field1")])}
86
+ ... )
87
+ >>> facet2 = ColumnLineageDatasetFacet(
88
+ ... fields={"columnA": Fields(inputFields=[InputField("namespace1", "dataset1", "field1")])}
89
+ ... )
90
+ >>> merged = merge_column_lineage_facets([facet1, facet2])
91
+ >>> merged.fields["columnA"].inputFields
92
+ [InputField("namespace1", "dataset1", "field1")]
93
+ ```
94
+
95
+ Case 2: Two facets with different transformations for the same input field
96
+ ```
97
+ >>> facet1 = ColumnLineageDatasetFacet(
98
+ ... fields={
99
+ ... "columnA": Fields(
100
+ ... inputFields=[InputField("namespace1", "dataset1", "field1", transformations=["t1"])]
101
+ ... )
102
+ ... }
103
+ ... )
104
+ >>> facet2 = ColumnLineageDatasetFacet(
105
+ ... fields={
106
+ ... "columnA": Fields(
107
+ ... inputFields=[InputField("namespace1", "dataset1", "field1", transformations=["t2"])]
108
+ ... )
109
+ ... }
110
+ ... )
111
+ >>> merged = merge_column_lineage_facets([facet1, facet2])
112
+ >>> merged.fields["columnA"].inputFields[0].transformations
113
+ ["t1", "t2"]
114
+ ```
115
+ """
116
+ # Dictionary to collect all unique input fields for each field name
117
+ fields_sources: dict[str, set[tuple[str, str, str]]] = defaultdict(set)
118
+ # Dictionary to aggregate transformations for each input field
119
+ transformations: dict[str, list] = defaultdict(list)
120
+
121
+ for facet in facets:
122
+ for field_name, single_field in facet.fields.items():
123
+ for input_field in single_field.inputFields:
124
+ input_key_fields = (input_field.namespace, input_field.name, input_field.field)
125
+ fields_sources[field_name].add(input_key_fields)
126
+
127
+ if single_transformations := getattr(input_field, "transformations", []):
128
+ transformation_key = "".join((field_name, *input_key_fields))
129
+ transformations[transformation_key].extend(single_transformations)
130
+
131
+ # Check if the `InputField` class supports the `transformations` attribute (since OL client 1.17.1)
132
+ input_field_allows_transformation_info = True
133
+ try:
134
+ InputField(namespace="a", name="b", field="c", transformations=[])
135
+ except TypeError:
136
+ input_field_allows_transformation_info = False
137
+
138
+ return ColumnLineageDatasetFacet(
139
+ fields={
140
+ field_name: Fields(
141
+ inputFields=[
142
+ InputField(
143
+ namespace,
144
+ name,
145
+ column,
146
+ transformations.get("".join((field_name, namespace, name, column)), []),
147
+ )
148
+ if input_field_allows_transformation_info
149
+ else InputField(namespace, name, column)
150
+ for namespace, name, column in sorted(input_fields)
151
+ ],
152
+ transformationType="", # Legacy transformation information
153
+ transformationDescription="", # Legacy transformation information
154
+ )
155
+ for field_name, input_fields in fields_sources.items()
156
+ }
157
+ )
158
+
159
+
58
160
  def extract_ds_name_from_gcs_path(path: str) -> str:
59
161
  """
60
162
  Extract and process the dataset name from a given path.
@@ -108,9 +210,9 @@ def extract_ds_name_from_gcs_path(path: str) -> str:
108
210
  return path
109
211
 
110
212
 
111
- def get_facets_from_bq_table(table: Table) -> dict[str, BaseFacet]:
213
+ def get_facets_from_bq_table(table: Table) -> dict[str, DatasetFacet]:
112
214
  """Get facets from BigQuery table object."""
113
- facets: dict[str, BaseFacet] = {}
215
+ facets: dict[str, DatasetFacet] = {}
114
216
  if table.schema:
115
217
  facets["schema"] = SchemaDatasetFacet(
116
218
  fields=[
@@ -124,26 +226,37 @@ def get_facets_from_bq_table(table: Table) -> dict[str, BaseFacet]:
124
226
  facets["documentation"] = DocumentationDatasetFacet(description=table.description)
125
227
 
126
228
  if table.external_data_configuration:
127
- symlinks = set()
128
- for uri in table.external_data_configuration.source_uris:
129
- if uri.startswith("gs://"):
130
- bucket, blob = _parse_gcs_url(uri)
131
- blob = extract_ds_name_from_gcs_path(blob)
132
- symlinks.add((f"gs://{bucket}", blob))
133
-
229
+ symlinks = get_namespace_name_from_source_uris(table.external_data_configuration.source_uris)
134
230
  facets["symlink"] = SymlinksDatasetFacet(
135
231
  identifiers=[
136
- Identifier(namespace=namespace, name=name, type="file")
232
+ Identifier(
233
+ namespace=namespace, name=name, type="file" if namespace.startswith("gs://") else "table"
234
+ )
137
235
  for namespace, name in sorted(symlinks)
138
236
  ]
139
237
  )
140
238
  return facets
141
239
 
142
240
 
241
+ def get_namespace_name_from_source_uris(source_uris: Iterable[str]) -> set[tuple[str, str]]:
242
+ result = set()
243
+ for uri in source_uris:
244
+ if uri.startswith("gs://"):
245
+ bucket, blob = _parse_gcs_url(uri)
246
+ result.add((f"gs://{bucket}", extract_ds_name_from_gcs_path(blob)))
247
+ elif uri.startswith("https://googleapis.com/bigtable"):
248
+ regex = r"https://googleapis.com/bigtable/projects/([^/]+)/instances/([^/]+)(?:/appProfiles/([^/]+))?/tables/([^/]+)"
249
+ match = re.match(regex, uri)
250
+ if match:
251
+ project_id, instance_id, table_id = match.groups()[0], match.groups()[1], match.groups()[3]
252
+ result.add((f"bigtable://{project_id}/{instance_id}", table_id))
253
+ return result
254
+
255
+
143
256
  def get_identity_column_lineage_facet(
144
- dest_field_names: list[str],
145
- input_datasets: list[Dataset],
146
- ) -> dict[str, ColumnLineageDatasetFacet]:
257
+ dest_field_names: Iterable[str],
258
+ input_datasets: Iterable[Dataset],
259
+ ) -> dict[str, DatasetFacet]:
147
260
  """
148
261
  Get column lineage facet for identity transformations.
149
262
 
@@ -200,31 +313,6 @@ def get_identity_column_lineage_facet(
200
313
  return {"columnLineage": column_lineage_facet}
201
314
 
202
315
 
203
- @define
204
- class BigQueryJobRunFacet(RunFacet):
205
- """
206
- Facet that represents relevant statistics of bigquery run.
207
-
208
- This facet is used to provide statistics about bigquery run.
209
-
210
- :param cached: BigQuery caches query results. Rest of the statistics will not be provided for cached queries.
211
- :param billedBytes: How many bytes BigQuery bills for.
212
- :param properties: Full property tree of BigQUery run.
213
- """
214
-
215
- cached: bool
216
- billedBytes: int | None = field(default=None)
217
- properties: str | None = field(default=None)
218
-
219
- @staticmethod
220
- def _get_schema() -> str:
221
- return (
222
- "https://raw.githubusercontent.com/apache/airflow/"
223
- f"providers-google/{provider_version}/airflow/providers/google/"
224
- "openlineage/BigQueryJobRunFacet.json"
225
- )
226
-
227
-
228
316
  def get_from_nullable_chain(source: Any, chain: list[str]) -> Any | None:
229
317
  """
230
318
  Get object from nested structure of objects, where it's not guaranteed that all keys in the nested structure exist.
@@ -336,31 +424,38 @@ def _replace_dataproc_job_properties(job: dict, job_type: str, new_properties: d
336
424
 
337
425
 
338
426
  def inject_openlineage_properties_into_dataproc_job(
339
- job: dict, context: Context, inject_parent_job_info: bool
427
+ job: dict, context: Context, inject_parent_job_info: bool, inject_transport_info: bool
340
428
  ) -> dict:
341
429
  """
342
430
  Inject OpenLineage properties into Spark job definition.
343
431
 
344
- Function is not removing any configuration or modifying the job in any other way,
345
- apart from adding desired OpenLineage properties to Dataproc job definition if not already present.
432
+ This function does not remove existing configurations or modify the job definition in any way,
433
+ except to add the required OpenLineage properties if they are not already present.
434
+
435
+ The entire properties injection process will be skipped if any condition is met:
436
+ - The OpenLineage provider is not accessible.
437
+ - The job type is unsupported.
438
+ - Both `inject_parent_job_info` and `inject_transport_info` are set to False.
346
439
 
347
- Note:
348
- Any modification to job will be skipped if:
349
- - OpenLineage provider is not accessible.
350
- - The job type is not supported.
351
- - Automatic parent job information injection is disabled.
352
- - Any OpenLineage properties with parent job information are already present
353
- in the Spark job definition.
440
+ Additionally, specific information will not be injected if relevant OpenLineage properties already exist.
441
+
442
+ Parent job information will not be injected if:
443
+ - Any property prefixed with `spark.openlineage.parent` exists.
444
+ - `inject_parent_job_info` is False.
445
+ Transport information will not be injected if:
446
+ - Any property prefixed with `spark.openlineage.transport` exists.
447
+ - `inject_transport_info` is False.
354
448
 
355
449
  Args:
356
450
  job: The original Dataproc job definition.
357
451
  context: The Airflow context in which the job is running.
358
452
  inject_parent_job_info: Flag indicating whether to inject parent job information.
453
+ inject_transport_info: Flag indicating whether to inject transport information.
359
454
 
360
455
  Returns:
361
456
  The modified job definition with OpenLineage properties injected, if applicable.
362
457
  """
363
- if not inject_parent_job_info:
458
+ if not inject_parent_job_info and not inject_transport_info:
364
459
  log.debug("Automatic injection of OpenLineage information is disabled.")
365
460
  return job
366
461
 
@@ -380,9 +475,247 @@ def inject_openlineage_properties_into_dataproc_job(
380
475
 
381
476
  properties = job[job_type].get("properties", {})
382
477
 
383
- properties = inject_parent_job_information_into_spark_properties(properties=properties, context=context)
478
+ if inject_parent_job_info:
479
+ log.debug("Injecting OpenLineage parent job information into Spark properties.")
480
+ properties = inject_parent_job_information_into_spark_properties(
481
+ properties=properties, context=context
482
+ )
483
+
484
+ if inject_transport_info:
485
+ log.debug("Injecting OpenLineage transport information into Spark properties.")
486
+ properties = inject_transport_information_into_spark_properties(
487
+ properties=properties, context=context
488
+ )
384
489
 
385
490
  job_with_ol_config = _replace_dataproc_job_properties(
386
491
  job=job, job_type=job_type, new_properties=properties
387
492
  )
388
493
  return job_with_ol_config
494
+
495
+
496
+ def _is_dataproc_batch_of_supported_type(batch: dict | Batch) -> bool:
497
+ """
498
+ Check if a Dataproc batch is of a supported type for Openlineage automatic injection.
499
+
500
+ This function determines if the given batch is of a supported type
501
+ by checking for specific job type attributes or keys in the batch.
502
+
503
+ Args:
504
+ batch: The Dataproc batch to check.
505
+
506
+ Returns:
507
+ True if the batch is of a supported type (`spark_batch` or
508
+ `pyspark_batch`), otherwise False.
509
+ """
510
+ supported_job_types = ("spark_batch", "pyspark_batch")
511
+ if isinstance(batch, Batch):
512
+ if any(getattr(batch, job_type) for job_type in supported_job_types):
513
+ return True
514
+ return False
515
+
516
+ # For dictionary-based batch
517
+ if any(job_type in batch for job_type in supported_job_types):
518
+ return True
519
+ return False
520
+
521
+
522
+ def _extract_dataproc_batch_properties(batch: dict | Batch) -> dict:
523
+ """
524
+ Extract Dataproc batch properties from a Batch object or dictionary.
525
+
526
+ This function retrieves the `properties` from the `runtime_config` of a
527
+ Dataproc `Batch` object or a dictionary representation of a batch.
528
+
529
+ Args:
530
+ batch: The Dataproc batch to extract properties from.
531
+
532
+ Returns:
533
+ Extracted `properties` if found, otherwise an empty dictionary.
534
+ """
535
+ if isinstance(batch, Batch):
536
+ return dict(batch.runtime_config.properties)
537
+
538
+ # For dictionary-based batch
539
+ run_time_config = batch.get("runtime_config", {})
540
+ if isinstance(run_time_config, RuntimeConfig):
541
+ return dict(run_time_config.properties)
542
+ return run_time_config.get("properties", {})
543
+
544
+
545
+ def _replace_dataproc_batch_properties(batch: dict | Batch, new_properties: dict) -> dict | Batch:
546
+ """
547
+ Replace the properties of a Dataproc batch.
548
+
549
+ Args:
550
+ batch: The original Dataproc batch definition.
551
+ new_properties: The new properties to replace the existing ones.
552
+
553
+ Returns:
554
+ A modified copy of the Dataproc batch definition with updated properties.
555
+ """
556
+ batch = copy.deepcopy(batch)
557
+ if isinstance(batch, Batch):
558
+ if not batch.runtime_config:
559
+ batch.runtime_config = RuntimeConfig(properties=new_properties)
560
+ elif isinstance(batch.runtime_config, dict):
561
+ batch.runtime_config["properties"] = new_properties
562
+ else:
563
+ batch.runtime_config.properties = new_properties
564
+ return batch
565
+
566
+ # For dictionary-based batch
567
+ run_time_config = batch.get("runtime_config")
568
+ if not run_time_config:
569
+ batch["runtime_config"] = {"properties": new_properties}
570
+ elif isinstance(run_time_config, dict):
571
+ run_time_config["properties"] = new_properties
572
+ else:
573
+ run_time_config.properties = new_properties
574
+ return batch
575
+
576
+
577
+ def inject_openlineage_properties_into_dataproc_batch(
578
+ batch: dict | Batch, context: Context, inject_parent_job_info: bool, inject_transport_info: bool
579
+ ) -> dict | Batch:
580
+ """
581
+ Inject OpenLineage properties into Dataproc batch definition.
582
+
583
+ This function does not remove existing configurations or modify the batch definition in any way,
584
+ except to add the required OpenLineage properties if they are not already present.
585
+
586
+ The entire properties injection process will be skipped if any condition is met:
587
+ - The OpenLineage provider is not accessible.
588
+ - The batch type is unsupported.
589
+ - Both `inject_parent_job_info` and `inject_transport_info` are set to False.
590
+
591
+ Additionally, specific information will not be injected if relevant OpenLineage properties already exist.
592
+
593
+ Parent job information will not be injected if:
594
+ - Any property prefixed with `spark.openlineage.parent` exists.
595
+ - `inject_parent_job_info` is False.
596
+ Transport information will not be injected if:
597
+ - Any property prefixed with `spark.openlineage.transport` exists.
598
+ - `inject_transport_info` is False.
599
+
600
+ Args:
601
+ batch: The original Dataproc batch definition.
602
+ context: The Airflow context in which the job is running.
603
+ inject_parent_job_info: Flag indicating whether to inject parent job information.
604
+ inject_transport_info: Flag indicating whether to inject transport information.
605
+
606
+ Returns:
607
+ The modified batch definition with OpenLineage properties injected, if applicable.
608
+ """
609
+ if not inject_parent_job_info and not inject_transport_info:
610
+ log.debug("Automatic injection of OpenLineage information is disabled.")
611
+ return batch
612
+
613
+ if not _is_openlineage_provider_accessible():
614
+ log.warning(
615
+ "Could not access OpenLineage provider for automatic OpenLineage "
616
+ "properties injection. No action will be performed."
617
+ )
618
+ return batch
619
+
620
+ if not _is_dataproc_batch_of_supported_type(batch):
621
+ log.warning(
622
+ "Could not find a supported Dataproc batch type for automatic OpenLineage "
623
+ "properties injection. No action will be performed.",
624
+ )
625
+ return batch
626
+
627
+ properties = _extract_dataproc_batch_properties(batch)
628
+
629
+ if inject_parent_job_info:
630
+ log.debug("Injecting OpenLineage parent job information into Spark properties.")
631
+ properties = inject_parent_job_information_into_spark_properties(
632
+ properties=properties, context=context
633
+ )
634
+
635
+ if inject_transport_info:
636
+ log.debug("Injecting OpenLineage transport information into Spark properties.")
637
+ properties = inject_transport_information_into_spark_properties(
638
+ properties=properties, context=context
639
+ )
640
+
641
+ batch_with_ol_config = _replace_dataproc_batch_properties(batch=batch, new_properties=properties)
642
+ return batch_with_ol_config
643
+
644
+
645
+ def inject_openlineage_properties_into_dataproc_workflow_template(
646
+ template: dict, context: Context, inject_parent_job_info: bool, inject_transport_info: bool
647
+ ) -> dict:
648
+ """
649
+ Inject OpenLineage properties into all Spark jobs within Workflow Template.
650
+
651
+ This function does not remove existing configurations or modify the jobs definition in any way,
652
+ except to add the required OpenLineage properties if they are not already present.
653
+
654
+ The entire properties injection process for each job will be skipped if any condition is met:
655
+ - The OpenLineage provider is not accessible.
656
+ - The job type is unsupported.
657
+ - Both `inject_parent_job_info` and `inject_transport_info` are set to False.
658
+
659
+ Additionally, specific information will not be injected if relevant OpenLineage properties already exist.
660
+
661
+ Parent job information will not be injected if:
662
+ - Any property prefixed with `spark.openlineage.parent` exists.
663
+ - `inject_parent_job_info` is False.
664
+ Transport information will not be injected if:
665
+ - Any property prefixed with `spark.openlineage.transport` exists.
666
+ - `inject_transport_info` is False.
667
+
668
+ Args:
669
+ template: The original Dataproc Workflow Template definition.
670
+ context: The Airflow context in which the job is running.
671
+ inject_parent_job_info: Flag indicating whether to inject parent job information.
672
+ inject_transport_info: Flag indicating whether to inject transport information.
673
+
674
+ Returns:
675
+ The modified Workflow Template definition with OpenLineage properties injected, if applicable.
676
+ """
677
+ if not inject_parent_job_info and not inject_transport_info:
678
+ log.debug("Automatic injection of OpenLineage information is disabled.")
679
+ return template
680
+
681
+ if not _is_openlineage_provider_accessible():
682
+ log.warning(
683
+ "Could not access OpenLineage provider for automatic OpenLineage "
684
+ "properties injection. No action will be performed."
685
+ )
686
+ return template
687
+
688
+ final_jobs = []
689
+ for single_job_definition in template["jobs"]:
690
+ step_id = single_job_definition["step_id"]
691
+ log.debug("Injecting OpenLineage properties into Workflow step: `%s`", step_id)
692
+
693
+ if (job_type := _extract_supported_job_type_from_dataproc_job(single_job_definition)) is None:
694
+ log.debug(
695
+ "Could not find a supported Dataproc job type for automatic OpenLineage "
696
+ "properties injection. No action will be performed.",
697
+ )
698
+ final_jobs.append(single_job_definition)
699
+ continue
700
+
701
+ properties = single_job_definition[job_type].get("properties", {})
702
+
703
+ if inject_parent_job_info:
704
+ log.debug("Injecting OpenLineage parent job information into Spark properties.")
705
+ properties = inject_parent_job_information_into_spark_properties(
706
+ properties=properties, context=context
707
+ )
708
+
709
+ if inject_transport_info:
710
+ log.debug("Injecting OpenLineage transport information into Spark properties.")
711
+ properties = inject_transport_information_into_spark_properties(
712
+ properties=properties, context=context
713
+ )
714
+
715
+ job_with_ol_config = _replace_dataproc_job_properties(
716
+ job=single_job_definition, job_type=job_type, new_properties=properties
717
+ )
718
+ final_jobs.append(job_with_ol_config)
719
+
720
+ template["jobs"] = final_jobs
721
+ return template