apache-airflow-providers-google 12.0.0rc1__py3-none-any.whl → 13.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. airflow/providers/google/LICENSE +0 -52
  2. airflow/providers/google/__init__.py +1 -1
  3. airflow/providers/google/ads/hooks/ads.py +27 -13
  4. airflow/providers/google/ads/transfers/ads_to_gcs.py +18 -4
  5. airflow/providers/google/assets/bigquery.py +17 -0
  6. airflow/providers/google/cloud/_internal_client/secret_manager_client.py +2 -3
  7. airflow/providers/google/cloud/hooks/alloy_db.py +736 -8
  8. airflow/providers/google/cloud/hooks/automl.py +10 -4
  9. airflow/providers/google/cloud/hooks/bigquery.py +125 -22
  10. airflow/providers/google/cloud/hooks/bigquery_dts.py +8 -8
  11. airflow/providers/google/cloud/hooks/bigtable.py +2 -3
  12. airflow/providers/google/cloud/hooks/cloud_batch.py +3 -4
  13. airflow/providers/google/cloud/hooks/cloud_build.py +4 -5
  14. airflow/providers/google/cloud/hooks/cloud_composer.py +3 -4
  15. airflow/providers/google/cloud/hooks/cloud_memorystore.py +3 -4
  16. airflow/providers/google/cloud/hooks/cloud_run.py +3 -4
  17. airflow/providers/google/cloud/hooks/cloud_sql.py +7 -3
  18. airflow/providers/google/cloud/hooks/cloud_storage_transfer_service.py +119 -7
  19. airflow/providers/google/cloud/hooks/compute.py +3 -3
  20. airflow/providers/google/cloud/hooks/datacatalog.py +3 -4
  21. airflow/providers/google/cloud/hooks/dataflow.py +12 -12
  22. airflow/providers/google/cloud/hooks/dataform.py +2 -3
  23. airflow/providers/google/cloud/hooks/datafusion.py +2 -2
  24. airflow/providers/google/cloud/hooks/dataplex.py +1032 -11
  25. airflow/providers/google/cloud/hooks/dataproc.py +4 -5
  26. airflow/providers/google/cloud/hooks/dataproc_metastore.py +3 -4
  27. airflow/providers/google/cloud/hooks/dlp.py +3 -4
  28. airflow/providers/google/cloud/hooks/gcs.py +7 -6
  29. airflow/providers/google/cloud/hooks/kms.py +2 -3
  30. airflow/providers/google/cloud/hooks/kubernetes_engine.py +8 -8
  31. airflow/providers/google/cloud/hooks/life_sciences.py +1 -1
  32. airflow/providers/google/cloud/hooks/managed_kafka.py +482 -0
  33. airflow/providers/google/cloud/hooks/natural_language.py +2 -3
  34. airflow/providers/google/cloud/hooks/os_login.py +2 -3
  35. airflow/providers/google/cloud/hooks/pubsub.py +6 -6
  36. airflow/providers/google/cloud/hooks/secret_manager.py +2 -3
  37. airflow/providers/google/cloud/hooks/spanner.py +2 -2
  38. airflow/providers/google/cloud/hooks/speech_to_text.py +2 -3
  39. airflow/providers/google/cloud/hooks/stackdriver.py +4 -4
  40. airflow/providers/google/cloud/hooks/tasks.py +3 -4
  41. airflow/providers/google/cloud/hooks/text_to_speech.py +2 -3
  42. airflow/providers/google/cloud/hooks/translate.py +236 -5
  43. airflow/providers/google/cloud/hooks/vertex_ai/auto_ml.py +9 -4
  44. airflow/providers/google/cloud/hooks/vertex_ai/batch_prediction_job.py +3 -4
  45. airflow/providers/google/cloud/hooks/vertex_ai/custom_job.py +4 -5
  46. airflow/providers/google/cloud/hooks/vertex_ai/dataset.py +3 -4
  47. airflow/providers/google/cloud/hooks/vertex_ai/endpoint_service.py +2 -3
  48. airflow/providers/google/cloud/hooks/vertex_ai/feature_store.py +3 -4
  49. airflow/providers/google/cloud/hooks/vertex_ai/generative_model.py +1 -181
  50. airflow/providers/google/cloud/hooks/vertex_ai/hyperparameter_tuning_job.py +3 -4
  51. airflow/providers/google/cloud/hooks/vertex_ai/model_service.py +2 -3
  52. airflow/providers/google/cloud/hooks/vertex_ai/pipeline_job.py +3 -4
  53. airflow/providers/google/cloud/hooks/vertex_ai/prediction_service.py +2 -3
  54. airflow/providers/google/cloud/hooks/video_intelligence.py +2 -3
  55. airflow/providers/google/cloud/hooks/vision.py +3 -4
  56. airflow/providers/google/cloud/hooks/workflows.py +2 -3
  57. airflow/providers/google/cloud/links/alloy_db.py +46 -0
  58. airflow/providers/google/cloud/links/bigquery.py +25 -0
  59. airflow/providers/google/cloud/links/dataplex.py +172 -2
  60. airflow/providers/google/cloud/links/kubernetes_engine.py +1 -2
  61. airflow/providers/google/cloud/links/managed_kafka.py +104 -0
  62. airflow/providers/google/cloud/links/translate.py +28 -0
  63. airflow/providers/google/cloud/log/gcs_task_handler.py +3 -3
  64. airflow/providers/google/cloud/log/stackdriver_task_handler.py +11 -10
  65. airflow/providers/google/cloud/openlineage/facets.py +67 -0
  66. airflow/providers/google/cloud/openlineage/mixins.py +438 -173
  67. airflow/providers/google/cloud/openlineage/utils.py +394 -61
  68. airflow/providers/google/cloud/operators/alloy_db.py +980 -69
  69. airflow/providers/google/cloud/operators/automl.py +83 -245
  70. airflow/providers/google/cloud/operators/bigquery.py +377 -74
  71. airflow/providers/google/cloud/operators/bigquery_dts.py +126 -13
  72. airflow/providers/google/cloud/operators/bigtable.py +1 -3
  73. airflow/providers/google/cloud/operators/cloud_base.py +1 -2
  74. airflow/providers/google/cloud/operators/cloud_batch.py +2 -4
  75. airflow/providers/google/cloud/operators/cloud_build.py +3 -5
  76. airflow/providers/google/cloud/operators/cloud_composer.py +5 -7
  77. airflow/providers/google/cloud/operators/cloud_memorystore.py +4 -6
  78. airflow/providers/google/cloud/operators/cloud_run.py +6 -5
  79. airflow/providers/google/cloud/operators/cloud_sql.py +20 -8
  80. airflow/providers/google/cloud/operators/cloud_storage_transfer_service.py +62 -8
  81. airflow/providers/google/cloud/operators/compute.py +3 -4
  82. airflow/providers/google/cloud/operators/datacatalog.py +9 -11
  83. airflow/providers/google/cloud/operators/dataflow.py +1 -112
  84. airflow/providers/google/cloud/operators/dataform.py +3 -5
  85. airflow/providers/google/cloud/operators/datafusion.py +1 -1
  86. airflow/providers/google/cloud/operators/dataplex.py +2046 -7
  87. airflow/providers/google/cloud/operators/dataproc.py +102 -17
  88. airflow/providers/google/cloud/operators/dataproc_metastore.py +7 -9
  89. airflow/providers/google/cloud/operators/dlp.py +17 -19
  90. airflow/providers/google/cloud/operators/gcs.py +14 -17
  91. airflow/providers/google/cloud/operators/kubernetes_engine.py +2 -2
  92. airflow/providers/google/cloud/operators/managed_kafka.py +788 -0
  93. airflow/providers/google/cloud/operators/natural_language.py +3 -5
  94. airflow/providers/google/cloud/operators/pubsub.py +39 -7
  95. airflow/providers/google/cloud/operators/speech_to_text.py +3 -5
  96. airflow/providers/google/cloud/operators/stackdriver.py +3 -5
  97. airflow/providers/google/cloud/operators/tasks.py +4 -6
  98. airflow/providers/google/cloud/operators/text_to_speech.py +2 -4
  99. airflow/providers/google/cloud/operators/translate.py +414 -5
  100. airflow/providers/google/cloud/operators/translate_speech.py +2 -4
  101. airflow/providers/google/cloud/operators/vertex_ai/auto_ml.py +9 -8
  102. airflow/providers/google/cloud/operators/vertex_ai/batch_prediction_job.py +4 -6
  103. airflow/providers/google/cloud/operators/vertex_ai/custom_job.py +6 -8
  104. airflow/providers/google/cloud/operators/vertex_ai/dataset.py +4 -6
  105. airflow/providers/google/cloud/operators/vertex_ai/endpoint_service.py +4 -6
  106. airflow/providers/google/cloud/operators/vertex_ai/generative_model.py +0 -322
  107. airflow/providers/google/cloud/operators/vertex_ai/hyperparameter_tuning_job.py +4 -6
  108. airflow/providers/google/cloud/operators/vertex_ai/model_service.py +4 -6
  109. airflow/providers/google/cloud/operators/vertex_ai/pipeline_job.py +4 -6
  110. airflow/providers/google/cloud/operators/video_intelligence.py +3 -5
  111. airflow/providers/google/cloud/operators/vision.py +4 -6
  112. airflow/providers/google/cloud/operators/workflows.py +5 -7
  113. airflow/providers/google/cloud/secrets/secret_manager.py +1 -2
  114. airflow/providers/google/cloud/sensors/bigquery_dts.py +3 -5
  115. airflow/providers/google/cloud/sensors/bigtable.py +2 -3
  116. airflow/providers/google/cloud/sensors/cloud_composer.py +32 -8
  117. airflow/providers/google/cloud/sensors/cloud_storage_transfer_service.py +39 -1
  118. airflow/providers/google/cloud/sensors/dataplex.py +4 -6
  119. airflow/providers/google/cloud/sensors/dataproc.py +2 -3
  120. airflow/providers/google/cloud/sensors/dataproc_metastore.py +1 -2
  121. airflow/providers/google/cloud/sensors/gcs.py +2 -4
  122. airflow/providers/google/cloud/sensors/pubsub.py +2 -3
  123. airflow/providers/google/cloud/sensors/workflows.py +3 -5
  124. airflow/providers/google/cloud/transfers/bigquery_to_gcs.py +5 -5
  125. airflow/providers/google/cloud/transfers/gcs_to_bigquery.py +10 -12
  126. airflow/providers/google/cloud/transfers/gcs_to_gcs.py +1 -1
  127. airflow/providers/google/cloud/transfers/gcs_to_sftp.py +36 -4
  128. airflow/providers/google/cloud/transfers/mssql_to_gcs.py +27 -2
  129. airflow/providers/google/cloud/transfers/mysql_to_gcs.py +27 -2
  130. airflow/providers/google/cloud/transfers/postgres_to_gcs.py +27 -2
  131. airflow/providers/google/cloud/transfers/sftp_to_gcs.py +34 -5
  132. airflow/providers/google/cloud/transfers/sql_to_gcs.py +15 -0
  133. airflow/providers/google/cloud/transfers/trino_to_gcs.py +25 -2
  134. airflow/providers/google/cloud/triggers/bigquery_dts.py +1 -2
  135. airflow/providers/google/cloud/triggers/cloud_batch.py +1 -2
  136. airflow/providers/google/cloud/triggers/cloud_build.py +1 -2
  137. airflow/providers/google/cloud/triggers/cloud_composer.py +13 -3
  138. airflow/providers/google/cloud/triggers/cloud_storage_transfer_service.py +102 -4
  139. airflow/providers/google/cloud/triggers/dataflow.py +2 -3
  140. airflow/providers/google/cloud/triggers/dataplex.py +1 -2
  141. airflow/providers/google/cloud/triggers/dataproc.py +2 -3
  142. airflow/providers/google/cloud/triggers/kubernetes_engine.py +1 -1
  143. airflow/providers/google/cloud/triggers/pubsub.py +1 -2
  144. airflow/providers/google/cloud/triggers/vertex_ai.py +7 -8
  145. airflow/providers/google/cloud/utils/credentials_provider.py +15 -8
  146. airflow/providers/google/cloud/utils/external_token_supplier.py +1 -0
  147. airflow/providers/google/common/auth_backend/google_openid.py +4 -4
  148. airflow/providers/google/common/consts.py +1 -2
  149. airflow/providers/google/common/hooks/base_google.py +8 -7
  150. airflow/providers/google/get_provider_info.py +186 -134
  151. airflow/providers/google/marketing_platform/hooks/analytics_admin.py +2 -3
  152. airflow/providers/google/marketing_platform/hooks/search_ads.py +1 -1
  153. airflow/providers/google/marketing_platform/operators/analytics_admin.py +5 -7
  154. {apache_airflow_providers_google-12.0.0rc1.dist-info → apache_airflow_providers_google-13.0.0.dist-info}/METADATA +41 -58
  155. {apache_airflow_providers_google-12.0.0rc1.dist-info → apache_airflow_providers_google-13.0.0.dist-info}/RECORD +157 -159
  156. airflow/providers/google/cloud/example_dags/example_facebook_ads_to_gcs.py +0 -141
  157. airflow/providers/google/cloud/example_dags/example_looker.py +0 -64
  158. airflow/providers/google/cloud/example_dags/example_presto_to_gcs.py +0 -194
  159. airflow/providers/google/cloud/example_dags/example_salesforce_to_gcs.py +0 -129
  160. airflow/providers/google/marketing_platform/example_dags/__init__.py +0 -16
  161. airflow/providers/google/marketing_platform/example_dags/example_display_video.py +0 -213
  162. {apache_airflow_providers_google-12.0.0rc1.dist-info → apache_airflow_providers_google-13.0.0.dist-info}/WHEEL +0 -0
  163. {apache_airflow_providers_google-12.0.0rc1.dist-info → apache_airflow_providers_google-13.0.0.dist-info}/entry_points.txt +0 -0
@@ -20,61 +20,71 @@ from __future__ import annotations
20
20
  import copy
21
21
  import json
22
22
  import traceback
23
+ from collections.abc import Iterable
23
24
  from typing import TYPE_CHECKING, cast
24
25
 
25
- if TYPE_CHECKING:
26
- from airflow.providers.common.compat.openlineage.facet import (
27
- Dataset,
28
- InputDataset,
29
- OutputDataset,
30
- OutputStatisticsOutputDatasetFacet,
31
- RunFacet,
32
- SchemaDatasetFacet,
33
- )
34
- from airflow.providers.google.cloud.openlineage.utils import BigQueryJobRunFacet
26
+ from airflow.providers.common.compat.openlineage.facet import (
27
+ ColumnLineageDatasetFacet,
28
+ DatasetFacet,
29
+ ErrorMessageRunFacet,
30
+ ExternalQueryRunFacet,
31
+ Fields,
32
+ InputDataset,
33
+ InputField,
34
+ OutputDataset,
35
+ OutputStatisticsOutputDatasetFacet,
36
+ SchemaDatasetFacet,
37
+ SQLJobFacet,
38
+ )
39
+ from airflow.providers.google.cloud.openlineage.utils import (
40
+ BIGQUERY_NAMESPACE,
41
+ get_facets_from_bq_table,
42
+ get_from_nullable_chain,
43
+ get_identity_column_lineage_facet,
44
+ get_namespace_name_from_source_uris,
45
+ merge_column_lineage_facets,
46
+ )
35
47
 
48
+ if TYPE_CHECKING:
49
+ from airflow.providers.common.compat.openlineage.facet import Dataset, RunFacet
50
+ from airflow.providers.google.cloud.openlineage.facets import BigQueryJobRunFacet
36
51
 
37
- BIGQUERY_NAMESPACE = "bigquery"
38
52
 
53
+ class _BigQueryInsertJobOperatorOpenLineageMixin:
54
+ """Mixin for BigQueryInsertJobOperator to extract OpenLineage metadata."""
39
55
 
40
- class _BigQueryOpenLineageMixin:
41
56
  def get_openlineage_facets_on_complete(self, _):
42
57
  """
43
- Retrieve OpenLineage data for a COMPLETE BigQuery job.
58
+ Retrieve OpenLineage data for a completed BigQuery job.
44
59
 
45
- This method retrieves statistics for the specified job_ids using the BigQueryDatasetsProvider.
46
- It calls BigQuery API, retrieving input and output dataset info from it, as well as run-level
47
- usage statistics.
48
-
49
- Run facets should contain:
50
- - ExternalQueryRunFacet
51
- - BigQueryJobRunFacet
60
+ This method calls BigQuery API, retrieving input and output dataset info from it,
61
+ as well as run-level statistics.
52
62
 
53
63
  Run facets may contain:
54
- - ErrorMessageRunFacet
64
+ - ExternalQueryRunFacet (for QUERY job type)
65
+ - BigQueryJobRunFacet
66
+ - ErrorMessageRunFacet (if an error occurred)
55
67
 
56
68
  Job facets should contain:
57
- - SqlJobFacet if operator has self.sql
69
+ - SqlJobFacet (for QUERY job type)
58
70
 
59
- Input datasets should contain facets:
60
- - DataSourceDatasetFacet
71
+ Input datasets should contain:
61
72
  - SchemaDatasetFacet
62
73
 
63
- Output datasets should contain facets:
64
- - DataSourceDatasetFacet
74
+ Output datasets should contain:
65
75
  - SchemaDatasetFacet
66
- - OutputStatisticsOutputDatasetFacet
76
+ - OutputStatisticsOutputDatasetFacet (for QUERY job type)
77
+ - ColumnLineageDatasetFacet (for QUERY job type)
67
78
  """
68
- from airflow.providers.common.compat.openlineage.facet import ExternalQueryRunFacet, SQLJobFacet
69
79
  from airflow.providers.openlineage.extractors import OperatorLineage
70
80
  from airflow.providers.openlineage.sqlparser import SQLParser
71
81
 
72
82
  if not self.job_id:
73
- if hasattr(self, "log"):
74
- self.log.warning("No BigQuery job_id was found by OpenLineage.")
83
+ self.log.warning("No BigQuery job_id was found by OpenLineage.") # type: ignore[attr-defined]
75
84
  return OperatorLineage()
76
85
 
77
86
  if not self.hook:
87
+ # This can occur when in deferrable mode
78
88
  from airflow.providers.google.cloud.hooks.bigquery import BigQueryHook
79
89
 
80
90
  self.hook = BigQueryHook(
@@ -82,64 +92,34 @@ class _BigQueryOpenLineageMixin:
82
92
  impersonation_chain=self.impersonation_chain,
83
93
  )
84
94
 
95
+ self.log.debug("Extracting data from bigquery job: `%s`", self.job_id) # type: ignore[attr-defined]
96
+ inputs, outputs = [], []
85
97
  run_facets: dict[str, RunFacet] = {
86
98
  "externalQuery": ExternalQueryRunFacet(externalQueryId=self.job_id, source="bigquery")
87
99
  }
88
-
89
- job_facets = {"sql": SQLJobFacet(query=SQLParser.normalize_sql(self.sql))}
90
-
91
- self.client = self.hook.get_client(project_id=self.hook.project_id, location=self.location)
92
- job_ids = self.job_id
93
- if isinstance(self.job_id, str):
94
- job_ids = [self.job_id]
95
- inputs, outputs = [], []
96
- for job_id in job_ids:
97
- inner_inputs, inner_outputs, inner_run_facets = self.get_facets(job_id=job_id)
98
- inputs.extend(inner_inputs)
99
- outputs.extend(inner_outputs)
100
- run_facets.update(inner_run_facets)
101
-
102
- return OperatorLineage(
103
- inputs=inputs,
104
- outputs=outputs,
105
- run_facets=run_facets,
106
- job_facets=job_facets,
107
- )
108
-
109
- def get_facets(self, job_id: str):
110
- from airflow.providers.common.compat.openlineage.facet import ErrorMessageRunFacet
111
- from airflow.providers.google.cloud.openlineage.utils import get_from_nullable_chain
112
-
113
- inputs = []
114
- outputs = []
115
- run_facets: dict[str, RunFacet] = {}
116
- if hasattr(self, "log"):
117
- self.log.debug("Extracting data from bigquery job: `%s`", job_id)
100
+ self._client = self.hook.get_client(project_id=self.hook.project_id, location=self.location)
118
101
  try:
119
- job = self.client.get_job(job_id=job_id) # type: ignore
120
- props = job._properties
102
+ job_properties = self._client.get_job(job_id=self.job_id)._properties # type: ignore
121
103
 
122
- if get_from_nullable_chain(props, ["status", "state"]) != "DONE":
123
- raise ValueError(f"Trying to extract data from running bigquery job: `{job_id}`")
104
+ if get_from_nullable_chain(job_properties, ["status", "state"]) != "DONE":
105
+ raise ValueError(f"Trying to extract data from running bigquery job: `{self.job_id}`")
124
106
 
125
- run_facets["bigQueryJob"] = self._get_bigquery_job_run_facet(props)
107
+ run_facets["bigQueryJob"] = self._get_bigquery_job_run_facet(job_properties)
126
108
 
127
- if get_from_nullable_chain(props, ["statistics", "numChildJobs"]):
128
- if hasattr(self, "log"):
129
- self.log.debug("Found SCRIPT job. Extracting lineage from child jobs instead.")
109
+ if get_from_nullable_chain(job_properties, ["statistics", "numChildJobs"]):
110
+ self.log.debug("Found SCRIPT job. Extracting lineage from child jobs instead.") # type: ignore[attr-defined]
130
111
  # SCRIPT job type has no input / output information but spawns child jobs that have one
131
112
  # https://cloud.google.com/bigquery/docs/information-schema-jobs#multi-statement_query_job
132
- for child_job_id in self.client.list_jobs(parent_job=job_id):
133
- child_job = self.client.get_job(job_id=child_job_id) # type: ignore
134
- child_inputs, child_output = self._get_inputs_outputs_from_job(child_job._properties)
113
+ for child_job_id in self._client.list_jobs(parent_job=self.job_id):
114
+ child_job_properties = self._client.get_job(job_id=child_job_id)._properties # type: ignore
115
+ child_inputs, child_outputs = self._get_inputs_and_outputs(child_job_properties)
135
116
  inputs.extend(child_inputs)
136
- outputs.append(child_output)
117
+ outputs.extend(child_outputs)
137
118
  else:
138
- inputs, _output = self._get_inputs_outputs_from_job(props)
139
- outputs.append(_output)
119
+ inputs, outputs = self._get_inputs_and_outputs(job_properties)
120
+
140
121
  except Exception as e:
141
- if hasattr(self, "log"):
142
- self.log.warning("Cannot retrieve job details from BigQuery.Client. %s", e, exc_info=True)
122
+ self.log.warning("Cannot retrieve job details from BigQuery.Client. %s", e, exc_info=True) # type: ignore[attr-defined]
143
123
  exception_msg = traceback.format_exc()
144
124
  run_facets.update(
145
125
  {
@@ -149,16 +129,37 @@ class _BigQueryOpenLineageMixin:
149
129
  )
150
130
  }
151
131
  )
152
- deduplicated_outputs = self._deduplicate_outputs(outputs)
153
- return inputs, deduplicated_outputs, run_facets
154
132
 
155
- def _deduplicate_outputs(self, outputs: list[OutputDataset | None]) -> list[OutputDataset]:
156
- # Sources are the same so we can compare only names
133
+ return OperatorLineage(
134
+ inputs=list(inputs),
135
+ outputs=self._deduplicate_outputs(outputs),
136
+ run_facets=run_facets,
137
+ job_facets={"sql": SQLJobFacet(query=SQLParser.normalize_sql(self.sql))} if self.sql else {},
138
+ )
139
+
140
+ def _get_inputs_and_outputs(self, properties: dict) -> tuple[list[InputDataset], list[OutputDataset]]:
141
+ job_type = get_from_nullable_chain(properties, ["configuration", "jobType"])
142
+
143
+ if job_type == "QUERY":
144
+ inputs, outputs = self._get_inputs_and_outputs_for_query_job(properties)
145
+ elif job_type == "LOAD":
146
+ inputs, outputs = self._get_inputs_and_outputs_for_load_job(properties)
147
+ elif job_type == "COPY":
148
+ inputs, outputs = self._get_inputs_and_outputs_for_copy_job(properties)
149
+ elif job_type == "EXTRACT":
150
+ inputs, outputs = self._get_inputs_and_outputs_for_extract_job(properties)
151
+ else:
152
+ self.log.debug("Unsupported job type for input/output extraction: `%s`.", job_type) # type: ignore[attr-defined]
153
+ inputs, outputs = [], []
154
+
155
+ return inputs, outputs
156
+
157
+ def _deduplicate_outputs(self, outputs: Iterable[OutputDataset | None]) -> list[OutputDataset]:
157
158
  final_outputs = {}
158
159
  for single_output in outputs:
159
160
  if not single_output:
160
161
  continue
161
- key = single_output.name
162
+ key = f"{single_output.namespace}.{single_output.name}"
162
163
  if key not in final_outputs:
163
164
  final_outputs[key] = single_output
164
165
  continue
@@ -167,139 +168,403 @@ class _BigQueryOpenLineageMixin:
167
168
  # if the rowCount or size can be summed together.
168
169
  if single_output.outputFacets:
169
170
  single_output.outputFacets.pop("outputStatistics", None)
170
- final_outputs[key] = single_output
171
-
172
- return list(final_outputs.values())
173
-
174
- def _get_inputs_outputs_from_job(
175
- self, properties: dict
176
- ) -> tuple[list[InputDataset], OutputDataset | None]:
177
- from airflow.providers.google.cloud.openlineage.utils import get_from_nullable_chain
178
-
179
- input_tables = get_from_nullable_chain(properties, ["statistics", "query", "referencedTables"]) or []
180
- output_table = get_from_nullable_chain(properties, ["configuration", "query", "destinationTable"])
181
- inputs = [(self._get_input_dataset(input_table)) for input_table in input_tables]
182
- if output_table:
183
- output = self._get_output_dataset(output_table)
184
- dataset_stat_facet = self._get_statistics_dataset_facet(properties)
185
- output.outputFacets = output.outputFacets or {}
186
- if dataset_stat_facet:
187
- output.outputFacets["outputStatistics"] = dataset_stat_facet
188
-
189
- return inputs, output
190
171
 
191
- @staticmethod
192
- def _get_bigquery_job_run_facet(properties: dict) -> BigQueryJobRunFacet:
193
- from airflow.providers.google.cloud.openlineage.utils import (
194
- BigQueryJobRunFacet,
195
- get_from_nullable_chain,
196
- )
197
-
198
- if get_from_nullable_chain(properties, ["configuration", "query", "query"]):
199
- # Exclude the query to avoid event size issues and duplicating SqlJobFacet information.
200
- properties = copy.deepcopy(properties)
201
- properties["configuration"]["query"].pop("query")
202
- cache_hit = get_from_nullable_chain(properties, ["statistics", "query", "cacheHit"])
203
- billed_bytes = get_from_nullable_chain(properties, ["statistics", "query", "totalBytesBilled"])
204
- return BigQueryJobRunFacet(
205
- cached=str(cache_hit).lower() == "true",
206
- billedBytes=int(billed_bytes) if billed_bytes else None,
207
- properties=json.dumps(properties),
208
- )
209
-
210
- @staticmethod
211
- def _get_statistics_dataset_facet(
212
- properties,
213
- ) -> OutputStatisticsOutputDatasetFacet | None:
214
- from airflow.providers.common.compat.openlineage.facet import OutputStatisticsOutputDatasetFacet
215
- from airflow.providers.google.cloud.openlineage.utils import get_from_nullable_chain
172
+ # If multiple outputs contain Column Level Lineage Facet - merge the facets
173
+ if (
174
+ single_output.facets
175
+ and final_outputs[key].facets
176
+ and "columnLineage" in single_output.facets # type: ignore
177
+ and "columnLineage" in final_outputs[key].facets # type: ignore
178
+ ):
179
+ single_output.facets["columnLineage"] = merge_column_lineage_facets(
180
+ [
181
+ single_output.facets["columnLineage"], # type: ignore
182
+ final_outputs[key].facets["columnLineage"], # type: ignore
183
+ ]
184
+ )
216
185
 
217
- query_plan = get_from_nullable_chain(properties, chain=["statistics", "query", "queryPlan"])
218
- if not query_plan:
219
- return None
186
+ final_outputs[key] = single_output
220
187
 
221
- out_stage = query_plan[-1]
222
- out_rows = out_stage.get("recordsWritten", None)
223
- out_bytes = out_stage.get("shuffleOutputBytes", None)
224
- if out_bytes and out_rows:
225
- return OutputStatisticsOutputDatasetFacet(rowCount=int(out_rows), size=int(out_bytes))
226
- return None
188
+ return list(final_outputs.values())
227
189
 
228
190
  def _get_input_dataset(self, table: dict) -> InputDataset:
229
- from airflow.providers.common.compat.openlineage.facet import InputDataset
230
-
231
191
  return cast(InputDataset, self._get_dataset(table, "input"))
232
192
 
233
193
  def _get_output_dataset(self, table: dict) -> OutputDataset:
234
- from airflow.providers.common.compat.openlineage.facet import OutputDataset
235
-
236
194
  return cast(OutputDataset, self._get_dataset(table, "output"))
237
195
 
238
196
  def _get_dataset(self, table: dict, dataset_type: str) -> Dataset:
239
- from airflow.providers.common.compat.openlineage.facet import InputDataset, OutputDataset
240
-
241
197
  project = table.get("projectId")
242
198
  dataset = table.get("datasetId")
243
199
  table_name = table.get("tableId")
244
200
  dataset_name = f"{project}.{dataset}.{table_name}"
245
201
 
246
- dataset_schema = self._get_table_schema_safely(dataset_name)
202
+ dataset_facets = self._get_table_facets_safely(dataset_name)
247
203
  if dataset_type == "input":
248
204
  # Logic specific to creating InputDataset (if needed)
249
205
  return InputDataset(
250
206
  namespace=BIGQUERY_NAMESPACE,
251
207
  name=dataset_name,
252
- facets={
253
- "schema": dataset_schema,
254
- }
255
- if dataset_schema
256
- else {},
208
+ facets=dataset_facets,
257
209
  )
258
210
  elif dataset_type == "output":
259
211
  # Logic specific to creating OutputDataset (if needed)
260
212
  return OutputDataset(
261
213
  namespace=BIGQUERY_NAMESPACE,
262
214
  name=dataset_name,
263
- facets={
264
- "schema": dataset_schema,
265
- }
266
- if dataset_schema
267
- else {},
215
+ facets=dataset_facets,
268
216
  )
269
217
  else:
270
218
  raise ValueError("Invalid dataset_type. Must be 'input' or 'output'")
271
219
 
272
- def _get_table_schema_safely(self, table_name: str) -> SchemaDatasetFacet | None:
220
+ def _get_table_facets_safely(self, table_name: str) -> dict[str, DatasetFacet]:
273
221
  try:
274
- return self._get_table_schema(table_name)
222
+ bq_table = self._client.get_table(table_name)
223
+ return get_facets_from_bq_table(bq_table)
275
224
  except Exception as e:
276
- if hasattr(self, "log"):
277
- self.log.warning("Could not extract output schema from bigquery. %s", e)
225
+ self.log.warning("Could not extract facets from bigquery table: `%s`. %s", table_name, e) # type: ignore[attr-defined]
226
+ return {}
227
+
228
+ def _get_inputs_and_outputs_for_query_job(
229
+ self, properties: dict
230
+ ) -> tuple[list[InputDataset], list[OutputDataset]]:
231
+ input_tables = get_from_nullable_chain(properties, ["statistics", "query", "referencedTables"]) or []
232
+ output_table = get_from_nullable_chain(properties, ["configuration", "query", "destinationTable"])
233
+
234
+ inputs = [
235
+ self._get_input_dataset(input_table)
236
+ for input_table in input_tables
237
+ if input_table != output_table # Output table is in `referencedTables` and needs to be removed
238
+ ]
239
+
240
+ if not output_table:
241
+ return inputs, []
242
+
243
+ output = self._get_output_dataset(output_table)
244
+ if dataset_stat_facet := self._get_output_statistics_dataset_facet(properties):
245
+ output.outputFacets = output.outputFacets or {}
246
+ output.outputFacets["outputStatistics"] = dataset_stat_facet
247
+ if cll_facet := self._get_column_level_lineage_facet_for_query_job(properties, output, inputs):
248
+ output.facets = output.facets or {}
249
+ output.facets["columnLineage"] = cll_facet
250
+ return inputs, [output]
251
+
252
+ def _get_inputs_and_outputs_for_load_job(
253
+ self, properties: dict
254
+ ) -> tuple[list[InputDataset], list[OutputDataset]]:
255
+ output = self._get_output_dataset(properties["configuration"]["load"]["destinationTable"])
256
+ output_table_schema_facet = output.facets.get("schema") if output.facets else None
257
+
258
+ source_uris = properties["configuration"]["load"]["sourceUris"]
259
+ inputs = [
260
+ InputDataset(
261
+ namespace=namespace,
262
+ name=name,
263
+ facets={"schema": output_table_schema_facet} if output_table_schema_facet else {},
264
+ )
265
+ for namespace, name in get_namespace_name_from_source_uris(source_uris)
266
+ ]
267
+
268
+ if dataset_stat_facet := self._get_output_statistics_dataset_facet(properties):
269
+ output.outputFacets = output.outputFacets or {}
270
+ output.outputFacets["outputStatistics"] = dataset_stat_facet
271
+ if cll_facet := get_identity_column_lineage_facet(self._extract_column_names(output), inputs):
272
+ output.facets = {**output.facets, **cll_facet} if output.facets else cll_facet
273
+ return inputs, [output]
274
+
275
+ def _get_inputs_and_outputs_for_copy_job(
276
+ self, properties: dict
277
+ ) -> tuple[list[InputDataset], list[OutputDataset]]:
278
+ input_tables = get_from_nullable_chain(properties, ["configuration", "copy", "sourceTables"]) or [
279
+ get_from_nullable_chain(properties, ["configuration", "copy", "sourceTable"])
280
+ ]
281
+ inputs = [self._get_input_dataset(input_table) for input_table in input_tables]
282
+
283
+ output = self._get_output_dataset(properties["configuration"]["copy"]["destinationTable"])
284
+ if dataset_stat_facet := self._get_output_statistics_dataset_facet(properties):
285
+ output.outputFacets = output.outputFacets or {}
286
+ output.outputFacets["outputStatistics"] = dataset_stat_facet
287
+ if cll_facet := get_identity_column_lineage_facet(self._extract_column_names(output), inputs):
288
+ output.facets = {**output.facets, **cll_facet} if output.facets else cll_facet
289
+ return inputs, [output]
290
+
291
+ def _get_inputs_and_outputs_for_extract_job(
292
+ self, properties: dict
293
+ ) -> tuple[list[InputDataset], list[OutputDataset]]:
294
+ source_table = get_from_nullable_chain(properties, ["configuration", "extract", "sourceTable"])
295
+ input_dataset = self._get_input_dataset(source_table) if source_table else None
296
+
297
+ destination_uris = get_from_nullable_chain(
298
+ properties, ["configuration", "extract", "destinationUris"]
299
+ ) or [get_from_nullable_chain(properties, ["configuration", "extract", "destinationUri"])]
300
+
301
+ outputs = []
302
+ for namespace, name in get_namespace_name_from_source_uris(destination_uris):
303
+ output_facets = {}
304
+ if input_dataset:
305
+ input_schema = input_dataset.facets.get("schema") if input_dataset.facets else None
306
+ if input_schema:
307
+ output_facets["schema"] = input_schema
308
+ if cll_facet := get_identity_column_lineage_facet(
309
+ self._extract_column_names(input_dataset), [input_dataset]
310
+ ):
311
+ output_facets = {**output_facets, **cll_facet}
312
+ outputs.append(OutputDataset(namespace=namespace, name=name, facets=output_facets))
313
+
314
+ inputs = [input_dataset] if input_dataset else []
315
+ return inputs, outputs
316
+
317
+ @staticmethod
318
+ def _get_bigquery_job_run_facet(properties: dict) -> BigQueryJobRunFacet:
319
+ from airflow.providers.google.cloud.openlineage.facets import BigQueryJobRunFacet
320
+
321
+ job_type = get_from_nullable_chain(properties, ["configuration", "jobType"])
322
+ cache_hit, billed_bytes = None, None
323
+ if job_type == "QUERY":
324
+ if get_from_nullable_chain(properties, ["configuration", "query", "query"]):
325
+ # Exclude the query to avoid event size issues and duplicating SqlJobFacet information.
326
+ properties = copy.deepcopy(properties)
327
+ properties["configuration"]["query"].pop("query")
328
+ cache_hit = get_from_nullable_chain(properties, ["statistics", "query", "cacheHit"])
329
+ billed_bytes = get_from_nullable_chain(properties, ["statistics", "query", "totalBytesBilled"])
330
+
331
+ return BigQueryJobRunFacet(
332
+ cached=str(cache_hit).lower() == "true",
333
+ billedBytes=int(billed_bytes) if billed_bytes else None,
334
+ properties=json.dumps(properties),
335
+ )
336
+
337
+ @staticmethod
338
+ def _get_output_statistics_dataset_facet(
339
+ properties,
340
+ ) -> OutputStatisticsOutputDatasetFacet | None:
341
+ job_type = get_from_nullable_chain(properties, ["configuration", "jobType"])
342
+ out_rows, out_bytes = None, None
343
+ if job_type == "QUERY":
344
+ query_plan = get_from_nullable_chain(properties, chain=["statistics", "query", "queryPlan"])
345
+ if not query_plan: # Without query plan there is no statistics
346
+ return None
347
+ out_stage = query_plan[-1] # Last stage of query plan writes the data and has all the statistics
348
+ out_rows = out_stage.get("recordsWritten", None)
349
+ out_bytes = out_stage.get("shuffleOutputBytes", None)
350
+ elif job_type == "LOAD":
351
+ out_rows = get_from_nullable_chain(properties, ["statistics", "load", "outputRows"])
352
+ out_bytes = get_from_nullable_chain(properties, ["statistics", "load", "outputBytes"])
353
+ elif job_type == "COPY":
354
+ out_rows = get_from_nullable_chain(properties, ["statistics", "copy", "copiedRows"])
355
+ out_bytes = get_from_nullable_chain(properties, ["statistics", "copy", "copiedLogicalBytes"])
356
+ # No statistics available for EXTRACT job type
357
+
358
+ if out_bytes and out_rows:
359
+ return OutputStatisticsOutputDatasetFacet(rowCount=int(out_rows), size=int(out_bytes))
278
360
  return None
279
361
 
280
- def _get_table_schema(self, table: str) -> SchemaDatasetFacet | None:
281
- from airflow.providers.common.compat.openlineage.facet import (
282
- SchemaDatasetFacet,
283
- SchemaDatasetFacetFields,
362
+ def _get_column_level_lineage_facet_for_query_job(
363
+ self, properties: dict, output: OutputDataset, inputs: Iterable[InputDataset]
364
+ ) -> ColumnLineageDatasetFacet | None:
365
+ """
366
+ Extract column-level lineage information from a BigQuery job and return it as a facet.
367
+
368
+ The Column Level Lineage Facet will NOT be returned if any of the following condition is met:
369
+ - The parsed result does not contain column lineage information.
370
+ - The parsed result does not contain exactly one output table.
371
+ - The parsed result has a different output table than the output table from the BQ job.
372
+ - The parsed result has at least one input table not present in the input tables from the BQ job.
373
+ - The parsed result has a column not present in the schema of given dataset from the BQ job.
374
+
375
+ Args:
376
+ properties: The properties of the BigQuery job.
377
+ output: The output dataset for which the column lineage is being extracted.
378
+
379
+ Returns:
380
+ The extracted Column Lineage Dataset Facet, or None if conditions are not met.
381
+ """
382
+ from airflow.providers.openlineage.sqlparser import SQLParser
383
+
384
+ # Extract SQL query and parse it
385
+ self.log.debug("Extracting column-level lineage facet from BigQuery query.") # type: ignore[attr-defined]
386
+
387
+ query = get_from_nullable_chain(properties, ["configuration", "query", "query"])
388
+ if query is None:
389
+ self.log.debug("No query found in BQ job configuration. Facet generation skipped.") # type: ignore[attr-defined]
390
+ return None
391
+
392
+ parse_result = SQLParser("bigquery").parse(SQLParser.split_sql_string(SQLParser.normalize_sql(query)))
393
+ if parse_result is None or parse_result.column_lineage == []:
394
+ self.log.debug("No column-level lineage found in the SQL query. Facet generation skipped.") # type: ignore[attr-defined]
395
+ return None
396
+
397
+ default_dataset, default_project = self._extract_default_dataset_and_project(
398
+ properties,
399
+ self.project_id, # type: ignore[attr-defined]
284
400
  )
285
- from airflow.providers.google.cloud.openlineage.utils import get_from_nullable_chain
286
401
 
287
- bq_table = self.client.get_table(table)
402
+ # Verify if the output table id from the parse result matches the BQ job output table
403
+ if not self._validate_output_table_id(
404
+ parse_result,
405
+ output,
406
+ default_project,
407
+ default_dataset,
408
+ ):
409
+ return None
288
410
 
289
- if not bq_table._properties:
411
+ # Verify if all columns from parse results are present in the output dataset schema
412
+ if not self._validate_output_columns(parse_result, output):
290
413
  return None
291
414
 
292
- fields = get_from_nullable_chain(bq_table._properties, ["schema", "fields"])
293
- if not fields:
415
+ input_tables_from_parse_result = self._extract_parsed_input_tables(
416
+ parse_result, default_project, default_dataset
417
+ )
418
+ input_tables_from_bq = {input_ds.name: self._extract_column_names(input_ds) for input_ds in inputs}
419
+
420
+ # Verify if all datasets from parse results are present in bq job input datasets
421
+ if not self._validate_input_tables(input_tables_from_parse_result, input_tables_from_bq):
422
+ return None
423
+
424
+ # Verify if all columns from parse results are present in their respective bq job input datasets
425
+ if not self._validate_input_columns(input_tables_from_parse_result, input_tables_from_bq):
294
426
  return None
295
427
 
296
- return SchemaDatasetFacet(
297
- fields=[
298
- SchemaDatasetFacetFields(
299
- name=field.get("name"),
300
- type=field.get("type"),
301
- description=field.get("description"),
428
+ return self._generate_column_lineage_facet(parse_result, default_project, default_dataset)
429
+
430
+ @staticmethod
431
+ def _get_qualified_name_from_parse_result(table, default_project: str, default_dataset: str) -> str:
432
+ """Get the qualified name of a table from the parse result."""
433
+ return ".".join(
434
+ (
435
+ table.database or default_project,
436
+ table.schema or default_dataset,
437
+ table.name,
438
+ )
439
+ )
440
+
441
+ @staticmethod
442
+ def _extract_default_dataset_and_project(properties: dict, default_project: str) -> tuple[str, str]:
443
+ """Extract the default dataset and project from the BigQuery job properties."""
444
+ default_dataset_obj = get_from_nullable_chain(
445
+ properties, ["configuration", "query", "defaultDataset"]
446
+ )
447
+ default_dataset = default_dataset_obj.get("datasetId", "") if default_dataset_obj else ""
448
+ default_project = (
449
+ default_dataset_obj.get("projectId", default_project) if default_dataset_obj else default_project
450
+ )
451
+ return default_dataset, default_project
452
+
453
+ def _validate_output_table_id(
454
+ self, parse_result, output: OutputDataset, default_project: str, default_dataset: str
455
+ ) -> bool:
456
+ """Check if the output table id from the parse result matches the BQ job output table."""
457
+ if len(parse_result.out_tables) != 1:
458
+ self.log.debug( # type: ignore[attr-defined]
459
+ "Invalid output tables in the parse result: `%s`. Expected exactly one output table.",
460
+ parse_result.out_tables,
461
+ )
462
+ return False
463
+
464
+ parsed_output_table = self._get_qualified_name_from_parse_result(
465
+ parse_result.out_tables[0], default_project, default_dataset
466
+ )
467
+ if parsed_output_table != output.name:
468
+ self.log.debug( # type: ignore[attr-defined]
469
+ "Mismatch between parsed output table `%s` and BQ job output table `%s`.",
470
+ parsed_output_table,
471
+ output.name,
472
+ )
473
+ return False
474
+ return True
475
+
476
+ @staticmethod
477
+ def _extract_column_names(dataset: Dataset) -> list[str]:
478
+ """Extract column names from a dataset's schema."""
479
+ return [
480
+ f.name
481
+ for f in dataset.facets.get("schema", SchemaDatasetFacet(fields=[])).fields # type: ignore[union-attr]
482
+ if dataset.facets
483
+ ]
484
+
485
+ def _validate_output_columns(self, parse_result, output: OutputDataset) -> bool:
486
+ """Validate if all descendant columns in parse result exist in output dataset schema."""
487
+ output_column_names = self._extract_column_names(output)
488
+ missing_columns = [
489
+ lineage.descendant.name
490
+ for lineage in parse_result.column_lineage
491
+ if lineage.descendant.name not in output_column_names
492
+ ]
493
+ if missing_columns:
494
+ self.log.debug( # type: ignore[attr-defined]
495
+ "Output dataset schema is missing columns from the parse result: `%s`.", missing_columns
496
+ )
497
+ return False
498
+ return True
499
+
500
+ def _extract_parsed_input_tables(
501
+ self, parse_result, default_project: str, default_dataset: str
502
+ ) -> dict[str, list[str]]:
503
+ """Extract input tables and their columns from the parse result."""
504
+ input_tables: dict[str, list[str]] = {}
505
+ for lineage in parse_result.column_lineage:
506
+ for column_meta in lineage.lineage:
507
+ if not column_meta.origin:
508
+ self.log.debug( # type: ignore[attr-defined]
509
+ "Column `%s` lacks origin information. Skipping facet generation.", column_meta.name
510
+ )
511
+ return {}
512
+
513
+ input_table_id = self._get_qualified_name_from_parse_result(
514
+ column_meta.origin, default_project, default_dataset
515
+ )
516
+ input_tables.setdefault(input_table_id, []).append(column_meta.name)
517
+ return input_tables
518
+
519
+ def _validate_input_tables(
520
+ self, parsed_input_tables: dict[str, list[str]], input_tables_from_bq: dict[str, list[str]]
521
+ ) -> bool:
522
+ """Validate if all parsed input tables exist in the BQ job's input datasets."""
523
+ if not parsed_input_tables:
524
+ self.log.debug("No input tables found in the parse result. Facet generation skipped.") # type: ignore[attr-defined]
525
+ return False
526
+ if missing_tables := set(parsed_input_tables) - set(input_tables_from_bq):
527
+ self.log.debug( # type: ignore[attr-defined]
528
+ "Parsed input tables not found in the BQ job's input datasets: `%s`.", missing_tables
529
+ )
530
+ return False
531
+ return True
532
+
533
+ def _validate_input_columns(
534
+ self, parsed_input_tables: dict[str, list[str]], input_tables_from_bq: dict[str, list[str]]
535
+ ) -> bool:
536
+ """Validate if all parsed input columns exist in their respective BQ job input table schemas."""
537
+ if not parsed_input_tables:
538
+ self.log.debug("No input tables found in the parse result. Facet generation skipped.") # type: ignore[attr-defined]
539
+ return False
540
+ for table, columns in parsed_input_tables.items():
541
+ if missing_columns := set(columns) - set(input_tables_from_bq.get(table, [])):
542
+ self.log.debug( # type: ignore[attr-defined]
543
+ "Input table `%s` is missing columns from the parse result: `%s`.", table, missing_columns
544
+ )
545
+ return False
546
+ return True
547
+
548
+ def _generate_column_lineage_facet(
549
+ self, parse_result, default_project: str, default_dataset: str
550
+ ) -> ColumnLineageDatasetFacet:
551
+ """Generate the ColumnLineageDatasetFacet based on the parsed result."""
552
+ return ColumnLineageDatasetFacet(
553
+ fields={
554
+ lineage.descendant.name: Fields(
555
+ inputFields=[
556
+ InputField(
557
+ namespace=BIGQUERY_NAMESPACE,
558
+ name=self._get_qualified_name_from_parse_result(
559
+ column_meta.origin, default_project, default_dataset
560
+ ),
561
+ field=column_meta.name,
562
+ )
563
+ for column_meta in lineage.lineage
564
+ ],
565
+ transformationType="",
566
+ transformationDescription="",
302
567
  )
303
- for field in fields
304
- ]
568
+ for lineage in parse_result.column_lineage
569
+ }
305
570
  )