apache-airflow-providers-google 12.0.0rc1__py3-none-any.whl → 13.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airflow/providers/google/LICENSE +0 -52
- airflow/providers/google/__init__.py +1 -1
- airflow/providers/google/ads/hooks/ads.py +27 -13
- airflow/providers/google/ads/transfers/ads_to_gcs.py +18 -4
- airflow/providers/google/assets/bigquery.py +17 -0
- airflow/providers/google/cloud/_internal_client/secret_manager_client.py +2 -3
- airflow/providers/google/cloud/hooks/alloy_db.py +736 -8
- airflow/providers/google/cloud/hooks/automl.py +10 -4
- airflow/providers/google/cloud/hooks/bigquery.py +125 -22
- airflow/providers/google/cloud/hooks/bigquery_dts.py +8 -8
- airflow/providers/google/cloud/hooks/bigtable.py +2 -3
- airflow/providers/google/cloud/hooks/cloud_batch.py +3 -4
- airflow/providers/google/cloud/hooks/cloud_build.py +4 -5
- airflow/providers/google/cloud/hooks/cloud_composer.py +3 -4
- airflow/providers/google/cloud/hooks/cloud_memorystore.py +3 -4
- airflow/providers/google/cloud/hooks/cloud_run.py +3 -4
- airflow/providers/google/cloud/hooks/cloud_sql.py +7 -3
- airflow/providers/google/cloud/hooks/cloud_storage_transfer_service.py +119 -7
- airflow/providers/google/cloud/hooks/compute.py +3 -3
- airflow/providers/google/cloud/hooks/datacatalog.py +3 -4
- airflow/providers/google/cloud/hooks/dataflow.py +12 -12
- airflow/providers/google/cloud/hooks/dataform.py +2 -3
- airflow/providers/google/cloud/hooks/datafusion.py +2 -2
- airflow/providers/google/cloud/hooks/dataplex.py +1032 -11
- airflow/providers/google/cloud/hooks/dataproc.py +4 -5
- airflow/providers/google/cloud/hooks/dataproc_metastore.py +3 -4
- airflow/providers/google/cloud/hooks/dlp.py +3 -4
- airflow/providers/google/cloud/hooks/gcs.py +7 -6
- airflow/providers/google/cloud/hooks/kms.py +2 -3
- airflow/providers/google/cloud/hooks/kubernetes_engine.py +8 -8
- airflow/providers/google/cloud/hooks/life_sciences.py +1 -1
- airflow/providers/google/cloud/hooks/managed_kafka.py +482 -0
- airflow/providers/google/cloud/hooks/natural_language.py +2 -3
- airflow/providers/google/cloud/hooks/os_login.py +2 -3
- airflow/providers/google/cloud/hooks/pubsub.py +6 -6
- airflow/providers/google/cloud/hooks/secret_manager.py +2 -3
- airflow/providers/google/cloud/hooks/spanner.py +2 -2
- airflow/providers/google/cloud/hooks/speech_to_text.py +2 -3
- airflow/providers/google/cloud/hooks/stackdriver.py +4 -4
- airflow/providers/google/cloud/hooks/tasks.py +3 -4
- airflow/providers/google/cloud/hooks/text_to_speech.py +2 -3
- airflow/providers/google/cloud/hooks/translate.py +236 -5
- airflow/providers/google/cloud/hooks/vertex_ai/auto_ml.py +9 -4
- airflow/providers/google/cloud/hooks/vertex_ai/batch_prediction_job.py +3 -4
- airflow/providers/google/cloud/hooks/vertex_ai/custom_job.py +4 -5
- airflow/providers/google/cloud/hooks/vertex_ai/dataset.py +3 -4
- airflow/providers/google/cloud/hooks/vertex_ai/endpoint_service.py +2 -3
- airflow/providers/google/cloud/hooks/vertex_ai/feature_store.py +3 -4
- airflow/providers/google/cloud/hooks/vertex_ai/generative_model.py +1 -181
- airflow/providers/google/cloud/hooks/vertex_ai/hyperparameter_tuning_job.py +3 -4
- airflow/providers/google/cloud/hooks/vertex_ai/model_service.py +2 -3
- airflow/providers/google/cloud/hooks/vertex_ai/pipeline_job.py +3 -4
- airflow/providers/google/cloud/hooks/vertex_ai/prediction_service.py +2 -3
- airflow/providers/google/cloud/hooks/video_intelligence.py +2 -3
- airflow/providers/google/cloud/hooks/vision.py +3 -4
- airflow/providers/google/cloud/hooks/workflows.py +2 -3
- airflow/providers/google/cloud/links/alloy_db.py +46 -0
- airflow/providers/google/cloud/links/bigquery.py +25 -0
- airflow/providers/google/cloud/links/dataplex.py +172 -2
- airflow/providers/google/cloud/links/kubernetes_engine.py +1 -2
- airflow/providers/google/cloud/links/managed_kafka.py +104 -0
- airflow/providers/google/cloud/links/translate.py +28 -0
- airflow/providers/google/cloud/log/gcs_task_handler.py +3 -3
- airflow/providers/google/cloud/log/stackdriver_task_handler.py +11 -10
- airflow/providers/google/cloud/openlineage/facets.py +67 -0
- airflow/providers/google/cloud/openlineage/mixins.py +438 -173
- airflow/providers/google/cloud/openlineage/utils.py +394 -61
- airflow/providers/google/cloud/operators/alloy_db.py +980 -69
- airflow/providers/google/cloud/operators/automl.py +83 -245
- airflow/providers/google/cloud/operators/bigquery.py +377 -74
- airflow/providers/google/cloud/operators/bigquery_dts.py +126 -13
- airflow/providers/google/cloud/operators/bigtable.py +1 -3
- airflow/providers/google/cloud/operators/cloud_base.py +1 -2
- airflow/providers/google/cloud/operators/cloud_batch.py +2 -4
- airflow/providers/google/cloud/operators/cloud_build.py +3 -5
- airflow/providers/google/cloud/operators/cloud_composer.py +5 -7
- airflow/providers/google/cloud/operators/cloud_memorystore.py +4 -6
- airflow/providers/google/cloud/operators/cloud_run.py +6 -5
- airflow/providers/google/cloud/operators/cloud_sql.py +20 -8
- airflow/providers/google/cloud/operators/cloud_storage_transfer_service.py +62 -8
- airflow/providers/google/cloud/operators/compute.py +3 -4
- airflow/providers/google/cloud/operators/datacatalog.py +9 -11
- airflow/providers/google/cloud/operators/dataflow.py +1 -112
- airflow/providers/google/cloud/operators/dataform.py +3 -5
- airflow/providers/google/cloud/operators/datafusion.py +1 -1
- airflow/providers/google/cloud/operators/dataplex.py +2046 -7
- airflow/providers/google/cloud/operators/dataproc.py +102 -17
- airflow/providers/google/cloud/operators/dataproc_metastore.py +7 -9
- airflow/providers/google/cloud/operators/dlp.py +17 -19
- airflow/providers/google/cloud/operators/gcs.py +14 -17
- airflow/providers/google/cloud/operators/kubernetes_engine.py +2 -2
- airflow/providers/google/cloud/operators/managed_kafka.py +788 -0
- airflow/providers/google/cloud/operators/natural_language.py +3 -5
- airflow/providers/google/cloud/operators/pubsub.py +39 -7
- airflow/providers/google/cloud/operators/speech_to_text.py +3 -5
- airflow/providers/google/cloud/operators/stackdriver.py +3 -5
- airflow/providers/google/cloud/operators/tasks.py +4 -6
- airflow/providers/google/cloud/operators/text_to_speech.py +2 -4
- airflow/providers/google/cloud/operators/translate.py +414 -5
- airflow/providers/google/cloud/operators/translate_speech.py +2 -4
- airflow/providers/google/cloud/operators/vertex_ai/auto_ml.py +9 -8
- airflow/providers/google/cloud/operators/vertex_ai/batch_prediction_job.py +4 -6
- airflow/providers/google/cloud/operators/vertex_ai/custom_job.py +6 -8
- airflow/providers/google/cloud/operators/vertex_ai/dataset.py +4 -6
- airflow/providers/google/cloud/operators/vertex_ai/endpoint_service.py +4 -6
- airflow/providers/google/cloud/operators/vertex_ai/generative_model.py +0 -322
- airflow/providers/google/cloud/operators/vertex_ai/hyperparameter_tuning_job.py +4 -6
- airflow/providers/google/cloud/operators/vertex_ai/model_service.py +4 -6
- airflow/providers/google/cloud/operators/vertex_ai/pipeline_job.py +4 -6
- airflow/providers/google/cloud/operators/video_intelligence.py +3 -5
- airflow/providers/google/cloud/operators/vision.py +4 -6
- airflow/providers/google/cloud/operators/workflows.py +5 -7
- airflow/providers/google/cloud/secrets/secret_manager.py +1 -2
- airflow/providers/google/cloud/sensors/bigquery_dts.py +3 -5
- airflow/providers/google/cloud/sensors/bigtable.py +2 -3
- airflow/providers/google/cloud/sensors/cloud_composer.py +32 -8
- airflow/providers/google/cloud/sensors/cloud_storage_transfer_service.py +39 -1
- airflow/providers/google/cloud/sensors/dataplex.py +4 -6
- airflow/providers/google/cloud/sensors/dataproc.py +2 -3
- airflow/providers/google/cloud/sensors/dataproc_metastore.py +1 -2
- airflow/providers/google/cloud/sensors/gcs.py +2 -4
- airflow/providers/google/cloud/sensors/pubsub.py +2 -3
- airflow/providers/google/cloud/sensors/workflows.py +3 -5
- airflow/providers/google/cloud/transfers/bigquery_to_gcs.py +5 -5
- airflow/providers/google/cloud/transfers/gcs_to_bigquery.py +10 -12
- airflow/providers/google/cloud/transfers/gcs_to_gcs.py +1 -1
- airflow/providers/google/cloud/transfers/gcs_to_sftp.py +36 -4
- airflow/providers/google/cloud/transfers/mssql_to_gcs.py +27 -2
- airflow/providers/google/cloud/transfers/mysql_to_gcs.py +27 -2
- airflow/providers/google/cloud/transfers/postgres_to_gcs.py +27 -2
- airflow/providers/google/cloud/transfers/sftp_to_gcs.py +34 -5
- airflow/providers/google/cloud/transfers/sql_to_gcs.py +15 -0
- airflow/providers/google/cloud/transfers/trino_to_gcs.py +25 -2
- airflow/providers/google/cloud/triggers/bigquery_dts.py +1 -2
- airflow/providers/google/cloud/triggers/cloud_batch.py +1 -2
- airflow/providers/google/cloud/triggers/cloud_build.py +1 -2
- airflow/providers/google/cloud/triggers/cloud_composer.py +13 -3
- airflow/providers/google/cloud/triggers/cloud_storage_transfer_service.py +102 -4
- airflow/providers/google/cloud/triggers/dataflow.py +2 -3
- airflow/providers/google/cloud/triggers/dataplex.py +1 -2
- airflow/providers/google/cloud/triggers/dataproc.py +2 -3
- airflow/providers/google/cloud/triggers/kubernetes_engine.py +1 -1
- airflow/providers/google/cloud/triggers/pubsub.py +1 -2
- airflow/providers/google/cloud/triggers/vertex_ai.py +7 -8
- airflow/providers/google/cloud/utils/credentials_provider.py +15 -8
- airflow/providers/google/cloud/utils/external_token_supplier.py +1 -0
- airflow/providers/google/common/auth_backend/google_openid.py +4 -4
- airflow/providers/google/common/consts.py +1 -2
- airflow/providers/google/common/hooks/base_google.py +8 -7
- airflow/providers/google/get_provider_info.py +186 -134
- airflow/providers/google/marketing_platform/hooks/analytics_admin.py +2 -3
- airflow/providers/google/marketing_platform/hooks/search_ads.py +1 -1
- airflow/providers/google/marketing_platform/operators/analytics_admin.py +5 -7
- {apache_airflow_providers_google-12.0.0rc1.dist-info → apache_airflow_providers_google-13.0.0.dist-info}/METADATA +41 -58
- {apache_airflow_providers_google-12.0.0rc1.dist-info → apache_airflow_providers_google-13.0.0.dist-info}/RECORD +157 -159
- airflow/providers/google/cloud/example_dags/example_facebook_ads_to_gcs.py +0 -141
- airflow/providers/google/cloud/example_dags/example_looker.py +0 -64
- airflow/providers/google/cloud/example_dags/example_presto_to_gcs.py +0 -194
- airflow/providers/google/cloud/example_dags/example_salesforce_to_gcs.py +0 -129
- airflow/providers/google/marketing_platform/example_dags/__init__.py +0 -16
- airflow/providers/google/marketing_platform/example_dags/example_display_video.py +0 -213
- {apache_airflow_providers_google-12.0.0rc1.dist-info → apache_airflow_providers_google-13.0.0.dist-info}/WHEEL +0 -0
- {apache_airflow_providers_google-12.0.0rc1.dist-info → apache_airflow_providers_google-13.0.0.dist-info}/entry_points.txt +0 -0
@@ -20,61 +20,71 @@ from __future__ import annotations
|
|
20
20
|
import copy
|
21
21
|
import json
|
22
22
|
import traceback
|
23
|
+
from collections.abc import Iterable
|
23
24
|
from typing import TYPE_CHECKING, cast
|
24
25
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
26
|
+
from airflow.providers.common.compat.openlineage.facet import (
|
27
|
+
ColumnLineageDatasetFacet,
|
28
|
+
DatasetFacet,
|
29
|
+
ErrorMessageRunFacet,
|
30
|
+
ExternalQueryRunFacet,
|
31
|
+
Fields,
|
32
|
+
InputDataset,
|
33
|
+
InputField,
|
34
|
+
OutputDataset,
|
35
|
+
OutputStatisticsOutputDatasetFacet,
|
36
|
+
SchemaDatasetFacet,
|
37
|
+
SQLJobFacet,
|
38
|
+
)
|
39
|
+
from airflow.providers.google.cloud.openlineage.utils import (
|
40
|
+
BIGQUERY_NAMESPACE,
|
41
|
+
get_facets_from_bq_table,
|
42
|
+
get_from_nullable_chain,
|
43
|
+
get_identity_column_lineage_facet,
|
44
|
+
get_namespace_name_from_source_uris,
|
45
|
+
merge_column_lineage_facets,
|
46
|
+
)
|
35
47
|
|
48
|
+
if TYPE_CHECKING:
|
49
|
+
from airflow.providers.common.compat.openlineage.facet import Dataset, RunFacet
|
50
|
+
from airflow.providers.google.cloud.openlineage.facets import BigQueryJobRunFacet
|
36
51
|
|
37
|
-
BIGQUERY_NAMESPACE = "bigquery"
|
38
52
|
|
53
|
+
class _BigQueryInsertJobOperatorOpenLineageMixin:
|
54
|
+
"""Mixin for BigQueryInsertJobOperator to extract OpenLineage metadata."""
|
39
55
|
|
40
|
-
class _BigQueryOpenLineageMixin:
|
41
56
|
def get_openlineage_facets_on_complete(self, _):
|
42
57
|
"""
|
43
|
-
Retrieve OpenLineage data for a
|
58
|
+
Retrieve OpenLineage data for a completed BigQuery job.
|
44
59
|
|
45
|
-
This method
|
46
|
-
|
47
|
-
usage statistics.
|
48
|
-
|
49
|
-
Run facets should contain:
|
50
|
-
- ExternalQueryRunFacet
|
51
|
-
- BigQueryJobRunFacet
|
60
|
+
This method calls BigQuery API, retrieving input and output dataset info from it,
|
61
|
+
as well as run-level statistics.
|
52
62
|
|
53
63
|
Run facets may contain:
|
54
|
-
-
|
64
|
+
- ExternalQueryRunFacet (for QUERY job type)
|
65
|
+
- BigQueryJobRunFacet
|
66
|
+
- ErrorMessageRunFacet (if an error occurred)
|
55
67
|
|
56
68
|
Job facets should contain:
|
57
|
-
- SqlJobFacet
|
69
|
+
- SqlJobFacet (for QUERY job type)
|
58
70
|
|
59
|
-
Input datasets should contain
|
60
|
-
- DataSourceDatasetFacet
|
71
|
+
Input datasets should contain:
|
61
72
|
- SchemaDatasetFacet
|
62
73
|
|
63
|
-
Output datasets should contain
|
64
|
-
- DataSourceDatasetFacet
|
74
|
+
Output datasets should contain:
|
65
75
|
- SchemaDatasetFacet
|
66
|
-
- OutputStatisticsOutputDatasetFacet
|
76
|
+
- OutputStatisticsOutputDatasetFacet (for QUERY job type)
|
77
|
+
- ColumnLineageDatasetFacet (for QUERY job type)
|
67
78
|
"""
|
68
|
-
from airflow.providers.common.compat.openlineage.facet import ExternalQueryRunFacet, SQLJobFacet
|
69
79
|
from airflow.providers.openlineage.extractors import OperatorLineage
|
70
80
|
from airflow.providers.openlineage.sqlparser import SQLParser
|
71
81
|
|
72
82
|
if not self.job_id:
|
73
|
-
|
74
|
-
self.log.warning("No BigQuery job_id was found by OpenLineage.")
|
83
|
+
self.log.warning("No BigQuery job_id was found by OpenLineage.") # type: ignore[attr-defined]
|
75
84
|
return OperatorLineage()
|
76
85
|
|
77
86
|
if not self.hook:
|
87
|
+
# This can occur when in deferrable mode
|
78
88
|
from airflow.providers.google.cloud.hooks.bigquery import BigQueryHook
|
79
89
|
|
80
90
|
self.hook = BigQueryHook(
|
@@ -82,64 +92,34 @@ class _BigQueryOpenLineageMixin:
|
|
82
92
|
impersonation_chain=self.impersonation_chain,
|
83
93
|
)
|
84
94
|
|
95
|
+
self.log.debug("Extracting data from bigquery job: `%s`", self.job_id) # type: ignore[attr-defined]
|
96
|
+
inputs, outputs = [], []
|
85
97
|
run_facets: dict[str, RunFacet] = {
|
86
98
|
"externalQuery": ExternalQueryRunFacet(externalQueryId=self.job_id, source="bigquery")
|
87
99
|
}
|
88
|
-
|
89
|
-
job_facets = {"sql": SQLJobFacet(query=SQLParser.normalize_sql(self.sql))}
|
90
|
-
|
91
|
-
self.client = self.hook.get_client(project_id=self.hook.project_id, location=self.location)
|
92
|
-
job_ids = self.job_id
|
93
|
-
if isinstance(self.job_id, str):
|
94
|
-
job_ids = [self.job_id]
|
95
|
-
inputs, outputs = [], []
|
96
|
-
for job_id in job_ids:
|
97
|
-
inner_inputs, inner_outputs, inner_run_facets = self.get_facets(job_id=job_id)
|
98
|
-
inputs.extend(inner_inputs)
|
99
|
-
outputs.extend(inner_outputs)
|
100
|
-
run_facets.update(inner_run_facets)
|
101
|
-
|
102
|
-
return OperatorLineage(
|
103
|
-
inputs=inputs,
|
104
|
-
outputs=outputs,
|
105
|
-
run_facets=run_facets,
|
106
|
-
job_facets=job_facets,
|
107
|
-
)
|
108
|
-
|
109
|
-
def get_facets(self, job_id: str):
|
110
|
-
from airflow.providers.common.compat.openlineage.facet import ErrorMessageRunFacet
|
111
|
-
from airflow.providers.google.cloud.openlineage.utils import get_from_nullable_chain
|
112
|
-
|
113
|
-
inputs = []
|
114
|
-
outputs = []
|
115
|
-
run_facets: dict[str, RunFacet] = {}
|
116
|
-
if hasattr(self, "log"):
|
117
|
-
self.log.debug("Extracting data from bigquery job: `%s`", job_id)
|
100
|
+
self._client = self.hook.get_client(project_id=self.hook.project_id, location=self.location)
|
118
101
|
try:
|
119
|
-
|
120
|
-
props = job._properties
|
102
|
+
job_properties = self._client.get_job(job_id=self.job_id)._properties # type: ignore
|
121
103
|
|
122
|
-
if get_from_nullable_chain(
|
123
|
-
raise ValueError(f"Trying to extract data from running bigquery job: `{job_id}`")
|
104
|
+
if get_from_nullable_chain(job_properties, ["status", "state"]) != "DONE":
|
105
|
+
raise ValueError(f"Trying to extract data from running bigquery job: `{self.job_id}`")
|
124
106
|
|
125
|
-
run_facets["bigQueryJob"] = self._get_bigquery_job_run_facet(
|
107
|
+
run_facets["bigQueryJob"] = self._get_bigquery_job_run_facet(job_properties)
|
126
108
|
|
127
|
-
if get_from_nullable_chain(
|
128
|
-
|
129
|
-
self.log.debug("Found SCRIPT job. Extracting lineage from child jobs instead.")
|
109
|
+
if get_from_nullable_chain(job_properties, ["statistics", "numChildJobs"]):
|
110
|
+
self.log.debug("Found SCRIPT job. Extracting lineage from child jobs instead.") # type: ignore[attr-defined]
|
130
111
|
# SCRIPT job type has no input / output information but spawns child jobs that have one
|
131
112
|
# https://cloud.google.com/bigquery/docs/information-schema-jobs#multi-statement_query_job
|
132
|
-
for child_job_id in self.
|
133
|
-
|
134
|
-
child_inputs,
|
113
|
+
for child_job_id in self._client.list_jobs(parent_job=self.job_id):
|
114
|
+
child_job_properties = self._client.get_job(job_id=child_job_id)._properties # type: ignore
|
115
|
+
child_inputs, child_outputs = self._get_inputs_and_outputs(child_job_properties)
|
135
116
|
inputs.extend(child_inputs)
|
136
|
-
outputs.
|
117
|
+
outputs.extend(child_outputs)
|
137
118
|
else:
|
138
|
-
inputs,
|
139
|
-
|
119
|
+
inputs, outputs = self._get_inputs_and_outputs(job_properties)
|
120
|
+
|
140
121
|
except Exception as e:
|
141
|
-
|
142
|
-
self.log.warning("Cannot retrieve job details from BigQuery.Client. %s", e, exc_info=True)
|
122
|
+
self.log.warning("Cannot retrieve job details from BigQuery.Client. %s", e, exc_info=True) # type: ignore[attr-defined]
|
143
123
|
exception_msg = traceback.format_exc()
|
144
124
|
run_facets.update(
|
145
125
|
{
|
@@ -149,16 +129,37 @@ class _BigQueryOpenLineageMixin:
|
|
149
129
|
)
|
150
130
|
}
|
151
131
|
)
|
152
|
-
deduplicated_outputs = self._deduplicate_outputs(outputs)
|
153
|
-
return inputs, deduplicated_outputs, run_facets
|
154
132
|
|
155
|
-
|
156
|
-
|
133
|
+
return OperatorLineage(
|
134
|
+
inputs=list(inputs),
|
135
|
+
outputs=self._deduplicate_outputs(outputs),
|
136
|
+
run_facets=run_facets,
|
137
|
+
job_facets={"sql": SQLJobFacet(query=SQLParser.normalize_sql(self.sql))} if self.sql else {},
|
138
|
+
)
|
139
|
+
|
140
|
+
def _get_inputs_and_outputs(self, properties: dict) -> tuple[list[InputDataset], list[OutputDataset]]:
|
141
|
+
job_type = get_from_nullable_chain(properties, ["configuration", "jobType"])
|
142
|
+
|
143
|
+
if job_type == "QUERY":
|
144
|
+
inputs, outputs = self._get_inputs_and_outputs_for_query_job(properties)
|
145
|
+
elif job_type == "LOAD":
|
146
|
+
inputs, outputs = self._get_inputs_and_outputs_for_load_job(properties)
|
147
|
+
elif job_type == "COPY":
|
148
|
+
inputs, outputs = self._get_inputs_and_outputs_for_copy_job(properties)
|
149
|
+
elif job_type == "EXTRACT":
|
150
|
+
inputs, outputs = self._get_inputs_and_outputs_for_extract_job(properties)
|
151
|
+
else:
|
152
|
+
self.log.debug("Unsupported job type for input/output extraction: `%s`.", job_type) # type: ignore[attr-defined]
|
153
|
+
inputs, outputs = [], []
|
154
|
+
|
155
|
+
return inputs, outputs
|
156
|
+
|
157
|
+
def _deduplicate_outputs(self, outputs: Iterable[OutputDataset | None]) -> list[OutputDataset]:
|
157
158
|
final_outputs = {}
|
158
159
|
for single_output in outputs:
|
159
160
|
if not single_output:
|
160
161
|
continue
|
161
|
-
key = single_output.name
|
162
|
+
key = f"{single_output.namespace}.{single_output.name}"
|
162
163
|
if key not in final_outputs:
|
163
164
|
final_outputs[key] = single_output
|
164
165
|
continue
|
@@ -167,139 +168,403 @@ class _BigQueryOpenLineageMixin:
|
|
167
168
|
# if the rowCount or size can be summed together.
|
168
169
|
if single_output.outputFacets:
|
169
170
|
single_output.outputFacets.pop("outputStatistics", None)
|
170
|
-
final_outputs[key] = single_output
|
171
|
-
|
172
|
-
return list(final_outputs.values())
|
173
|
-
|
174
|
-
def _get_inputs_outputs_from_job(
|
175
|
-
self, properties: dict
|
176
|
-
) -> tuple[list[InputDataset], OutputDataset | None]:
|
177
|
-
from airflow.providers.google.cloud.openlineage.utils import get_from_nullable_chain
|
178
|
-
|
179
|
-
input_tables = get_from_nullable_chain(properties, ["statistics", "query", "referencedTables"]) or []
|
180
|
-
output_table = get_from_nullable_chain(properties, ["configuration", "query", "destinationTable"])
|
181
|
-
inputs = [(self._get_input_dataset(input_table)) for input_table in input_tables]
|
182
|
-
if output_table:
|
183
|
-
output = self._get_output_dataset(output_table)
|
184
|
-
dataset_stat_facet = self._get_statistics_dataset_facet(properties)
|
185
|
-
output.outputFacets = output.outputFacets or {}
|
186
|
-
if dataset_stat_facet:
|
187
|
-
output.outputFacets["outputStatistics"] = dataset_stat_facet
|
188
|
-
|
189
|
-
return inputs, output
|
190
171
|
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
return BigQueryJobRunFacet(
|
205
|
-
cached=str(cache_hit).lower() == "true",
|
206
|
-
billedBytes=int(billed_bytes) if billed_bytes else None,
|
207
|
-
properties=json.dumps(properties),
|
208
|
-
)
|
209
|
-
|
210
|
-
@staticmethod
|
211
|
-
def _get_statistics_dataset_facet(
|
212
|
-
properties,
|
213
|
-
) -> OutputStatisticsOutputDatasetFacet | None:
|
214
|
-
from airflow.providers.common.compat.openlineage.facet import OutputStatisticsOutputDatasetFacet
|
215
|
-
from airflow.providers.google.cloud.openlineage.utils import get_from_nullable_chain
|
172
|
+
# If multiple outputs contain Column Level Lineage Facet - merge the facets
|
173
|
+
if (
|
174
|
+
single_output.facets
|
175
|
+
and final_outputs[key].facets
|
176
|
+
and "columnLineage" in single_output.facets # type: ignore
|
177
|
+
and "columnLineage" in final_outputs[key].facets # type: ignore
|
178
|
+
):
|
179
|
+
single_output.facets["columnLineage"] = merge_column_lineage_facets(
|
180
|
+
[
|
181
|
+
single_output.facets["columnLineage"], # type: ignore
|
182
|
+
final_outputs[key].facets["columnLineage"], # type: ignore
|
183
|
+
]
|
184
|
+
)
|
216
185
|
|
217
|
-
|
218
|
-
if not query_plan:
|
219
|
-
return None
|
186
|
+
final_outputs[key] = single_output
|
220
187
|
|
221
|
-
|
222
|
-
out_rows = out_stage.get("recordsWritten", None)
|
223
|
-
out_bytes = out_stage.get("shuffleOutputBytes", None)
|
224
|
-
if out_bytes and out_rows:
|
225
|
-
return OutputStatisticsOutputDatasetFacet(rowCount=int(out_rows), size=int(out_bytes))
|
226
|
-
return None
|
188
|
+
return list(final_outputs.values())
|
227
189
|
|
228
190
|
def _get_input_dataset(self, table: dict) -> InputDataset:
|
229
|
-
from airflow.providers.common.compat.openlineage.facet import InputDataset
|
230
|
-
|
231
191
|
return cast(InputDataset, self._get_dataset(table, "input"))
|
232
192
|
|
233
193
|
def _get_output_dataset(self, table: dict) -> OutputDataset:
|
234
|
-
from airflow.providers.common.compat.openlineage.facet import OutputDataset
|
235
|
-
|
236
194
|
return cast(OutputDataset, self._get_dataset(table, "output"))
|
237
195
|
|
238
196
|
def _get_dataset(self, table: dict, dataset_type: str) -> Dataset:
|
239
|
-
from airflow.providers.common.compat.openlineage.facet import InputDataset, OutputDataset
|
240
|
-
|
241
197
|
project = table.get("projectId")
|
242
198
|
dataset = table.get("datasetId")
|
243
199
|
table_name = table.get("tableId")
|
244
200
|
dataset_name = f"{project}.{dataset}.{table_name}"
|
245
201
|
|
246
|
-
|
202
|
+
dataset_facets = self._get_table_facets_safely(dataset_name)
|
247
203
|
if dataset_type == "input":
|
248
204
|
# Logic specific to creating InputDataset (if needed)
|
249
205
|
return InputDataset(
|
250
206
|
namespace=BIGQUERY_NAMESPACE,
|
251
207
|
name=dataset_name,
|
252
|
-
facets=
|
253
|
-
"schema": dataset_schema,
|
254
|
-
}
|
255
|
-
if dataset_schema
|
256
|
-
else {},
|
208
|
+
facets=dataset_facets,
|
257
209
|
)
|
258
210
|
elif dataset_type == "output":
|
259
211
|
# Logic specific to creating OutputDataset (if needed)
|
260
212
|
return OutputDataset(
|
261
213
|
namespace=BIGQUERY_NAMESPACE,
|
262
214
|
name=dataset_name,
|
263
|
-
facets=
|
264
|
-
"schema": dataset_schema,
|
265
|
-
}
|
266
|
-
if dataset_schema
|
267
|
-
else {},
|
215
|
+
facets=dataset_facets,
|
268
216
|
)
|
269
217
|
else:
|
270
218
|
raise ValueError("Invalid dataset_type. Must be 'input' or 'output'")
|
271
219
|
|
272
|
-
def
|
220
|
+
def _get_table_facets_safely(self, table_name: str) -> dict[str, DatasetFacet]:
|
273
221
|
try:
|
274
|
-
|
222
|
+
bq_table = self._client.get_table(table_name)
|
223
|
+
return get_facets_from_bq_table(bq_table)
|
275
224
|
except Exception as e:
|
276
|
-
|
277
|
-
|
225
|
+
self.log.warning("Could not extract facets from bigquery table: `%s`. %s", table_name, e) # type: ignore[attr-defined]
|
226
|
+
return {}
|
227
|
+
|
228
|
+
def _get_inputs_and_outputs_for_query_job(
|
229
|
+
self, properties: dict
|
230
|
+
) -> tuple[list[InputDataset], list[OutputDataset]]:
|
231
|
+
input_tables = get_from_nullable_chain(properties, ["statistics", "query", "referencedTables"]) or []
|
232
|
+
output_table = get_from_nullable_chain(properties, ["configuration", "query", "destinationTable"])
|
233
|
+
|
234
|
+
inputs = [
|
235
|
+
self._get_input_dataset(input_table)
|
236
|
+
for input_table in input_tables
|
237
|
+
if input_table != output_table # Output table is in `referencedTables` and needs to be removed
|
238
|
+
]
|
239
|
+
|
240
|
+
if not output_table:
|
241
|
+
return inputs, []
|
242
|
+
|
243
|
+
output = self._get_output_dataset(output_table)
|
244
|
+
if dataset_stat_facet := self._get_output_statistics_dataset_facet(properties):
|
245
|
+
output.outputFacets = output.outputFacets or {}
|
246
|
+
output.outputFacets["outputStatistics"] = dataset_stat_facet
|
247
|
+
if cll_facet := self._get_column_level_lineage_facet_for_query_job(properties, output, inputs):
|
248
|
+
output.facets = output.facets or {}
|
249
|
+
output.facets["columnLineage"] = cll_facet
|
250
|
+
return inputs, [output]
|
251
|
+
|
252
|
+
def _get_inputs_and_outputs_for_load_job(
|
253
|
+
self, properties: dict
|
254
|
+
) -> tuple[list[InputDataset], list[OutputDataset]]:
|
255
|
+
output = self._get_output_dataset(properties["configuration"]["load"]["destinationTable"])
|
256
|
+
output_table_schema_facet = output.facets.get("schema") if output.facets else None
|
257
|
+
|
258
|
+
source_uris = properties["configuration"]["load"]["sourceUris"]
|
259
|
+
inputs = [
|
260
|
+
InputDataset(
|
261
|
+
namespace=namespace,
|
262
|
+
name=name,
|
263
|
+
facets={"schema": output_table_schema_facet} if output_table_schema_facet else {},
|
264
|
+
)
|
265
|
+
for namespace, name in get_namespace_name_from_source_uris(source_uris)
|
266
|
+
]
|
267
|
+
|
268
|
+
if dataset_stat_facet := self._get_output_statistics_dataset_facet(properties):
|
269
|
+
output.outputFacets = output.outputFacets or {}
|
270
|
+
output.outputFacets["outputStatistics"] = dataset_stat_facet
|
271
|
+
if cll_facet := get_identity_column_lineage_facet(self._extract_column_names(output), inputs):
|
272
|
+
output.facets = {**output.facets, **cll_facet} if output.facets else cll_facet
|
273
|
+
return inputs, [output]
|
274
|
+
|
275
|
+
def _get_inputs_and_outputs_for_copy_job(
|
276
|
+
self, properties: dict
|
277
|
+
) -> tuple[list[InputDataset], list[OutputDataset]]:
|
278
|
+
input_tables = get_from_nullable_chain(properties, ["configuration", "copy", "sourceTables"]) or [
|
279
|
+
get_from_nullable_chain(properties, ["configuration", "copy", "sourceTable"])
|
280
|
+
]
|
281
|
+
inputs = [self._get_input_dataset(input_table) for input_table in input_tables]
|
282
|
+
|
283
|
+
output = self._get_output_dataset(properties["configuration"]["copy"]["destinationTable"])
|
284
|
+
if dataset_stat_facet := self._get_output_statistics_dataset_facet(properties):
|
285
|
+
output.outputFacets = output.outputFacets or {}
|
286
|
+
output.outputFacets["outputStatistics"] = dataset_stat_facet
|
287
|
+
if cll_facet := get_identity_column_lineage_facet(self._extract_column_names(output), inputs):
|
288
|
+
output.facets = {**output.facets, **cll_facet} if output.facets else cll_facet
|
289
|
+
return inputs, [output]
|
290
|
+
|
291
|
+
def _get_inputs_and_outputs_for_extract_job(
|
292
|
+
self, properties: dict
|
293
|
+
) -> tuple[list[InputDataset], list[OutputDataset]]:
|
294
|
+
source_table = get_from_nullable_chain(properties, ["configuration", "extract", "sourceTable"])
|
295
|
+
input_dataset = self._get_input_dataset(source_table) if source_table else None
|
296
|
+
|
297
|
+
destination_uris = get_from_nullable_chain(
|
298
|
+
properties, ["configuration", "extract", "destinationUris"]
|
299
|
+
) or [get_from_nullable_chain(properties, ["configuration", "extract", "destinationUri"])]
|
300
|
+
|
301
|
+
outputs = []
|
302
|
+
for namespace, name in get_namespace_name_from_source_uris(destination_uris):
|
303
|
+
output_facets = {}
|
304
|
+
if input_dataset:
|
305
|
+
input_schema = input_dataset.facets.get("schema") if input_dataset.facets else None
|
306
|
+
if input_schema:
|
307
|
+
output_facets["schema"] = input_schema
|
308
|
+
if cll_facet := get_identity_column_lineage_facet(
|
309
|
+
self._extract_column_names(input_dataset), [input_dataset]
|
310
|
+
):
|
311
|
+
output_facets = {**output_facets, **cll_facet}
|
312
|
+
outputs.append(OutputDataset(namespace=namespace, name=name, facets=output_facets))
|
313
|
+
|
314
|
+
inputs = [input_dataset] if input_dataset else []
|
315
|
+
return inputs, outputs
|
316
|
+
|
317
|
+
@staticmethod
|
318
|
+
def _get_bigquery_job_run_facet(properties: dict) -> BigQueryJobRunFacet:
|
319
|
+
from airflow.providers.google.cloud.openlineage.facets import BigQueryJobRunFacet
|
320
|
+
|
321
|
+
job_type = get_from_nullable_chain(properties, ["configuration", "jobType"])
|
322
|
+
cache_hit, billed_bytes = None, None
|
323
|
+
if job_type == "QUERY":
|
324
|
+
if get_from_nullable_chain(properties, ["configuration", "query", "query"]):
|
325
|
+
# Exclude the query to avoid event size issues and duplicating SqlJobFacet information.
|
326
|
+
properties = copy.deepcopy(properties)
|
327
|
+
properties["configuration"]["query"].pop("query")
|
328
|
+
cache_hit = get_from_nullable_chain(properties, ["statistics", "query", "cacheHit"])
|
329
|
+
billed_bytes = get_from_nullable_chain(properties, ["statistics", "query", "totalBytesBilled"])
|
330
|
+
|
331
|
+
return BigQueryJobRunFacet(
|
332
|
+
cached=str(cache_hit).lower() == "true",
|
333
|
+
billedBytes=int(billed_bytes) if billed_bytes else None,
|
334
|
+
properties=json.dumps(properties),
|
335
|
+
)
|
336
|
+
|
337
|
+
@staticmethod
|
338
|
+
def _get_output_statistics_dataset_facet(
|
339
|
+
properties,
|
340
|
+
) -> OutputStatisticsOutputDatasetFacet | None:
|
341
|
+
job_type = get_from_nullable_chain(properties, ["configuration", "jobType"])
|
342
|
+
out_rows, out_bytes = None, None
|
343
|
+
if job_type == "QUERY":
|
344
|
+
query_plan = get_from_nullable_chain(properties, chain=["statistics", "query", "queryPlan"])
|
345
|
+
if not query_plan: # Without query plan there is no statistics
|
346
|
+
return None
|
347
|
+
out_stage = query_plan[-1] # Last stage of query plan writes the data and has all the statistics
|
348
|
+
out_rows = out_stage.get("recordsWritten", None)
|
349
|
+
out_bytes = out_stage.get("shuffleOutputBytes", None)
|
350
|
+
elif job_type == "LOAD":
|
351
|
+
out_rows = get_from_nullable_chain(properties, ["statistics", "load", "outputRows"])
|
352
|
+
out_bytes = get_from_nullable_chain(properties, ["statistics", "load", "outputBytes"])
|
353
|
+
elif job_type == "COPY":
|
354
|
+
out_rows = get_from_nullable_chain(properties, ["statistics", "copy", "copiedRows"])
|
355
|
+
out_bytes = get_from_nullable_chain(properties, ["statistics", "copy", "copiedLogicalBytes"])
|
356
|
+
# No statistics available for EXTRACT job type
|
357
|
+
|
358
|
+
if out_bytes and out_rows:
|
359
|
+
return OutputStatisticsOutputDatasetFacet(rowCount=int(out_rows), size=int(out_bytes))
|
278
360
|
return None
|
279
361
|
|
280
|
-
def
|
281
|
-
|
282
|
-
|
283
|
-
|
362
|
+
def _get_column_level_lineage_facet_for_query_job(
|
363
|
+
self, properties: dict, output: OutputDataset, inputs: Iterable[InputDataset]
|
364
|
+
) -> ColumnLineageDatasetFacet | None:
|
365
|
+
"""
|
366
|
+
Extract column-level lineage information from a BigQuery job and return it as a facet.
|
367
|
+
|
368
|
+
The Column Level Lineage Facet will NOT be returned if any of the following condition is met:
|
369
|
+
- The parsed result does not contain column lineage information.
|
370
|
+
- The parsed result does not contain exactly one output table.
|
371
|
+
- The parsed result has a different output table than the output table from the BQ job.
|
372
|
+
- The parsed result has at least one input table not present in the input tables from the BQ job.
|
373
|
+
- The parsed result has a column not present in the schema of given dataset from the BQ job.
|
374
|
+
|
375
|
+
Args:
|
376
|
+
properties: The properties of the BigQuery job.
|
377
|
+
output: The output dataset for which the column lineage is being extracted.
|
378
|
+
|
379
|
+
Returns:
|
380
|
+
The extracted Column Lineage Dataset Facet, or None if conditions are not met.
|
381
|
+
"""
|
382
|
+
from airflow.providers.openlineage.sqlparser import SQLParser
|
383
|
+
|
384
|
+
# Extract SQL query and parse it
|
385
|
+
self.log.debug("Extracting column-level lineage facet from BigQuery query.") # type: ignore[attr-defined]
|
386
|
+
|
387
|
+
query = get_from_nullable_chain(properties, ["configuration", "query", "query"])
|
388
|
+
if query is None:
|
389
|
+
self.log.debug("No query found in BQ job configuration. Facet generation skipped.") # type: ignore[attr-defined]
|
390
|
+
return None
|
391
|
+
|
392
|
+
parse_result = SQLParser("bigquery").parse(SQLParser.split_sql_string(SQLParser.normalize_sql(query)))
|
393
|
+
if parse_result is None or parse_result.column_lineage == []:
|
394
|
+
self.log.debug("No column-level lineage found in the SQL query. Facet generation skipped.") # type: ignore[attr-defined]
|
395
|
+
return None
|
396
|
+
|
397
|
+
default_dataset, default_project = self._extract_default_dataset_and_project(
|
398
|
+
properties,
|
399
|
+
self.project_id, # type: ignore[attr-defined]
|
284
400
|
)
|
285
|
-
from airflow.providers.google.cloud.openlineage.utils import get_from_nullable_chain
|
286
401
|
|
287
|
-
|
402
|
+
# Verify if the output table id from the parse result matches the BQ job output table
|
403
|
+
if not self._validate_output_table_id(
|
404
|
+
parse_result,
|
405
|
+
output,
|
406
|
+
default_project,
|
407
|
+
default_dataset,
|
408
|
+
):
|
409
|
+
return None
|
288
410
|
|
289
|
-
if
|
411
|
+
# Verify if all columns from parse results are present in the output dataset schema
|
412
|
+
if not self._validate_output_columns(parse_result, output):
|
290
413
|
return None
|
291
414
|
|
292
|
-
|
293
|
-
|
415
|
+
input_tables_from_parse_result = self._extract_parsed_input_tables(
|
416
|
+
parse_result, default_project, default_dataset
|
417
|
+
)
|
418
|
+
input_tables_from_bq = {input_ds.name: self._extract_column_names(input_ds) for input_ds in inputs}
|
419
|
+
|
420
|
+
# Verify if all datasets from parse results are present in bq job input datasets
|
421
|
+
if not self._validate_input_tables(input_tables_from_parse_result, input_tables_from_bq):
|
422
|
+
return None
|
423
|
+
|
424
|
+
# Verify if all columns from parse results are present in their respective bq job input datasets
|
425
|
+
if not self._validate_input_columns(input_tables_from_parse_result, input_tables_from_bq):
|
294
426
|
return None
|
295
427
|
|
296
|
-
return
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
428
|
+
return self._generate_column_lineage_facet(parse_result, default_project, default_dataset)
|
429
|
+
|
430
|
+
@staticmethod
|
431
|
+
def _get_qualified_name_from_parse_result(table, default_project: str, default_dataset: str) -> str:
|
432
|
+
"""Get the qualified name of a table from the parse result."""
|
433
|
+
return ".".join(
|
434
|
+
(
|
435
|
+
table.database or default_project,
|
436
|
+
table.schema or default_dataset,
|
437
|
+
table.name,
|
438
|
+
)
|
439
|
+
)
|
440
|
+
|
441
|
+
@staticmethod
|
442
|
+
def _extract_default_dataset_and_project(properties: dict, default_project: str) -> tuple[str, str]:
|
443
|
+
"""Extract the default dataset and project from the BigQuery job properties."""
|
444
|
+
default_dataset_obj = get_from_nullable_chain(
|
445
|
+
properties, ["configuration", "query", "defaultDataset"]
|
446
|
+
)
|
447
|
+
default_dataset = default_dataset_obj.get("datasetId", "") if default_dataset_obj else ""
|
448
|
+
default_project = (
|
449
|
+
default_dataset_obj.get("projectId", default_project) if default_dataset_obj else default_project
|
450
|
+
)
|
451
|
+
return default_dataset, default_project
|
452
|
+
|
453
|
+
def _validate_output_table_id(
|
454
|
+
self, parse_result, output: OutputDataset, default_project: str, default_dataset: str
|
455
|
+
) -> bool:
|
456
|
+
"""Check if the output table id from the parse result matches the BQ job output table."""
|
457
|
+
if len(parse_result.out_tables) != 1:
|
458
|
+
self.log.debug( # type: ignore[attr-defined]
|
459
|
+
"Invalid output tables in the parse result: `%s`. Expected exactly one output table.",
|
460
|
+
parse_result.out_tables,
|
461
|
+
)
|
462
|
+
return False
|
463
|
+
|
464
|
+
parsed_output_table = self._get_qualified_name_from_parse_result(
|
465
|
+
parse_result.out_tables[0], default_project, default_dataset
|
466
|
+
)
|
467
|
+
if parsed_output_table != output.name:
|
468
|
+
self.log.debug( # type: ignore[attr-defined]
|
469
|
+
"Mismatch between parsed output table `%s` and BQ job output table `%s`.",
|
470
|
+
parsed_output_table,
|
471
|
+
output.name,
|
472
|
+
)
|
473
|
+
return False
|
474
|
+
return True
|
475
|
+
|
476
|
+
@staticmethod
|
477
|
+
def _extract_column_names(dataset: Dataset) -> list[str]:
|
478
|
+
"""Extract column names from a dataset's schema."""
|
479
|
+
return [
|
480
|
+
f.name
|
481
|
+
for f in dataset.facets.get("schema", SchemaDatasetFacet(fields=[])).fields # type: ignore[union-attr]
|
482
|
+
if dataset.facets
|
483
|
+
]
|
484
|
+
|
485
|
+
def _validate_output_columns(self, parse_result, output: OutputDataset) -> bool:
|
486
|
+
"""Validate if all descendant columns in parse result exist in output dataset schema."""
|
487
|
+
output_column_names = self._extract_column_names(output)
|
488
|
+
missing_columns = [
|
489
|
+
lineage.descendant.name
|
490
|
+
for lineage in parse_result.column_lineage
|
491
|
+
if lineage.descendant.name not in output_column_names
|
492
|
+
]
|
493
|
+
if missing_columns:
|
494
|
+
self.log.debug( # type: ignore[attr-defined]
|
495
|
+
"Output dataset schema is missing columns from the parse result: `%s`.", missing_columns
|
496
|
+
)
|
497
|
+
return False
|
498
|
+
return True
|
499
|
+
|
500
|
+
def _extract_parsed_input_tables(
|
501
|
+
self, parse_result, default_project: str, default_dataset: str
|
502
|
+
) -> dict[str, list[str]]:
|
503
|
+
"""Extract input tables and their columns from the parse result."""
|
504
|
+
input_tables: dict[str, list[str]] = {}
|
505
|
+
for lineage in parse_result.column_lineage:
|
506
|
+
for column_meta in lineage.lineage:
|
507
|
+
if not column_meta.origin:
|
508
|
+
self.log.debug( # type: ignore[attr-defined]
|
509
|
+
"Column `%s` lacks origin information. Skipping facet generation.", column_meta.name
|
510
|
+
)
|
511
|
+
return {}
|
512
|
+
|
513
|
+
input_table_id = self._get_qualified_name_from_parse_result(
|
514
|
+
column_meta.origin, default_project, default_dataset
|
515
|
+
)
|
516
|
+
input_tables.setdefault(input_table_id, []).append(column_meta.name)
|
517
|
+
return input_tables
|
518
|
+
|
519
|
+
def _validate_input_tables(
|
520
|
+
self, parsed_input_tables: dict[str, list[str]], input_tables_from_bq: dict[str, list[str]]
|
521
|
+
) -> bool:
|
522
|
+
"""Validate if all parsed input tables exist in the BQ job's input datasets."""
|
523
|
+
if not parsed_input_tables:
|
524
|
+
self.log.debug("No input tables found in the parse result. Facet generation skipped.") # type: ignore[attr-defined]
|
525
|
+
return False
|
526
|
+
if missing_tables := set(parsed_input_tables) - set(input_tables_from_bq):
|
527
|
+
self.log.debug( # type: ignore[attr-defined]
|
528
|
+
"Parsed input tables not found in the BQ job's input datasets: `%s`.", missing_tables
|
529
|
+
)
|
530
|
+
return False
|
531
|
+
return True
|
532
|
+
|
533
|
+
def _validate_input_columns(
|
534
|
+
self, parsed_input_tables: dict[str, list[str]], input_tables_from_bq: dict[str, list[str]]
|
535
|
+
) -> bool:
|
536
|
+
"""Validate if all parsed input columns exist in their respective BQ job input table schemas."""
|
537
|
+
if not parsed_input_tables:
|
538
|
+
self.log.debug("No input tables found in the parse result. Facet generation skipped.") # type: ignore[attr-defined]
|
539
|
+
return False
|
540
|
+
for table, columns in parsed_input_tables.items():
|
541
|
+
if missing_columns := set(columns) - set(input_tables_from_bq.get(table, [])):
|
542
|
+
self.log.debug( # type: ignore[attr-defined]
|
543
|
+
"Input table `%s` is missing columns from the parse result: `%s`.", table, missing_columns
|
544
|
+
)
|
545
|
+
return False
|
546
|
+
return True
|
547
|
+
|
548
|
+
def _generate_column_lineage_facet(
|
549
|
+
self, parse_result, default_project: str, default_dataset: str
|
550
|
+
) -> ColumnLineageDatasetFacet:
|
551
|
+
"""Generate the ColumnLineageDatasetFacet based on the parsed result."""
|
552
|
+
return ColumnLineageDatasetFacet(
|
553
|
+
fields={
|
554
|
+
lineage.descendant.name: Fields(
|
555
|
+
inputFields=[
|
556
|
+
InputField(
|
557
|
+
namespace=BIGQUERY_NAMESPACE,
|
558
|
+
name=self._get_qualified_name_from_parse_result(
|
559
|
+
column_meta.origin, default_project, default_dataset
|
560
|
+
),
|
561
|
+
field=column_meta.name,
|
562
|
+
)
|
563
|
+
for column_meta in lineage.lineage
|
564
|
+
],
|
565
|
+
transformationType="",
|
566
|
+
transformationDescription="",
|
302
567
|
)
|
303
|
-
for
|
304
|
-
|
568
|
+
for lineage in parse_result.column_lineage
|
569
|
+
}
|
305
570
|
)
|