apache-airflow-providers-google 14.0.0__py3-none-any.whl → 19.1.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airflow/providers/google/3rd-party-licenses/LICENSES.txt +14 -0
- airflow/providers/google/3rd-party-licenses/NOTICE +5 -0
- airflow/providers/google/__init__.py +3 -3
- airflow/providers/google/_vendor/__init__.py +0 -0
- airflow/providers/google/_vendor/json_merge_patch.py +91 -0
- airflow/providers/google/ads/hooks/ads.py +52 -43
- airflow/providers/google/ads/operators/ads.py +2 -2
- airflow/providers/google/ads/transfers/ads_to_gcs.py +3 -19
- airflow/providers/google/assets/gcs.py +1 -11
- airflow/providers/google/cloud/_internal_client/secret_manager_client.py +3 -2
- airflow/providers/google/cloud/bundles/gcs.py +161 -0
- airflow/providers/google/cloud/hooks/alloy_db.py +2 -3
- airflow/providers/google/cloud/hooks/bigquery.py +195 -318
- airflow/providers/google/cloud/hooks/bigquery_dts.py +8 -8
- airflow/providers/google/cloud/hooks/bigtable.py +3 -2
- airflow/providers/google/cloud/hooks/cloud_batch.py +8 -9
- airflow/providers/google/cloud/hooks/cloud_build.py +6 -65
- airflow/providers/google/cloud/hooks/cloud_composer.py +292 -24
- airflow/providers/google/cloud/hooks/cloud_logging.py +109 -0
- airflow/providers/google/cloud/hooks/cloud_memorystore.py +4 -3
- airflow/providers/google/cloud/hooks/cloud_run.py +20 -11
- airflow/providers/google/cloud/hooks/cloud_sql.py +136 -64
- airflow/providers/google/cloud/hooks/cloud_storage_transfer_service.py +35 -15
- airflow/providers/google/cloud/hooks/compute.py +7 -6
- airflow/providers/google/cloud/hooks/compute_ssh.py +7 -4
- airflow/providers/google/cloud/hooks/datacatalog.py +12 -3
- airflow/providers/google/cloud/hooks/dataflow.py +87 -242
- airflow/providers/google/cloud/hooks/dataform.py +9 -14
- airflow/providers/google/cloud/hooks/datafusion.py +7 -9
- airflow/providers/google/cloud/hooks/dataplex.py +13 -12
- airflow/providers/google/cloud/hooks/dataprep.py +2 -2
- airflow/providers/google/cloud/hooks/dataproc.py +76 -74
- airflow/providers/google/cloud/hooks/dataproc_metastore.py +4 -3
- airflow/providers/google/cloud/hooks/dlp.py +5 -4
- airflow/providers/google/cloud/hooks/gcs.py +144 -33
- airflow/providers/google/cloud/hooks/gen_ai.py +196 -0
- airflow/providers/google/cloud/hooks/kms.py +3 -2
- airflow/providers/google/cloud/hooks/kubernetes_engine.py +22 -17
- airflow/providers/google/cloud/hooks/looker.py +6 -1
- airflow/providers/google/cloud/hooks/managed_kafka.py +227 -3
- airflow/providers/google/cloud/hooks/mlengine.py +7 -8
- airflow/providers/google/cloud/hooks/natural_language.py +3 -2
- airflow/providers/google/cloud/hooks/os_login.py +3 -2
- airflow/providers/google/cloud/hooks/pubsub.py +6 -6
- airflow/providers/google/cloud/hooks/secret_manager.py +105 -12
- airflow/providers/google/cloud/hooks/spanner.py +75 -10
- airflow/providers/google/cloud/hooks/speech_to_text.py +3 -2
- airflow/providers/google/cloud/hooks/stackdriver.py +18 -18
- airflow/providers/google/cloud/hooks/tasks.py +4 -3
- airflow/providers/google/cloud/hooks/text_to_speech.py +3 -2
- airflow/providers/google/cloud/hooks/translate.py +8 -17
- airflow/providers/google/cloud/hooks/vertex_ai/auto_ml.py +8 -222
- airflow/providers/google/cloud/hooks/vertex_ai/batch_prediction_job.py +9 -15
- airflow/providers/google/cloud/hooks/vertex_ai/custom_job.py +33 -283
- airflow/providers/google/cloud/hooks/vertex_ai/dataset.py +5 -12
- airflow/providers/google/cloud/hooks/vertex_ai/endpoint_service.py +6 -12
- airflow/providers/google/cloud/hooks/vertex_ai/experiment_service.py +202 -0
- airflow/providers/google/cloud/hooks/vertex_ai/feature_store.py +311 -10
- airflow/providers/google/cloud/hooks/vertex_ai/generative_model.py +79 -75
- airflow/providers/google/cloud/hooks/vertex_ai/hyperparameter_tuning_job.py +7 -13
- airflow/providers/google/cloud/hooks/vertex_ai/model_service.py +8 -12
- airflow/providers/google/cloud/hooks/vertex_ai/pipeline_job.py +6 -12
- airflow/providers/google/cloud/hooks/vertex_ai/prediction_service.py +3 -2
- airflow/providers/google/cloud/hooks/vertex_ai/ray.py +223 -0
- airflow/providers/google/cloud/hooks/video_intelligence.py +3 -2
- airflow/providers/google/cloud/hooks/vision.py +7 -7
- airflow/providers/google/cloud/hooks/workflows.py +4 -3
- airflow/providers/google/cloud/links/alloy_db.py +0 -46
- airflow/providers/google/cloud/links/base.py +77 -7
- airflow/providers/google/cloud/links/bigquery.py +0 -47
- airflow/providers/google/cloud/links/bigquery_dts.py +0 -20
- airflow/providers/google/cloud/links/bigtable.py +0 -48
- airflow/providers/google/cloud/links/cloud_build.py +0 -73
- airflow/providers/google/cloud/links/cloud_functions.py +0 -33
- airflow/providers/google/cloud/links/cloud_memorystore.py +0 -58
- airflow/providers/google/cloud/links/{life_sciences.py → cloud_run.py} +5 -27
- airflow/providers/google/cloud/links/cloud_sql.py +0 -33
- airflow/providers/google/cloud/links/cloud_storage_transfer.py +17 -46
- airflow/providers/google/cloud/links/cloud_tasks.py +7 -26
- airflow/providers/google/cloud/links/compute.py +0 -58
- airflow/providers/google/cloud/links/data_loss_prevention.py +0 -169
- airflow/providers/google/cloud/links/datacatalog.py +23 -54
- airflow/providers/google/cloud/links/dataflow.py +0 -34
- airflow/providers/google/cloud/links/dataform.py +0 -64
- airflow/providers/google/cloud/links/datafusion.py +1 -90
- airflow/providers/google/cloud/links/dataplex.py +0 -154
- airflow/providers/google/cloud/links/dataprep.py +0 -24
- airflow/providers/google/cloud/links/dataproc.py +11 -89
- airflow/providers/google/cloud/links/datastore.py +0 -31
- airflow/providers/google/cloud/links/kubernetes_engine.py +11 -61
- airflow/providers/google/cloud/links/managed_kafka.py +11 -51
- airflow/providers/google/cloud/links/mlengine.py +0 -70
- airflow/providers/google/cloud/links/pubsub.py +0 -32
- airflow/providers/google/cloud/links/spanner.py +0 -33
- airflow/providers/google/cloud/links/stackdriver.py +0 -30
- airflow/providers/google/cloud/links/translate.py +17 -187
- airflow/providers/google/cloud/links/vertex_ai.py +28 -195
- airflow/providers/google/cloud/links/workflows.py +0 -52
- airflow/providers/google/cloud/log/gcs_task_handler.py +166 -118
- airflow/providers/google/cloud/log/stackdriver_task_handler.py +14 -9
- airflow/providers/google/cloud/openlineage/CloudStorageTransferJobFacet.json +68 -0
- airflow/providers/google/cloud/openlineage/CloudStorageTransferRunFacet.json +60 -0
- airflow/providers/google/cloud/openlineage/DataFusionRunFacet.json +32 -0
- airflow/providers/google/cloud/openlineage/facets.py +141 -40
- airflow/providers/google/cloud/openlineage/mixins.py +14 -13
- airflow/providers/google/cloud/openlineage/utils.py +19 -3
- airflow/providers/google/cloud/operators/alloy_db.py +76 -61
- airflow/providers/google/cloud/operators/bigquery.py +104 -667
- airflow/providers/google/cloud/operators/bigquery_dts.py +12 -12
- airflow/providers/google/cloud/operators/bigtable.py +38 -7
- airflow/providers/google/cloud/operators/cloud_base.py +22 -1
- airflow/providers/google/cloud/operators/cloud_batch.py +18 -18
- airflow/providers/google/cloud/operators/cloud_build.py +80 -36
- airflow/providers/google/cloud/operators/cloud_composer.py +157 -71
- airflow/providers/google/cloud/operators/cloud_logging_sink.py +341 -0
- airflow/providers/google/cloud/operators/cloud_memorystore.py +74 -46
- airflow/providers/google/cloud/operators/cloud_run.py +39 -20
- airflow/providers/google/cloud/operators/cloud_sql.py +46 -61
- airflow/providers/google/cloud/operators/cloud_storage_transfer_service.py +92 -14
- airflow/providers/google/cloud/operators/compute.py +18 -50
- airflow/providers/google/cloud/operators/datacatalog.py +167 -29
- airflow/providers/google/cloud/operators/dataflow.py +38 -15
- airflow/providers/google/cloud/operators/dataform.py +19 -7
- airflow/providers/google/cloud/operators/datafusion.py +43 -43
- airflow/providers/google/cloud/operators/dataplex.py +212 -126
- airflow/providers/google/cloud/operators/dataprep.py +1 -5
- airflow/providers/google/cloud/operators/dataproc.py +134 -207
- airflow/providers/google/cloud/operators/dataproc_metastore.py +102 -84
- airflow/providers/google/cloud/operators/datastore.py +22 -6
- airflow/providers/google/cloud/operators/dlp.py +24 -45
- airflow/providers/google/cloud/operators/functions.py +21 -14
- airflow/providers/google/cloud/operators/gcs.py +15 -12
- airflow/providers/google/cloud/operators/gen_ai.py +389 -0
- airflow/providers/google/cloud/operators/kubernetes_engine.py +115 -106
- airflow/providers/google/cloud/operators/looker.py +1 -1
- airflow/providers/google/cloud/operators/managed_kafka.py +362 -40
- airflow/providers/google/cloud/operators/natural_language.py +5 -3
- airflow/providers/google/cloud/operators/pubsub.py +69 -21
- airflow/providers/google/cloud/operators/spanner.py +53 -45
- airflow/providers/google/cloud/operators/speech_to_text.py +5 -4
- airflow/providers/google/cloud/operators/stackdriver.py +5 -11
- airflow/providers/google/cloud/operators/tasks.py +6 -15
- airflow/providers/google/cloud/operators/text_to_speech.py +4 -3
- airflow/providers/google/cloud/operators/translate.py +46 -20
- airflow/providers/google/cloud/operators/translate_speech.py +4 -3
- airflow/providers/google/cloud/operators/vertex_ai/auto_ml.py +44 -34
- airflow/providers/google/cloud/operators/vertex_ai/batch_prediction_job.py +34 -12
- airflow/providers/google/cloud/operators/vertex_ai/custom_job.py +62 -53
- airflow/providers/google/cloud/operators/vertex_ai/dataset.py +75 -11
- airflow/providers/google/cloud/operators/vertex_ai/endpoint_service.py +48 -12
- airflow/providers/google/cloud/operators/vertex_ai/experiment_service.py +435 -0
- airflow/providers/google/cloud/operators/vertex_ai/feature_store.py +532 -1
- airflow/providers/google/cloud/operators/vertex_ai/generative_model.py +135 -116
- airflow/providers/google/cloud/operators/vertex_ai/hyperparameter_tuning_job.py +16 -12
- airflow/providers/google/cloud/operators/vertex_ai/model_service.py +62 -14
- airflow/providers/google/cloud/operators/vertex_ai/pipeline_job.py +35 -10
- airflow/providers/google/cloud/operators/vertex_ai/ray.py +393 -0
- airflow/providers/google/cloud/operators/video_intelligence.py +5 -3
- airflow/providers/google/cloud/operators/vision.py +7 -5
- airflow/providers/google/cloud/operators/workflows.py +24 -19
- airflow/providers/google/cloud/secrets/secret_manager.py +2 -1
- airflow/providers/google/cloud/sensors/bigquery.py +2 -2
- airflow/providers/google/cloud/sensors/bigquery_dts.py +6 -4
- airflow/providers/google/cloud/sensors/bigtable.py +14 -6
- airflow/providers/google/cloud/sensors/cloud_composer.py +535 -33
- airflow/providers/google/cloud/sensors/cloud_storage_transfer_service.py +6 -5
- airflow/providers/google/cloud/sensors/dataflow.py +27 -10
- airflow/providers/google/cloud/sensors/dataform.py +2 -2
- airflow/providers/google/cloud/sensors/datafusion.py +4 -4
- airflow/providers/google/cloud/sensors/dataplex.py +7 -5
- airflow/providers/google/cloud/sensors/dataprep.py +2 -2
- airflow/providers/google/cloud/sensors/dataproc.py +10 -9
- airflow/providers/google/cloud/sensors/dataproc_metastore.py +4 -3
- airflow/providers/google/cloud/sensors/gcs.py +22 -21
- airflow/providers/google/cloud/sensors/looker.py +5 -5
- airflow/providers/google/cloud/sensors/pubsub.py +20 -20
- airflow/providers/google/cloud/sensors/tasks.py +2 -2
- airflow/providers/google/cloud/sensors/vertex_ai/feature_store.py +2 -2
- airflow/providers/google/cloud/sensors/workflows.py +6 -4
- airflow/providers/google/cloud/transfers/adls_to_gcs.py +1 -1
- airflow/providers/google/cloud/transfers/azure_blob_to_gcs.py +2 -2
- airflow/providers/google/cloud/transfers/azure_fileshare_to_gcs.py +2 -2
- airflow/providers/google/cloud/transfers/bigquery_to_bigquery.py +11 -8
- airflow/providers/google/cloud/transfers/bigquery_to_gcs.py +14 -13
- airflow/providers/google/cloud/transfers/bigquery_to_mssql.py +7 -3
- airflow/providers/google/cloud/transfers/bigquery_to_mysql.py +12 -1
- airflow/providers/google/cloud/transfers/bigquery_to_postgres.py +24 -10
- airflow/providers/google/cloud/transfers/bigquery_to_sql.py +104 -5
- airflow/providers/google/cloud/transfers/calendar_to_gcs.py +1 -1
- airflow/providers/google/cloud/transfers/cassandra_to_gcs.py +18 -22
- airflow/providers/google/cloud/transfers/facebook_ads_to_gcs.py +4 -5
- airflow/providers/google/cloud/transfers/gcs_to_bigquery.py +45 -38
- airflow/providers/google/cloud/transfers/gcs_to_gcs.py +2 -2
- airflow/providers/google/cloud/transfers/gcs_to_local.py +5 -3
- airflow/providers/google/cloud/transfers/gcs_to_sftp.py +10 -4
- airflow/providers/google/cloud/transfers/gdrive_to_gcs.py +6 -2
- airflow/providers/google/cloud/transfers/gdrive_to_local.py +2 -2
- airflow/providers/google/cloud/transfers/http_to_gcs.py +193 -0
- airflow/providers/google/cloud/transfers/local_to_gcs.py +2 -2
- airflow/providers/google/cloud/transfers/mssql_to_gcs.py +1 -1
- airflow/providers/google/cloud/transfers/oracle_to_gcs.py +36 -11
- airflow/providers/google/cloud/transfers/postgres_to_gcs.py +44 -12
- airflow/providers/google/cloud/transfers/s3_to_gcs.py +12 -6
- airflow/providers/google/cloud/transfers/salesforce_to_gcs.py +2 -2
- airflow/providers/google/cloud/transfers/sftp_to_gcs.py +36 -14
- airflow/providers/google/cloud/transfers/sheets_to_gcs.py +3 -3
- airflow/providers/google/cloud/transfers/sql_to_gcs.py +10 -10
- airflow/providers/google/cloud/triggers/bigquery.py +75 -34
- airflow/providers/google/cloud/triggers/bigquery_dts.py +2 -1
- airflow/providers/google/cloud/triggers/cloud_batch.py +2 -1
- airflow/providers/google/cloud/triggers/cloud_build.py +3 -2
- airflow/providers/google/cloud/triggers/cloud_composer.py +303 -47
- airflow/providers/google/cloud/triggers/cloud_run.py +2 -2
- airflow/providers/google/cloud/triggers/cloud_storage_transfer_service.py +96 -5
- airflow/providers/google/cloud/triggers/dataflow.py +125 -2
- airflow/providers/google/cloud/triggers/datafusion.py +1 -1
- airflow/providers/google/cloud/triggers/dataplex.py +16 -3
- airflow/providers/google/cloud/triggers/dataproc.py +124 -53
- airflow/providers/google/cloud/triggers/kubernetes_engine.py +46 -28
- airflow/providers/google/cloud/triggers/mlengine.py +1 -1
- airflow/providers/google/cloud/triggers/pubsub.py +17 -20
- airflow/providers/google/cloud/triggers/vertex_ai.py +8 -7
- airflow/providers/google/cloud/utils/bigquery.py +5 -7
- airflow/providers/google/cloud/utils/bigquery_get_data.py +1 -1
- airflow/providers/google/cloud/utils/credentials_provider.py +4 -3
- airflow/providers/google/cloud/utils/dataform.py +1 -1
- airflow/providers/google/cloud/utils/external_token_supplier.py +0 -1
- airflow/providers/google/cloud/utils/field_validator.py +1 -2
- airflow/providers/google/cloud/utils/validators.py +43 -0
- airflow/providers/google/common/auth_backend/google_openid.py +26 -9
- airflow/providers/google/common/consts.py +2 -1
- airflow/providers/google/common/deprecated.py +2 -1
- airflow/providers/google/common/hooks/base_google.py +40 -43
- airflow/providers/google/common/hooks/operation_helpers.py +78 -0
- airflow/providers/google/common/links/storage.py +0 -22
- airflow/providers/google/common/utils/get_secret.py +31 -0
- airflow/providers/google/common/utils/id_token_credentials.py +4 -5
- airflow/providers/google/firebase/operators/firestore.py +2 -2
- airflow/providers/google/get_provider_info.py +61 -216
- airflow/providers/google/go_module_utils.py +35 -3
- airflow/providers/google/leveldb/hooks/leveldb.py +30 -6
- airflow/providers/google/leveldb/operators/leveldb.py +2 -2
- airflow/providers/google/marketing_platform/hooks/analytics_admin.py +3 -2
- airflow/providers/google/marketing_platform/hooks/display_video.py +3 -109
- airflow/providers/google/marketing_platform/hooks/search_ads.py +1 -1
- airflow/providers/google/marketing_platform/links/analytics_admin.py +4 -5
- airflow/providers/google/marketing_platform/operators/analytics_admin.py +7 -6
- airflow/providers/google/marketing_platform/operators/campaign_manager.py +5 -5
- airflow/providers/google/marketing_platform/operators/display_video.py +28 -489
- airflow/providers/google/marketing_platform/operators/search_ads.py +2 -2
- airflow/providers/google/marketing_platform/sensors/campaign_manager.py +2 -2
- airflow/providers/google/marketing_platform/sensors/display_video.py +4 -64
- airflow/providers/google/suite/hooks/calendar.py +1 -1
- airflow/providers/google/suite/hooks/drive.py +2 -2
- airflow/providers/google/suite/hooks/sheets.py +15 -1
- airflow/providers/google/suite/operators/sheets.py +8 -3
- airflow/providers/google/suite/sensors/drive.py +2 -2
- airflow/providers/google/suite/transfers/gcs_to_gdrive.py +2 -2
- airflow/providers/google/suite/transfers/gcs_to_sheets.py +1 -1
- airflow/providers/google/suite/transfers/local_to_drive.py +3 -3
- airflow/providers/google/suite/transfers/sql_to_sheets.py +5 -4
- airflow/providers/google/version_compat.py +15 -1
- {apache_airflow_providers_google-14.0.0.dist-info → apache_airflow_providers_google-19.1.0rc1.dist-info}/METADATA +117 -72
- apache_airflow_providers_google-19.1.0rc1.dist-info/RECORD +331 -0
- {apache_airflow_providers_google-14.0.0.dist-info → apache_airflow_providers_google-19.1.0rc1.dist-info}/WHEEL +1 -1
- apache_airflow_providers_google-19.1.0rc1.dist-info/licenses/NOTICE +5 -0
- airflow/providers/google/cloud/example_dags/example_cloud_task.py +0 -54
- airflow/providers/google/cloud/hooks/automl.py +0 -679
- airflow/providers/google/cloud/hooks/life_sciences.py +0 -159
- airflow/providers/google/cloud/links/automl.py +0 -193
- airflow/providers/google/cloud/operators/automl.py +0 -1360
- airflow/providers/google/cloud/operators/life_sciences.py +0 -119
- airflow/providers/google/cloud/operators/mlengine.py +0 -1515
- airflow/providers/google/cloud/utils/mlengine_operator_utils.py +0 -273
- apache_airflow_providers_google-14.0.0.dist-info/RECORD +0 -318
- /airflow/providers/google/cloud/{example_dags → bundles}/__init__.py +0 -0
- {apache_airflow_providers_google-14.0.0.dist-info → apache_airflow_providers_google-19.1.0rc1.dist-info}/entry_points.txt +0 -0
- {airflow/providers/google → apache_airflow_providers_google-19.1.0rc1.dist-info/licenses}/LICENSE +0 -0
|
@@ -27,9 +27,15 @@ from collections.abc import Sequence
|
|
|
27
27
|
from functools import cached_property
|
|
28
28
|
from typing import TYPE_CHECKING, Any, SupportsAbs
|
|
29
29
|
|
|
30
|
+
from google.api_core.exceptions import Conflict
|
|
31
|
+
from google.api_core.gapic_v1.method import DEFAULT, _MethodDefault
|
|
32
|
+
from google.cloud.bigquery import DEFAULT_RETRY, CopyJob, ExtractJob, LoadJob, QueryJob, Row
|
|
33
|
+
from google.cloud.bigquery.table import RowIterator, Table, TableListItem, TableReference
|
|
34
|
+
|
|
30
35
|
from airflow.configuration import conf
|
|
31
|
-
from airflow.exceptions import AirflowException, AirflowProviderDeprecationWarning
|
|
32
|
-
from airflow.providers.common.
|
|
36
|
+
from airflow.exceptions import AirflowException, AirflowProviderDeprecationWarning
|
|
37
|
+
from airflow.providers.common.compat.sdk import AirflowSkipException
|
|
38
|
+
from airflow.providers.common.sql.operators.sql import ( # for _parse_boolean
|
|
33
39
|
SQLCheckOperator,
|
|
34
40
|
SQLColumnCheckOperator,
|
|
35
41
|
SQLIntervalCheckOperator,
|
|
@@ -54,19 +60,15 @@ from airflow.providers.google.cloud.triggers.bigquery import (
|
|
|
54
60
|
BigQueryValueCheckTrigger,
|
|
55
61
|
)
|
|
56
62
|
from airflow.providers.google.cloud.utils.bigquery import convert_job_id
|
|
57
|
-
from airflow.providers.google.common.deprecated import deprecated
|
|
58
63
|
from airflow.providers.google.common.hooks.base_google import PROVIDE_PROJECT_ID
|
|
59
64
|
from airflow.utils.helpers import exactly_one
|
|
60
|
-
from google.api_core.exceptions import Conflict
|
|
61
|
-
from google.api_core.gapic_v1.method import DEFAULT, _MethodDefault
|
|
62
|
-
from google.cloud.bigquery import DEFAULT_RETRY, CopyJob, ExtractJob, LoadJob, QueryJob, Row
|
|
63
|
-
from google.cloud.bigquery.table import RowIterator, Table, TableListItem, TableReference
|
|
64
65
|
|
|
65
66
|
if TYPE_CHECKING:
|
|
66
|
-
from airflow.utils.context import Context
|
|
67
67
|
from google.api_core.retry import Retry
|
|
68
68
|
from google.cloud.bigquery import UnknownJob
|
|
69
69
|
|
|
70
|
+
from airflow.providers.common.compat.sdk import Context
|
|
71
|
+
|
|
70
72
|
|
|
71
73
|
BIGQUERY_JOB_DETAILS_LINK_FMT = "https://console.cloud.google.com/bigquery?j={job_id}"
|
|
72
74
|
|
|
@@ -91,10 +93,23 @@ class IfExistAction(enum.Enum):
|
|
|
91
93
|
SKIP = "skip"
|
|
92
94
|
|
|
93
95
|
|
|
96
|
+
class _BigQueryHookWithFlexibleProjectId(BigQueryHook):
|
|
97
|
+
@property
|
|
98
|
+
def project_id(self) -> str:
|
|
99
|
+
_, project_id = self.get_credentials_and_project_id()
|
|
100
|
+
return project_id or PROVIDE_PROJECT_ID
|
|
101
|
+
|
|
102
|
+
@project_id.setter
|
|
103
|
+
def project_id(self, value: str) -> None:
|
|
104
|
+
cached_creds, _ = self.get_credentials_and_project_id()
|
|
105
|
+
self._cached_project_id = value or PROVIDE_PROJECT_ID
|
|
106
|
+
self._cached_credntials = cached_creds
|
|
107
|
+
|
|
108
|
+
|
|
94
109
|
class _BigQueryDbHookMixin:
|
|
95
|
-
def get_db_hook(self: BigQueryCheckOperator) ->
|
|
110
|
+
def get_db_hook(self: BigQueryCheckOperator) -> _BigQueryHookWithFlexibleProjectId: # type:ignore[misc]
|
|
96
111
|
"""Get BigQuery DB Hook."""
|
|
97
|
-
|
|
112
|
+
hook = _BigQueryHookWithFlexibleProjectId(
|
|
98
113
|
gcp_conn_id=self.gcp_conn_id,
|
|
99
114
|
use_legacy_sql=self.use_legacy_sql,
|
|
100
115
|
location=self.location,
|
|
@@ -102,6 +117,11 @@ class _BigQueryDbHookMixin:
|
|
|
102
117
|
labels=self.labels,
|
|
103
118
|
)
|
|
104
119
|
|
|
120
|
+
# mypy assuming project_id is read only, as project_id is a property in GoogleBaseHook.
|
|
121
|
+
if self.project_id:
|
|
122
|
+
hook.project_id = self.project_id # type:ignore[misc]
|
|
123
|
+
return hook
|
|
124
|
+
|
|
105
125
|
|
|
106
126
|
class _BigQueryOperatorsEncryptionConfigurationMixin:
|
|
107
127
|
"""A class to handle the configuration for BigQueryHook.insert_job method."""
|
|
@@ -188,6 +208,7 @@ class BigQueryCheckOperator(
|
|
|
188
208
|
https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs.
|
|
189
209
|
For example, [{ 'name': 'corpus', 'parameterType': { 'type': 'STRING' },
|
|
190
210
|
'parameterValue': { 'value': 'romeoandjuliet' } }]. (templated)
|
|
211
|
+
:param project_id: Google Cloud Project where the job is running
|
|
191
212
|
"""
|
|
192
213
|
|
|
193
214
|
template_fields: Sequence[str] = (
|
|
@@ -206,6 +227,7 @@ class BigQueryCheckOperator(
|
|
|
206
227
|
*,
|
|
207
228
|
sql: str,
|
|
208
229
|
gcp_conn_id: str = "google_cloud_default",
|
|
230
|
+
project_id: str = PROVIDE_PROJECT_ID,
|
|
209
231
|
use_legacy_sql: bool = True,
|
|
210
232
|
location: str | None = None,
|
|
211
233
|
impersonation_chain: str | Sequence[str] | None = None,
|
|
@@ -226,6 +248,7 @@ class BigQueryCheckOperator(
|
|
|
226
248
|
self.deferrable = deferrable
|
|
227
249
|
self.poll_interval = poll_interval
|
|
228
250
|
self.query_params = query_params
|
|
251
|
+
self.project_id = project_id
|
|
229
252
|
|
|
230
253
|
def _submit_job(
|
|
231
254
|
self,
|
|
@@ -241,7 +264,7 @@ class BigQueryCheckOperator(
|
|
|
241
264
|
|
|
242
265
|
return hook.insert_job(
|
|
243
266
|
configuration=configuration,
|
|
244
|
-
project_id=
|
|
267
|
+
project_id=self.project_id,
|
|
245
268
|
location=self.location,
|
|
246
269
|
job_id=job_id,
|
|
247
270
|
nowait=True,
|
|
@@ -255,6 +278,8 @@ class BigQueryCheckOperator(
|
|
|
255
278
|
gcp_conn_id=self.gcp_conn_id,
|
|
256
279
|
impersonation_chain=self.impersonation_chain,
|
|
257
280
|
)
|
|
281
|
+
if self.project_id is None:
|
|
282
|
+
self.project_id = hook.project_id
|
|
258
283
|
job = self._submit_job(hook, job_id="")
|
|
259
284
|
context["ti"].xcom_push(key="job_id", value=job.job_id)
|
|
260
285
|
if job.running():
|
|
@@ -263,7 +288,7 @@ class BigQueryCheckOperator(
|
|
|
263
288
|
trigger=BigQueryCheckTrigger(
|
|
264
289
|
conn_id=self.gcp_conn_id,
|
|
265
290
|
job_id=job.job_id,
|
|
266
|
-
project_id=
|
|
291
|
+
project_id=self.project_id,
|
|
267
292
|
location=self.location or hook.location,
|
|
268
293
|
poll_interval=self.poll_interval,
|
|
269
294
|
impersonation_chain=self.impersonation_chain,
|
|
@@ -285,10 +310,8 @@ class BigQueryCheckOperator(
|
|
|
285
310
|
def _validate_records(self, records) -> None:
|
|
286
311
|
if not records:
|
|
287
312
|
raise AirflowException(f"The following query returned zero rows: {self.sql}")
|
|
288
|
-
|
|
289
|
-
self._raise_exception(
|
|
290
|
-
f"Test failed.\nQuery:\n{self.sql}\nResults:\n{records!s}"
|
|
291
|
-
)
|
|
313
|
+
if not all(records):
|
|
314
|
+
self._raise_exception(f"Test failed.\nQuery:\n{self.sql}\nResults:\n{records!s}")
|
|
292
315
|
|
|
293
316
|
def execute_complete(self, context: Context, event: dict[str, Any]) -> None:
|
|
294
317
|
"""
|
|
@@ -340,6 +363,7 @@ class BigQueryValueCheckOperator(
|
|
|
340
363
|
:param deferrable: Run operator in the deferrable mode.
|
|
341
364
|
:param poll_interval: (Deferrable mode only) polling period in seconds to
|
|
342
365
|
check for the status of job.
|
|
366
|
+
:param project_id: Google Cloud Project where the job is running
|
|
343
367
|
"""
|
|
344
368
|
|
|
345
369
|
template_fields: Sequence[str] = (
|
|
@@ -361,6 +385,7 @@ class BigQueryValueCheckOperator(
|
|
|
361
385
|
tolerance: Any = None,
|
|
362
386
|
encryption_configuration: dict | None = None,
|
|
363
387
|
gcp_conn_id: str = "google_cloud_default",
|
|
388
|
+
project_id: str = PROVIDE_PROJECT_ID,
|
|
364
389
|
use_legacy_sql: bool = True,
|
|
365
390
|
location: str | None = None,
|
|
366
391
|
impersonation_chain: str | Sequence[str] | None = None,
|
|
@@ -378,6 +403,7 @@ class BigQueryValueCheckOperator(
|
|
|
378
403
|
self.labels = labels
|
|
379
404
|
self.deferrable = deferrable
|
|
380
405
|
self.poll_interval = poll_interval
|
|
406
|
+
self.project_id = project_id
|
|
381
407
|
|
|
382
408
|
def _submit_job(
|
|
383
409
|
self,
|
|
@@ -396,18 +422,19 @@ class BigQueryValueCheckOperator(
|
|
|
396
422
|
|
|
397
423
|
return hook.insert_job(
|
|
398
424
|
configuration=configuration,
|
|
399
|
-
project_id=
|
|
425
|
+
project_id=self.project_id,
|
|
400
426
|
location=self.location,
|
|
401
427
|
job_id=job_id,
|
|
402
428
|
nowait=True,
|
|
403
429
|
)
|
|
404
430
|
|
|
405
|
-
def execute(self, context: Context) -> None:
|
|
431
|
+
def execute(self, context: Context) -> None:
|
|
406
432
|
if not self.deferrable:
|
|
407
433
|
super().execute(context=context)
|
|
408
434
|
else:
|
|
409
435
|
hook = BigQueryHook(gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain)
|
|
410
|
-
|
|
436
|
+
if self.project_id is None:
|
|
437
|
+
self.project_id = hook.project_id
|
|
411
438
|
job = self._submit_job(hook, job_id="")
|
|
412
439
|
context["ti"].xcom_push(key="job_id", value=job.job_id)
|
|
413
440
|
if job.running():
|
|
@@ -416,7 +443,7 @@ class BigQueryValueCheckOperator(
|
|
|
416
443
|
trigger=BigQueryValueCheckTrigger(
|
|
417
444
|
conn_id=self.gcp_conn_id,
|
|
418
445
|
job_id=job.job_id,
|
|
419
|
-
project_id=
|
|
446
|
+
project_id=self.project_id,
|
|
420
447
|
location=self.location or hook.location,
|
|
421
448
|
sql=self.sql,
|
|
422
449
|
pass_value=self.pass_value,
|
|
@@ -573,6 +600,9 @@ class BigQueryIntervalCheckOperator(
|
|
|
573
600
|
hook = BigQueryHook(gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain)
|
|
574
601
|
self.log.info("Using ratio formula: %s", self.ratio_formula)
|
|
575
602
|
|
|
603
|
+
if self.project_id is None:
|
|
604
|
+
self.project_id = hook.project_id
|
|
605
|
+
|
|
576
606
|
self.log.info("Executing SQL check: %s", self.sql1)
|
|
577
607
|
job_1 = self._submit_job(hook, sql=self.sql1, job_id="")
|
|
578
608
|
context["ti"].xcom_push(key="job_id", value=job_1.job_id)
|
|
@@ -585,7 +615,7 @@ class BigQueryIntervalCheckOperator(
|
|
|
585
615
|
conn_id=self.gcp_conn_id,
|
|
586
616
|
first_job_id=job_1.job_id,
|
|
587
617
|
second_job_id=job_2.job_id,
|
|
588
|
-
project_id=
|
|
618
|
+
project_id=self.project_id,
|
|
589
619
|
table=self.table,
|
|
590
620
|
location=self.location or hook.location,
|
|
591
621
|
metrics_thresholds=self.metrics_thresholds,
|
|
@@ -652,6 +682,7 @@ class BigQueryColumnCheckOperator(
|
|
|
652
682
|
Service Account Token Creator IAM role to the directly preceding identity, with first
|
|
653
683
|
account from the list granting this role to the originating account (templated).
|
|
654
684
|
:param labels: a dictionary containing labels for the table, passed to BigQuery
|
|
685
|
+
:param project_id: Google Cloud Project where the job is running
|
|
655
686
|
"""
|
|
656
687
|
|
|
657
688
|
template_fields: Sequence[str] = tuple(set(SQLColumnCheckOperator.template_fields) | {"gcp_conn_id"})
|
|
@@ -668,6 +699,7 @@ class BigQueryColumnCheckOperator(
|
|
|
668
699
|
accept_none: bool = True,
|
|
669
700
|
encryption_configuration: dict | None = None,
|
|
670
701
|
gcp_conn_id: str = "google_cloud_default",
|
|
702
|
+
project_id: str = PROVIDE_PROJECT_ID,
|
|
671
703
|
use_legacy_sql: bool = True,
|
|
672
704
|
location: str | None = None,
|
|
673
705
|
impersonation_chain: str | Sequence[str] | None = None,
|
|
@@ -693,6 +725,7 @@ class BigQueryColumnCheckOperator(
|
|
|
693
725
|
self.location = location
|
|
694
726
|
self.impersonation_chain = impersonation_chain
|
|
695
727
|
self.labels = labels
|
|
728
|
+
self.project_id = project_id
|
|
696
729
|
|
|
697
730
|
def _submit_job(
|
|
698
731
|
self,
|
|
@@ -704,7 +737,7 @@ class BigQueryColumnCheckOperator(
|
|
|
704
737
|
self.include_encryption_configuration(configuration, "query")
|
|
705
738
|
return hook.insert_job(
|
|
706
739
|
configuration=configuration,
|
|
707
|
-
project_id=
|
|
740
|
+
project_id=self.project_id,
|
|
708
741
|
location=self.location,
|
|
709
742
|
job_id=job_id,
|
|
710
743
|
nowait=False,
|
|
@@ -713,6 +746,9 @@ class BigQueryColumnCheckOperator(
|
|
|
713
746
|
def execute(self, context=None):
|
|
714
747
|
"""Perform checks on the given columns."""
|
|
715
748
|
hook = self.get_db_hook()
|
|
749
|
+
|
|
750
|
+
if self.project_id is None:
|
|
751
|
+
self.project_id = hook.project_id
|
|
716
752
|
failed_tests = []
|
|
717
753
|
|
|
718
754
|
job = self._submit_job(hook, job_id="")
|
|
@@ -784,6 +820,7 @@ class BigQueryTableCheckOperator(
|
|
|
784
820
|
account from the list granting this role to the originating account (templated).
|
|
785
821
|
:param labels: a dictionary containing labels for the table, passed to BigQuery
|
|
786
822
|
:param encryption_configuration: (Optional) Custom encryption configuration (e.g., Cloud KMS keys).
|
|
823
|
+
:param project_id: Google Cloud Project where the job is running
|
|
787
824
|
|
|
788
825
|
.. code-block:: python
|
|
789
826
|
|
|
@@ -803,6 +840,7 @@ class BigQueryTableCheckOperator(
|
|
|
803
840
|
checks: dict,
|
|
804
841
|
partition_clause: str | None = None,
|
|
805
842
|
gcp_conn_id: str = "google_cloud_default",
|
|
843
|
+
project_id: str = PROVIDE_PROJECT_ID,
|
|
806
844
|
use_legacy_sql: bool = True,
|
|
807
845
|
location: str | None = None,
|
|
808
846
|
impersonation_chain: str | Sequence[str] | None = None,
|
|
@@ -817,6 +855,7 @@ class BigQueryTableCheckOperator(
|
|
|
817
855
|
self.impersonation_chain = impersonation_chain
|
|
818
856
|
self.labels = labels
|
|
819
857
|
self.encryption_configuration = encryption_configuration
|
|
858
|
+
self.project_id = project_id
|
|
820
859
|
|
|
821
860
|
def _submit_job(
|
|
822
861
|
self,
|
|
@@ -830,7 +869,7 @@ class BigQueryTableCheckOperator(
|
|
|
830
869
|
|
|
831
870
|
return hook.insert_job(
|
|
832
871
|
configuration=configuration,
|
|
833
|
-
project_id=
|
|
872
|
+
project_id=self.project_id,
|
|
834
873
|
location=self.location,
|
|
835
874
|
job_id=job_id,
|
|
836
875
|
nowait=False,
|
|
@@ -839,6 +878,8 @@ class BigQueryTableCheckOperator(
|
|
|
839
878
|
def execute(self, context=None):
|
|
840
879
|
"""Execute the given checks on the table."""
|
|
841
880
|
hook = self.get_db_hook()
|
|
881
|
+
if self.project_id is None:
|
|
882
|
+
self.project_id = hook.project_id
|
|
842
883
|
job = self._submit_job(hook, job_id="")
|
|
843
884
|
context["ti"].xcom_push(key="job_id", value=job.job_id)
|
|
844
885
|
records = job.result().to_dataframe()
|
|
@@ -972,6 +1013,7 @@ class BigQueryGetDataOperator(GoogleCloudBaseOperator, _BigQueryOperatorsEncrypt
|
|
|
972
1013
|
"project_id",
|
|
973
1014
|
"max_results",
|
|
974
1015
|
"selected_fields",
|
|
1016
|
+
"gcp_conn_id",
|
|
975
1017
|
"impersonation_chain",
|
|
976
1018
|
)
|
|
977
1019
|
ui_color = BigQueryUIColors.QUERY.value
|
|
@@ -1115,7 +1157,7 @@ class BigQueryGetDataOperator(GoogleCloudBaseOperator, _BigQueryOperatorsEncrypt
|
|
|
1115
1157
|
"BigQueryHook.list_rows() returns iterator when return_iterator is False (default)"
|
|
1116
1158
|
)
|
|
1117
1159
|
self.log.info("Total extracted rows: %s", len(rows))
|
|
1118
|
-
|
|
1160
|
+
table_data: list[dict[str, Any]] | list[Any]
|
|
1119
1161
|
if self.as_dict:
|
|
1120
1162
|
table_data = [dict(row) for row in rows]
|
|
1121
1163
|
else:
|
|
@@ -1213,6 +1255,7 @@ class BigQueryCreateTableOperator(GoogleCloudBaseOperator):
|
|
|
1213
1255
|
"table_resource",
|
|
1214
1256
|
"project_id",
|
|
1215
1257
|
"gcs_schema_object",
|
|
1258
|
+
"gcp_conn_id",
|
|
1216
1259
|
"impersonation_chain",
|
|
1217
1260
|
)
|
|
1218
1261
|
template_fields_renderers = {"table_resource": "json"}
|
|
@@ -1283,7 +1326,6 @@ class BigQueryCreateTableOperator(GoogleCloudBaseOperator):
|
|
|
1283
1326
|
if self._table:
|
|
1284
1327
|
persist_kwargs = {
|
|
1285
1328
|
"context": context,
|
|
1286
|
-
"task_instance": self,
|
|
1287
1329
|
"project_id": self._table.to_api_repr()["tableReference"]["projectId"],
|
|
1288
1330
|
"dataset_id": self._table.to_api_repr()["tableReference"]["datasetId"],
|
|
1289
1331
|
"table_id": self._table.to_api_repr()["tableReference"]["tableId"],
|
|
@@ -1302,7 +1344,6 @@ class BigQueryCreateTableOperator(GoogleCloudBaseOperator):
|
|
|
1302
1344
|
self.log.info(error_msg)
|
|
1303
1345
|
persist_kwargs = {
|
|
1304
1346
|
"context": context,
|
|
1305
|
-
"task_instance": self,
|
|
1306
1347
|
"project_id": self.project_id or bq_hook.project_id,
|
|
1307
1348
|
"dataset_id": self.dataset_id,
|
|
1308
1349
|
"table_id": self.table_id,
|
|
@@ -1336,610 +1377,6 @@ class BigQueryCreateTableOperator(GoogleCloudBaseOperator):
|
|
|
1336
1377
|
return OperatorLineage(outputs=[output_dataset])
|
|
1337
1378
|
|
|
1338
1379
|
|
|
1339
|
-
@deprecated(
|
|
1340
|
-
planned_removal_date="July 30, 2025",
|
|
1341
|
-
use_instead="airflow.providers.google.cloud.operators.bigquery.BigQueryCreateTableOperator",
|
|
1342
|
-
category=AirflowProviderDeprecationWarning,
|
|
1343
|
-
)
|
|
1344
|
-
class BigQueryCreateEmptyTableOperator(GoogleCloudBaseOperator):
|
|
1345
|
-
"""
|
|
1346
|
-
Creates a new table in the specified BigQuery dataset, optionally with schema.
|
|
1347
|
-
|
|
1348
|
-
The schema to be used for the BigQuery table may be specified in one of
|
|
1349
|
-
two ways. You may either directly pass the schema fields in, or you may
|
|
1350
|
-
point the operator to a Google Cloud Storage object name. The object in
|
|
1351
|
-
Google Cloud Storage must be a JSON file with the schema fields in it.
|
|
1352
|
-
You can also create a table without schema.
|
|
1353
|
-
|
|
1354
|
-
.. seealso::
|
|
1355
|
-
For more information on how to use this operator, take a look at the guide:
|
|
1356
|
-
:ref:`howto/operator:BigQueryCreateEmptyTableOperator`
|
|
1357
|
-
|
|
1358
|
-
:param project_id: The project to create the table into. (templated)
|
|
1359
|
-
:param dataset_id: The dataset to create the table into. (templated)
|
|
1360
|
-
:param table_id: The Name of the table to be created. (templated)
|
|
1361
|
-
:param table_resource: Table resource as described in documentation:
|
|
1362
|
-
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#Table
|
|
1363
|
-
If provided all other parameters are ignored. (templated)
|
|
1364
|
-
:param schema_fields: If set, the schema field list as defined here:
|
|
1365
|
-
https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.load.schema
|
|
1366
|
-
|
|
1367
|
-
**Example**::
|
|
1368
|
-
|
|
1369
|
-
schema_fields = [
|
|
1370
|
-
{"name": "emp_name", "type": "STRING", "mode": "REQUIRED"},
|
|
1371
|
-
{"name": "salary", "type": "INTEGER", "mode": "NULLABLE"},
|
|
1372
|
-
]
|
|
1373
|
-
|
|
1374
|
-
:param gcs_schema_object: Full path to the JSON file containing
|
|
1375
|
-
schema (templated). For
|
|
1376
|
-
example: ``gs://test-bucket/dir1/dir2/employee_schema.json``
|
|
1377
|
-
:param time_partitioning: configure optional time partitioning fields i.e.
|
|
1378
|
-
partition by field, type and expiration as per API specifications.
|
|
1379
|
-
|
|
1380
|
-
.. seealso::
|
|
1381
|
-
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#timePartitioning
|
|
1382
|
-
:param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud and
|
|
1383
|
-
interact with the Bigquery service.
|
|
1384
|
-
:param google_cloud_storage_conn_id: (Optional) The connection ID used to connect to Google Cloud.
|
|
1385
|
-
and interact with the Google Cloud Storage service.
|
|
1386
|
-
:param labels: a dictionary containing labels for the table, passed to BigQuery
|
|
1387
|
-
|
|
1388
|
-
**Example (with schema JSON in GCS)**::
|
|
1389
|
-
|
|
1390
|
-
CreateTable = BigQueryCreateEmptyTableOperator(
|
|
1391
|
-
task_id="BigQueryCreateEmptyTableOperator_task",
|
|
1392
|
-
dataset_id="ODS",
|
|
1393
|
-
table_id="Employees",
|
|
1394
|
-
project_id="internal-gcp-project",
|
|
1395
|
-
gcs_schema_object="gs://schema-bucket/employee_schema.json",
|
|
1396
|
-
gcp_conn_id="airflow-conn-id",
|
|
1397
|
-
google_cloud_storage_conn_id="airflow-conn-id",
|
|
1398
|
-
)
|
|
1399
|
-
|
|
1400
|
-
**Corresponding Schema file** (``employee_schema.json``)::
|
|
1401
|
-
|
|
1402
|
-
[
|
|
1403
|
-
{"mode": "NULLABLE", "name": "emp_name", "type": "STRING"},
|
|
1404
|
-
{"mode": "REQUIRED", "name": "salary", "type": "INTEGER"},
|
|
1405
|
-
]
|
|
1406
|
-
|
|
1407
|
-
**Example (with schema in the DAG)**::
|
|
1408
|
-
|
|
1409
|
-
CreateTable = BigQueryCreateEmptyTableOperator(
|
|
1410
|
-
task_id="BigQueryCreateEmptyTableOperator_task",
|
|
1411
|
-
dataset_id="ODS",
|
|
1412
|
-
table_id="Employees",
|
|
1413
|
-
project_id="internal-gcp-project",
|
|
1414
|
-
schema_fields=[
|
|
1415
|
-
{"name": "emp_name", "type": "STRING", "mode": "REQUIRED"},
|
|
1416
|
-
{"name": "salary", "type": "INTEGER", "mode": "NULLABLE"},
|
|
1417
|
-
],
|
|
1418
|
-
gcp_conn_id="airflow-conn-id-account",
|
|
1419
|
-
google_cloud_storage_conn_id="airflow-conn-id",
|
|
1420
|
-
)
|
|
1421
|
-
|
|
1422
|
-
:param view: (Optional) A dictionary containing definition for the view.
|
|
1423
|
-
If set, it will create a view instead of a table:
|
|
1424
|
-
|
|
1425
|
-
.. seealso::
|
|
1426
|
-
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ViewDefinition
|
|
1427
|
-
:param materialized_view: (Optional) The materialized view definition.
|
|
1428
|
-
:param encryption_configuration: (Optional) Custom encryption configuration (e.g., Cloud KMS keys).
|
|
1429
|
-
|
|
1430
|
-
.. code-block:: python
|
|
1431
|
-
|
|
1432
|
-
encryption_configuration = {
|
|
1433
|
-
"kmsKeyName": "projects/PROJECT/locations/LOCATION/keyRings/KEY_RING/cryptoKeys/KEY",
|
|
1434
|
-
}
|
|
1435
|
-
:param location: The location used for the operation.
|
|
1436
|
-
:param cluster_fields: (Optional) The fields used for clustering.
|
|
1437
|
-
BigQuery supports clustering for both partitioned and
|
|
1438
|
-
non-partitioned tables.
|
|
1439
|
-
|
|
1440
|
-
.. seealso::
|
|
1441
|
-
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#clustering.fields
|
|
1442
|
-
:param impersonation_chain: Optional service account to impersonate using short-term
|
|
1443
|
-
credentials, or chained list of accounts required to get the access_token
|
|
1444
|
-
of the last account in the list, which will be impersonated in the request.
|
|
1445
|
-
If set as a string, the account must grant the originating account
|
|
1446
|
-
the Service Account Token Creator IAM role.
|
|
1447
|
-
If set as a sequence, the identities from the list must grant
|
|
1448
|
-
Service Account Token Creator IAM role to the directly preceding identity, with first
|
|
1449
|
-
account from the list granting this role to the originating account (templated).
|
|
1450
|
-
:param if_exists: What should Airflow do if the table exists. If set to `log`, the TI will be passed to
|
|
1451
|
-
success and an error message will be logged. Set to `ignore` to ignore the error, set to `fail` to
|
|
1452
|
-
fail the TI, and set to `skip` to skip it.
|
|
1453
|
-
:param exists_ok: Deprecated - use `if_exists="ignore"` instead.
|
|
1454
|
-
"""
|
|
1455
|
-
|
|
1456
|
-
template_fields: Sequence[str] = (
|
|
1457
|
-
"dataset_id",
|
|
1458
|
-
"table_id",
|
|
1459
|
-
"table_resource",
|
|
1460
|
-
"project_id",
|
|
1461
|
-
"gcs_schema_object",
|
|
1462
|
-
"labels",
|
|
1463
|
-
"view",
|
|
1464
|
-
"materialized_view",
|
|
1465
|
-
"impersonation_chain",
|
|
1466
|
-
)
|
|
1467
|
-
template_fields_renderers = {"table_resource": "json", "materialized_view": "json"}
|
|
1468
|
-
ui_color = BigQueryUIColors.TABLE.value
|
|
1469
|
-
operator_extra_links = (BigQueryTableLink(),)
|
|
1470
|
-
|
|
1471
|
-
def __init__(
|
|
1472
|
-
self,
|
|
1473
|
-
*,
|
|
1474
|
-
dataset_id: str,
|
|
1475
|
-
table_id: str,
|
|
1476
|
-
table_resource: dict[str, Any] | None = None,
|
|
1477
|
-
project_id: str = PROVIDE_PROJECT_ID,
|
|
1478
|
-
schema_fields: list | None = None,
|
|
1479
|
-
gcs_schema_object: str | None = None,
|
|
1480
|
-
time_partitioning: dict | None = None,
|
|
1481
|
-
gcp_conn_id: str = "google_cloud_default",
|
|
1482
|
-
google_cloud_storage_conn_id: str = "google_cloud_default",
|
|
1483
|
-
labels: dict | None = None,
|
|
1484
|
-
view: dict | None = None,
|
|
1485
|
-
materialized_view: dict | None = None,
|
|
1486
|
-
encryption_configuration: dict | None = None,
|
|
1487
|
-
location: str | None = None,
|
|
1488
|
-
cluster_fields: list[str] | None = None,
|
|
1489
|
-
impersonation_chain: str | Sequence[str] | None = None,
|
|
1490
|
-
if_exists: str = "log",
|
|
1491
|
-
bigquery_conn_id: str | None = None,
|
|
1492
|
-
exists_ok: bool | None = None,
|
|
1493
|
-
**kwargs,
|
|
1494
|
-
) -> None:
|
|
1495
|
-
if bigquery_conn_id:
|
|
1496
|
-
warnings.warn(
|
|
1497
|
-
"The bigquery_conn_id parameter has been deprecated. Use the gcp_conn_id parameter instead.",
|
|
1498
|
-
AirflowProviderDeprecationWarning,
|
|
1499
|
-
stacklevel=2,
|
|
1500
|
-
)
|
|
1501
|
-
gcp_conn_id = bigquery_conn_id
|
|
1502
|
-
|
|
1503
|
-
super().__init__(**kwargs)
|
|
1504
|
-
|
|
1505
|
-
self.project_id = project_id
|
|
1506
|
-
self.dataset_id = dataset_id
|
|
1507
|
-
self.table_id = table_id
|
|
1508
|
-
self.schema_fields = schema_fields
|
|
1509
|
-
self.gcs_schema_object = gcs_schema_object
|
|
1510
|
-
self.gcp_conn_id = gcp_conn_id
|
|
1511
|
-
self.google_cloud_storage_conn_id = google_cloud_storage_conn_id
|
|
1512
|
-
self.time_partitioning = time_partitioning or {}
|
|
1513
|
-
self.labels = labels
|
|
1514
|
-
self.view = view
|
|
1515
|
-
self.materialized_view = materialized_view
|
|
1516
|
-
self.encryption_configuration = encryption_configuration
|
|
1517
|
-
self.location = location
|
|
1518
|
-
self.cluster_fields = cluster_fields
|
|
1519
|
-
self.table_resource = table_resource
|
|
1520
|
-
self.impersonation_chain = impersonation_chain
|
|
1521
|
-
self._table: Table | None = None
|
|
1522
|
-
if exists_ok is not None:
|
|
1523
|
-
warnings.warn(
|
|
1524
|
-
"`exists_ok` parameter is deprecated, please use `if_exists`",
|
|
1525
|
-
AirflowProviderDeprecationWarning,
|
|
1526
|
-
stacklevel=2,
|
|
1527
|
-
)
|
|
1528
|
-
self.if_exists = IfExistAction.IGNORE if exists_ok else IfExistAction.LOG
|
|
1529
|
-
else:
|
|
1530
|
-
self.if_exists = IfExistAction(if_exists)
|
|
1531
|
-
|
|
1532
|
-
def execute(self, context: Context) -> None:
|
|
1533
|
-
bq_hook = BigQueryHook(
|
|
1534
|
-
gcp_conn_id=self.gcp_conn_id,
|
|
1535
|
-
location=self.location,
|
|
1536
|
-
impersonation_chain=self.impersonation_chain,
|
|
1537
|
-
)
|
|
1538
|
-
|
|
1539
|
-
if not self.schema_fields and self.gcs_schema_object:
|
|
1540
|
-
gcs_bucket, gcs_object = _parse_gcs_url(self.gcs_schema_object)
|
|
1541
|
-
gcs_hook = GCSHook(
|
|
1542
|
-
gcp_conn_id=self.google_cloud_storage_conn_id,
|
|
1543
|
-
impersonation_chain=self.impersonation_chain,
|
|
1544
|
-
)
|
|
1545
|
-
schema_fields_string = gcs_hook.download_as_byte_array(gcs_bucket, gcs_object).decode("utf-8")
|
|
1546
|
-
schema_fields = json.loads(schema_fields_string)
|
|
1547
|
-
else:
|
|
1548
|
-
schema_fields = self.schema_fields
|
|
1549
|
-
|
|
1550
|
-
try:
|
|
1551
|
-
self.log.info("Creating table")
|
|
1552
|
-
# Save table as attribute for further use by OpenLineage
|
|
1553
|
-
self._table = bq_hook.create_empty_table(
|
|
1554
|
-
project_id=self.project_id,
|
|
1555
|
-
dataset_id=self.dataset_id,
|
|
1556
|
-
table_id=self.table_id,
|
|
1557
|
-
schema_fields=schema_fields,
|
|
1558
|
-
time_partitioning=self.time_partitioning,
|
|
1559
|
-
cluster_fields=self.cluster_fields,
|
|
1560
|
-
labels=self.labels,
|
|
1561
|
-
view=self.view,
|
|
1562
|
-
materialized_view=self.materialized_view,
|
|
1563
|
-
encryption_configuration=self.encryption_configuration,
|
|
1564
|
-
table_resource=self.table_resource,
|
|
1565
|
-
exists_ok=self.if_exists == IfExistAction.IGNORE,
|
|
1566
|
-
)
|
|
1567
|
-
if self._table:
|
|
1568
|
-
persist_kwargs = {
|
|
1569
|
-
"context": context,
|
|
1570
|
-
"task_instance": self,
|
|
1571
|
-
"project_id": self._table.to_api_repr()["tableReference"]["projectId"],
|
|
1572
|
-
"dataset_id": self._table.to_api_repr()["tableReference"]["datasetId"],
|
|
1573
|
-
"table_id": self._table.to_api_repr()["tableReference"]["tableId"],
|
|
1574
|
-
}
|
|
1575
|
-
self.log.info(
|
|
1576
|
-
"Table %s.%s.%s created successfully",
|
|
1577
|
-
self._table.project,
|
|
1578
|
-
self._table.dataset_id,
|
|
1579
|
-
self._table.table_id,
|
|
1580
|
-
)
|
|
1581
|
-
else:
|
|
1582
|
-
raise AirflowException("Table creation failed.")
|
|
1583
|
-
except Conflict:
|
|
1584
|
-
error_msg = f"Table {self.dataset_id}.{self.table_id} already exists."
|
|
1585
|
-
if self.if_exists == IfExistAction.LOG:
|
|
1586
|
-
self.log.info(error_msg)
|
|
1587
|
-
persist_kwargs = {
|
|
1588
|
-
"context": context,
|
|
1589
|
-
"task_instance": self,
|
|
1590
|
-
"project_id": self.project_id or bq_hook.project_id,
|
|
1591
|
-
"dataset_id": self.dataset_id,
|
|
1592
|
-
"table_id": self.table_id,
|
|
1593
|
-
}
|
|
1594
|
-
elif self.if_exists == IfExistAction.FAIL:
|
|
1595
|
-
raise AirflowException(error_msg)
|
|
1596
|
-
else:
|
|
1597
|
-
raise AirflowSkipException(error_msg)
|
|
1598
|
-
|
|
1599
|
-
BigQueryTableLink.persist(**persist_kwargs)
|
|
1600
|
-
|
|
1601
|
-
def get_openlineage_facets_on_complete(self, _):
|
|
1602
|
-
"""Implement _on_complete as we will use table resource returned by create method."""
|
|
1603
|
-
from airflow.providers.common.compat.openlineage.facet import Dataset
|
|
1604
|
-
from airflow.providers.google.cloud.openlineage.utils import (
|
|
1605
|
-
BIGQUERY_NAMESPACE,
|
|
1606
|
-
get_facets_from_bq_table,
|
|
1607
|
-
)
|
|
1608
|
-
from airflow.providers.openlineage.extractors import OperatorLineage
|
|
1609
|
-
|
|
1610
|
-
if not self._table:
|
|
1611
|
-
self.log.debug("OpenLineage did not find `self._table` attribute.")
|
|
1612
|
-
return OperatorLineage()
|
|
1613
|
-
|
|
1614
|
-
output_dataset = Dataset(
|
|
1615
|
-
namespace=BIGQUERY_NAMESPACE,
|
|
1616
|
-
name=f"{self._table.project}.{self._table.dataset_id}.{self._table.table_id}",
|
|
1617
|
-
facets=get_facets_from_bq_table(self._table),
|
|
1618
|
-
)
|
|
1619
|
-
|
|
1620
|
-
return OperatorLineage(outputs=[output_dataset])
|
|
1621
|
-
|
|
1622
|
-
|
|
1623
|
-
@deprecated(
|
|
1624
|
-
planned_removal_date="July 30, 2025",
|
|
1625
|
-
use_instead="airflow.providers.google.cloud.operators.bigquery.BigQueryCreateTableOperator",
|
|
1626
|
-
category=AirflowProviderDeprecationWarning,
|
|
1627
|
-
)
|
|
1628
|
-
class BigQueryCreateExternalTableOperator(GoogleCloudBaseOperator):
|
|
1629
|
-
"""
|
|
1630
|
-
Create a new external table with data from Google Cloud Storage.
|
|
1631
|
-
|
|
1632
|
-
The schema to be used for the BigQuery table may be specified in one of
|
|
1633
|
-
two ways. You may either directly pass the schema fields in, or you may
|
|
1634
|
-
point the operator to a Google Cloud Storage object name. The object in
|
|
1635
|
-
Google Cloud Storage must be a JSON file with the schema fields in it.
|
|
1636
|
-
|
|
1637
|
-
.. seealso::
|
|
1638
|
-
For more information on how to use this operator, take a look at the guide:
|
|
1639
|
-
:ref:`howto/operator:BigQueryCreateExternalTableOperator`
|
|
1640
|
-
|
|
1641
|
-
:param bucket: The bucket to point the external table to. (templated)
|
|
1642
|
-
:param source_objects: List of Google Cloud Storage URIs to point
|
|
1643
|
-
table to. If source_format is 'DATASTORE_BACKUP', the list must only contain a single URI.
|
|
1644
|
-
:param destination_project_dataset_table: The dotted ``(<project>.)<dataset>.<table>``
|
|
1645
|
-
BigQuery table to load data into (templated). If ``<project>`` is not included,
|
|
1646
|
-
project will be the project defined in the connection json.
|
|
1647
|
-
:param schema_fields: If set, the schema field list as defined here:
|
|
1648
|
-
https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.load.schema
|
|
1649
|
-
|
|
1650
|
-
**Example**::
|
|
1651
|
-
|
|
1652
|
-
schema_fields = [
|
|
1653
|
-
{"name": "emp_name", "type": "STRING", "mode": "REQUIRED"},
|
|
1654
|
-
{"name": "salary", "type": "INTEGER", "mode": "NULLABLE"},
|
|
1655
|
-
]
|
|
1656
|
-
|
|
1657
|
-
Should not be set when source_format is 'DATASTORE_BACKUP'.
|
|
1658
|
-
:param table_resource: Table resource as described in documentation:
|
|
1659
|
-
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#Table
|
|
1660
|
-
If provided all other parameters are ignored. External schema from object will be resolved.
|
|
1661
|
-
:param schema_object: If set, a GCS object path pointing to a .json file that
|
|
1662
|
-
contains the schema for the table. (templated)
|
|
1663
|
-
:param gcs_schema_bucket: GCS bucket name where the schema JSON is stored (templated).
|
|
1664
|
-
The default value is self.bucket.
|
|
1665
|
-
:param source_format: File format of the data.
|
|
1666
|
-
:param autodetect: Try to detect schema and format options automatically.
|
|
1667
|
-
The schema_fields and schema_object options will be honored when specified explicitly.
|
|
1668
|
-
https://cloud.google.com/bigquery/docs/schema-detect#schema_auto-detection_for_external_data_sources
|
|
1669
|
-
:param compression: (Optional) The compression type of the data source.
|
|
1670
|
-
Possible values include GZIP and NONE.
|
|
1671
|
-
The default value is NONE.
|
|
1672
|
-
This setting is ignored for Google Cloud Bigtable,
|
|
1673
|
-
Google Cloud Datastore backups and Avro formats.
|
|
1674
|
-
:param skip_leading_rows: Number of rows to skip when loading from a CSV.
|
|
1675
|
-
:param field_delimiter: The delimiter to use for the CSV.
|
|
1676
|
-
:param max_bad_records: The maximum number of bad records that BigQuery can
|
|
1677
|
-
ignore when running the job.
|
|
1678
|
-
:param quote_character: The value that is used to quote data sections in a CSV file.
|
|
1679
|
-
:param allow_quoted_newlines: Whether to allow quoted newlines (true) or not (false).
|
|
1680
|
-
:param allow_jagged_rows: Accept rows that are missing trailing optional columns.
|
|
1681
|
-
The missing values are treated as nulls. If false, records with missing trailing
|
|
1682
|
-
columns are treated as bad records, and if there are too many bad records, an
|
|
1683
|
-
invalid error is returned in the job result. Only applicable to CSV, ignored
|
|
1684
|
-
for other formats.
|
|
1685
|
-
:param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud and
|
|
1686
|
-
interact with the Bigquery service.
|
|
1687
|
-
:param google_cloud_storage_conn_id: (Optional) The connection ID used to connect to Google Cloud
|
|
1688
|
-
and interact with the Google Cloud Storage service.
|
|
1689
|
-
:param src_fmt_configs: configure optional fields specific to the source format
|
|
1690
|
-
:param labels: a dictionary containing labels for the table, passed to BigQuery
|
|
1691
|
-
:param encryption_configuration: (Optional) Custom encryption configuration (e.g., Cloud KMS keys).
|
|
1692
|
-
|
|
1693
|
-
.. code-block:: python
|
|
1694
|
-
|
|
1695
|
-
encryption_configuration = {
|
|
1696
|
-
"kmsKeyName": "projects/PROJECT/locations/LOCATION/keyRings/KEY_RING/cryptoKeys/KEY",
|
|
1697
|
-
}
|
|
1698
|
-
:param location: The location used for the operation.
|
|
1699
|
-
:param impersonation_chain: Optional service account to impersonate using short-term
|
|
1700
|
-
credentials, or chained list of accounts required to get the access_token
|
|
1701
|
-
of the last account in the list, which will be impersonated in the request.
|
|
1702
|
-
If set as a string, the account must grant the originating account
|
|
1703
|
-
the Service Account Token Creator IAM role.
|
|
1704
|
-
If set as a sequence, the identities from the list must grant
|
|
1705
|
-
Service Account Token Creator IAM role to the directly preceding identity, with first
|
|
1706
|
-
account from the list granting this role to the originating account (templated).
|
|
1707
|
-
"""
|
|
1708
|
-
|
|
1709
|
-
template_fields: Sequence[str] = (
|
|
1710
|
-
"bucket",
|
|
1711
|
-
"source_objects",
|
|
1712
|
-
"schema_object",
|
|
1713
|
-
"gcs_schema_bucket",
|
|
1714
|
-
"destination_project_dataset_table",
|
|
1715
|
-
"labels",
|
|
1716
|
-
"table_resource",
|
|
1717
|
-
"impersonation_chain",
|
|
1718
|
-
)
|
|
1719
|
-
template_fields_renderers = {"table_resource": "json"}
|
|
1720
|
-
ui_color = BigQueryUIColors.TABLE.value
|
|
1721
|
-
operator_extra_links = (BigQueryTableLink(),)
|
|
1722
|
-
|
|
1723
|
-
def __init__(
|
|
1724
|
-
self,
|
|
1725
|
-
*,
|
|
1726
|
-
bucket: str | None = None,
|
|
1727
|
-
source_objects: list[str] | None = None,
|
|
1728
|
-
destination_project_dataset_table: str | None = None,
|
|
1729
|
-
table_resource: dict[str, Any] | None = None,
|
|
1730
|
-
schema_fields: list | None = None,
|
|
1731
|
-
schema_object: str | None = None,
|
|
1732
|
-
gcs_schema_bucket: str | None = None,
|
|
1733
|
-
source_format: str | None = None,
|
|
1734
|
-
autodetect: bool = False,
|
|
1735
|
-
compression: str | None = None,
|
|
1736
|
-
skip_leading_rows: int | None = None,
|
|
1737
|
-
field_delimiter: str | None = None,
|
|
1738
|
-
max_bad_records: int = 0,
|
|
1739
|
-
quote_character: str | None = None,
|
|
1740
|
-
allow_quoted_newlines: bool = False,
|
|
1741
|
-
allow_jagged_rows: bool = False,
|
|
1742
|
-
gcp_conn_id: str = "google_cloud_default",
|
|
1743
|
-
google_cloud_storage_conn_id: str = "google_cloud_default",
|
|
1744
|
-
src_fmt_configs: dict | None = None,
|
|
1745
|
-
labels: dict | None = None,
|
|
1746
|
-
encryption_configuration: dict | None = None,
|
|
1747
|
-
location: str | None = None,
|
|
1748
|
-
impersonation_chain: str | Sequence[str] | None = None,
|
|
1749
|
-
bigquery_conn_id: str | None = None,
|
|
1750
|
-
**kwargs,
|
|
1751
|
-
) -> None:
|
|
1752
|
-
if bigquery_conn_id:
|
|
1753
|
-
warnings.warn(
|
|
1754
|
-
"The bigquery_conn_id parameter has been deprecated. Use the gcp_conn_id parameter instead.",
|
|
1755
|
-
AirflowProviderDeprecationWarning,
|
|
1756
|
-
stacklevel=2,
|
|
1757
|
-
)
|
|
1758
|
-
gcp_conn_id = bigquery_conn_id
|
|
1759
|
-
|
|
1760
|
-
super().__init__(**kwargs)
|
|
1761
|
-
|
|
1762
|
-
self.table_resource = table_resource
|
|
1763
|
-
self.bucket = bucket or ""
|
|
1764
|
-
self.source_objects = source_objects or []
|
|
1765
|
-
self.schema_object = schema_object or None
|
|
1766
|
-
self.gcs_schema_bucket = gcs_schema_bucket or ""
|
|
1767
|
-
self.destination_project_dataset_table = destination_project_dataset_table or ""
|
|
1768
|
-
|
|
1769
|
-
# BQ config
|
|
1770
|
-
kwargs_passed = any(
|
|
1771
|
-
[
|
|
1772
|
-
destination_project_dataset_table,
|
|
1773
|
-
schema_fields,
|
|
1774
|
-
source_format,
|
|
1775
|
-
compression,
|
|
1776
|
-
skip_leading_rows,
|
|
1777
|
-
field_delimiter,
|
|
1778
|
-
max_bad_records,
|
|
1779
|
-
autodetect,
|
|
1780
|
-
quote_character,
|
|
1781
|
-
allow_quoted_newlines,
|
|
1782
|
-
allow_jagged_rows,
|
|
1783
|
-
src_fmt_configs,
|
|
1784
|
-
labels,
|
|
1785
|
-
encryption_configuration,
|
|
1786
|
-
]
|
|
1787
|
-
)
|
|
1788
|
-
|
|
1789
|
-
if not table_resource:
|
|
1790
|
-
warnings.warn(
|
|
1791
|
-
"Passing table parameters via keywords arguments will be deprecated. "
|
|
1792
|
-
"Please provide table definition using `table_resource` parameter.",
|
|
1793
|
-
AirflowProviderDeprecationWarning,
|
|
1794
|
-
stacklevel=2,
|
|
1795
|
-
)
|
|
1796
|
-
if not bucket:
|
|
1797
|
-
raise ValueError("`bucket` is required when not using `table_resource`.")
|
|
1798
|
-
if not gcs_schema_bucket:
|
|
1799
|
-
gcs_schema_bucket = bucket
|
|
1800
|
-
if not source_objects:
|
|
1801
|
-
raise ValueError("`source_objects` is required when not using `table_resource`.")
|
|
1802
|
-
if not source_format:
|
|
1803
|
-
source_format = "CSV"
|
|
1804
|
-
if not compression:
|
|
1805
|
-
compression = "NONE"
|
|
1806
|
-
if not skip_leading_rows:
|
|
1807
|
-
skip_leading_rows = 0
|
|
1808
|
-
if not field_delimiter:
|
|
1809
|
-
field_delimiter = ","
|
|
1810
|
-
if not destination_project_dataset_table:
|
|
1811
|
-
raise ValueError(
|
|
1812
|
-
"`destination_project_dataset_table` is required when not using `table_resource`."
|
|
1813
|
-
)
|
|
1814
|
-
self.bucket = bucket
|
|
1815
|
-
self.source_objects = source_objects
|
|
1816
|
-
self.schema_object = schema_object
|
|
1817
|
-
self.gcs_schema_bucket = gcs_schema_bucket
|
|
1818
|
-
self.destination_project_dataset_table = destination_project_dataset_table
|
|
1819
|
-
self.schema_fields = schema_fields
|
|
1820
|
-
self.source_format = source_format
|
|
1821
|
-
self.compression = compression
|
|
1822
|
-
self.skip_leading_rows = skip_leading_rows
|
|
1823
|
-
self.field_delimiter = field_delimiter
|
|
1824
|
-
self.table_resource = None
|
|
1825
|
-
else:
|
|
1826
|
-
pass
|
|
1827
|
-
|
|
1828
|
-
if table_resource and kwargs_passed:
|
|
1829
|
-
raise ValueError("You provided both `table_resource` and exclusive keywords arguments.")
|
|
1830
|
-
|
|
1831
|
-
self.max_bad_records = max_bad_records
|
|
1832
|
-
self.quote_character = quote_character
|
|
1833
|
-
self.allow_quoted_newlines = allow_quoted_newlines
|
|
1834
|
-
self.allow_jagged_rows = allow_jagged_rows
|
|
1835
|
-
self.gcp_conn_id = gcp_conn_id
|
|
1836
|
-
self.google_cloud_storage_conn_id = google_cloud_storage_conn_id
|
|
1837
|
-
self.autodetect = autodetect
|
|
1838
|
-
|
|
1839
|
-
self.src_fmt_configs = src_fmt_configs or {}
|
|
1840
|
-
self.labels = labels
|
|
1841
|
-
self.encryption_configuration = encryption_configuration
|
|
1842
|
-
self.location = location
|
|
1843
|
-
self.impersonation_chain = impersonation_chain
|
|
1844
|
-
self._table: Table | None = None
|
|
1845
|
-
|
|
1846
|
-
def execute(self, context: Context) -> None:
|
|
1847
|
-
bq_hook = BigQueryHook(
|
|
1848
|
-
gcp_conn_id=self.gcp_conn_id,
|
|
1849
|
-
location=self.location,
|
|
1850
|
-
impersonation_chain=self.impersonation_chain,
|
|
1851
|
-
)
|
|
1852
|
-
if self.table_resource:
|
|
1853
|
-
# Save table as attribute for further use by OpenLineage
|
|
1854
|
-
self._table = bq_hook.create_empty_table(
|
|
1855
|
-
table_resource=self.table_resource,
|
|
1856
|
-
)
|
|
1857
|
-
if self._table:
|
|
1858
|
-
BigQueryTableLink.persist(
|
|
1859
|
-
context=context,
|
|
1860
|
-
task_instance=self,
|
|
1861
|
-
dataset_id=self._table.dataset_id,
|
|
1862
|
-
project_id=self._table.project,
|
|
1863
|
-
table_id=self._table.table_id,
|
|
1864
|
-
)
|
|
1865
|
-
return
|
|
1866
|
-
|
|
1867
|
-
if not self.schema_fields and self.schema_object and self.source_format != "DATASTORE_BACKUP":
|
|
1868
|
-
gcs_hook = GCSHook(
|
|
1869
|
-
gcp_conn_id=self.google_cloud_storage_conn_id,
|
|
1870
|
-
impersonation_chain=self.impersonation_chain,
|
|
1871
|
-
)
|
|
1872
|
-
schema_fields = json.loads(
|
|
1873
|
-
gcs_hook.download(self.gcs_schema_bucket, self.schema_object).decode("utf-8")
|
|
1874
|
-
)
|
|
1875
|
-
else:
|
|
1876
|
-
schema_fields = self.schema_fields
|
|
1877
|
-
|
|
1878
|
-
source_uris = [f"gs://{self.bucket}/{source_object}" for source_object in self.source_objects]
|
|
1879
|
-
|
|
1880
|
-
project_id, dataset_id, table_id = bq_hook.split_tablename(
|
|
1881
|
-
table_input=self.destination_project_dataset_table,
|
|
1882
|
-
default_project_id=bq_hook.project_id or "",
|
|
1883
|
-
)
|
|
1884
|
-
|
|
1885
|
-
external_data_configuration = {
|
|
1886
|
-
"source_uris": source_uris,
|
|
1887
|
-
"source_format": self.source_format,
|
|
1888
|
-
"autodetect": self.autodetect,
|
|
1889
|
-
"compression": self.compression,
|
|
1890
|
-
"maxBadRecords": self.max_bad_records,
|
|
1891
|
-
}
|
|
1892
|
-
if self.source_format == "CSV":
|
|
1893
|
-
external_data_configuration["csvOptions"] = {
|
|
1894
|
-
"fieldDelimiter": self.field_delimiter,
|
|
1895
|
-
"skipLeadingRows": self.skip_leading_rows,
|
|
1896
|
-
"quote": self.quote_character,
|
|
1897
|
-
"allowQuotedNewlines": self.allow_quoted_newlines,
|
|
1898
|
-
"allowJaggedRows": self.allow_jagged_rows,
|
|
1899
|
-
}
|
|
1900
|
-
|
|
1901
|
-
table_resource = {
|
|
1902
|
-
"tableReference": {
|
|
1903
|
-
"projectId": project_id,
|
|
1904
|
-
"datasetId": dataset_id,
|
|
1905
|
-
"tableId": table_id,
|
|
1906
|
-
},
|
|
1907
|
-
"labels": self.labels,
|
|
1908
|
-
"schema": {"fields": schema_fields},
|
|
1909
|
-
"externalDataConfiguration": external_data_configuration,
|
|
1910
|
-
"location": self.location,
|
|
1911
|
-
"encryptionConfiguration": self.encryption_configuration,
|
|
1912
|
-
}
|
|
1913
|
-
|
|
1914
|
-
# Save table as attribute for further use by OpenLineage
|
|
1915
|
-
self._table = bq_hook.create_empty_table(table_resource=table_resource)
|
|
1916
|
-
if self._table:
|
|
1917
|
-
BigQueryTableLink.persist(
|
|
1918
|
-
context=context,
|
|
1919
|
-
task_instance=self,
|
|
1920
|
-
dataset_id=self._table.dataset_id,
|
|
1921
|
-
project_id=self._table.project,
|
|
1922
|
-
table_id=self._table.table_id,
|
|
1923
|
-
)
|
|
1924
|
-
|
|
1925
|
-
def get_openlineage_facets_on_complete(self, _):
|
|
1926
|
-
"""Implement _on_complete as we will use table resource returned by create method."""
|
|
1927
|
-
from airflow.providers.common.compat.openlineage.facet import Dataset
|
|
1928
|
-
from airflow.providers.google.cloud.openlineage.utils import (
|
|
1929
|
-
BIGQUERY_NAMESPACE,
|
|
1930
|
-
get_facets_from_bq_table,
|
|
1931
|
-
)
|
|
1932
|
-
from airflow.providers.openlineage.extractors import OperatorLineage
|
|
1933
|
-
|
|
1934
|
-
output_dataset = Dataset(
|
|
1935
|
-
namespace=BIGQUERY_NAMESPACE,
|
|
1936
|
-
name=f"{self._table.project}.{self._table.dataset_id}.{self._table.table_id}",
|
|
1937
|
-
facets=get_facets_from_bq_table(self._table),
|
|
1938
|
-
)
|
|
1939
|
-
|
|
1940
|
-
return OperatorLineage(outputs=[output_dataset])
|
|
1941
|
-
|
|
1942
|
-
|
|
1943
1380
|
class BigQueryDeleteDatasetOperator(GoogleCloudBaseOperator):
|
|
1944
1381
|
"""
|
|
1945
1382
|
Delete an existing dataset from your Project in BigQuery.
|
|
@@ -1981,6 +1418,7 @@ class BigQueryDeleteDatasetOperator(GoogleCloudBaseOperator):
|
|
|
1981
1418
|
template_fields: Sequence[str] = (
|
|
1982
1419
|
"dataset_id",
|
|
1983
1420
|
"project_id",
|
|
1421
|
+
"gcp_conn_id",
|
|
1984
1422
|
"impersonation_chain",
|
|
1985
1423
|
)
|
|
1986
1424
|
ui_color = BigQueryUIColors.DATASET.value
|
|
@@ -2060,6 +1498,7 @@ class BigQueryCreateEmptyDatasetOperator(GoogleCloudBaseOperator):
|
|
|
2060
1498
|
"dataset_id",
|
|
2061
1499
|
"project_id",
|
|
2062
1500
|
"dataset_reference",
|
|
1501
|
+
"gcp_conn_id",
|
|
2063
1502
|
"impersonation_chain",
|
|
2064
1503
|
)
|
|
2065
1504
|
template_fields_renderers = {"dataset_reference": "json"}
|
|
@@ -2114,7 +1553,6 @@ class BigQueryCreateEmptyDatasetOperator(GoogleCloudBaseOperator):
|
|
|
2114
1553
|
)
|
|
2115
1554
|
persist_kwargs = {
|
|
2116
1555
|
"context": context,
|
|
2117
|
-
"task_instance": self,
|
|
2118
1556
|
"project_id": dataset["datasetReference"]["projectId"],
|
|
2119
1557
|
"dataset_id": dataset["datasetReference"]["datasetId"],
|
|
2120
1558
|
}
|
|
@@ -2126,7 +1564,6 @@ class BigQueryCreateEmptyDatasetOperator(GoogleCloudBaseOperator):
|
|
|
2126
1564
|
)
|
|
2127
1565
|
persist_kwargs = {
|
|
2128
1566
|
"context": context,
|
|
2129
|
-
"task_instance": self,
|
|
2130
1567
|
"project_id": project_id,
|
|
2131
1568
|
"dataset_id": dataset_id,
|
|
2132
1569
|
}
|
|
@@ -2166,6 +1603,7 @@ class BigQueryGetDatasetOperator(GoogleCloudBaseOperator):
|
|
|
2166
1603
|
template_fields: Sequence[str] = (
|
|
2167
1604
|
"dataset_id",
|
|
2168
1605
|
"project_id",
|
|
1606
|
+
"gcp_conn_id",
|
|
2169
1607
|
"impersonation_chain",
|
|
2170
1608
|
)
|
|
2171
1609
|
ui_color = BigQueryUIColors.DATASET.value
|
|
@@ -2198,7 +1636,6 @@ class BigQueryGetDatasetOperator(GoogleCloudBaseOperator):
|
|
|
2198
1636
|
dataset_api_repr = dataset.to_api_repr()
|
|
2199
1637
|
BigQueryDatasetLink.persist(
|
|
2200
1638
|
context=context,
|
|
2201
|
-
task_instance=self,
|
|
2202
1639
|
dataset_id=dataset_api_repr["datasetReference"]["datasetId"],
|
|
2203
1640
|
project_id=dataset_api_repr["datasetReference"]["projectId"],
|
|
2204
1641
|
)
|
|
@@ -2231,6 +1668,7 @@ class BigQueryGetDatasetTablesOperator(GoogleCloudBaseOperator):
|
|
|
2231
1668
|
template_fields: Sequence[str] = (
|
|
2232
1669
|
"dataset_id",
|
|
2233
1670
|
"project_id",
|
|
1671
|
+
"gcp_conn_id",
|
|
2234
1672
|
"impersonation_chain",
|
|
2235
1673
|
)
|
|
2236
1674
|
ui_color = BigQueryUIColors.DATASET.value
|
|
@@ -2301,6 +1739,7 @@ class BigQueryUpdateTableOperator(GoogleCloudBaseOperator):
|
|
|
2301
1739
|
"dataset_id",
|
|
2302
1740
|
"table_id",
|
|
2303
1741
|
"project_id",
|
|
1742
|
+
"gcp_conn_id",
|
|
2304
1743
|
"impersonation_chain",
|
|
2305
1744
|
)
|
|
2306
1745
|
template_fields_renderers = {"table_resource": "json"}
|
|
@@ -2347,7 +1786,6 @@ class BigQueryUpdateTableOperator(GoogleCloudBaseOperator):
|
|
|
2347
1786
|
if self._table:
|
|
2348
1787
|
BigQueryTableLink.persist(
|
|
2349
1788
|
context=context,
|
|
2350
|
-
task_instance=self,
|
|
2351
1789
|
dataset_id=self._table["tableReference"]["datasetId"],
|
|
2352
1790
|
project_id=self._table["tableReference"]["projectId"],
|
|
2353
1791
|
table_id=self._table["tableReference"]["tableId"],
|
|
@@ -2408,6 +1846,7 @@ class BigQueryUpdateDatasetOperator(GoogleCloudBaseOperator):
|
|
|
2408
1846
|
template_fields: Sequence[str] = (
|
|
2409
1847
|
"dataset_id",
|
|
2410
1848
|
"project_id",
|
|
1849
|
+
"gcp_conn_id",
|
|
2411
1850
|
"impersonation_chain",
|
|
2412
1851
|
)
|
|
2413
1852
|
template_fields_renderers = {"dataset_resource": "json"}
|
|
@@ -2450,7 +1889,6 @@ class BigQueryUpdateDatasetOperator(GoogleCloudBaseOperator):
|
|
|
2450
1889
|
dataset_api_repr = dataset.to_api_repr()
|
|
2451
1890
|
BigQueryDatasetLink.persist(
|
|
2452
1891
|
context=context,
|
|
2453
|
-
task_instance=self,
|
|
2454
1892
|
dataset_id=dataset_api_repr["datasetReference"]["datasetId"],
|
|
2455
1893
|
project_id=dataset_api_repr["datasetReference"]["projectId"],
|
|
2456
1894
|
)
|
|
@@ -2484,6 +1922,7 @@ class BigQueryDeleteTableOperator(GoogleCloudBaseOperator):
|
|
|
2484
1922
|
|
|
2485
1923
|
template_fields: Sequence[str] = (
|
|
2486
1924
|
"deletion_dataset_table",
|
|
1925
|
+
"gcp_conn_id",
|
|
2487
1926
|
"impersonation_chain",
|
|
2488
1927
|
)
|
|
2489
1928
|
ui_color = BigQueryUIColors.TABLE.value
|
|
@@ -2578,6 +2017,7 @@ class BigQueryUpsertTableOperator(GoogleCloudBaseOperator):
|
|
|
2578
2017
|
template_fields: Sequence[str] = (
|
|
2579
2018
|
"dataset_id",
|
|
2580
2019
|
"table_resource",
|
|
2020
|
+
"gcp_conn_id",
|
|
2581
2021
|
"impersonation_chain",
|
|
2582
2022
|
"project_id",
|
|
2583
2023
|
)
|
|
@@ -2622,7 +2062,6 @@ class BigQueryUpsertTableOperator(GoogleCloudBaseOperator):
|
|
|
2622
2062
|
if self._table:
|
|
2623
2063
|
BigQueryTableLink.persist(
|
|
2624
2064
|
context=context,
|
|
2625
|
-
task_instance=self,
|
|
2626
2065
|
dataset_id=self._table["tableReference"]["datasetId"],
|
|
2627
2066
|
project_id=self._table["tableReference"]["projectId"],
|
|
2628
2067
|
table_id=self._table["tableReference"]["tableId"],
|
|
@@ -2706,6 +2145,7 @@ class BigQueryUpdateTableSchemaOperator(GoogleCloudBaseOperator):
|
|
|
2706
2145
|
"dataset_id",
|
|
2707
2146
|
"table_id",
|
|
2708
2147
|
"project_id",
|
|
2148
|
+
"gcp_conn_id",
|
|
2709
2149
|
"impersonation_chain",
|
|
2710
2150
|
)
|
|
2711
2151
|
template_fields_renderers = {"schema_fields_updates": "json"}
|
|
@@ -2752,7 +2192,6 @@ class BigQueryUpdateTableSchemaOperator(GoogleCloudBaseOperator):
|
|
|
2752
2192
|
if self._table:
|
|
2753
2193
|
BigQueryTableLink.persist(
|
|
2754
2194
|
context=context,
|
|
2755
|
-
task_instance=self,
|
|
2756
2195
|
dataset_id=self._table["tableReference"]["datasetId"],
|
|
2757
2196
|
project_id=self._table["tableReference"]["projectId"],
|
|
2758
2197
|
table_id=self._table["tableReference"]["tableId"],
|
|
@@ -2836,6 +2275,7 @@ class BigQueryInsertJobOperator(GoogleCloudBaseOperator, _BigQueryInsertJobOpera
|
|
|
2836
2275
|
template_fields: Sequence[str] = (
|
|
2837
2276
|
"configuration",
|
|
2838
2277
|
"job_id",
|
|
2278
|
+
"gcp_conn_id",
|
|
2839
2279
|
"impersonation_chain",
|
|
2840
2280
|
"project_id",
|
|
2841
2281
|
)
|
|
@@ -2895,7 +2335,7 @@ class BigQueryInsertJobOperator(GoogleCloudBaseOperator, _BigQueryInsertJobOpera
|
|
|
2895
2335
|
|
|
2896
2336
|
def _add_job_labels(self) -> None:
|
|
2897
2337
|
dag_label = self.dag_id.lower()
|
|
2898
|
-
task_label = self.task_id.lower()
|
|
2338
|
+
task_label = self.task_id.lower().replace(".", "-")
|
|
2899
2339
|
|
|
2900
2340
|
if LABEL_REGEX.match(dag_label) and LABEL_REGEX.match(task_label):
|
|
2901
2341
|
automatic_labels = {"airflow-dag": dag_label, "airflow-task": task_label}
|
|
@@ -2947,8 +2387,9 @@ class BigQueryInsertJobOperator(GoogleCloudBaseOperator, _BigQueryInsertJobOpera
|
|
|
2947
2387
|
job_id=self.job_id,
|
|
2948
2388
|
dag_id=self.dag_id,
|
|
2949
2389
|
task_id=self.task_id,
|
|
2950
|
-
logical_date=
|
|
2390
|
+
logical_date=None,
|
|
2951
2391
|
configuration=self.configuration,
|
|
2392
|
+
run_after=hook.get_run_after_or_logical_date(context),
|
|
2952
2393
|
force_rerun=self.force_rerun,
|
|
2953
2394
|
)
|
|
2954
2395
|
|
|
@@ -2974,14 +2415,13 @@ class BigQueryInsertJobOperator(GoogleCloudBaseOperator, _BigQueryInsertJobOpera
|
|
|
2974
2415
|
f"Or, if you want to reattach in this scenario add {job.state} to `reattach_states`"
|
|
2975
2416
|
)
|
|
2976
2417
|
|
|
2977
|
-
|
|
2978
|
-
|
|
2979
|
-
|
|
2980
|
-
raise AirflowException("Job is already in state DONE. Can not reattach to this job.")
|
|
2418
|
+
# Job already reached state DONE
|
|
2419
|
+
if job.state == "DONE":
|
|
2420
|
+
raise AirflowException("Job is already in state DONE. Can not reattach to this job.")
|
|
2981
2421
|
|
|
2982
|
-
|
|
2983
|
-
|
|
2984
|
-
|
|
2422
|
+
# We are reattaching to a job
|
|
2423
|
+
self.log.info("Reattaching to existing Job in state %s", job.state)
|
|
2424
|
+
self._handle_job_error(job)
|
|
2985
2425
|
|
|
2986
2426
|
job_types = {
|
|
2987
2427
|
LoadJob._JOB_TYPE: ["sourceTable", "destinationTable"],
|
|
@@ -2999,7 +2439,6 @@ class BigQueryInsertJobOperator(GoogleCloudBaseOperator, _BigQueryInsertJobOpera
|
|
|
2999
2439
|
table = job_configuration[job_type][table_prop]
|
|
3000
2440
|
persist_kwargs = {
|
|
3001
2441
|
"context": context,
|
|
3002
|
-
"task_instance": self,
|
|
3003
2442
|
"project_id": self.project_id,
|
|
3004
2443
|
"table_id": table,
|
|
3005
2444
|
}
|
|
@@ -3013,7 +2452,7 @@ class BigQueryInsertJobOperator(GoogleCloudBaseOperator, _BigQueryInsertJobOpera
|
|
|
3013
2452
|
|
|
3014
2453
|
if self.project_id:
|
|
3015
2454
|
job_id_path = convert_job_id(
|
|
3016
|
-
job_id=self.job_id,
|
|
2455
|
+
job_id=self.job_id,
|
|
3017
2456
|
project_id=self.project_id,
|
|
3018
2457
|
location=self.location,
|
|
3019
2458
|
)
|
|
@@ -3021,7 +2460,6 @@ class BigQueryInsertJobOperator(GoogleCloudBaseOperator, _BigQueryInsertJobOpera
|
|
|
3021
2460
|
|
|
3022
2461
|
persist_kwargs = {
|
|
3023
2462
|
"context": context,
|
|
3024
|
-
"task_instance": self,
|
|
3025
2463
|
"project_id": self.project_id,
|
|
3026
2464
|
"location": self.location,
|
|
3027
2465
|
"job_id": self.job_id,
|
|
@@ -3034,24 +2472,23 @@ class BigQueryInsertJobOperator(GoogleCloudBaseOperator, _BigQueryInsertJobOpera
|
|
|
3034
2472
|
self._handle_job_error(job)
|
|
3035
2473
|
|
|
3036
2474
|
return self.job_id
|
|
3037
|
-
|
|
3038
|
-
|
|
3039
|
-
self.
|
|
3040
|
-
|
|
3041
|
-
|
|
3042
|
-
|
|
3043
|
-
|
|
3044
|
-
|
|
3045
|
-
|
|
3046
|
-
|
|
3047
|
-
|
|
3048
|
-
|
|
3049
|
-
|
|
3050
|
-
|
|
3051
|
-
|
|
3052
|
-
|
|
3053
|
-
|
|
3054
|
-
return self.job_id
|
|
2475
|
+
if job.running():
|
|
2476
|
+
self.defer(
|
|
2477
|
+
timeout=self.execution_timeout,
|
|
2478
|
+
trigger=BigQueryInsertJobTrigger(
|
|
2479
|
+
conn_id=self.gcp_conn_id,
|
|
2480
|
+
job_id=self.job_id,
|
|
2481
|
+
project_id=self.project_id,
|
|
2482
|
+
location=self.location or hook.location,
|
|
2483
|
+
poll_interval=self.poll_interval,
|
|
2484
|
+
impersonation_chain=self.impersonation_chain,
|
|
2485
|
+
cancel_on_kill=self.cancel_on_kill,
|
|
2486
|
+
),
|
|
2487
|
+
method_name="execute_complete",
|
|
2488
|
+
)
|
|
2489
|
+
self.log.info("Current state of job %s is %s", job.job_id, job.state)
|
|
2490
|
+
self._handle_job_error(job)
|
|
2491
|
+
return self.job_id
|
|
3055
2492
|
|
|
3056
2493
|
def execute_complete(self, context: Context, event: dict[str, Any]) -> str | None:
|
|
3057
2494
|
"""
|