apache-airflow-providers-google 14.0.0__py3-none-any.whl → 19.1.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airflow/providers/google/3rd-party-licenses/LICENSES.txt +14 -0
- airflow/providers/google/3rd-party-licenses/NOTICE +5 -0
- airflow/providers/google/__init__.py +3 -3
- airflow/providers/google/_vendor/__init__.py +0 -0
- airflow/providers/google/_vendor/json_merge_patch.py +91 -0
- airflow/providers/google/ads/hooks/ads.py +52 -43
- airflow/providers/google/ads/operators/ads.py +2 -2
- airflow/providers/google/ads/transfers/ads_to_gcs.py +3 -19
- airflow/providers/google/assets/gcs.py +1 -11
- airflow/providers/google/cloud/_internal_client/secret_manager_client.py +3 -2
- airflow/providers/google/cloud/bundles/gcs.py +161 -0
- airflow/providers/google/cloud/hooks/alloy_db.py +2 -3
- airflow/providers/google/cloud/hooks/bigquery.py +195 -318
- airflow/providers/google/cloud/hooks/bigquery_dts.py +8 -8
- airflow/providers/google/cloud/hooks/bigtable.py +3 -2
- airflow/providers/google/cloud/hooks/cloud_batch.py +8 -9
- airflow/providers/google/cloud/hooks/cloud_build.py +6 -65
- airflow/providers/google/cloud/hooks/cloud_composer.py +292 -24
- airflow/providers/google/cloud/hooks/cloud_logging.py +109 -0
- airflow/providers/google/cloud/hooks/cloud_memorystore.py +4 -3
- airflow/providers/google/cloud/hooks/cloud_run.py +20 -11
- airflow/providers/google/cloud/hooks/cloud_sql.py +136 -64
- airflow/providers/google/cloud/hooks/cloud_storage_transfer_service.py +35 -15
- airflow/providers/google/cloud/hooks/compute.py +7 -6
- airflow/providers/google/cloud/hooks/compute_ssh.py +7 -4
- airflow/providers/google/cloud/hooks/datacatalog.py +12 -3
- airflow/providers/google/cloud/hooks/dataflow.py +87 -242
- airflow/providers/google/cloud/hooks/dataform.py +9 -14
- airflow/providers/google/cloud/hooks/datafusion.py +7 -9
- airflow/providers/google/cloud/hooks/dataplex.py +13 -12
- airflow/providers/google/cloud/hooks/dataprep.py +2 -2
- airflow/providers/google/cloud/hooks/dataproc.py +76 -74
- airflow/providers/google/cloud/hooks/dataproc_metastore.py +4 -3
- airflow/providers/google/cloud/hooks/dlp.py +5 -4
- airflow/providers/google/cloud/hooks/gcs.py +144 -33
- airflow/providers/google/cloud/hooks/gen_ai.py +196 -0
- airflow/providers/google/cloud/hooks/kms.py +3 -2
- airflow/providers/google/cloud/hooks/kubernetes_engine.py +22 -17
- airflow/providers/google/cloud/hooks/looker.py +6 -1
- airflow/providers/google/cloud/hooks/managed_kafka.py +227 -3
- airflow/providers/google/cloud/hooks/mlengine.py +7 -8
- airflow/providers/google/cloud/hooks/natural_language.py +3 -2
- airflow/providers/google/cloud/hooks/os_login.py +3 -2
- airflow/providers/google/cloud/hooks/pubsub.py +6 -6
- airflow/providers/google/cloud/hooks/secret_manager.py +105 -12
- airflow/providers/google/cloud/hooks/spanner.py +75 -10
- airflow/providers/google/cloud/hooks/speech_to_text.py +3 -2
- airflow/providers/google/cloud/hooks/stackdriver.py +18 -18
- airflow/providers/google/cloud/hooks/tasks.py +4 -3
- airflow/providers/google/cloud/hooks/text_to_speech.py +3 -2
- airflow/providers/google/cloud/hooks/translate.py +8 -17
- airflow/providers/google/cloud/hooks/vertex_ai/auto_ml.py +8 -222
- airflow/providers/google/cloud/hooks/vertex_ai/batch_prediction_job.py +9 -15
- airflow/providers/google/cloud/hooks/vertex_ai/custom_job.py +33 -283
- airflow/providers/google/cloud/hooks/vertex_ai/dataset.py +5 -12
- airflow/providers/google/cloud/hooks/vertex_ai/endpoint_service.py +6 -12
- airflow/providers/google/cloud/hooks/vertex_ai/experiment_service.py +202 -0
- airflow/providers/google/cloud/hooks/vertex_ai/feature_store.py +311 -10
- airflow/providers/google/cloud/hooks/vertex_ai/generative_model.py +79 -75
- airflow/providers/google/cloud/hooks/vertex_ai/hyperparameter_tuning_job.py +7 -13
- airflow/providers/google/cloud/hooks/vertex_ai/model_service.py +8 -12
- airflow/providers/google/cloud/hooks/vertex_ai/pipeline_job.py +6 -12
- airflow/providers/google/cloud/hooks/vertex_ai/prediction_service.py +3 -2
- airflow/providers/google/cloud/hooks/vertex_ai/ray.py +223 -0
- airflow/providers/google/cloud/hooks/video_intelligence.py +3 -2
- airflow/providers/google/cloud/hooks/vision.py +7 -7
- airflow/providers/google/cloud/hooks/workflows.py +4 -3
- airflow/providers/google/cloud/links/alloy_db.py +0 -46
- airflow/providers/google/cloud/links/base.py +77 -7
- airflow/providers/google/cloud/links/bigquery.py +0 -47
- airflow/providers/google/cloud/links/bigquery_dts.py +0 -20
- airflow/providers/google/cloud/links/bigtable.py +0 -48
- airflow/providers/google/cloud/links/cloud_build.py +0 -73
- airflow/providers/google/cloud/links/cloud_functions.py +0 -33
- airflow/providers/google/cloud/links/cloud_memorystore.py +0 -58
- airflow/providers/google/cloud/links/{life_sciences.py → cloud_run.py} +5 -27
- airflow/providers/google/cloud/links/cloud_sql.py +0 -33
- airflow/providers/google/cloud/links/cloud_storage_transfer.py +17 -46
- airflow/providers/google/cloud/links/cloud_tasks.py +7 -26
- airflow/providers/google/cloud/links/compute.py +0 -58
- airflow/providers/google/cloud/links/data_loss_prevention.py +0 -169
- airflow/providers/google/cloud/links/datacatalog.py +23 -54
- airflow/providers/google/cloud/links/dataflow.py +0 -34
- airflow/providers/google/cloud/links/dataform.py +0 -64
- airflow/providers/google/cloud/links/datafusion.py +1 -90
- airflow/providers/google/cloud/links/dataplex.py +0 -154
- airflow/providers/google/cloud/links/dataprep.py +0 -24
- airflow/providers/google/cloud/links/dataproc.py +11 -89
- airflow/providers/google/cloud/links/datastore.py +0 -31
- airflow/providers/google/cloud/links/kubernetes_engine.py +11 -61
- airflow/providers/google/cloud/links/managed_kafka.py +11 -51
- airflow/providers/google/cloud/links/mlengine.py +0 -70
- airflow/providers/google/cloud/links/pubsub.py +0 -32
- airflow/providers/google/cloud/links/spanner.py +0 -33
- airflow/providers/google/cloud/links/stackdriver.py +0 -30
- airflow/providers/google/cloud/links/translate.py +17 -187
- airflow/providers/google/cloud/links/vertex_ai.py +28 -195
- airflow/providers/google/cloud/links/workflows.py +0 -52
- airflow/providers/google/cloud/log/gcs_task_handler.py +166 -118
- airflow/providers/google/cloud/log/stackdriver_task_handler.py +14 -9
- airflow/providers/google/cloud/openlineage/CloudStorageTransferJobFacet.json +68 -0
- airflow/providers/google/cloud/openlineage/CloudStorageTransferRunFacet.json +60 -0
- airflow/providers/google/cloud/openlineage/DataFusionRunFacet.json +32 -0
- airflow/providers/google/cloud/openlineage/facets.py +141 -40
- airflow/providers/google/cloud/openlineage/mixins.py +14 -13
- airflow/providers/google/cloud/openlineage/utils.py +19 -3
- airflow/providers/google/cloud/operators/alloy_db.py +76 -61
- airflow/providers/google/cloud/operators/bigquery.py +104 -667
- airflow/providers/google/cloud/operators/bigquery_dts.py +12 -12
- airflow/providers/google/cloud/operators/bigtable.py +38 -7
- airflow/providers/google/cloud/operators/cloud_base.py +22 -1
- airflow/providers/google/cloud/operators/cloud_batch.py +18 -18
- airflow/providers/google/cloud/operators/cloud_build.py +80 -36
- airflow/providers/google/cloud/operators/cloud_composer.py +157 -71
- airflow/providers/google/cloud/operators/cloud_logging_sink.py +341 -0
- airflow/providers/google/cloud/operators/cloud_memorystore.py +74 -46
- airflow/providers/google/cloud/operators/cloud_run.py +39 -20
- airflow/providers/google/cloud/operators/cloud_sql.py +46 -61
- airflow/providers/google/cloud/operators/cloud_storage_transfer_service.py +92 -14
- airflow/providers/google/cloud/operators/compute.py +18 -50
- airflow/providers/google/cloud/operators/datacatalog.py +167 -29
- airflow/providers/google/cloud/operators/dataflow.py +38 -15
- airflow/providers/google/cloud/operators/dataform.py +19 -7
- airflow/providers/google/cloud/operators/datafusion.py +43 -43
- airflow/providers/google/cloud/operators/dataplex.py +212 -126
- airflow/providers/google/cloud/operators/dataprep.py +1 -5
- airflow/providers/google/cloud/operators/dataproc.py +134 -207
- airflow/providers/google/cloud/operators/dataproc_metastore.py +102 -84
- airflow/providers/google/cloud/operators/datastore.py +22 -6
- airflow/providers/google/cloud/operators/dlp.py +24 -45
- airflow/providers/google/cloud/operators/functions.py +21 -14
- airflow/providers/google/cloud/operators/gcs.py +15 -12
- airflow/providers/google/cloud/operators/gen_ai.py +389 -0
- airflow/providers/google/cloud/operators/kubernetes_engine.py +115 -106
- airflow/providers/google/cloud/operators/looker.py +1 -1
- airflow/providers/google/cloud/operators/managed_kafka.py +362 -40
- airflow/providers/google/cloud/operators/natural_language.py +5 -3
- airflow/providers/google/cloud/operators/pubsub.py +69 -21
- airflow/providers/google/cloud/operators/spanner.py +53 -45
- airflow/providers/google/cloud/operators/speech_to_text.py +5 -4
- airflow/providers/google/cloud/operators/stackdriver.py +5 -11
- airflow/providers/google/cloud/operators/tasks.py +6 -15
- airflow/providers/google/cloud/operators/text_to_speech.py +4 -3
- airflow/providers/google/cloud/operators/translate.py +46 -20
- airflow/providers/google/cloud/operators/translate_speech.py +4 -3
- airflow/providers/google/cloud/operators/vertex_ai/auto_ml.py +44 -34
- airflow/providers/google/cloud/operators/vertex_ai/batch_prediction_job.py +34 -12
- airflow/providers/google/cloud/operators/vertex_ai/custom_job.py +62 -53
- airflow/providers/google/cloud/operators/vertex_ai/dataset.py +75 -11
- airflow/providers/google/cloud/operators/vertex_ai/endpoint_service.py +48 -12
- airflow/providers/google/cloud/operators/vertex_ai/experiment_service.py +435 -0
- airflow/providers/google/cloud/operators/vertex_ai/feature_store.py +532 -1
- airflow/providers/google/cloud/operators/vertex_ai/generative_model.py +135 -116
- airflow/providers/google/cloud/operators/vertex_ai/hyperparameter_tuning_job.py +16 -12
- airflow/providers/google/cloud/operators/vertex_ai/model_service.py +62 -14
- airflow/providers/google/cloud/operators/vertex_ai/pipeline_job.py +35 -10
- airflow/providers/google/cloud/operators/vertex_ai/ray.py +393 -0
- airflow/providers/google/cloud/operators/video_intelligence.py +5 -3
- airflow/providers/google/cloud/operators/vision.py +7 -5
- airflow/providers/google/cloud/operators/workflows.py +24 -19
- airflow/providers/google/cloud/secrets/secret_manager.py +2 -1
- airflow/providers/google/cloud/sensors/bigquery.py +2 -2
- airflow/providers/google/cloud/sensors/bigquery_dts.py +6 -4
- airflow/providers/google/cloud/sensors/bigtable.py +14 -6
- airflow/providers/google/cloud/sensors/cloud_composer.py +535 -33
- airflow/providers/google/cloud/sensors/cloud_storage_transfer_service.py +6 -5
- airflow/providers/google/cloud/sensors/dataflow.py +27 -10
- airflow/providers/google/cloud/sensors/dataform.py +2 -2
- airflow/providers/google/cloud/sensors/datafusion.py +4 -4
- airflow/providers/google/cloud/sensors/dataplex.py +7 -5
- airflow/providers/google/cloud/sensors/dataprep.py +2 -2
- airflow/providers/google/cloud/sensors/dataproc.py +10 -9
- airflow/providers/google/cloud/sensors/dataproc_metastore.py +4 -3
- airflow/providers/google/cloud/sensors/gcs.py +22 -21
- airflow/providers/google/cloud/sensors/looker.py +5 -5
- airflow/providers/google/cloud/sensors/pubsub.py +20 -20
- airflow/providers/google/cloud/sensors/tasks.py +2 -2
- airflow/providers/google/cloud/sensors/vertex_ai/feature_store.py +2 -2
- airflow/providers/google/cloud/sensors/workflows.py +6 -4
- airflow/providers/google/cloud/transfers/adls_to_gcs.py +1 -1
- airflow/providers/google/cloud/transfers/azure_blob_to_gcs.py +2 -2
- airflow/providers/google/cloud/transfers/azure_fileshare_to_gcs.py +2 -2
- airflow/providers/google/cloud/transfers/bigquery_to_bigquery.py +11 -8
- airflow/providers/google/cloud/transfers/bigquery_to_gcs.py +14 -13
- airflow/providers/google/cloud/transfers/bigquery_to_mssql.py +7 -3
- airflow/providers/google/cloud/transfers/bigquery_to_mysql.py +12 -1
- airflow/providers/google/cloud/transfers/bigquery_to_postgres.py +24 -10
- airflow/providers/google/cloud/transfers/bigquery_to_sql.py +104 -5
- airflow/providers/google/cloud/transfers/calendar_to_gcs.py +1 -1
- airflow/providers/google/cloud/transfers/cassandra_to_gcs.py +18 -22
- airflow/providers/google/cloud/transfers/facebook_ads_to_gcs.py +4 -5
- airflow/providers/google/cloud/transfers/gcs_to_bigquery.py +45 -38
- airflow/providers/google/cloud/transfers/gcs_to_gcs.py +2 -2
- airflow/providers/google/cloud/transfers/gcs_to_local.py +5 -3
- airflow/providers/google/cloud/transfers/gcs_to_sftp.py +10 -4
- airflow/providers/google/cloud/transfers/gdrive_to_gcs.py +6 -2
- airflow/providers/google/cloud/transfers/gdrive_to_local.py +2 -2
- airflow/providers/google/cloud/transfers/http_to_gcs.py +193 -0
- airflow/providers/google/cloud/transfers/local_to_gcs.py +2 -2
- airflow/providers/google/cloud/transfers/mssql_to_gcs.py +1 -1
- airflow/providers/google/cloud/transfers/oracle_to_gcs.py +36 -11
- airflow/providers/google/cloud/transfers/postgres_to_gcs.py +44 -12
- airflow/providers/google/cloud/transfers/s3_to_gcs.py +12 -6
- airflow/providers/google/cloud/transfers/salesforce_to_gcs.py +2 -2
- airflow/providers/google/cloud/transfers/sftp_to_gcs.py +36 -14
- airflow/providers/google/cloud/transfers/sheets_to_gcs.py +3 -3
- airflow/providers/google/cloud/transfers/sql_to_gcs.py +10 -10
- airflow/providers/google/cloud/triggers/bigquery.py +75 -34
- airflow/providers/google/cloud/triggers/bigquery_dts.py +2 -1
- airflow/providers/google/cloud/triggers/cloud_batch.py +2 -1
- airflow/providers/google/cloud/triggers/cloud_build.py +3 -2
- airflow/providers/google/cloud/triggers/cloud_composer.py +303 -47
- airflow/providers/google/cloud/triggers/cloud_run.py +2 -2
- airflow/providers/google/cloud/triggers/cloud_storage_transfer_service.py +96 -5
- airflow/providers/google/cloud/triggers/dataflow.py +125 -2
- airflow/providers/google/cloud/triggers/datafusion.py +1 -1
- airflow/providers/google/cloud/triggers/dataplex.py +16 -3
- airflow/providers/google/cloud/triggers/dataproc.py +124 -53
- airflow/providers/google/cloud/triggers/kubernetes_engine.py +46 -28
- airflow/providers/google/cloud/triggers/mlengine.py +1 -1
- airflow/providers/google/cloud/triggers/pubsub.py +17 -20
- airflow/providers/google/cloud/triggers/vertex_ai.py +8 -7
- airflow/providers/google/cloud/utils/bigquery.py +5 -7
- airflow/providers/google/cloud/utils/bigquery_get_data.py +1 -1
- airflow/providers/google/cloud/utils/credentials_provider.py +4 -3
- airflow/providers/google/cloud/utils/dataform.py +1 -1
- airflow/providers/google/cloud/utils/external_token_supplier.py +0 -1
- airflow/providers/google/cloud/utils/field_validator.py +1 -2
- airflow/providers/google/cloud/utils/validators.py +43 -0
- airflow/providers/google/common/auth_backend/google_openid.py +26 -9
- airflow/providers/google/common/consts.py +2 -1
- airflow/providers/google/common/deprecated.py +2 -1
- airflow/providers/google/common/hooks/base_google.py +40 -43
- airflow/providers/google/common/hooks/operation_helpers.py +78 -0
- airflow/providers/google/common/links/storage.py +0 -22
- airflow/providers/google/common/utils/get_secret.py +31 -0
- airflow/providers/google/common/utils/id_token_credentials.py +4 -5
- airflow/providers/google/firebase/operators/firestore.py +2 -2
- airflow/providers/google/get_provider_info.py +61 -216
- airflow/providers/google/go_module_utils.py +35 -3
- airflow/providers/google/leveldb/hooks/leveldb.py +30 -6
- airflow/providers/google/leveldb/operators/leveldb.py +2 -2
- airflow/providers/google/marketing_platform/hooks/analytics_admin.py +3 -2
- airflow/providers/google/marketing_platform/hooks/display_video.py +3 -109
- airflow/providers/google/marketing_platform/hooks/search_ads.py +1 -1
- airflow/providers/google/marketing_platform/links/analytics_admin.py +4 -5
- airflow/providers/google/marketing_platform/operators/analytics_admin.py +7 -6
- airflow/providers/google/marketing_platform/operators/campaign_manager.py +5 -5
- airflow/providers/google/marketing_platform/operators/display_video.py +28 -489
- airflow/providers/google/marketing_platform/operators/search_ads.py +2 -2
- airflow/providers/google/marketing_platform/sensors/campaign_manager.py +2 -2
- airflow/providers/google/marketing_platform/sensors/display_video.py +4 -64
- airflow/providers/google/suite/hooks/calendar.py +1 -1
- airflow/providers/google/suite/hooks/drive.py +2 -2
- airflow/providers/google/suite/hooks/sheets.py +15 -1
- airflow/providers/google/suite/operators/sheets.py +8 -3
- airflow/providers/google/suite/sensors/drive.py +2 -2
- airflow/providers/google/suite/transfers/gcs_to_gdrive.py +2 -2
- airflow/providers/google/suite/transfers/gcs_to_sheets.py +1 -1
- airflow/providers/google/suite/transfers/local_to_drive.py +3 -3
- airflow/providers/google/suite/transfers/sql_to_sheets.py +5 -4
- airflow/providers/google/version_compat.py +15 -1
- {apache_airflow_providers_google-14.0.0.dist-info → apache_airflow_providers_google-19.1.0rc1.dist-info}/METADATA +117 -72
- apache_airflow_providers_google-19.1.0rc1.dist-info/RECORD +331 -0
- {apache_airflow_providers_google-14.0.0.dist-info → apache_airflow_providers_google-19.1.0rc1.dist-info}/WHEEL +1 -1
- apache_airflow_providers_google-19.1.0rc1.dist-info/licenses/NOTICE +5 -0
- airflow/providers/google/cloud/example_dags/example_cloud_task.py +0 -54
- airflow/providers/google/cloud/hooks/automl.py +0 -679
- airflow/providers/google/cloud/hooks/life_sciences.py +0 -159
- airflow/providers/google/cloud/links/automl.py +0 -193
- airflow/providers/google/cloud/operators/automl.py +0 -1360
- airflow/providers/google/cloud/operators/life_sciences.py +0 -119
- airflow/providers/google/cloud/operators/mlengine.py +0 -1515
- airflow/providers/google/cloud/utils/mlengine_operator_utils.py +0 -273
- apache_airflow_providers_google-14.0.0.dist-info/RECORD +0 -318
- /airflow/providers/google/cloud/{example_dags → bundles}/__init__.py +0 -0
- {apache_airflow_providers_google-14.0.0.dist-info → apache_airflow_providers_google-19.1.0rc1.dist-info}/entry_points.txt +0 -0
- {airflow/providers/google → apache_airflow_providers_google-19.1.0rc1.dist-info/licenses}/LICENSE +0 -0
|
@@ -20,41 +20,20 @@
|
|
|
20
20
|
|
|
21
21
|
from __future__ import annotations
|
|
22
22
|
|
|
23
|
-
import asyncio
|
|
24
23
|
import json
|
|
25
24
|
import logging
|
|
26
25
|
import re
|
|
27
26
|
import time
|
|
28
27
|
import uuid
|
|
28
|
+
import warnings
|
|
29
29
|
from collections.abc import Iterable, Mapping, Sequence
|
|
30
30
|
from copy import deepcopy
|
|
31
31
|
from datetime import datetime, timedelta
|
|
32
|
-
from typing import TYPE_CHECKING, Any, NoReturn,
|
|
32
|
+
from typing import TYPE_CHECKING, Any, Literal, NoReturn, cast, overload
|
|
33
33
|
|
|
34
|
+
import pendulum
|
|
34
35
|
from aiohttp import ClientSession as ClientSession
|
|
35
36
|
from gcloud.aio.bigquery import Job, Table as Table_async
|
|
36
|
-
from googleapiclient.discovery import build
|
|
37
|
-
from pandas_gbq import read_gbq
|
|
38
|
-
from pandas_gbq.gbq import GbqConnector # noqa: F401 used in ``airflow.contrib.hooks.bigquery``
|
|
39
|
-
from requests import Session
|
|
40
|
-
from sqlalchemy import create_engine
|
|
41
|
-
|
|
42
|
-
from airflow.exceptions import AirflowException, AirflowProviderDeprecationWarning
|
|
43
|
-
from airflow.providers.common.compat.lineage.hook import get_hook_lineage_collector
|
|
44
|
-
from airflow.providers.common.sql.hooks.sql import DbApiHook
|
|
45
|
-
from airflow.providers.google.cloud.utils.bigquery import bq_cast
|
|
46
|
-
from airflow.providers.google.cloud.utils.credentials_provider import _get_scopes
|
|
47
|
-
from airflow.providers.google.common.consts import CLIENT_INFO
|
|
48
|
-
from airflow.providers.google.common.deprecated import deprecated
|
|
49
|
-
from airflow.providers.google.common.hooks.base_google import (
|
|
50
|
-
PROVIDE_PROJECT_ID,
|
|
51
|
-
GoogleBaseAsyncHook,
|
|
52
|
-
GoogleBaseHook,
|
|
53
|
-
get_field,
|
|
54
|
-
)
|
|
55
|
-
from airflow.utils.hashlib_wrapper import md5
|
|
56
|
-
from airflow.utils.helpers import convert_camel_to_snake
|
|
57
|
-
from airflow.utils.log.logging_mixin import LoggingMixin
|
|
58
37
|
from google.cloud.bigquery import (
|
|
59
38
|
DEFAULT_RETRY,
|
|
60
39
|
Client,
|
|
@@ -75,16 +54,47 @@ from google.cloud.bigquery.table import (
|
|
|
75
54
|
TableReference,
|
|
76
55
|
)
|
|
77
56
|
from google.cloud.exceptions import NotFound
|
|
57
|
+
from googleapiclient.discovery import build
|
|
58
|
+
from pandas_gbq import read_gbq
|
|
59
|
+
from pandas_gbq.gbq import GbqConnector # noqa: F401 used in ``airflow.contrib.hooks.bigquery``
|
|
60
|
+
from sqlalchemy import create_engine
|
|
61
|
+
|
|
62
|
+
from airflow.exceptions import (
|
|
63
|
+
AirflowException,
|
|
64
|
+
AirflowOptionalProviderFeatureException,
|
|
65
|
+
AirflowProviderDeprecationWarning,
|
|
66
|
+
)
|
|
67
|
+
from airflow.providers.common.compat.lineage.hook import get_hook_lineage_collector
|
|
68
|
+
from airflow.providers.common.sql.hooks.sql import DbApiHook
|
|
69
|
+
from airflow.providers.google.cloud.utils.bigquery import bq_cast
|
|
70
|
+
from airflow.providers.google.cloud.utils.credentials_provider import _get_scopes
|
|
71
|
+
from airflow.providers.google.common.consts import CLIENT_INFO
|
|
72
|
+
from airflow.providers.google.common.deprecated import deprecated
|
|
73
|
+
from airflow.providers.google.common.hooks.base_google import (
|
|
74
|
+
_UNSET,
|
|
75
|
+
PROVIDE_PROJECT_ID,
|
|
76
|
+
GoogleBaseAsyncHook,
|
|
77
|
+
GoogleBaseHook,
|
|
78
|
+
get_field,
|
|
79
|
+
)
|
|
80
|
+
from airflow.providers.google.version_compat import AIRFLOW_V_3_0_PLUS
|
|
81
|
+
from airflow.utils.hashlib_wrapper import md5
|
|
82
|
+
from airflow.utils.helpers import convert_camel_to_snake
|
|
83
|
+
from airflow.utils.log.logging_mixin import LoggingMixin
|
|
84
|
+
from airflow.utils.types import DagRunType
|
|
78
85
|
|
|
79
86
|
if TYPE_CHECKING:
|
|
80
87
|
import pandas as pd
|
|
81
|
-
|
|
88
|
+
import polars as pl
|
|
82
89
|
from google.api_core.page_iterator import HTTPIterator
|
|
83
90
|
from google.api_core.retry import Retry
|
|
91
|
+
from requests import Session
|
|
92
|
+
|
|
93
|
+
from airflow.sdk import Context
|
|
84
94
|
|
|
85
95
|
log = logging.getLogger(__name__)
|
|
86
96
|
|
|
87
|
-
BigQueryJob =
|
|
97
|
+
BigQueryJob = CopyJob | QueryJob | LoadJob | ExtractJob
|
|
88
98
|
|
|
89
99
|
|
|
90
100
|
class BigQueryHook(GoogleBaseHook, DbApiHook):
|
|
@@ -121,10 +131,10 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
|
|
|
121
131
|
from wtforms import validators
|
|
122
132
|
from wtforms.fields.simple import BooleanField, StringField
|
|
123
133
|
|
|
124
|
-
from airflow.
|
|
134
|
+
from airflow.providers.google.cloud.utils.validators import ValidJson
|
|
125
135
|
|
|
126
136
|
connection_form_widgets = super().get_connection_form_widgets()
|
|
127
|
-
connection_form_widgets["use_legacy_sql"] = BooleanField(lazy_gettext("Use Legacy SQL")
|
|
137
|
+
connection_form_widgets["use_legacy_sql"] = BooleanField(lazy_gettext("Use Legacy SQL"))
|
|
128
138
|
connection_form_widgets["location"] = StringField(
|
|
129
139
|
lazy_gettext("Location"), widget=BS3TextFieldWidget()
|
|
130
140
|
)
|
|
@@ -152,21 +162,47 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
|
|
|
152
162
|
|
|
153
163
|
def __init__(
|
|
154
164
|
self,
|
|
155
|
-
use_legacy_sql: bool =
|
|
156
|
-
location: str | None =
|
|
157
|
-
priority: str =
|
|
158
|
-
api_resource_configs: dict | None =
|
|
165
|
+
use_legacy_sql: bool | object = _UNSET,
|
|
166
|
+
location: str | None | object = _UNSET,
|
|
167
|
+
priority: str | object = _UNSET,
|
|
168
|
+
api_resource_configs: dict | None | object = _UNSET,
|
|
159
169
|
impersonation_scopes: str | Sequence[str] | None = None,
|
|
160
|
-
labels: dict | None =
|
|
170
|
+
labels: dict | None | object = _UNSET,
|
|
161
171
|
**kwargs,
|
|
162
172
|
) -> None:
|
|
163
173
|
super().__init__(**kwargs)
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
174
|
+
# Use sentinel pattern to distinguish "not provided" from "explicitly provided"
|
|
175
|
+
if use_legacy_sql is _UNSET:
|
|
176
|
+
value = self._get_field("use_legacy_sql", _UNSET)
|
|
177
|
+
self.use_legacy_sql: bool = value if value is not None else True
|
|
178
|
+
else:
|
|
179
|
+
self.use_legacy_sql = use_legacy_sql # type: ignore[assignment]
|
|
180
|
+
|
|
181
|
+
if location is _UNSET:
|
|
182
|
+
self.location: str | None = self._get_field("location", _UNSET)
|
|
183
|
+
else:
|
|
184
|
+
self.location = location # type: ignore[assignment]
|
|
185
|
+
|
|
186
|
+
if priority is _UNSET:
|
|
187
|
+
value = self._get_field("priority", _UNSET)
|
|
188
|
+
self.priority: str = value if value is not None else "INTERACTIVE"
|
|
189
|
+
else:
|
|
190
|
+
self.priority = priority # type: ignore[assignment]
|
|
191
|
+
|
|
167
192
|
self.running_job_id: str | None = None
|
|
168
|
-
|
|
169
|
-
|
|
193
|
+
|
|
194
|
+
if api_resource_configs is _UNSET:
|
|
195
|
+
value = self._get_field("api_resource_configs", _UNSET)
|
|
196
|
+
self.api_resource_configs: dict = value if value is not None else {}
|
|
197
|
+
else:
|
|
198
|
+
self.api_resource_configs = api_resource_configs or {} # type: ignore[assignment]
|
|
199
|
+
|
|
200
|
+
if labels is _UNSET:
|
|
201
|
+
value = self._get_field("labels", _UNSET)
|
|
202
|
+
self.labels = value if value is not None else {}
|
|
203
|
+
else:
|
|
204
|
+
self.labels = labels or {} # type: ignore[assignment]
|
|
205
|
+
|
|
170
206
|
self.impersonation_scopes: str | Sequence[str] | None = impersonation_scopes
|
|
171
207
|
|
|
172
208
|
def get_conn(self) -> BigQueryConnection:
|
|
@@ -276,15 +312,57 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
|
|
|
276
312
|
"""
|
|
277
313
|
raise NotImplementedError()
|
|
278
314
|
|
|
279
|
-
def
|
|
315
|
+
def _get_pandas_df(
|
|
280
316
|
self,
|
|
281
317
|
sql: str,
|
|
282
318
|
parameters: Iterable | Mapping[str, Any] | None = None,
|
|
283
319
|
dialect: str | None = None,
|
|
284
320
|
**kwargs,
|
|
285
321
|
) -> pd.DataFrame:
|
|
322
|
+
if dialect is None:
|
|
323
|
+
dialect = "legacy" if self.use_legacy_sql else "standard"
|
|
324
|
+
|
|
325
|
+
credentials, project_id = self.get_credentials_and_project_id()
|
|
326
|
+
|
|
327
|
+
return read_gbq(sql, project_id=project_id, dialect=dialect, credentials=credentials, **kwargs)
|
|
328
|
+
|
|
329
|
+
def _get_polars_df(self, sql, parameters=None, dialect=None, **kwargs) -> pl.DataFrame:
|
|
330
|
+
try:
|
|
331
|
+
import polars as pl
|
|
332
|
+
except ImportError:
|
|
333
|
+
raise AirflowOptionalProviderFeatureException(
|
|
334
|
+
"Polars is not installed. Please install it with `pip install polars`."
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
if dialect is None:
|
|
338
|
+
dialect = "legacy" if self.use_legacy_sql else "standard"
|
|
339
|
+
|
|
340
|
+
credentials, project_id = self.get_credentials_and_project_id()
|
|
341
|
+
|
|
342
|
+
pandas_df = read_gbq(sql, project_id=project_id, dialect=dialect, credentials=credentials, **kwargs)
|
|
343
|
+
return pl.from_pandas(pandas_df)
|
|
344
|
+
|
|
345
|
+
@overload
|
|
346
|
+
def get_df(
|
|
347
|
+
self, sql, parameters=None, dialect=None, *, df_type: Literal["pandas"] = "pandas", **kwargs
|
|
348
|
+
) -> pd.DataFrame: ...
|
|
349
|
+
|
|
350
|
+
@overload
|
|
351
|
+
def get_df(
|
|
352
|
+
self, sql, parameters=None, dialect=None, *, df_type: Literal["polars"], **kwargs
|
|
353
|
+
) -> pl.DataFrame: ...
|
|
354
|
+
|
|
355
|
+
def get_df(
|
|
356
|
+
self,
|
|
357
|
+
sql,
|
|
358
|
+
parameters=None,
|
|
359
|
+
dialect=None,
|
|
360
|
+
*,
|
|
361
|
+
df_type: Literal["pandas", "polars"] = "pandas",
|
|
362
|
+
**kwargs,
|
|
363
|
+
) -> pd.DataFrame | pl.DataFrame:
|
|
286
364
|
"""
|
|
287
|
-
Get a
|
|
365
|
+
Get a DataFrame for the BigQuery results.
|
|
288
366
|
|
|
289
367
|
The DbApiHook method must be overridden because Pandas doesn't support
|
|
290
368
|
PEP 249 connections, except for SQLite.
|
|
@@ -300,12 +378,19 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
|
|
|
300
378
|
defaults to use `self.use_legacy_sql` if not specified
|
|
301
379
|
:param kwargs: (optional) passed into pandas_gbq.read_gbq method
|
|
302
380
|
"""
|
|
303
|
-
if
|
|
304
|
-
|
|
381
|
+
if df_type == "polars":
|
|
382
|
+
return self._get_polars_df(sql, parameters, dialect, **kwargs)
|
|
305
383
|
|
|
306
|
-
|
|
384
|
+
if df_type == "pandas":
|
|
385
|
+
return self._get_pandas_df(sql, parameters, dialect, **kwargs)
|
|
307
386
|
|
|
308
|
-
|
|
387
|
+
@deprecated(
|
|
388
|
+
planned_removal_date="November 30, 2025",
|
|
389
|
+
use_instead="airflow.providers.google.cloud.hooks.bigquery.BigQueryHook.get_df",
|
|
390
|
+
category=AirflowProviderDeprecationWarning,
|
|
391
|
+
)
|
|
392
|
+
def get_pandas_df(self, sql, parameters=None, dialect=None, **kwargs):
|
|
393
|
+
return self._get_pandas_df(sql, parameters, dialect, **kwargs)
|
|
309
394
|
|
|
310
395
|
@GoogleBaseHook.fallback_to_default_project_id
|
|
311
396
|
def table_exists(self, dataset_id: str, table_id: str, project_id: str) -> bool:
|
|
@@ -347,135 +432,6 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
|
|
|
347
432
|
except NotFound:
|
|
348
433
|
return False
|
|
349
434
|
|
|
350
|
-
@deprecated(
|
|
351
|
-
planned_removal_date="July 30, 2025",
|
|
352
|
-
use_instead="airflow.providers.google.cloud.hooks.bigquery.BigQueryHook.create_table",
|
|
353
|
-
category=AirflowProviderDeprecationWarning,
|
|
354
|
-
)
|
|
355
|
-
@GoogleBaseHook.fallback_to_default_project_id
|
|
356
|
-
def create_empty_table(
|
|
357
|
-
self,
|
|
358
|
-
project_id: str = PROVIDE_PROJECT_ID,
|
|
359
|
-
dataset_id: str | None = None,
|
|
360
|
-
table_id: str | None = None,
|
|
361
|
-
table_resource: dict[str, Any] | None = None,
|
|
362
|
-
schema_fields: list | None = None,
|
|
363
|
-
time_partitioning: dict | None = None,
|
|
364
|
-
cluster_fields: list[str] | None = None,
|
|
365
|
-
labels: dict | None = None,
|
|
366
|
-
view: dict | None = None,
|
|
367
|
-
materialized_view: dict | None = None,
|
|
368
|
-
encryption_configuration: dict | None = None,
|
|
369
|
-
retry: Retry = DEFAULT_RETRY,
|
|
370
|
-
location: str | None = None,
|
|
371
|
-
exists_ok: bool = True,
|
|
372
|
-
) -> Table:
|
|
373
|
-
"""
|
|
374
|
-
Create a new, empty table in the dataset.
|
|
375
|
-
|
|
376
|
-
To create a view, which is defined by a SQL query, parse a dictionary to
|
|
377
|
-
the *view* argument.
|
|
378
|
-
|
|
379
|
-
:param project_id: The project to create the table into.
|
|
380
|
-
:param dataset_id: The dataset to create the table into.
|
|
381
|
-
:param table_id: The Name of the table to be created.
|
|
382
|
-
:param table_resource: Table resource as described in documentation:
|
|
383
|
-
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#Table
|
|
384
|
-
If provided all other parameters are ignored.
|
|
385
|
-
:param schema_fields: If set, the schema field list as defined here:
|
|
386
|
-
https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.load.schema
|
|
387
|
-
|
|
388
|
-
.. code-block:: python
|
|
389
|
-
|
|
390
|
-
schema_fields = [
|
|
391
|
-
{"name": "emp_name", "type": "STRING", "mode": "REQUIRED"},
|
|
392
|
-
{"name": "salary", "type": "INTEGER", "mode": "NULLABLE"},
|
|
393
|
-
]
|
|
394
|
-
|
|
395
|
-
:param labels: a dictionary containing labels for the table, passed to BigQuery
|
|
396
|
-
:param retry: Optional. How to retry the RPC.
|
|
397
|
-
:param time_partitioning: configure optional time partitioning fields i.e.
|
|
398
|
-
partition by field, type and expiration as per API specifications.
|
|
399
|
-
|
|
400
|
-
.. seealso::
|
|
401
|
-
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#timePartitioning
|
|
402
|
-
:param cluster_fields: [Optional] The fields used for clustering.
|
|
403
|
-
BigQuery supports clustering for both partitioned and
|
|
404
|
-
non-partitioned tables.
|
|
405
|
-
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#clustering.fields
|
|
406
|
-
:param view: [Optional] A dictionary containing definition for the view.
|
|
407
|
-
If set, it will create a view instead of a table:
|
|
408
|
-
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ViewDefinition
|
|
409
|
-
|
|
410
|
-
.. code-block:: python
|
|
411
|
-
|
|
412
|
-
view = {
|
|
413
|
-
"query": "SELECT * FROM `test-project-id.test_dataset_id.test_table_prefix*` LIMIT 1000",
|
|
414
|
-
"useLegacySql": False,
|
|
415
|
-
}
|
|
416
|
-
|
|
417
|
-
:param materialized_view: [Optional] The materialized view definition.
|
|
418
|
-
:param encryption_configuration: [Optional] Custom encryption configuration (e.g., Cloud KMS keys).
|
|
419
|
-
|
|
420
|
-
.. code-block:: python
|
|
421
|
-
|
|
422
|
-
encryption_configuration = {
|
|
423
|
-
"kmsKeyName": "projects/testp/locations/us/keyRings/test-kr/cryptoKeys/test-key",
|
|
424
|
-
}
|
|
425
|
-
|
|
426
|
-
:param num_retries: Maximum number of retries in case of connection problems.
|
|
427
|
-
:param location: (Optional) The geographic location where the table should reside.
|
|
428
|
-
:param exists_ok: If ``True``, ignore "already exists" errors when creating the table.
|
|
429
|
-
:return: Created table
|
|
430
|
-
"""
|
|
431
|
-
_table_resource: dict[str, Any] = {}
|
|
432
|
-
|
|
433
|
-
if self.location:
|
|
434
|
-
_table_resource["location"] = self.location
|
|
435
|
-
|
|
436
|
-
if schema_fields:
|
|
437
|
-
_table_resource["schema"] = {"fields": schema_fields}
|
|
438
|
-
|
|
439
|
-
if time_partitioning:
|
|
440
|
-
_table_resource["timePartitioning"] = time_partitioning
|
|
441
|
-
|
|
442
|
-
if cluster_fields:
|
|
443
|
-
_table_resource["clustering"] = {"fields": cluster_fields}
|
|
444
|
-
|
|
445
|
-
if labels:
|
|
446
|
-
_table_resource["labels"] = labels
|
|
447
|
-
|
|
448
|
-
if view:
|
|
449
|
-
_table_resource["view"] = view
|
|
450
|
-
|
|
451
|
-
if materialized_view:
|
|
452
|
-
_table_resource["materializedView"] = materialized_view
|
|
453
|
-
|
|
454
|
-
if encryption_configuration:
|
|
455
|
-
_table_resource["encryptionConfiguration"] = encryption_configuration
|
|
456
|
-
|
|
457
|
-
table_resource = table_resource or _table_resource
|
|
458
|
-
table_resource = self._resolve_table_reference(
|
|
459
|
-
table_resource=table_resource,
|
|
460
|
-
project_id=project_id,
|
|
461
|
-
dataset_id=dataset_id,
|
|
462
|
-
table_id=table_id,
|
|
463
|
-
)
|
|
464
|
-
table = Table.from_api_repr(table_resource)
|
|
465
|
-
result = self.get_client(project_id=project_id, location=location).create_table(
|
|
466
|
-
table=table, exists_ok=exists_ok, retry=retry
|
|
467
|
-
)
|
|
468
|
-
get_hook_lineage_collector().add_output_asset(
|
|
469
|
-
context=self,
|
|
470
|
-
scheme="bigquery",
|
|
471
|
-
asset_kwargs={
|
|
472
|
-
"project_id": result.project,
|
|
473
|
-
"dataset_id": result.dataset_id,
|
|
474
|
-
"table_id": result.table_id,
|
|
475
|
-
},
|
|
476
|
-
)
|
|
477
|
-
return result
|
|
478
|
-
|
|
479
435
|
@GoogleBaseHook.fallback_to_default_project_id
|
|
480
436
|
def create_table(
|
|
481
437
|
self,
|
|
@@ -862,7 +818,7 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
|
|
|
862
818
|
if return_iterator:
|
|
863
819
|
# The iterator returned by list_datasets() is a HTTPIterator but annotated
|
|
864
820
|
# as Iterator
|
|
865
|
-
return iterator #
|
|
821
|
+
return iterator # type: ignore
|
|
866
822
|
|
|
867
823
|
datasets_list = list(iterator)
|
|
868
824
|
self.log.info("Datasets List: %s", len(datasets_list))
|
|
@@ -1350,7 +1306,16 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
|
|
|
1350
1306
|
job_api_repr.result(timeout=timeout, retry=retry)
|
|
1351
1307
|
return job_api_repr
|
|
1352
1308
|
|
|
1353
|
-
def generate_job_id(
|
|
1309
|
+
def generate_job_id(
|
|
1310
|
+
self,
|
|
1311
|
+
job_id: str | None,
|
|
1312
|
+
dag_id: str,
|
|
1313
|
+
task_id: str,
|
|
1314
|
+
logical_date: datetime | None,
|
|
1315
|
+
configuration: dict,
|
|
1316
|
+
run_after: pendulum.DateTime | datetime | None = None,
|
|
1317
|
+
force_rerun: bool = False,
|
|
1318
|
+
) -> str:
|
|
1354
1319
|
if force_rerun:
|
|
1355
1320
|
hash_base = str(uuid.uuid4())
|
|
1356
1321
|
else:
|
|
@@ -1361,10 +1326,31 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
|
|
|
1361
1326
|
if job_id:
|
|
1362
1327
|
return f"{job_id}_{uniqueness_suffix}"
|
|
1363
1328
|
|
|
1364
|
-
|
|
1365
|
-
|
|
1329
|
+
if logical_date is not None:
|
|
1330
|
+
if AIRFLOW_V_3_0_PLUS:
|
|
1331
|
+
warnings.warn(
|
|
1332
|
+
"The 'logical_date' parameter is deprecated. Please use 'run_after' instead.",
|
|
1333
|
+
AirflowProviderDeprecationWarning,
|
|
1334
|
+
stacklevel=1,
|
|
1335
|
+
)
|
|
1336
|
+
job_id_timestamp = logical_date
|
|
1337
|
+
elif run_after is not None:
|
|
1338
|
+
job_id_timestamp = run_after
|
|
1339
|
+
else:
|
|
1340
|
+
job_id_timestamp = pendulum.now("UTC")
|
|
1341
|
+
|
|
1342
|
+
job_id = f"airflow_{dag_id}_{task_id}_{job_id_timestamp.isoformat()}_{uniqueness_suffix}"
|
|
1366
1343
|
return re.sub(r"[:\-+.]", "_", job_id)
|
|
1367
1344
|
|
|
1345
|
+
def get_run_after_or_logical_date(self, context: Context) -> pendulum.DateTime | datetime | None:
|
|
1346
|
+
dag_run = context.get("dag_run")
|
|
1347
|
+
if not dag_run:
|
|
1348
|
+
return pendulum.now("UTC")
|
|
1349
|
+
|
|
1350
|
+
if AIRFLOW_V_3_0_PLUS:
|
|
1351
|
+
return dag_run.start_date
|
|
1352
|
+
return dag_run.start_date if dag_run.run_type == DagRunType.SCHEDULED else context.get("logical_date")
|
|
1353
|
+
|
|
1368
1354
|
def split_tablename(
|
|
1369
1355
|
self, table_input: str, default_project_id: str, var_name: str | None = None
|
|
1370
1356
|
) -> tuple[str, str, str]:
|
|
@@ -1377,8 +1363,7 @@ class BigQueryHook(GoogleBaseHook, DbApiHook):
|
|
|
1377
1363
|
def var_print(var_name):
|
|
1378
1364
|
if var_name is None:
|
|
1379
1365
|
return ""
|
|
1380
|
-
|
|
1381
|
-
return f"Format exception for {var_name}: "
|
|
1366
|
+
return f"Format exception for {var_name}: "
|
|
1382
1367
|
|
|
1383
1368
|
if table_input.count(".") + table_input.count(":") > 3:
|
|
1384
1369
|
raise ValueError(f"{var_print(var_name)}Use either : or . to specify project got {table_input}")
|
|
@@ -1779,6 +1764,7 @@ class BigQueryCursor(BigQueryBaseCursor):
|
|
|
1779
1764
|
schema_update_options: Iterable | None = None,
|
|
1780
1765
|
priority: str | None = None,
|
|
1781
1766
|
time_partitioning: dict | None = None,
|
|
1767
|
+
range_partitioning: dict | None = None,
|
|
1782
1768
|
api_resource_configs: dict | None = None,
|
|
1783
1769
|
cluster_fields: list[str] | None = None,
|
|
1784
1770
|
encryption_configuration: dict | None = None,
|
|
@@ -1791,6 +1777,10 @@ class BigQueryCursor(BigQueryBaseCursor):
|
|
|
1791
1777
|
|
|
1792
1778
|
if time_partitioning is None:
|
|
1793
1779
|
time_partitioning = {}
|
|
1780
|
+
if range_partitioning is None:
|
|
1781
|
+
range_partitioning = {}
|
|
1782
|
+
if time_partitioning and range_partitioning:
|
|
1783
|
+
raise ValueError("Only one of time_partitioning or range_partitioning can be set.")
|
|
1794
1784
|
|
|
1795
1785
|
if not api_resource_configs:
|
|
1796
1786
|
api_resource_configs = self.hook.api_resource_configs
|
|
@@ -1820,14 +1810,6 @@ class BigQueryCursor(BigQueryBaseCursor):
|
|
|
1820
1810
|
f" Please only use one or more of the following options: {allowed_schema_update_options}"
|
|
1821
1811
|
)
|
|
1822
1812
|
|
|
1823
|
-
if schema_update_options:
|
|
1824
|
-
if write_disposition not in ["WRITE_APPEND", "WRITE_TRUNCATE"]:
|
|
1825
|
-
raise ValueError(
|
|
1826
|
-
"schema_update_options is only "
|
|
1827
|
-
"allowed if write_disposition is "
|
|
1828
|
-
"'WRITE_APPEND' or 'WRITE_TRUNCATE'."
|
|
1829
|
-
)
|
|
1830
|
-
|
|
1831
1813
|
if destination_dataset_table:
|
|
1832
1814
|
destination_project, destination_dataset, destination_table = self.hook.split_tablename(
|
|
1833
1815
|
table_input=destination_dataset_table, default_project_id=self.project_id
|
|
@@ -1851,16 +1833,21 @@ class BigQueryCursor(BigQueryBaseCursor):
|
|
|
1851
1833
|
(maximum_billing_tier, "maximumBillingTier", None, int),
|
|
1852
1834
|
(maximum_bytes_billed, "maximumBytesBilled", None, float),
|
|
1853
1835
|
(time_partitioning, "timePartitioning", {}, dict),
|
|
1836
|
+
(range_partitioning, "rangePartitioning", {}, dict),
|
|
1854
1837
|
(schema_update_options, "schemaUpdateOptions", None, list),
|
|
1855
1838
|
(destination_dataset_table, "destinationTable", None, dict),
|
|
1856
1839
|
(cluster_fields, "clustering", None, dict),
|
|
1857
1840
|
]
|
|
1858
1841
|
|
|
1859
|
-
for
|
|
1860
|
-
|
|
1842
|
+
for param_raw, param_name, param_default, param_type in query_param_list:
|
|
1843
|
+
param: Any
|
|
1844
|
+
if param_name not in configuration["query"] and param_raw in [None, {}, ()]:
|
|
1861
1845
|
if param_name == "timePartitioning":
|
|
1862
|
-
|
|
1863
|
-
|
|
1846
|
+
param = _cleanse_time_partitioning(destination_dataset_table, time_partitioning)
|
|
1847
|
+
else:
|
|
1848
|
+
param = param_default
|
|
1849
|
+
else:
|
|
1850
|
+
param = param_raw
|
|
1864
1851
|
|
|
1865
1852
|
if param in [None, {}, ()]:
|
|
1866
1853
|
continue
|
|
@@ -1887,15 +1874,14 @@ class BigQueryCursor(BigQueryBaseCursor):
|
|
|
1887
1874
|
"must be a dict with {'projectId':'', "
|
|
1888
1875
|
"'datasetId':'', 'tableId':''}"
|
|
1889
1876
|
)
|
|
1890
|
-
|
|
1891
|
-
|
|
1892
|
-
|
|
1893
|
-
|
|
1894
|
-
|
|
1895
|
-
|
|
1896
|
-
|
|
1897
|
-
|
|
1898
|
-
)
|
|
1877
|
+
configuration["query"].update(
|
|
1878
|
+
{
|
|
1879
|
+
"allowLargeResults": allow_large_results,
|
|
1880
|
+
"flattenResults": flatten_results,
|
|
1881
|
+
"writeDisposition": write_disposition,
|
|
1882
|
+
"createDisposition": create_disposition,
|
|
1883
|
+
}
|
|
1884
|
+
)
|
|
1899
1885
|
|
|
1900
1886
|
if (
|
|
1901
1887
|
"useLegacySql" in configuration["query"]
|
|
@@ -1939,75 +1925,6 @@ def _escape(s: str) -> str:
|
|
|
1939
1925
|
return e
|
|
1940
1926
|
|
|
1941
1927
|
|
|
1942
|
-
@deprecated(
|
|
1943
|
-
planned_removal_date="April 01, 2025",
|
|
1944
|
-
use_instead="airflow.providers.google.cloud.hooks.bigquery.BigQueryHook.split_tablename",
|
|
1945
|
-
category=AirflowProviderDeprecationWarning,
|
|
1946
|
-
)
|
|
1947
|
-
def split_tablename(
|
|
1948
|
-
table_input: str, default_project_id: str, var_name: str | None = None
|
|
1949
|
-
) -> tuple[str, str, str]:
|
|
1950
|
-
if "." not in table_input:
|
|
1951
|
-
raise ValueError(f"Expected table name in the format of <dataset>.<table>. Got: {table_input}")
|
|
1952
|
-
|
|
1953
|
-
if not default_project_id:
|
|
1954
|
-
raise ValueError("INTERNAL: No default project is specified")
|
|
1955
|
-
|
|
1956
|
-
def var_print(var_name):
|
|
1957
|
-
if var_name is None:
|
|
1958
|
-
return ""
|
|
1959
|
-
else:
|
|
1960
|
-
return f"Format exception for {var_name}: "
|
|
1961
|
-
|
|
1962
|
-
if table_input.count(".") + table_input.count(":") > 3:
|
|
1963
|
-
raise ValueError(f"{var_print(var_name)}Use either : or . to specify project got {table_input}")
|
|
1964
|
-
cmpt = table_input.rsplit(":", 1)
|
|
1965
|
-
project_id = None
|
|
1966
|
-
rest = table_input
|
|
1967
|
-
if len(cmpt) == 1:
|
|
1968
|
-
project_id = None
|
|
1969
|
-
rest = cmpt[0]
|
|
1970
|
-
elif len(cmpt) == 2 and cmpt[0].count(":") <= 1:
|
|
1971
|
-
if cmpt[-1].count(".") != 2:
|
|
1972
|
-
project_id = cmpt[0]
|
|
1973
|
-
rest = cmpt[1]
|
|
1974
|
-
else:
|
|
1975
|
-
raise ValueError(
|
|
1976
|
-
f"{var_print(var_name)}Expect format of (<project:)<dataset>.<table>, got {table_input}"
|
|
1977
|
-
)
|
|
1978
|
-
|
|
1979
|
-
cmpt = rest.split(".")
|
|
1980
|
-
if len(cmpt) == 3:
|
|
1981
|
-
if project_id:
|
|
1982
|
-
raise ValueError(f"{var_print(var_name)}Use either : or . to specify project")
|
|
1983
|
-
project_id = cmpt[0]
|
|
1984
|
-
dataset_id = cmpt[1]
|
|
1985
|
-
table_id = cmpt[2]
|
|
1986
|
-
|
|
1987
|
-
elif len(cmpt) == 2:
|
|
1988
|
-
dataset_id = cmpt[0]
|
|
1989
|
-
table_id = cmpt[1]
|
|
1990
|
-
else:
|
|
1991
|
-
raise ValueError(
|
|
1992
|
-
f"{var_print(var_name)}Expect format of (<project.|<project:)<dataset>.<table>, got {table_input}"
|
|
1993
|
-
)
|
|
1994
|
-
|
|
1995
|
-
# Exclude partition from the table name
|
|
1996
|
-
table_id = table_id.split("$")[0]
|
|
1997
|
-
|
|
1998
|
-
if project_id is None:
|
|
1999
|
-
if var_name is not None:
|
|
2000
|
-
log.info(
|
|
2001
|
-
'Project is not included in %s: %s; using project "%s"',
|
|
2002
|
-
var_name,
|
|
2003
|
-
table_input,
|
|
2004
|
-
default_project_id,
|
|
2005
|
-
)
|
|
2006
|
-
project_id = default_project_id
|
|
2007
|
-
|
|
2008
|
-
return project_id, dataset_id, table_id
|
|
2009
|
-
|
|
2010
|
-
|
|
2011
1928
|
def _cleanse_time_partitioning(
|
|
2012
1929
|
destination_dataset_table: str | None, time_partitioning_in: dict | None
|
|
2013
1930
|
) -> dict: # if it is a partitioned table ($ is in the table name) add partition load option
|
|
@@ -2117,52 +2034,17 @@ class BigQueryAsyncHook(GoogleBaseAsyncHook):
|
|
|
2117
2034
|
job_id=job_id,
|
|
2118
2035
|
project=project_id,
|
|
2119
2036
|
token=token,
|
|
2120
|
-
session=cast(Session, session),
|
|
2037
|
+
session=cast("Session", session),
|
|
2121
2038
|
)
|
|
2122
2039
|
|
|
2123
2040
|
async def _get_job(
|
|
2124
2041
|
self, job_id: str | None, project_id: str = PROVIDE_PROJECT_ID, location: str | None = None
|
|
2125
2042
|
) -> BigQueryJob | UnknownJob:
|
|
2126
|
-
"""
|
|
2127
|
-
|
|
2128
|
-
|
|
2129
|
-
WARNING.
|
|
2130
|
-
This is a temporary workaround for issues below, and it's not intended to be used elsewhere!
|
|
2131
|
-
https://github.com/apache/airflow/issues/35833
|
|
2132
|
-
https://github.com/talkiq/gcloud-aio/issues/584
|
|
2133
|
-
|
|
2134
|
-
This method was developed, because neither the `google-cloud-bigquery` nor the `gcloud-aio-bigquery`
|
|
2135
|
-
provides asynchronous access to a BigQuery jobs with location parameter. That's why this method wraps
|
|
2136
|
-
synchronous client call with the event loop's run_in_executor() method.
|
|
2137
|
-
|
|
2138
|
-
This workaround must be deleted along with the method _get_job_sync() and replaced by more robust and
|
|
2139
|
-
cleaner solution in one of two cases:
|
|
2140
|
-
1. The `google-cloud-bigquery` library provides async client with get_job method, that supports
|
|
2141
|
-
optional parameter `location`
|
|
2142
|
-
2. The `gcloud-aio-bigquery` library supports the `location` parameter in get_job() method.
|
|
2143
|
-
"""
|
|
2144
|
-
loop = asyncio.get_event_loop()
|
|
2145
|
-
job = await loop.run_in_executor(None, self._get_job_sync, job_id, project_id, location)
|
|
2043
|
+
"""Get BigQuery job by its ID, project ID and location."""
|
|
2044
|
+
sync_hook = await self.get_sync_hook()
|
|
2045
|
+
job = sync_hook.get_job(job_id=job_id, project_id=project_id, location=location)
|
|
2146
2046
|
return job
|
|
2147
2047
|
|
|
2148
|
-
def _get_job_sync(self, job_id, project_id, location):
|
|
2149
|
-
"""
|
|
2150
|
-
Get BigQuery job by its ID, project ID and location synchronously.
|
|
2151
|
-
|
|
2152
|
-
WARNING
|
|
2153
|
-
This is a temporary workaround for issues below, and it's not intended to be used elsewhere!
|
|
2154
|
-
https://github.com/apache/airflow/issues/35833
|
|
2155
|
-
https://github.com/talkiq/gcloud-aio/issues/584
|
|
2156
|
-
|
|
2157
|
-
This workaround must be deleted along with the method _get_job() and replaced by more robust and
|
|
2158
|
-
cleaner solution in one of two cases:
|
|
2159
|
-
1. The `google-cloud-bigquery` library provides async client with get_job method, that supports
|
|
2160
|
-
optional parameter `location`
|
|
2161
|
-
2. The `gcloud-aio-bigquery` library supports the `location` parameter in get_job() method.
|
|
2162
|
-
"""
|
|
2163
|
-
hook = BigQueryHook(**self._hook_kwargs)
|
|
2164
|
-
return hook.get_job(job_id=job_id, project_id=project_id, location=location)
|
|
2165
|
-
|
|
2166
2048
|
async def get_job_status(
|
|
2167
2049
|
self, job_id: str | None, project_id: str = PROVIDE_PROJECT_ID, location: str | None = None
|
|
2168
2050
|
) -> dict[str, str]:
|
|
@@ -2182,7 +2064,7 @@ class BigQueryAsyncHook(GoogleBaseAsyncHook):
|
|
|
2182
2064
|
async with ClientSession() as session:
|
|
2183
2065
|
self.log.info("Executing get_job_output..")
|
|
2184
2066
|
job_client = await self.get_job_instance(project_id, job_id, session)
|
|
2185
|
-
job_query_response = await job_client.get_query_results(cast(Session, session))
|
|
2067
|
+
job_query_response = await job_client.get_query_results(cast("Session", session))
|
|
2186
2068
|
return job_query_response
|
|
2187
2069
|
|
|
2188
2070
|
async def create_job_for_partition_get(
|
|
@@ -2202,7 +2084,7 @@ class BigQueryAsyncHook(GoogleBaseAsyncHook):
|
|
|
2202
2084
|
+ (f" WHERE table_name='{table_id}'" if table_id else ""),
|
|
2203
2085
|
"useLegacySql": False,
|
|
2204
2086
|
}
|
|
2205
|
-
job_query_resp = await job_client.query(query_request, cast(Session, session))
|
|
2087
|
+
job_query_resp = await job_client.query(query_request, cast("Session", session))
|
|
2206
2088
|
return job_query_resp["jobReference"]["jobId"]
|
|
2207
2089
|
|
|
2208
2090
|
async def cancel_job(self, job_id: str, project_id: str | None, location: str | None) -> None:
|
|
@@ -2265,7 +2147,7 @@ class BigQueryAsyncHook(GoogleBaseAsyncHook):
|
|
|
2265
2147
|
self,
|
|
2266
2148
|
sql: str,
|
|
2267
2149
|
pass_value: Any,
|
|
2268
|
-
records: list[Any],
|
|
2150
|
+
records: list[Any] | None = None,
|
|
2269
2151
|
tolerance: float | None = None,
|
|
2270
2152
|
) -> None:
|
|
2271
2153
|
"""
|
|
@@ -2382,12 +2264,7 @@ class BigQueryAsyncHook(GoogleBaseAsyncHook):
|
|
|
2382
2264
|
test_results[metric] = float(ratios[metric]) < threshold
|
|
2383
2265
|
|
|
2384
2266
|
self.log.info(
|
|
2385
|
-
(
|
|
2386
|
-
"Current metric for %s: %s\n"
|
|
2387
|
-
"Past metric for %s: %s\n"
|
|
2388
|
-
"Ratio for %s: %s\n"
|
|
2389
|
-
"Threshold: %s\n"
|
|
2390
|
-
),
|
|
2267
|
+
("Current metric for %s: %s\nPast metric for %s: %s\nRatio for %s: %s\nThreshold: %s\n"),
|
|
2391
2268
|
metric,
|
|
2392
2269
|
cur,
|
|
2393
2270
|
metric,
|
|
@@ -2452,5 +2329,5 @@ class BigQueryTableAsyncHook(GoogleBaseAsyncHook):
|
|
|
2452
2329
|
table_name=table_id,
|
|
2453
2330
|
project=project_id,
|
|
2454
2331
|
token=token,
|
|
2455
|
-
session=cast(Session, session),
|
|
2332
|
+
session=cast("Session", session),
|
|
2456
2333
|
)
|